All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 48/52] xfs_scrub: trim realtime volumes too
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
@ 2023-12-27 13:00   ` Darrick J. Wong
  2023-12-27 13:00   ` [PATCH 49/52] xfs_scrub: use histograms to speed up phase 8 on the realtime volume Darrick J. Wong
                     ` (50 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:00 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

On the kernel side, the XFS realtime groups patchset added support for
FITRIM of the realtime volume.  This support doesn't actually require
there to be any realtime groups, so teach scrub to run through the whole
region.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/phase8.c |   32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)


diff --git a/scrub/phase8.c b/scrub/phase8.c
index c6845555579..25d1ec3693e 100644
--- a/scrub/phase8.c
+++ b/scrub/phase8.c
@@ -59,7 +59,8 @@ fstrim_fsblocks(
 	struct scrub_ctx	*ctx,
 	uint64_t		start_fsb,
 	uint64_t		fsbcount,
-	uint64_t		minlen_fsb)
+	uint64_t		minlen_fsb,
+	bool			ignore_einval)
 {
 	uint64_t		start = cvt_off_fsb_to_b(&ctx->mnt, start_fsb);
 	uint64_t		len = cvt_off_fsb_to_b(&ctx->mnt, fsbcount);
@@ -72,6 +73,8 @@ fstrim_fsblocks(
 		run = min(len, FSTRIM_MAX_BYTES);
 
 		error = fstrim(ctx, start, run, minlen);
+		if (error == EINVAL && ignore_einval)
+			error = EOPNOTSUPP;
 		if (error == EOPNOTSUPP) {
 			/* Pretend we finished all the work. */
 			progress_add(len);
@@ -177,7 +180,8 @@ fstrim_datadev(
 		 */
 		progress_add(geo->blocksize);
 		fsbcount = min(geo->datablocks - fsbno + 1, geo->agblocks);
-		error = fstrim_fsblocks(ctx, fsbno + 1, fsbcount, minlen_fsb);
+		error = fstrim_fsblocks(ctx, fsbno + 1, fsbcount, minlen_fsb,
+				false);
 		if (error)
 			return error;
 	}
@@ -185,15 +189,35 @@ fstrim_datadev(
 	return 0;
 }
 
+/* Trim the realtime device. */
+static int
+fstrim_rtdev(
+	struct scrub_ctx	*ctx)
+{
+	struct xfs_fsop_geom	*geo = &ctx->mnt.fsgeom;
+
+	/*
+	 * The fstrim ioctl pretends that the realtime volume is in the address
+	 * space immediately after the data volume.  Ignore EINVAL if someone
+	 * tries to run us on an older kernel.
+	 */
+	return fstrim_fsblocks(ctx, geo->datablocks, geo->rtblocks, 0, true);
+}
+
 /* Trim the filesystem, if desired. */
 int
 phase8_func(
 	struct scrub_ctx	*ctx)
 {
+	int			error;
+
 	if (!fstrim_ok(ctx))
 		return 0;
 
-	return fstrim_datadev(ctx);
+	error = fstrim_datadev(ctx);
+	if (error)
+		return error;
+	return fstrim_rtdev(ctx);
 }
 
 /* Estimate how much work we're going to do. */
@@ -207,6 +231,8 @@ phase8_estimate(
 	if (fstrim_ok(ctx)) {
 		*items = cvt_off_fsb_to_b(&ctx->mnt,
 				ctx->mnt.fsgeom.datablocks);
+		*items += cvt_off_fsb_to_b(&ctx->mnt,
+				ctx->mnt.fsgeom.rtblocks);
 	} else {
 		*items = 0;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 49/52] xfs_scrub: use histograms to speed up phase 8 on the realtime volume
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
  2023-12-27 13:00   ` [PATCH 48/52] xfs_scrub: trim realtime volumes too Darrick J. Wong
@ 2023-12-27 13:00   ` Darrick J. Wong
  2023-12-27 13:00   ` [PATCH 50/52] mkfs: add headers to realtime bitmap blocks Darrick J. Wong
                     ` (49 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:00 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use the same statistical methods that we use on the data volume to
compute the minimum threshold size for fstrims on the realtime volume.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/phase7.c    |    7 +++++++
 scrub/phase8.c    |    6 +++++-
 scrub/xfs_scrub.c |    2 ++
 scrub/xfs_scrub.h |    1 +
 4 files changed, 15 insertions(+), 1 deletion(-)


diff --git a/scrub/phase7.c b/scrub/phase7.c
index 475d8f157ee..01097b67879 100644
--- a/scrub/phase7.c
+++ b/scrub/phase7.c
@@ -31,6 +31,7 @@ struct summary_counts {
 
 	/* Free space histogram, in fsb */
 	struct histogram	datadev_hist;
+	struct histogram	rtdev_hist;
 };
 
 /*
@@ -56,6 +57,7 @@ summary_count_init(
 	struct summary_counts	*counts = data;
 
 	init_freesp_hist(&counts->datadev_hist);
+	init_freesp_hist(&counts->rtdev_hist);
 }
 
 /* Record block usage. */
@@ -83,6 +85,8 @@ count_block_summary(
 		blocks = cvt_b_to_off_fsbt(&ctx->mnt, fsmap->fmr_length);
 		if (fsmap->fmr_device == ctx->fsinfo.fs_datadev)
 			hist_add(&counts->datadev_hist, blocks);
+		else if (fsmap->fmr_device == ctx->fsinfo.fs_rtdev)
+			hist_add(&counts->rtdev_hist, blocks);
 		return 0;
 	}
 
@@ -124,7 +128,9 @@ add_summaries(
 	total->agbytes += item->agbytes;
 
 	hist_import(&total->datadev_hist, &item->datadev_hist);
+	hist_import(&total->rtdev_hist, &item->rtdev_hist);
 	hist_free(&item->datadev_hist);
+	hist_free(&item->rtdev_hist);
 	return 0;
 }
 
@@ -195,6 +201,7 @@ phase7_func(
 
 	/* Preserve free space histograms for phase 8. */
 	hist_move(&ctx->datadev_hist, &totalcount.datadev_hist);
+	hist_move(&ctx->rtdev_hist, &totalcount.rtdev_hist);
 
 	/* Scan the whole fs. */
 	error = scrub_count_all_inodes(ctx, &counted_inodes);
diff --git a/scrub/phase8.c b/scrub/phase8.c
index 25d1ec3693e..d59bab7009c 100644
--- a/scrub/phase8.c
+++ b/scrub/phase8.c
@@ -195,13 +195,17 @@ fstrim_rtdev(
 	struct scrub_ctx	*ctx)
 {
 	struct xfs_fsop_geom	*geo = &ctx->mnt.fsgeom;
+	uint64_t		minlen_fsb;
+
+	minlen_fsb = fstrim_compute_minlen(ctx, &ctx->rtdev_hist);
 
 	/*
 	 * The fstrim ioctl pretends that the realtime volume is in the address
 	 * space immediately after the data volume.  Ignore EINVAL if someone
 	 * tries to run us on an older kernel.
 	 */
-	return fstrim_fsblocks(ctx, geo->datablocks, geo->rtblocks, 0, true);
+	return fstrim_fsblocks(ctx, geo->datablocks, geo->rtblocks,
+			minlen_fsb, true);
 }
 
 /* Trim the filesystem, if desired. */
diff --git a/scrub/xfs_scrub.c b/scrub/xfs_scrub.c
index 7c73e4d3cca..c510d9534a8 100644
--- a/scrub/xfs_scrub.c
+++ b/scrub/xfs_scrub.c
@@ -713,6 +713,7 @@ main(
 	int			error;
 
 	hist_init(&ctx.datadev_hist);
+	hist_init(&ctx.rtdev_hist);
 
 	fprintf(stdout, "EXPERIMENTAL xfs_scrub program in use! Use at your own risk!\n");
 	fflush(stdout);
@@ -944,6 +945,7 @@ main(
 	unicrash_unload();
 
 	hist_free(&ctx.datadev_hist);
+	hist_free(&ctx.rtdev_hist);
 
 	/*
 	 * If we're being run as a service, the return code must fit the LSB
diff --git a/scrub/xfs_scrub.h b/scrub/xfs_scrub.h
index 4d9a028921b..8389551c067 100644
--- a/scrub/xfs_scrub.h
+++ b/scrub/xfs_scrub.h
@@ -94,6 +94,7 @@ struct scrub_ctx {
 
 	/* Free space histograms, in fsb */
 	struct histogram	datadev_hist;
+	struct histogram	rtdev_hist;
 
 	/*
 	 * Pick the largest value for fstrim minlen such that we trim at least


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 50/52] mkfs: add headers to realtime bitmap blocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
  2023-12-27 13:00   ` [PATCH 48/52] xfs_scrub: trim realtime volumes too Darrick J. Wong
  2023-12-27 13:00   ` [PATCH 49/52] xfs_scrub: use histograms to speed up phase 8 on the realtime volume Darrick J. Wong
@ 2023-12-27 13:00   ` Darrick J. Wong
  2023-12-27 13:00   ` [PATCH 51/52] mkfs: add headers to realtime summary blocks Darrick J. Wong
                     ` (48 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:00 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When the rtgroups feature is enabled, format rtbitmap blocks with the
appropriate block headers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 mkfs/proto.c    |   51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mkfs/xfs_mkfs.c |    6 +++++-
 2 files changed, 56 insertions(+), 1 deletion(-)


diff --git a/mkfs/proto.c b/mkfs/proto.c
index 36df148018f..f5b7859f9a9 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -852,6 +852,50 @@ rtsummary_create(
 	mp->m_rsumip = rsumip;
 }
 
+/* Initialize block headers of rt free space files. */
+static int
+init_rtblock_headers(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		nrblocks,
+	const struct xfs_buf_ops *ops,
+	uint32_t		magic)
+{
+	struct xfs_bmbt_irec	map;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_rtbuf_blkinfo *hdr;
+	xfs_fileoff_t		off = 0;
+	int			error;
+
+	while (off < nrblocks) {
+		struct xfs_buf	*bp;
+		xfs_daddr_t	daddr;
+		int		nimaps = 1;
+
+		error = -libxfs_bmapi_read(ip, off, 1, &map, &nimaps, 0);
+		if (error)
+			return error;
+
+		daddr = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+		error = -libxfs_buf_get(mp->m_ddev_targp, daddr,
+				XFS_FSB_TO_BB(mp, map.br_blockcount), &bp);
+		if (error)
+			return error;
+
+		bp->b_ops = ops;
+		hdr = bp->b_addr;
+		hdr->rt_magic = cpu_to_be32(magic);
+		hdr->rt_owner = cpu_to_be64(ip->i_ino);
+		hdr->rt_blkno = cpu_to_be64(daddr);
+		platform_uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid);
+		libxfs_buf_mark_dirty(bp);
+		libxfs_buf_relse(bp);
+
+		off = map.br_startoff + map.br_blockcount;
+	}
+
+	return 0;
+}
+
 /* Zero the realtime bitmap. */
 static void
 rtbitmap_init(
@@ -895,6 +939,13 @@ rtbitmap_init(
 	if (error)
 		fail(_("Block allocation of the realtime bitmap inode failed"),
 				error);
+
+	if (xfs_has_rtgroups(mp)) {
+		error = init_rtblock_headers(mp->m_rbmip, mp->m_sb.sb_rbmblocks,
+				&xfs_rtbitmap_buf_ops, XFS_RTBITMAP_MAGIC);
+		if (error)
+			fail(_("Initialization of rtbitmap failed"), error);
+	}
 }
 
 /* Zero the realtime summary file. */
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 2330ebebfae..aab1d9130b2 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -909,6 +909,7 @@ struct sb_feat_args {
 	bool	nodalign;
 	bool	nortalign;
 	bool	nrext64;
+	bool	rtgroups;		/* XFS_SB_FEAT_COMPAT_RTGROUPS */
 };
 
 struct cli_params {
@@ -3124,6 +3125,7 @@ validate_rtdev(
 	struct cli_params	*cli)
 {
 	struct libxfs_init	*xi = cli->xi;
+	unsigned int		rbmblocksize = cfg->blocksize;
 
 	if (!xi->rt.dev) {
 		if (cli->rtsize) {
@@ -3167,8 +3169,10 @@ reported by the device (%u).\n"),
 _("cannot have an rt subvolume with zero extents\n"));
 		usage();
 	}
+	if (cfg->sb_feat.rtgroups)
+		rbmblocksize -= sizeof(struct xfs_rtbuf_blkinfo);
 	cfg->rtbmblocks = (xfs_extlen_t)howmany(cfg->rtextents,
-						NBBY * cfg->blocksize);
+						NBBY * rbmblocksize);
 }
 
 static bool


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 51/52] mkfs: add headers to realtime summary blocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:00   ` [PATCH 50/52] mkfs: add headers to realtime bitmap blocks Darrick J. Wong
@ 2023-12-27 13:00   ` Darrick J. Wong
  2023-12-27 13:01   ` [PATCH 52/52] mkfs: format realtime groups Darrick J. Wong
                     ` (47 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:00 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When the rtgroups feature is enabled, format rtsummary blocks with the
appropriate block headers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 mkfs/proto.c |    8 ++++++++
 1 file changed, 8 insertions(+)


diff --git a/mkfs/proto.c b/mkfs/proto.c
index f5b7859f9a9..b89b114d0d6 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -991,6 +991,14 @@ rtsummary_init(
 	if (error)
 		fail(_("Block allocation of the realtime summary inode failed"),
 				error);
+
+	if (xfs_has_rtgroups(mp)) {
+		error = init_rtblock_headers(mp->m_rsumip,
+				XFS_B_TO_FSB(mp, mp->m_rsumsize),
+				&xfs_rtsummary_buf_ops, XFS_RTSUMMARY_MAGIC);
+		if (error)
+			fail(_("Initialization of rtsummary failed"), error);
+	}
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 52/52] mkfs: format realtime groups
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:00   ` [PATCH 51/52] mkfs: add headers to realtime summary blocks Darrick J. Wong
@ 2023-12-27 13:01   ` Darrick J. Wong
  2023-12-31 23:47   ` [PATCH 01/52] xfs: create incore realtime group structures Darrick J. Wong
                     ` (46 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:01 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create filesystems with the realtime group feature enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/div64.h          |    6 +
 libfrog/util.c           |   12 ++
 libfrog/util.h           |    1 
 libxfs/libxfs_api_defs.h |    1 
 libxfs/libxfs_priv.h     |    6 -
 libxfs/topology.c        |   42 +++++++
 libxfs/topology.h        |    3 +
 libxfs/xfs_format.h      |    1 
 man/man8/mkfs.xfs.8.in   |   44 +++++++
 mkfs/proto.c             |   45 +++++++-
 mkfs/xfs_mkfs.c          |  272 ++++++++++++++++++++++++++++++++++++++++++++++
 11 files changed, 425 insertions(+), 8 deletions(-)


diff --git a/libfrog/div64.h b/libfrog/div64.h
index 673b01cbab3..4b0d4c3b3c7 100644
--- a/libfrog/div64.h
+++ b/libfrog/div64.h
@@ -93,4 +93,10 @@ howmany_64(uint64_t x, uint32_t y)
 	return x;
 }
 
+static inline __attribute__((const))
+int is_power_of_2(unsigned long n)
+{
+	return (n != 0 && ((n & (n - 1)) == 0));
+}
+
 #endif /* LIBFROG_DIV64_H_ */
diff --git a/libfrog/util.c b/libfrog/util.c
index 46047571a55..4e130c884c1 100644
--- a/libfrog/util.c
+++ b/libfrog/util.c
@@ -36,3 +36,15 @@ memchr_inv(const void *start, int c, size_t bytes)
 
 	return NULL;
 }
+
+unsigned int
+log2_rounddown(unsigned long long i)
+{
+	int	rval;
+
+	for (rval = NBBY * sizeof(i) - 1; rval >= 0; rval--) {
+		if ((1ULL << rval) < i)
+			break;
+	}
+	return rval;
+}
diff --git a/libfrog/util.h b/libfrog/util.h
index ac2f331c93e..b0715576e8d 100644
--- a/libfrog/util.h
+++ b/libfrog/util.h
@@ -7,6 +7,7 @@
 #define __LIBFROG_UTIL_H__
 
 unsigned int	log2_roundup(unsigned int i);
+unsigned int	log2_rounddown(unsigned long long i);
 
 void *memchr_inv(const void *start, int c, size_t bytes);
 
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index a3503e07984..c5dad34f3d2 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -80,6 +80,7 @@
 #define xfs_btree_update		libxfs_btree_update
 #define xfs_btree_space_to_height	libxfs_btree_space_to_height
 #define xfs_btree_visit_blocks		libxfs_btree_visit_blocks
+#define xfs_buf_delwri_queue		libxfs_buf_delwri_queue
 #define xfs_buf_delwri_submit		libxfs_buf_delwri_submit
 #define xfs_buf_get			libxfs_buf_get
 #define xfs_buf_get_uncached		libxfs_buf_get_uncached
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 4e4a51637e6..120a41e20a7 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -337,12 +337,6 @@ find_next_zero_bit(const unsigned long *addr, unsigned long size,
 }
 #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
 
-static inline __attribute__((const))
-int is_power_of_2(unsigned long n)
-{
-	return (n != 0 && ((n & (n - 1)) == 0));
-}
-
 /*
  * xfs_iroundup: round up argument to next power of two
  */
diff --git a/libxfs/topology.c b/libxfs/topology.c
index 06013d42945..b87731820c4 100644
--- a/libxfs/topology.c
+++ b/libxfs/topology.c
@@ -89,6 +89,48 @@ calc_default_ag_geometry(
 	*agcount = dblocks / blocks + (dblocks % blocks != 0);
 }
 
+void
+calc_default_rtgroup_geometry(
+	int		blocklog,
+	uint64_t	rblocks,
+	uint64_t	*rgsize,
+	uint64_t	*rgcount)
+{
+	uint64_t	blocks = 0;
+	int		shift = 0;
+
+	/*
+	 * For a single underlying storage device over 4TB in size use the
+	 * maximum rtgroup size.  Between 128MB and 4TB, just use 4 rtgroups
+	 * and scale up smoothly between min/max rtgroup sizes.
+	 */
+	if (rblocks >= TERABYTES(4, blocklog)) {
+		blocks = XFS_MAX_RGBLOCKS;
+		goto done;
+	}
+	if (rblocks >= MEGABYTES(128, blocklog)) {
+		shift = XFS_NOMULTIDISK_AGLOG;
+		goto calc_blocks;
+	}
+
+	/*
+	 * If rblocks is not evenly divisible by the number of desired rt
+	 * groups, round "blocks" up so we don't lose the last bit of the
+	 * filesystem. The same principle applies to the rt group count, so we
+	 * don't lose the last rt group!
+	 */
+calc_blocks:
+	ASSERT(shift >= 0 && shift <= XFS_MULTIDISK_AGLOG);
+	blocks = rblocks >> shift;
+	if (rblocks & xfs_mask32lo(shift)) {
+		if (blocks < XFS_MAX_RGBLOCKS)
+		    blocks++;
+	}
+done:
+	*rgsize = blocks;
+	*rgcount = rblocks / blocks + (rblocks % blocks != 0);
+}
+
 /*
  * Check for existing filesystem or partition table on device.
  * Returns:
diff --git a/libxfs/topology.h b/libxfs/topology.h
index 1af5b054947..f1174bb4bab 100644
--- a/libxfs/topology.h
+++ b/libxfs/topology.h
@@ -32,6 +32,9 @@ calc_default_ag_geometry(
 	uint64_t	*agsize,
 	uint64_t	*agcount);
 
+void calc_default_rtgroup_geometry(int blocklog, uint64_t rblocks,
+		uint64_t *rgsize, uint64_t *rgcount);
+
 extern int
 check_overwrite(
 	const char	*device);
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 59ba13db53e..87476c6bb6c 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -418,6 +418,7 @@ xfs_sb_has_ro_compat_feature(
 		 XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
 		 XFS_SB_FEAT_INCOMPAT_NREXT64| \
 		 XFS_SB_FEAT_INCOMPAT_PARENT | \
+		 XFS_SB_FEAT_INCOMPAT_RTGROUPS | \
 		 XFS_SB_FEAT_INCOMPAT_METADIR)
 
 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL
diff --git a/man/man8/mkfs.xfs.8.in b/man/man8/mkfs.xfs.8.in
index 587754ff95b..e0175ca04b4 100644
--- a/man/man8/mkfs.xfs.8.in
+++ b/man/man8/mkfs.xfs.8.in
@@ -1116,6 +1116,50 @@ or logical volume containing the section.
 .BI noalign
 This option disables stripe size detection, enforcing a realtime device with no
 stripe geometry.
+.TP
+.BI rtgroups= value
+This feature breaks the realtime section into multiple allocation groups for
+improved scalability.
+This feature is only available if the metadata directory tree feature is
+enabled.
+.IP
+By default,
+.B mkfs.xfs
+will not enable this feature.
+If the option
+.B \-r rtgroups=0
+is used, the rt group feature is not supported and is disabled.
+.TP
+.BI rgcount=
+This is used to specify the number of allocation groups in the realtime
+section.
+The realtime section of the filesystem can be divided into allocation groups to
+improve the performance of XFS.
+More allocation groups imply that more parallelism can be achieved when
+allocating blocks.
+The minimum allocation group size is 2 realtime extents; the maximum size is
+2^31 blocks.
+The rt section of the filesystem is divided into
+.I value
+allocation groups (default value is scaled automatically based
+on the underlying device size).
+.TP
+.BI rgsize= value
+This is an alternative to using the
+.B rgcount
+suboption. The
+.I value
+is the desired size of the realtime allocation group expressed in bytes
+(usually using the
+.BR m " or " g
+suffixes).
+This value must be a multiple of the realtime extent size,
+must be at least two realtime extents, and no more than 2^31 blocks.
+The
+.B rgcount
+and
+.B rgsize
+suboptions are mutually exclusive.
 .RE
 .PP
 .PD 0
diff --git a/mkfs/proto.c b/mkfs/proto.c
index b89b114d0d6..5239f9ec413 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -1001,6 +1001,46 @@ rtsummary_init(
 	}
 }
 
+static void
+rtfreesp_init_groups(
+	struct xfs_mount	*mp)
+{
+	xfs_rgnumber_t		rgno;
+	int			error;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		struct xfs_trans	*tp;
+		xfs_rtblock_t	rtbno;
+		xfs_rtxnum_t	start_rtx;
+		xfs_rtxnum_t	next_rtx;
+
+		rtbno = xfs_rgbno_to_rtb(mp, rgno, mp->m_sb.sb_rextsize);
+		start_rtx = xfs_rtb_to_rtx(mp, rtbno);
+
+		rtbno = xfs_rgbno_to_rtb(mp, rgno + 1, 0);
+		next_rtx = xfs_rtb_to_rtx(mp, rtbno);
+		next_rtx = min(next_rtx, mp->m_sb.sb_rextents);
+
+		error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+				0, 0, 0, &tp);
+		if (error)
+			res_failed(error);
+
+		libxfs_trans_ijoin(tp, mp->m_rbmip, 0);
+		error = -libxfs_rtfree_extent(tp, start_rtx,
+				next_rtx - start_rtx);
+		if (error) {
+			fail(_("Error initializing the realtime space"),
+				error);
+		}
+		error = -libxfs_trans_commit(tp);
+		if (error)
+			fail(_("Initialization of the realtime space failed"),
+					error);
+
+	}
+}
+
 /*
  * Free the whole realtime area using transactions.
  * Do one transaction per bitmap block.
@@ -1049,7 +1089,10 @@ rtinit(
 
 	rtbitmap_init(mp);
 	rtsummary_init(mp);
-	rtfreesp_init(mp);
+	if (xfs_has_rtgroups(mp))
+		rtfreesp_init_groups(mp);
+	else
+		rtfreesp_init(mp);
 }
 
 static long
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index aab1d9130b2..66532b8c9b6 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -131,6 +131,9 @@ enum {
 	R_FILE,
 	R_NAME,
 	R_NOALIGN,
+	R_RTGROUPS,
+	R_RGCOUNT,
+	R_RGSIZE,
 	R_MAX_OPTS,
 };
 
@@ -718,6 +721,9 @@ static struct opt_params ropts = {
 		[R_FILE] = "file",
 		[R_NAME] = "name",
 		[R_NOALIGN] = "noalign",
+		[R_RTGROUPS] = "rtgroups",
+		[R_RGCOUNT] = "rgcount",
+		[R_RGSIZE] = "rgsize",
 		[R_MAX_OPTS] = NULL,
 	},
 	.subopt_params = {
@@ -757,6 +763,27 @@ static struct opt_params ropts = {
 		  .defaultval = 1,
 		  .conflicts = { { NULL, LAST_CONFLICT } },
 		},
+		{ .index = R_RTGROUPS,
+		  .conflicts = { { NULL, LAST_CONFLICT } },
+		  .minval = 0,
+		  .maxval = 1,
+		  .defaultval = 1,
+		},
+		{ .index = R_RGCOUNT,
+		  .conflicts = { { &dopts, R_RGSIZE },
+				 { NULL, LAST_CONFLICT } },
+		  .minval = 1,
+		  .maxval = XFS_MAX_RGNUMBER,
+		  .defaultval = SUBOPT_NEEDS_VAL,
+		},
+		{ .index = R_RGSIZE,
+		  .conflicts = { { &dopts, R_RGCOUNT },
+				 { NULL, LAST_CONFLICT } },
+		  .convert = true,
+		  .minval = 0,
+		  .maxval = (unsigned long long)XFS_MAX_RGBLOCKS << XFS_MAX_BLOCKSIZE_LOG,
+		  .defaultval = SUBOPT_NEEDS_VAL,
+		},
 	},
 };
 
@@ -922,6 +949,7 @@ struct cli_params {
 	/* parameters that depend on sector/block size being validated. */
 	char	*dsize;
 	char	*agsize;
+	char	*rgsize;
 	char	*dsu;
 	char	*dirblocksize;
 	char	*logsize;
@@ -943,6 +971,7 @@ struct cli_params {
 
 	/* parameters where 0 is not a valid value */
 	int64_t	agcount;
+	int64_t	rgcount;
 	int	inodesize;
 	int	inopblock;
 	int	imaxpct;
@@ -999,6 +1028,9 @@ struct mkfs_params {
 	uint64_t	agsize;
 	uint64_t	agcount;
 
+	uint64_t	rgsize;
+	uint64_t	rgcount;
+
 	int		imaxpct;
 
 	bool		loginternal;
@@ -1055,7 +1087,8 @@ usage( void )
 /* no-op info only */	[-N]\n\
 /* prototype file */	[-p fname]\n\
 /* quiet */		[-q]\n\
-/* realtime subvol */	[-r extsize=num,size=num,rtdev=xxx]\n\
+/* realtime subvol */	[-r extsize=num,size=num,rtdev=xxx,rtgroups=0|1,\n\
+			    rgcount=n,rgsize=n]\n\
 /* sectorsize */	[-s size=num]\n\
 /* version */		[-V]\n\
 			devicename\n\
@@ -1952,6 +1985,15 @@ rtdev_opts_parser(
 	case R_NOALIGN:
 		cli->sb_feat.nortalign = getnum(value, opts, subopt);
 		break;
+	case R_RTGROUPS:
+		cli->sb_feat.rtgroups = getnum(value, opts, subopt);
+		break;
+	case R_RGCOUNT:
+		cli->rgcount = getnum(value, opts, subopt);
+		break;
+	case R_RGSIZE:
+		cli->rgsize = getstr(value, opts, subopt);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -2447,6 +2489,15 @@ _("cowextsize not supported without reflink support\n"));
 		usage();
 	}
 
+	if (cli->sb_feat.rtgroups && !cli->sb_feat.metadir) {
+		if (cli_opt_set(&mopts, M_METADIR)) {
+			fprintf(stderr,
+_("realtime groups not supported without metadata directory support\n"));
+			usage();
+		}
+		cli->sb_feat.metadir = true;
+	}
+
 	/*
 	 * Copy features across to config structure now.
 	 */
@@ -3413,6 +3464,181 @@ an AG size that is one stripe unit smaller or larger, for example %llu.\n"),
 			     cfg->agsize, cfg->agcount);
 }
 
+static uint64_t
+calc_rgsize_extsize_nonpower(
+	struct mkfs_params	*cfg)
+{
+	uint64_t		try_rgsize, rgsize, rgcount;
+
+	/*
+	 * For non-power-of-two rt extent sizes, round the rtgroup size down to
+	 * the nearest extent.
+	 */
+	calc_default_rtgroup_geometry(cfg->blocklog, cfg->rtblocks, &rgsize,
+			&rgcount);
+	rgsize -= rgsize % cfg->rtextblocks;
+	rgsize = min(XFS_MAX_RGBLOCKS, rgsize);
+
+	/*
+	 * If we would be left with a too-small rtgroup, increase or decrease
+	 * the size of the group until we have a working geometry.
+	 */
+	for (try_rgsize = rgsize;
+	     try_rgsize <= XFS_MAX_RGBLOCKS - cfg->rtextblocks;
+	     try_rgsize += cfg->rtextblocks) {
+		if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+			return try_rgsize;
+	}
+	for (try_rgsize = rgsize;
+	     try_rgsize > (2 * cfg->rtextblocks);
+	     try_rgsize -= cfg->rtextblocks) {
+		if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+			return try_rgsize;
+	}
+
+	fprintf(stderr,
+_("realtime group size (%llu) not at all congruent with extent size (%llu)\n"),
+			(unsigned long long)rgsize,
+			(unsigned long long)cfg->rtextblocks);
+	usage();
+	return 0;
+}
+
+static uint64_t
+calc_rgsize_extsize_power(
+	struct mkfs_params	*cfg)
+{
+	uint64_t		try_rgsize, rgsize, rgcount;
+	unsigned int		rgsizelog;
+
+	/*
+	 * Find the rt group size that is both a power of two and yields at
+	 * least as many rt groups as the default geometry specified.
+	 */
+	calc_default_rtgroup_geometry(cfg->blocklog, cfg->rtblocks, &rgsize,
+			&rgcount);
+	rgsizelog = log2_rounddown(rgsize);
+	rgsize = min(XFS_MAX_RGBLOCKS, 1U << rgsizelog);
+
+	/*
+	 * If we would be left with a too-small rtgroup, increase or decrease
+	 * the size of the group by powers of 2 until we have a working
+	 * geometry.  If that doesn't work, try bumping by the extent size.
+	 */
+	for (try_rgsize = rgsize;
+	     try_rgsize <= XFS_MAX_RGBLOCKS - cfg->rtextblocks;
+	     try_rgsize <<= 2) {
+		if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+			return try_rgsize;
+	}
+	for (try_rgsize = rgsize;
+	     try_rgsize > (2 * cfg->rtextblocks);
+	     try_rgsize >>= 2) {
+		if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+			return try_rgsize;
+	}
+	for (try_rgsize = rgsize;
+	     try_rgsize <= XFS_MAX_RGBLOCKS - cfg->rtextblocks;
+	     try_rgsize += cfg->rtextblocks) {
+		if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+			return try_rgsize;
+	}
+	for (try_rgsize = rgsize;
+	     try_rgsize > (2 * cfg->rtextblocks);
+	     try_rgsize -= cfg->rtextblocks) {
+		if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+			return try_rgsize;
+	}
+
+	fprintf(stderr,
+_("realtime group size (%llu) not at all congruent with extent size (%llu)\n"),
+			(unsigned long long)rgsize,
+			(unsigned long long)cfg->rtextblocks);
+	usage();
+	return 0;
+}
+
+static void
+calculate_rtgroup_geometry(
+	struct mkfs_params	*cfg,
+	struct cli_params	*cli)
+{
+	if (!cli->sb_feat.rtgroups) {
+		cfg->rgcount = 0;
+		cfg->rgsize = 0;
+		return;
+	}
+
+	if (cli->rgsize) {	/* User-specified rtgroup size */
+		cfg->rgsize = getnum(cli->rgsize, &ropts, R_RGSIZE);
+
+		/*
+		 * Check specified agsize is a multiple of blocksize.
+		 */
+		if (cfg->rgsize % cfg->blocksize) {
+			fprintf(stderr,
+_("rgsize (%s) not a multiple of fs blk size (%d)\n"),
+				cli->rgsize, cfg->blocksize);
+			usage();
+		}
+		cfg->rgsize /= cfg->blocksize;
+		cfg->rgcount = cfg->rtblocks / cfg->rgsize +
+				(cfg->rtblocks % cfg->rgsize != 0);
+
+	} else if (cli->rgcount) {	/* User-specified rtgroup count */
+		cfg->rgcount = cli->rgcount;
+		cfg->rgsize = cfg->rtblocks / cfg->rgcount +
+				(cfg->rtblocks % cfg->rgcount != 0);
+	} else if (cfg->rtblocks == 0) {
+		/*
+		 * If nobody specified a realtime device or the rtgroup size,
+		 * try 1TB, rounded down to the nearest rt extent.
+		 */
+		cfg->rgsize = TERABYTES(1, cfg->blocklog);
+		cfg->rgsize -= cfg->rgsize % cfg->rtextblocks;
+		cfg->rgcount = 0;
+	} else if (!is_power_of_2(cfg->rtextblocks)) {
+		cfg->rgsize = calc_rgsize_extsize_nonpower(cfg);
+		cfg->rgcount = cfg->rtblocks / cfg->rgsize +
+				(cfg->rtblocks % cfg->rgsize != 0);
+	} else {
+		cfg->rgsize = calc_rgsize_extsize_power(cfg);
+		cfg->rgcount = cfg->rtblocks / cfg->rgsize +
+				(cfg->rtblocks % cfg->rgsize != 0);
+	}
+
+	if (cfg->rgsize > XFS_MAX_RGBLOCKS) {
+		fprintf(stderr,
+_("realtime group size (%llu) must be less than the maximum (%u)\n"),
+				(unsigned long long)cfg->rgsize,
+				XFS_MAX_RGBLOCKS);
+		usage();
+	}
+
+	if (cfg->rgsize % cfg->rtextblocks != 0) {
+		fprintf(stderr,
+_("realtime group size (%llu) not a multiple of rt extent size (%llu)\n"),
+				(unsigned long long)cfg->rgsize,
+				(unsigned long long)cfg->rtextblocks);
+		usage();
+	}
+
+	if (cfg->rgsize <= cfg->rtextblocks) {
+		fprintf(stderr,
+_("realtime group size (%llu) must be at least two realtime extents\n"),
+				(unsigned long long)cfg->rgsize);
+		usage();
+	}
+
+	if (cfg->rgcount > XFS_MAX_RGNUMBER) {
+		fprintf(stderr,
+_("realtime group count (%llu) must be less than the maximum (%u)\n"),
+				(unsigned long long)cfg->rgcount,
+				XFS_MAX_RGNUMBER);
+		usage();
+	}
+}
+
 static void
 calculate_imaxpct(
 	struct mkfs_params	*cfg,
@@ -3552,6 +3778,12 @@ sb_set_features(
 
 	if (fp->nrext64)
 		sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_NREXT64;
+
+	if (fp->rtgroups) {
+		sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_RTGROUPS;
+		sbp->sb_rgcount = cfg->rgcount;
+		sbp->sb_rgblocks = cfg->rgsize;
+	}
 }
 
 /*
@@ -4327,6 +4559,7 @@ main(
 	char			**argv)
 {
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 	struct xfs_buf		*buf;
 	int			c;
 	int			dry_run = 0;
@@ -4536,6 +4769,7 @@ main(
 	 */
 	calculate_initial_ag_geometry(&cfg, &cli, &xi);
 	align_ag_geometry(&cfg);
+	calculate_rtgroup_geometry(&cfg, &cli);
 
 	calculate_imaxpct(&cfg, &cli);
 
@@ -4636,6 +4870,42 @@ main(
 		exit(1);
 	}
 
+	/* Write all the realtime group superblocks. */
+	for (rgno = 0; rgno < cfg.rgcount; rgno++) {
+		struct xfs_buf	*rtsb_bp;
+		struct xfs_buf	*sb_bp = libxfs_getsb(mp);
+
+		if (!sb_bp) {
+			fprintf(stderr,
+ _("%s: couldn't grab buffers to write primary rt superblock\n"), progname);
+			exit(1);
+		}
+
+		error = -libxfs_buf_get_uncached(mp->m_rtdev_targp,
+				XFS_FSB_TO_BB(mp, 1), 0,
+				&rtsb_bp);
+		if (error) {
+			fprintf(stderr,
+ _("%s: couldn't grab primary rt superblock\n"), progname);
+			exit(1);
+		}
+		rtsb_bp->b_maps[0].bm_bn = XFS_RTSB_DADDR;
+		rtsb_bp->b_ops = &xfs_rtsb_buf_ops;
+
+		libxfs_rtgroup_update_super(rtsb_bp, sb_bp);
+		libxfs_buf_mark_dirty(rtsb_bp);
+		libxfs_buf_relse(rtsb_bp);
+		libxfs_buf_relse(sb_bp);
+
+		error = -libxfs_rtgroup_update_secondary_sbs(mp);
+		if (error) {
+			fprintf(stderr,
+	_("%s: writing secondary rtgroup headers failed, err=%d\n"),
+					progname, error);
+			exit(1);
+		}
+	}
+
 	/*
 	 * Initialise the freespace freelists (i.e. AGFLs) in each AG.
 	 */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
@ 2023-12-27 13:01   ` Darrick J. Wong
  2023-12-27 13:01   ` [PATCH 02/14] xfs: refactor the allocation and freeing of incore inode fork btree roots Darrick J. Wong
                     ` (12 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:01 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Replace all the shouty bmap btree and bmap disk root macros with actual
functions, and fix a type handling error in the xattr code that the
macros previously didn't care about.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/bmap.c               |   10 +-
 db/bmap_inflate.c       |    2 
 db/bmroot.c             |    8 +-
 db/check.c              |    8 +-
 db/frag.c               |    8 +-
 db/metadump.c           |   16 ++--
 libxfs/xfs_attr_leaf.c  |    8 +-
 libxfs/xfs_bmap.c       |   40 +++++----
 libxfs/xfs_bmap_btree.c |   18 ++--
 libxfs/xfs_bmap_btree.h |  204 ++++++++++++++++++++++++++++++++---------------
 libxfs/xfs_inode_fork.c |   30 +++----
 libxfs/xfs_trans_resv.c |    2 
 repair/bmap_repair.c    |    2 
 repair/dinode.c         |   10 +-
 repair/prefetch.c       |    8 +-
 repair/scan.c           |    6 +
 16 files changed, 228 insertions(+), 152 deletions(-)


diff --git a/db/bmap.c b/db/bmap.c
index 874135f001e..7915772aaee 100644
--- a/db/bmap.c
+++ b/db/bmap.c
@@ -78,8 +78,8 @@ bmap(
 		push_cur();
 		rblock = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 		fsize = XFS_DFORK_SIZE(dip, mp, whichfork);
-		pp = XFS_BMDR_PTR_ADDR(rblock, 1, libxfs_bmdr_maxrecs(fsize, 0));
-		kp = XFS_BMDR_KEY_ADDR(rblock, 1);
+		pp = xfs_bmdr_ptr_addr(rblock, 1, libxfs_bmdr_maxrecs(fsize, 0));
+		kp = xfs_bmdr_key_addr(rblock, 1);
 		bno = select_child(curoffset, kp, pp,
 					be16_to_cpu(rblock->bb_numrecs));
 		for (;;) {
@@ -88,9 +88,9 @@ bmap(
 			block = (struct xfs_btree_block *)iocur_top->data;
 			if (be16_to_cpu(block->bb_level) == 0)
 				break;
-			pp = XFS_BMBT_PTR_ADDR(mp, block, 1,
+			pp = xfs_bmbt_ptr_addr(mp, block, 1,
 				libxfs_bmbt_maxrecs(mp, mp->m_sb.sb_blocksize, 0));
-			kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+			kp = xfs_bmbt_key_addr(mp, block, 1);
 			bno = select_child(curoffset, kp, pp,
 					be16_to_cpu(block->bb_numrecs));
 		}
@@ -98,7 +98,7 @@ bmap(
 			nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 			nextents = be16_to_cpu(block->bb_numrecs);
 			xp = (xfs_bmbt_rec_t *)
-				XFS_BMBT_REC_ADDR(mp, block, 1);
+				xfs_bmbt_rec_addr(mp, block, 1);
 			for (ep = xp; ep < &xp[nextents] && n < nex; ep++) {
 				if (!bmap_one_extent(ep, &curoffset, eoffset,
 						&n, bep)) {
diff --git a/db/bmap_inflate.c b/db/bmap_inflate.c
index a3ad6ad3832..118d911a1db 100644
--- a/db/bmap_inflate.c
+++ b/db/bmap_inflate.c
@@ -282,7 +282,7 @@ iroot_size(
 	unsigned int		nr_this_level,
 	void			*priv)
 {
-	return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+	return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level);
 }
 
 static int
diff --git a/db/bmroot.c b/db/bmroot.c
index 246e390a8a3..7ef07da181e 100644
--- a/db/bmroot.c
+++ b/db/bmroot.c
@@ -89,7 +89,7 @@ bmroota_key_offset(
 	block = (xfs_bmdr_block_t *)((char *)obj + byteize(startoff));
 	ASSERT(dip->di_forkoff != 0 && (char *)block == XFS_DFORK_APTR(dip));
 	ASSERT(be16_to_cpu(block->bb_level) > 0);
-	kp = XFS_BMDR_KEY_ADDR(block, idx);
+	kp = xfs_bmdr_key_addr(block, idx);
 	return bitize((int)((char *)kp - (char *)block));
 }
 
@@ -127,7 +127,7 @@ bmroota_ptr_offset(
 	block = (xfs_bmdr_block_t *)((char *)obj + byteize(startoff));
 	ASSERT(dip->di_forkoff != 0 && (char *)block == XFS_DFORK_APTR(dip));
 	ASSERT(be16_to_cpu(block->bb_level) > 0);
-	pp = XFS_BMDR_PTR_ADDR(block, idx,
+	pp = xfs_bmdr_ptr_addr(block, idx,
 		libxfs_bmdr_maxrecs(XFS_DFORK_ASIZE(dip, mp), 0));
 	return bitize((int)((char *)pp - (char *)block));
 }
@@ -185,7 +185,7 @@ bmrootd_key_offset(
 	ASSERT(obj == iocur_top->data);
 	block = (xfs_bmdr_block_t *)((char *)obj + byteize(startoff));
 	ASSERT(be16_to_cpu(block->bb_level) > 0);
-	kp = XFS_BMDR_KEY_ADDR(block, idx);
+	kp = xfs_bmdr_key_addr(block, idx);
 	return bitize((int)((char *)kp - (char *)block));
 }
 
@@ -222,7 +222,7 @@ bmrootd_ptr_offset(
 	dip = obj;
 	block = (xfs_bmdr_block_t *)((char *)obj + byteize(startoff));
 	ASSERT(be16_to_cpu(block->bb_level) > 0);
-	pp = XFS_BMDR_PTR_ADDR(block, idx,
+	pp = xfs_bmdr_ptr_addr(block, idx,
 		libxfs_bmdr_maxrecs(XFS_DFORK_DSIZE(dip, mp), 0));
 	return bitize((int)((char *)pp - (char *)block));
 }
diff --git a/db/check.c b/db/check.c
index 7d0687a9db7..d1c86206c08 100644
--- a/db/check.c
+++ b/db/check.c
@@ -2366,13 +2366,13 @@ process_btinode(
 		return;
 	}
 	if (be16_to_cpu(dib->bb_level) == 0) {
-		xfs_bmbt_rec_t	*rp = XFS_BMDR_REC_ADDR(dib, 1);
+		xfs_bmbt_rec_t	*rp = xfs_bmdr_rec_addr(dib, 1);
 		process_bmbt_reclist(rp, be16_to_cpu(dib->bb_numrecs), type,
 							id, totd, blkmapp);
 		*nex += be16_to_cpu(dib->bb_numrecs);
 		return;
 	} else {
-		pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(
+		pp = xfs_bmdr_ptr_addr(dib, 1, libxfs_bmdr_maxrecs(
 				XFS_DFORK_SIZE(dip, mp, whichfork), 0));
 		for (i = 0; i < be16_to_cpu(dib->bb_numrecs); i++)
 			scan_lbtree(get_unaligned_be64(&pp[i]),
@@ -4422,7 +4422,7 @@ scanfunc_bmap(
 			error++;
 			return;
 		}
-		rp = XFS_BMBT_REC_ADDR(mp, block, 1);
+		rp = xfs_bmbt_rec_addr(mp, block, 1);
 		*nex += be16_to_cpu(block->bb_numrecs);
 		process_bmbt_reclist(rp, be16_to_cpu(block->bb_numrecs), type, id, totd,
 			blkmapp);
@@ -4438,7 +4438,7 @@ scanfunc_bmap(
 		error++;
 		return;
 	}
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[0]);
+	pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[0]);
 	for (i = 0; i < be16_to_cpu(block->bb_numrecs); i++)
 		scan_lbtree(be64_to_cpu(pp[i]), level, scanfunc_bmap, type, id,
 					totd, toti, nex, blkmapp, 0, btype);
diff --git a/db/frag.c b/db/frag.c
index 4efc6ad07f8..1165e824a37 100644
--- a/db/frag.c
+++ b/db/frag.c
@@ -243,11 +243,11 @@ process_btinode(
 
 	dib = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 	if (be16_to_cpu(dib->bb_level) == 0) {
-		xfs_bmbt_rec_t		*rp = XFS_BMDR_REC_ADDR(dib, 1);
+		xfs_bmbt_rec_t		*rp = xfs_bmdr_rec_addr(dib, 1);
 		process_bmbt_reclist(rp, be16_to_cpu(dib->bb_numrecs), extmapp);
 		return;
 	}
-	pp = XFS_BMDR_PTR_ADDR(dib, 1,
+	pp = xfs_bmdr_ptr_addr(dib, 1,
 		libxfs_bmdr_maxrecs(XFS_DFORK_SIZE(dip, mp, whichfork), 0));
 	for (i = 0; i < be16_to_cpu(dib->bb_numrecs); i++)
 		scan_lbtree(get_unaligned_be64(&pp[i]),
@@ -437,7 +437,7 @@ scanfunc_bmap(
 				   nrecs, typtab[btype].name);
 			return;
 		}
-		rp = XFS_BMBT_REC_ADDR(mp, block, 1);
+		rp = xfs_bmbt_rec_addr(mp, block, 1);
 		process_bmbt_reclist(rp, nrecs, extmapp);
 		return;
 	}
@@ -447,7 +447,7 @@ scanfunc_bmap(
 			   nrecs, typtab[btype].name);
 		return;
 	}
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[0]);
+	pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[0]);
 	for (i = 0; i < nrecs; i++)
 		scan_lbtree(be64_to_cpu(pp[i]), level, scanfunc_bmap, extmapp,
 									btype);
diff --git a/db/metadump.c b/db/metadump.c
index be4cc01ff26..ccf7b89ccd5 100644
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -250,8 +250,8 @@ zero_btree_node(
 		if (nrecs > mp->m_bmap_dmxr[1])
 			return;
 
-		bkp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-		bpp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bkp = xfs_bmbt_key_addr(mp, block, 1);
+		bpp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]);
 		zp1 = (char *)&bkp[nrecs];
 		zp2 = (char *)&bpp[nrecs];
 		key_end = (char *)bpp;
@@ -316,7 +316,7 @@ zero_btree_leaf(
 		if (nrecs > mp->m_bmap_dmxr[0])
 			return;
 
-		brp = XFS_BMBT_REC_ADDR(mp, block, 1);
+		brp = xfs_bmbt_rec_addr(mp, block, 1);
 		zp = (char *)&brp[nrecs];
 		break;
 	case TYP_INOBT:
@@ -2156,7 +2156,7 @@ scanfunc_bmap(
 					typtab[btype].name, agno, agbno);
 			return 1;
 		}
-		return process_bmbt_reclist(XFS_BMBT_REC_ADDR(mp, block, 1),
+		return process_bmbt_reclist(xfs_bmbt_rec_addr(mp, block, 1),
 					    nrecs, sbm->typ, sbm->is_meta);
 	}
 
@@ -2166,7 +2166,7 @@ scanfunc_bmap(
 					nrecs, typtab[btype].name, agno, agbno);
 		return 1;
 	}
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+	pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]);
 	for (i = 0; i < nrecs; i++) {
 		xfs_agnumber_t	ag;
 		xfs_agblock_t	bno;
@@ -2229,7 +2229,7 @@ process_btinode(
 	}
 
 	if (level == 0) {
-		return process_bmbt_reclist(XFS_BMDR_REC_ADDR(dib, 1),
+		return process_bmbt_reclist(xfs_bmdr_rec_addr(dib, 1),
 					    nrecs, itype, is_meta);
 	}
 
@@ -2242,13 +2242,13 @@ process_btinode(
 		return 1;
 	}
 
-	pp = XFS_BMDR_PTR_ADDR(dib, 1, maxrecs);
+	pp = xfs_bmdr_ptr_addr(dib, 1, maxrecs);
 
 	if (metadump.zero_stale_data) {
 		char	*top;
 
 		/* Unused btree key space */
-		top = (char*)XFS_BMDR_KEY_ADDR(dib, nrecs + 1);
+		top = (char*)xfs_bmdr_key_addr(dib, nrecs + 1);
 		memset(top, 0, (char*)pp - top);
 
 		/* Unused btree ptr space */
diff --git a/libxfs/xfs_attr_leaf.c b/libxfs/xfs_attr_leaf.c
index 14020c09146..28bb72f7a8a 100644
--- a/libxfs/xfs_attr_leaf.c
+++ b/libxfs/xfs_attr_leaf.c
@@ -669,7 +669,7 @@ xfs_attr_shortform_bytesfit(
 		 */
 		if (!dp->i_forkoff && dp->i_df.if_bytes >
 		    xfs_default_attroffset(dp))
-			dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+			dsize = xfs_bmdr_space_calc(MINDBTPTRS);
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		/*
@@ -683,7 +683,7 @@ xfs_attr_shortform_bytesfit(
 				return 0;
 			return dp->i_forkoff;
 		}
-		dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+		dsize = xfs_bmap_bmdr_space(dp->i_df.if_broot);
 		break;
 	}
 
@@ -691,11 +691,11 @@ xfs_attr_shortform_bytesfit(
 	 * A data fork btree root must have space for at least
 	 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
 	 */
-	minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+	minforkoff = max_t(int64_t, dsize, xfs_bmdr_space_calc(MINDBTPTRS));
 	minforkoff = roundup(minforkoff, 8) >> 3;
 
 	/* attr fork btree root can have at least this many key/ptr pairs */
-	maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	maxforkoff = XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS);
 	maxforkoff = maxforkoff >> 3;	/* rounded down */
 
 	if (offset >= maxforkoff)
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index c69cb5c66df..d7cbef76067 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -73,9 +73,9 @@ xfs_bmap_compute_maxlevels(
 	maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
 				whichfork);
 	if (whichfork == XFS_DATA_FORK)
-		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+		sz = xfs_bmdr_space_calc(MINDBTPTRS);
 	else
-		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+		sz = xfs_bmdr_space_calc(MINABTPTRS);
 
 	maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
 	minleafrecs = mp->m_bmap_dmnr[0];
@@ -96,8 +96,8 @@ xfs_bmap_compute_attr_offset(
 	struct xfs_mount	*mp)
 {
 	if (mp->m_sb.sb_inodesize == 256)
-		return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
-	return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+		return XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS);
+	return xfs_bmdr_space_calc(6 * MINABTPTRS);
 }
 
 STATIC int				/* error */
@@ -270,7 +270,7 @@ xfs_check_block(
 	prevp = NULL;
 	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
 		dmxr = mp->m_bmap_dmxr[0];
-		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+		keyp = xfs_bmbt_key_addr(mp, block, i);
 
 		if (prevp) {
 			ASSERT(be64_to_cpu(prevp->br_startoff) <
@@ -282,15 +282,15 @@ xfs_check_block(
 		 * Compare the block numbers to see if there are dups.
 		 */
 		if (root)
-			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+			pp = xfs_bmap_broot_ptr_addr(mp, block, i, sz);
 		else
-			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+			pp = xfs_bmbt_ptr_addr(mp, block, i, dmxr);
 
 		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
 			if (root)
-				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+				thispa = xfs_bmap_broot_ptr_addr(mp, block, j, sz);
 			else
-				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+				thispa = xfs_bmbt_ptr_addr(mp, block, j, dmxr);
 			if (*thispa == *pp) {
 				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld",
 					__func__, j, i,
@@ -345,7 +345,7 @@ xfs_bmap_check_leaf_extents(
 	level = be16_to_cpu(block->bb_level);
 	ASSERT(level > 0);
 	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	pp = xfs_bmap_broot_ptr_addr(mp, block, 1, ifp->if_broot_bytes);
 	bno = be64_to_cpu(*pp);
 
 	ASSERT(bno != NULLFSBLOCK);
@@ -380,7 +380,7 @@ xfs_bmap_check_leaf_extents(
 		 */
 
 		xfs_check_block(block, mp, 0, 0);
-		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]);
 		bno = be64_to_cpu(*pp);
 		if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
 			xfs_btree_mark_sick(cur);
@@ -420,14 +420,14 @@ xfs_bmap_check_leaf_extents(
 		 * conform with the first entry in this one.
 		 */
 
-		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+		ep = xfs_bmbt_rec_addr(mp, block, 1);
 		if (i) {
 			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
 			       xfs_bmbt_disk_get_blockcount(&last) <=
 			       xfs_bmbt_disk_get_startoff(ep));
 		}
 		for (j = 1; j < num_recs; j++) {
-			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+			nextp = xfs_bmbt_rec_addr(mp, block, j + 1);
 			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
 			       xfs_bmbt_disk_get_blockcount(ep) <=
 			       xfs_bmbt_disk_get_startoff(nextp));
@@ -562,7 +562,7 @@ xfs_bmap_btree_to_extents(
 	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
 	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
 
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+	pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes);
 	cbno = be64_to_cpu(*pp);
 #ifdef DEBUG
 	if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) {
@@ -690,7 +690,7 @@ xfs_bmap_extents_to_btree(
 	for_each_xfs_iext(ifp, &icur, &rec) {
 		if (isnullstartblock(rec.br_startblock))
 			continue;
-		arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt);
+		arp = xfs_bmbt_rec_addr(mp, ablock, 1 + cnt);
 		xfs_bmbt_disk_set_all(arp, &rec);
 		cnt++;
 	}
@@ -700,10 +700,10 @@ xfs_bmap_extents_to_btree(
 	/*
 	 * Fill in the root key and pointer.
 	 */
-	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	kp = xfs_bmbt_key_addr(mp, block, 1);
+	arp = xfs_bmbt_rec_addr(mp, ablock, 1);
 	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+	pp = xfs_bmbt_ptr_addr(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
 						be16_to_cpu(block->bb_level)));
 	*pp = cpu_to_be64(args.fsbno);
 
@@ -872,7 +872,7 @@ xfs_bmap_add_attrfork_btree(
 
 	mp = ip->i_mount;
 
-	if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
+	if (xfs_bmap_bmdr_space(block) <= xfs_inode_data_fork_size(ip))
 		*flags |= XFS_ILOG_DBROOT;
 	else {
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
@@ -1139,7 +1139,7 @@ xfs_iread_bmbt_block(
 	}
 
 	/* Copy records into the incore cache. */
-	frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+	frp = xfs_bmbt_rec_addr(mp, block, 1);
 	for (j = 0; j < num_recs; j++, frp++, ir->loaded++) {
 		struct xfs_bmbt_irec	new;
 		xfs_failaddr_t		fa;
diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c
index 160f7b08ffd..be4979894a0 100644
--- a/libxfs/xfs_bmap_btree.c
+++ b/libxfs/xfs_bmap_btree.c
@@ -47,10 +47,10 @@ xfs_bmdr_to_bmbt(
 	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
 	rblock->bb_numrecs = dblock->bb_numrecs;
 	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
-	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-	fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-	tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	fkp = xfs_bmdr_key_addr(dblock, 1);
+	tkp = xfs_bmbt_key_addr(mp, rblock, 1);
+	fpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr);
+	tpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
 	dmxr = be16_to_cpu(dblock->bb_numrecs);
 	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
 	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
@@ -150,10 +150,10 @@ xfs_bmbt_to_bmdr(
 	dblock->bb_level = rblock->bb_level;
 	dblock->bb_numrecs = rblock->bb_numrecs;
 	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
-	fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-	tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-	fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-	tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	fkp = xfs_bmbt_key_addr(mp, rblock, 1);
+	tkp = xfs_bmdr_key_addr(dblock, 1);
+	fpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+	tpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr);
 	dmxr = be16_to_cpu(dblock->bb_numrecs);
 	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
 	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
@@ -670,7 +670,7 @@ xfs_bmbt_maxrecs(
 	int			blocklen,
 	int			leaf)
 {
-	blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+	blocklen -= xfs_bmbt_block_len(mp);
 	return xfs_bmbt_block_maxrecs(blocklen, leaf);
 }
 
diff --git a/libxfs/xfs_bmap_btree.h b/libxfs/xfs_bmap_btree.h
index 151b8491f60..62fbc4f7c2c 100644
--- a/libxfs/xfs_bmap_btree.h
+++ b/libxfs/xfs_bmap_btree.h
@@ -13,70 +13,6 @@ struct xfs_inode;
 struct xfs_trans;
 struct xbtree_ifakeroot;
 
-/*
- * Btree block header size depends on a superblock flag.
- */
-#define XFS_BMBT_BLOCK_LEN(mp) \
-	(xfs_has_crc(((mp))) ? \
-		XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
-
-#define XFS_BMBT_REC_ADDR(mp, block, index) \
-	((xfs_bmbt_rec_t *) \
-		((char *)(block) + \
-		 XFS_BMBT_BLOCK_LEN(mp) + \
-		 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
-
-#define XFS_BMBT_KEY_ADDR(mp, block, index) \
-	((xfs_bmbt_key_t *) \
-		((char *)(block) + \
-		 XFS_BMBT_BLOCK_LEN(mp) + \
-		 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
-
-#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
-	((xfs_bmbt_ptr_t *) \
-		((char *)(block) + \
-		 XFS_BMBT_BLOCK_LEN(mp) + \
-		 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
-		 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
-
-#define XFS_BMDR_REC_ADDR(block, index) \
-	((xfs_bmdr_rec_t *) \
-		((char *)(block) + \
-		 sizeof(struct xfs_bmdr_block) + \
-	         ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
-
-#define XFS_BMDR_KEY_ADDR(block, index) \
-	((xfs_bmdr_key_t *) \
-		((char *)(block) + \
-		 sizeof(struct xfs_bmdr_block) + \
-		 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
-
-#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
-	((xfs_bmdr_ptr_t *) \
-		((char *)(block) + \
-		 sizeof(struct xfs_bmdr_block) + \
-		 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
-		 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
-
-/*
- * These are to be used when we know the size of the block and
- * we don't have a cursor.
- */
-#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
-	XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-
-#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
-	(int)(XFS_BMBT_BLOCK_LEN(mp) + \
-	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-
-#define XFS_BMAP_BROOT_SPACE(mp, bb) \
-	(XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
-#define XFS_BMDR_SPACE_CALC(nrecs) \
-	(int)(sizeof(xfs_bmdr_block_t) + \
-	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-#define XFS_BMAP_BMDR_SPACE(bb) \
-	(XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
-
 /*
  * Maximum number of bmap btree levels.
  */
@@ -120,4 +56,144 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void);
 int __init xfs_bmbt_init_cur_cache(void);
 void xfs_bmbt_destroy_cur_cache(void);
 
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+static inline size_t
+xfs_bmbt_block_len(struct xfs_mount *mp)
+{
+	return xfs_has_crc(mp) ?
+			XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN;
+}
+
+/* Addresses of key, pointers, and records within an incore bmbt block. */
+
+static inline struct xfs_bmbt_rec *
+xfs_bmbt_rec_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_bmbt_rec *)
+		((char *)block + xfs_bmbt_block_len(mp) +
+		 (index - 1) * sizeof(struct xfs_bmbt_rec));
+}
+
+static inline struct xfs_bmbt_key *
+xfs_bmbt_key_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_bmbt_key *)
+		((char *)block + xfs_bmbt_block_len(mp) +
+		 (index - 1) * sizeof(struct xfs_bmbt_key *));
+}
+
+static inline xfs_bmbt_ptr_t *
+xfs_bmbt_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_bmbt_ptr_t *)
+		((char *)block + xfs_bmbt_block_len(mp) +
+		 maxrecs * sizeof(struct xfs_bmbt_key) +
+		 (index - 1) * sizeof(xfs_bmbt_ptr_t));
+}
+
+/* Addresses of key, pointers, and records within an ondisk bmbt block. */
+
+static inline struct xfs_bmbt_rec *
+xfs_bmdr_rec_addr(
+	struct xfs_bmdr_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_bmbt_rec *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_bmbt_rec));
+}
+
+static inline struct xfs_bmbt_key *
+xfs_bmdr_key_addr(
+	struct xfs_bmdr_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_bmbt_key *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_bmbt_key));
+}
+
+static inline xfs_bmbt_ptr_t *
+xfs_bmdr_ptr_addr(
+	struct xfs_bmdr_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_bmbt_ptr_t *)
+		((char *)(block + 1) +
+		 maxrecs * sizeof(struct xfs_bmbt_key) +
+		 (index - 1) * sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_bmbt_ptr_t *
+xfs_bmap_broot_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*bb,
+	unsigned int		i,
+	unsigned int		sz)
+{
+	return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_bmap_broot_space_calc(
+	struct xfs_mount	*mp,
+	unsigned int		nrecs)
+{
+	return xfs_bmbt_block_len(mp) + \
+	       (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t)));
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_bmap_broot_space(
+	struct xfs_mount	*mp,
+	struct xfs_bmdr_block	*bb)
+{
+	return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_bmdr_space_calc(unsigned int nrecs)
+{
+	return sizeof(struct xfs_bmdr_block) +
+	       (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t)));
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_bmap_bmdr_space(struct xfs_btree_block *bb)
+{
+	return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs));
+}
+
 #endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 46da5edfb11..2d67e35c7e3 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -179,7 +179,7 @@ xfs_iformat_btree(
 
 	ifp = xfs_ifork_ptr(ip, whichfork);
 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
-	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+	size = xfs_bmap_broot_space(mp, dfp);
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
 	level = be16_to_cpu(dfp->bb_level);
 
@@ -192,7 +192,7 @@ xfs_iformat_btree(
 	 */
 	if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) ||
 		     nrecs == 0 ||
-		     XFS_BMDR_SPACE_CALC(nrecs) >
+		     xfs_bmdr_space_calc(nrecs) >
 					XFS_DFORK_SIZE(dip, mp, whichfork) ||
 		     ifp->if_nextents > ip->i_nblocks) ||
 		     level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) {
@@ -403,7 +403,7 @@ xfs_iroot_realloc(
 		 * allocate it now and get out.
 		 */
 		if (ifp->if_broot_bytes == 0) {
-			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
 			ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
 			ifp->if_broot_bytes = (int)new_size;
 			return;
@@ -417,15 +417,15 @@ xfs_iroot_realloc(
 		 */
 		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
 		new_max = cur_max + rec_diff;
-		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_NOFS | __GFP_NOFAIL);
-		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
-		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     (int)new_size);
 		ifp->if_broot_bytes = (int)new_size;
-		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 			xfs_inode_fork_size(ip, whichfork));
 		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
 		return;
@@ -441,7 +441,7 @@ xfs_iroot_realloc(
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	if (new_max > 0)
-		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	else
 		new_size = 0;
 	if (new_size > 0) {
@@ -450,7 +450,7 @@ xfs_iroot_realloc(
 		 * First copy over the btree block header.
 		 */
 		memcpy(new_broot, ifp->if_broot,
-			XFS_BMBT_BLOCK_LEN(ip->i_mount));
+			xfs_bmbt_block_len(ip->i_mount));
 	} else {
 		new_broot = NULL;
 	}
@@ -462,16 +462,16 @@ xfs_iroot_realloc(
 		/*
 		 * First copy the records.
 		 */
-		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+		op = (char *)xfs_bmbt_rec_addr(mp, ifp->if_broot, 1);
+		np = (char *)xfs_bmbt_rec_addr(mp, new_broot, 1);
 		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
 
 		/*
 		 * Then copy the pointers.
 		 */
-		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
-		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+		np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1,
 						     (int)new_size);
 		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
 	}
@@ -479,7 +479,7 @@ xfs_iroot_realloc(
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
 	if (ifp->if_broot)
-		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 			xfs_inode_fork_size(ip, whichfork));
 	return;
 }
@@ -652,7 +652,7 @@ xfs_iflush_fork(
 		if ((iip->ili_fields & brootflag[whichfork]) &&
 		    (ifp->if_broot_bytes > 0)) {
 			ASSERT(ifp->if_broot != NULL);
-			ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 			        xfs_inode_fork_size(ip, whichfork));
 			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
 				(xfs_bmdr_block_t *)cp,
diff --git a/libxfs/xfs_trans_resv.c b/libxfs/xfs_trans_resv.c
index 74f46539179..800a2f9ecb8 100644
--- a/libxfs/xfs_trans_resv.c
+++ b/libxfs/xfs_trans_resv.c
@@ -128,7 +128,7 @@ xfs_calc_inode_res(
 		(4 * sizeof(struct xlog_op_header) +
 		 sizeof(struct xfs_inode_log_format) +
 		 mp->m_sb.sb_inodesize +
-		 2 * XFS_BMBT_BLOCK_LEN(mp));
+		 2 * xfs_bmbt_block_len(mp));
 }
 
 /*
diff --git a/repair/bmap_repair.c b/repair/bmap_repair.c
index 7705980621c..a8cbff67ceb 100644
--- a/repair/bmap_repair.c
+++ b/repair/bmap_repair.c
@@ -285,7 +285,7 @@ xrep_bmap_iroot_size(
 {
 	ASSERT(level > 0);
 
-	return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+	return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level);
 }
 
 /* Update the inode counters. */
diff --git a/repair/dinode.c b/repair/dinode.c
index 5a57069c29e..31b3cd74139 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -842,19 +842,19 @@ _("bad numrecs 0 in inode %" PRIu64 " bmap btree root block\n"),
 	/*
 	 * use bmdr/dfork_dsize since the root block is in the data fork
 	 */
-	if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_SIZE(dip, mp, whichfork)) {
+	if (xfs_bmdr_space_calc(numrecs) > XFS_DFORK_SIZE(dip, mp, whichfork)) {
 		do_warn(
-	_("indicated size of %s btree root (%d bytes) greater than space in "
+	_("indicated size of %s btree root (%zu bytes) greater than space in "
 	  "inode %" PRIu64 " %s fork\n"),
-			forkname, XFS_BMDR_SPACE_CALC(numrecs), lino, forkname);
+			forkname, xfs_bmdr_space_calc(numrecs), lino, forkname);
 		return(1);
 	}
 
 	init_bm_cursor(&cursor, level + 1);
 
-	pp = XFS_BMDR_PTR_ADDR(dib, 1,
+	pp = xfs_bmdr_ptr_addr(dib, 1,
 		libxfs_bmdr_maxrecs(XFS_DFORK_SIZE(dip, mp, whichfork), 0));
-	pkey = XFS_BMDR_KEY_ADDR(dib, 1);
+	pkey = xfs_bmdr_key_addr(dib, 1);
 	last_key = NULLFILEOFF;
 
 	for (i = 0; i < numrecs; i++)  {
diff --git a/repair/prefetch.c b/repair/prefetch.c
index 58fc2dac1a8..5caa41beb74 100644
--- a/repair/prefetch.c
+++ b/repair/prefetch.c
@@ -328,13 +328,13 @@ pf_scanfunc_bmap(
 		if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
 			return 0;
 		return pf_read_bmbt_reclist(args,
-			XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
+			xfs_bmbt_rec_addr(mp, block, 1), numrecs);
 	}
 
 	if (numrecs > mp->m_bmap_dmxr[1])
 		return 0;
 
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+	pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]);
 
 	for (i = 0; i < numrecs; i++) {
 		dbno = get_unaligned_be64(&pp[i]);
@@ -372,11 +372,11 @@ pf_read_btinode(
 	/*
 	 * use bmdr/dfork_dsize since the root block is in the data fork
 	 */
-	if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
+	if (xfs_bmdr_space_calc(numrecs) > XFS_DFORK_DSIZE(dino, mp))
 		return;
 
 	dsize = XFS_DFORK_DSIZE(dino, mp);
-	pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0));
+	pp = xfs_bmdr_ptr_addr(dib, 1, libxfs_bmdr_maxrecs(dsize, 0));
 
 	for (i = 0; i < numrecs; i++) {
 		dbno = get_unaligned_be64(&pp[i]);
diff --git a/repair/scan.c b/repair/scan.c
index 3857593b165..1cd4d0ad2e1 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -445,7 +445,7 @@ _("inode %" PRIu64 " bad # of bmap records (%" PRIu64 ", min - %u, max - %u)\n")
 					mp->m_bmap_dmxr[0]);
 			return(1);
 		}
-		rp = XFS_BMBT_REC_ADDR(mp, block, 1);
+		rp = xfs_bmbt_rec_addr(mp, block, 1);
 		*nex += numrecs;
 		/*
 		 * XXX - if we were going to fix up the btree record,
@@ -496,8 +496,8 @@ _("inode %" PRIu64 " bad # of bmap records (%" PRIu64 ", min - %u, max - %u)\n")
 			ino, numrecs, mp->m_bmap_dmnr[1], mp->m_bmap_dmxr[1]);
 		return(1);
 	}
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-	pkey = XFS_BMBT_KEY_ADDR(mp, block, 1);
+	pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]);
+	pkey = xfs_bmbt_key_addr(mp, block, 1);
 
 	last_key = NULLFILEOFF;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/14] xfs: refactor the allocation and freeing of incore inode fork btree roots
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
  2023-12-27 13:01   ` [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros Darrick J. Wong
@ 2023-12-27 13:01   ` Darrick J. Wong
  2023-12-27 13:01   ` [PATCH 03/14] xfs: refactor creation of bmap " Darrick J. Wong
                     ` (11 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:01 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Refactor the code that allocates and freese the incore inode fork btree
roots.  This will help us disentangle some of the weird logic when we're
creating and tearing down inode-based btrees.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_inode_fork.c |   53 +++++++++++++++++++++++++++++++++--------------
 libxfs/xfs_inode_fork.h |    3 +++
 2 files changed, 40 insertions(+), 16 deletions(-)


diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 2d67e35c7e3..05f7ada0ae3 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -205,8 +205,7 @@ xfs_iformat_btree(
 		return -EFSCORRUPTED;
 	}
 
-	ifp->if_broot_bytes = size;
-	ifp->if_broot = kmem_alloc(size, KM_NOFS);
+	xfs_iroot_alloc(ip, whichfork, size);
 	ASSERT(ifp->if_broot != NULL);
 	/*
 	 * Copy and convert from the on-disk structure
@@ -356,6 +355,32 @@ xfs_iformat_attr_fork(
 	return error;
 }
 
+/* Allocate a new incore ifork btree root. */
+void
+xfs_iroot_alloc(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	size_t			bytes)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	ifp->if_broot = kmem_alloc(bytes, KM_NOFS);
+	ifp->if_broot_bytes = bytes;
+}
+
+/* Free all the memory and state associated with an incore ifork btree root. */
+void
+xfs_iroot_free(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	ifp->if_broot_bytes = 0;
+	kmem_free(ifp->if_broot);
+	ifp->if_broot = NULL;
+}
+
 /*
  * Reallocate the space for if_broot based on the number of records
  * being added or deleted as indicated in rec_diff.  Move the records
@@ -404,8 +429,7 @@ xfs_iroot_realloc(
 		 */
 		if (ifp->if_broot_bytes == 0) {
 			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
-			ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
-			ifp->if_broot_bytes = (int)new_size;
+			xfs_iroot_alloc(ip, whichfork, new_size);
 			return;
 		}
 
@@ -444,17 +468,15 @@ xfs_iroot_realloc(
 		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	else
 		new_size = 0;
-	if (new_size > 0) {
-		new_broot = kmem_alloc(new_size, KM_NOFS);
-		/*
-		 * First copy over the btree block header.
-		 */
-		memcpy(new_broot, ifp->if_broot,
-			xfs_bmbt_block_len(ip->i_mount));
-	} else {
-		new_broot = NULL;
+	if (new_size == 0) {
+		xfs_iroot_free(ip, whichfork);
+		return;
 	}
 
+	/* First copy over the btree block header. */
+	new_broot = kmem_alloc(new_size, KM_NOFS);
+	memcpy(new_broot, ifp->if_broot, xfs_bmbt_block_len(ip->i_mount));
+
 	/*
 	 * Only copy the records and pointers if there are any.
 	 */
@@ -478,9 +500,8 @@ xfs_iroot_realloc(
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
-	if (ifp->if_broot)
-		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-			xfs_inode_fork_size(ip, whichfork));
+	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
+		xfs_inode_fork_size(ip, whichfork));
 	return;
 }
 
diff --git a/libxfs/xfs_inode_fork.h b/libxfs/xfs_inode_fork.h
index ebeb925be09..18ea2d27777 100644
--- a/libxfs/xfs_inode_fork.h
+++ b/libxfs/xfs_inode_fork.h
@@ -172,6 +172,9 @@ void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
 void		xfs_idestroy_fork(struct xfs_ifork *ifp);
 void		xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
 				int whichfork);
+void		xfs_iroot_alloc(struct xfs_inode *ip, int whichfork,
+				size_t bytes);
+void		xfs_iroot_free(struct xfs_inode *ip, int whichfork);
 void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/14] xfs: refactor creation of bmap btree roots
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
  2023-12-27 13:01   ` [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros Darrick J. Wong
  2023-12-27 13:01   ` [PATCH 02/14] xfs: refactor the allocation and freeing of incore inode fork btree roots Darrick J. Wong
@ 2023-12-27 13:01   ` Darrick J. Wong
  2023-12-27 13:02   ` [PATCH 04/14] xfs: fix a sloppy memory handling bug in xfs_iroot_realloc Darrick J. Wong
                     ` (10 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:01 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we've created inode fork helpers to allocate and free btree
roots, create a new bmap btree helper to create a new bmbt root, and
refactor the extents <-> btree conversion functions to use our new
helpers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_bmap.c       |   17 ++++-------------
 libxfs/xfs_bmap_btree.c |   16 ++++++++++++++++
 libxfs/xfs_bmap_btree.h |    2 ++
 3 files changed, 22 insertions(+), 13 deletions(-)


diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index d7cbef76067..5935f87833b 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -591,7 +591,7 @@ xfs_bmap_btree_to_extents(
 	xfs_trans_binval(tp, cbp);
 	if (cur->bc_levels[0].bp == cbp)
 		cur->bc_levels[0].bp = NULL;
-	xfs_iroot_realloc(ip, -1, whichfork);
+	xfs_iroot_free(ip, whichfork);
 	ASSERT(ifp->if_broot == NULL);
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -631,20 +631,10 @@ xfs_bmap_extents_to_btree(
 	ifp = xfs_ifork_ptr(ip, whichfork);
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
 
-	/*
-	 * Make space in the inode incore. This needs to be undone if we fail
-	 * to expand the root.
-	 */
-	xfs_iroot_realloc(ip, 1, whichfork);
-
-	/*
-	 * Fill in the root.
-	 */
-	block = ifp->if_broot;
-	xfs_btree_init_block(mp, block, &xfs_bmbt_ops, 1, 1, ip->i_ino);
 	/*
 	 * Need a cursor.  Can't allocate until bb_level is filled in.
 	 */
+	xfs_bmbt_iroot_alloc(ip, whichfork);
 	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
 	cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
 	/*
@@ -700,6 +690,7 @@ xfs_bmap_extents_to_btree(
 	/*
 	 * Fill in the root key and pointer.
 	 */
+	block = ifp->if_broot;
 	kp = xfs_bmbt_key_addr(mp, block, 1);
 	arp = xfs_bmbt_rec_addr(mp, ablock, 1);
 	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
@@ -721,7 +712,7 @@ xfs_bmap_extents_to_btree(
 out_unreserve_dquot:
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
 out_root_realloc:
-	xfs_iroot_realloc(ip, -1, whichfork);
+	xfs_iroot_free(ip, whichfork);
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	ASSERT(ifp->if_broot == NULL);
 	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c
index be4979894a0..1dd8d12af8f 100644
--- a/libxfs/xfs_bmap_btree.c
+++ b/libxfs/xfs_bmap_btree.c
@@ -778,3 +778,19 @@ xfs_bmbt_destroy_cur_cache(void)
 	kmem_cache_destroy(xfs_bmbt_cur_cache);
 	xfs_bmbt_cur_cache = NULL;
 }
+
+/* Create an incore bmbt btree root block. */
+void
+xfs_bmbt_iroot_alloc(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	xfs_iroot_alloc(ip, whichfork,
+			xfs_bmap_broot_space_calc(ip->i_mount, 1));
+
+	/* Fill in the root. */
+	xfs_btree_init_block(ip->i_mount, ifp->if_broot, &xfs_bmbt_ops, 1, 1,
+			ip->i_ino);
+}
diff --git a/libxfs/xfs_bmap_btree.h b/libxfs/xfs_bmap_btree.h
index 62fbc4f7c2c..3fe9c4f7f1a 100644
--- a/libxfs/xfs_bmap_btree.h
+++ b/libxfs/xfs_bmap_btree.h
@@ -196,4 +196,6 @@ xfs_bmap_bmdr_space(struct xfs_btree_block *bb)
 	return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs));
 }
 
+void xfs_bmbt_iroot_alloc(struct xfs_inode *ip, int whichfork);
+
 #endif	/* __XFS_BMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/14] xfs: fix a sloppy memory handling bug in xfs_iroot_realloc
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:01   ` [PATCH 03/14] xfs: refactor creation of bmap " Darrick J. Wong
@ 2023-12-27 13:02   ` Darrick J. Wong
  2023-12-27 13:02   ` [PATCH 05/14] xfs: hoist the code that moves the incore inode fork broot memory Darrick J. Wong
                     ` (9 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:02 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

While refactoring code, I noticed that when xfs_iroot_realloc tries to
shrink a bmbt root block, it allocates a smaller new block and then
copies "records" and pointers to the new block.  However, bmbt root
blocks cannot ever be leaves, which means that it's not technically
correct to copy records.  We /should/ be copying keys.

Note that this has never resulted in actual memory corruption because
sizeof(bmbt_rec) == (sizeof(bmbt_key) + sizeof(bmbt_ptr)).  However,
this will no longer be true when we start adding realtime rmap stuff,
so fix this now.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_inode_fork.c |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)


diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 05f7ada0ae3..765e174999d 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -478,15 +478,15 @@ xfs_iroot_realloc(
 	memcpy(new_broot, ifp->if_broot, xfs_bmbt_block_len(ip->i_mount));
 
 	/*
-	 * Only copy the records and pointers if there are any.
+	 * Only copy the keys and pointers if there are any.
 	 */
 	if (new_max > 0) {
 		/*
-		 * First copy the records.
+		 * First copy the keys.
 		 */
-		op = (char *)xfs_bmbt_rec_addr(mp, ifp->if_broot, 1);
-		np = (char *)xfs_bmbt_rec_addr(mp, new_broot, 1);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+		op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
+		np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
 
 		/*
 		 * Then copy the pointers.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/14] xfs: hoist the code that moves the incore inode fork broot memory
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:02   ` [PATCH 04/14] xfs: fix a sloppy memory handling bug in xfs_iroot_realloc Darrick J. Wong
@ 2023-12-27 13:02   ` Darrick J. Wong
  2023-12-27 13:02   ` [PATCH 06/14] xfs: move the zero records logic into xfs_bmap_broot_space_calc Darrick J. Wong
                     ` (8 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:02 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Whenever we change the size of the memory buffer holding an inode fork
btree root block, we have to copy the contents over.  Refactor all this
into a single function that handles both, in preparation for making
xfs_iroot_realloc more generic.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_inode_fork.c |   99 +++++++++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 42 deletions(-)


diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 765e174999d..edf6dff28b4 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -381,6 +381,50 @@ xfs_iroot_free(
 	ifp->if_broot = NULL;
 }
 
+/* Move the bmap btree root from one incore buffer to another. */
+static void
+xfs_ifork_move_broot(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_btree_block	*dst_broot,
+	size_t			dst_bytes,
+	struct xfs_btree_block	*src_broot,
+	size_t			src_bytes,
+	unsigned int		numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	void			*dptr;
+	void			*sptr;
+
+	ASSERT(xfs_bmap_bmdr_space(src_broot) <= xfs_inode_fork_size(ip, whichfork));
+
+	/*
+	 * We always have to move the pointers because they are not butted
+	 * against the btree block header.
+	 */
+	if (numrecs) {
+		sptr = xfs_bmap_broot_ptr_addr(mp, src_broot, 1, src_bytes);
+		dptr = xfs_bmap_broot_ptr_addr(mp, dst_broot, 1, dst_bytes);
+		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
+	}
+
+	if (src_broot == dst_broot)
+		return;
+
+	/*
+	 * If the root is being totally relocated, we have to migrate the block
+	 * header and the keys that come after it.
+	 */
+	memcpy(dst_broot, src_broot, xfs_bmbt_block_len(mp));
+
+	/* Now copy the keys, which come right after the header. */
+	if (numrecs) {
+		sptr = xfs_bmbt_key_addr(mp, src_broot, 1);
+		dptr = xfs_bmbt_key_addr(mp, dst_broot, 1);
+		memcpy(dptr, sptr, numrecs * sizeof(struct xfs_bmbt_key));
+	}
+}
+
 /*
  * Reallocate the space for if_broot based on the number of records
  * being added or deleted as indicated in rec_diff.  Move the records
@@ -407,12 +451,11 @@ xfs_iroot_realloc(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	int			cur_max;
-	struct xfs_ifork	*ifp;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*new_broot;
 	int			new_max;
 	size_t			new_size;
-	char			*np;
-	char			*op;
+	size_t			old_size = ifp->if_broot_bytes;
 
 	/*
 	 * Handle the degenerate case quietly.
@@ -421,13 +464,12 @@ xfs_iroot_realloc(
 		return;
 	}
 
-	ifp = xfs_ifork_ptr(ip, whichfork);
 	if (rec_diff > 0) {
 		/*
 		 * If there wasn't any memory allocated before, just
 		 * allocate it now and get out.
 		 */
-		if (ifp->if_broot_bytes == 0) {
+		if (old_size == 0) {
 			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
 			xfs_iroot_alloc(ip, whichfork, new_size);
 			return;
@@ -436,22 +478,16 @@ xfs_iroot_realloc(
 		/*
 		 * If there is already an existing if_broot, then we need
 		 * to realloc() it and shift the pointers to their new
-		 * location.  The records don't change location because
-		 * they are kept butted up against the btree block header.
+		 * location.
 		 */
-		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+		cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
 		new_max = cur_max + rec_diff;
 		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_NOFS | __GFP_NOFAIL);
-		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     ifp->if_broot_bytes);
-		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     (int)new_size);
-		ifp->if_broot_bytes = (int)new_size;
-		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-			xfs_inode_fork_size(ip, whichfork));
-		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
+		ifp->if_broot_bytes = new_size;
+		xfs_ifork_move_broot(ip, whichfork, ifp->if_broot, new_size,
+				ifp->if_broot, old_size, cur_max);
 		return;
 	}
 
@@ -460,8 +496,8 @@ xfs_iroot_realloc(
 	 * if_broot buffer.  It must already exist.  If we go to zero
 	 * records, just get rid of the root and clear the status bit.
 	 */
-	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+	ASSERT((ifp->if_broot != NULL) && (old_size > 0));
+	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	if (new_max > 0)
@@ -473,35 +509,14 @@ xfs_iroot_realloc(
 		return;
 	}
 
-	/* First copy over the btree block header. */
+	/* Reallocate the btree root and move the contents. */
 	new_broot = kmem_alloc(new_size, KM_NOFS);
-	memcpy(new_broot, ifp->if_broot, xfs_bmbt_block_len(ip->i_mount));
+	xfs_ifork_move_broot(ip, whichfork, new_broot, new_size, ifp->if_broot,
+			old_size, new_max);
 
-	/*
-	 * Only copy the keys and pointers if there are any.
-	 */
-	if (new_max > 0) {
-		/*
-		 * First copy the keys.
-		 */
-		op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
-		np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
-
-		/*
-		 * Then copy the pointers.
-		 */
-		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     ifp->if_broot_bytes);
-		np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1,
-						     (int)new_size);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
-	}
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
-	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-		xfs_inode_fork_size(ip, whichfork));
 	return;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/14] xfs: move the zero records logic into xfs_bmap_broot_space_calc
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:02   ` [PATCH 05/14] xfs: hoist the code that moves the incore inode fork broot memory Darrick J. Wong
@ 2023-12-27 13:02   ` Darrick J. Wong
  2023-12-27 13:02   ` [PATCH 07/14] xfs: rearrange xfs_iroot_realloc a bit Darrick J. Wong
                     ` (7 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:02 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The bmap btree cannot ever have zero records in an incore btree block.
If the number of records drops to zero, that means we're converting the
fork to extents format and are trying to remove the tree.  This logic
won't hold for the future realtime rmap btree, so move the logic into
the bmbt code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_bmap_btree.h |    7 +++++++
 libxfs/xfs_inode_fork.c |    6 ++----
 2 files changed, 9 insertions(+), 4 deletions(-)


diff --git a/libxfs/xfs_bmap_btree.h b/libxfs/xfs_bmap_btree.h
index 3fe9c4f7f1a..5a3bae94deb 100644
--- a/libxfs/xfs_bmap_btree.h
+++ b/libxfs/xfs_bmap_btree.h
@@ -162,6 +162,13 @@ xfs_bmap_broot_space_calc(
 	struct xfs_mount	*mp,
 	unsigned int		nrecs)
 {
+	/*
+	 * If the bmbt root block is empty, we should be converting the fork
+	 * to extents format.  Hence, the size is zero.
+	 */
+	if (nrecs == 0)
+		return 0;
+
 	return xfs_bmbt_block_len(mp) + \
 	       (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t)));
 }
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index edf6dff28b4..81f054cd212 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -500,10 +500,8 @@ xfs_iroot_realloc(
 	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
-	if (new_max > 0)
-		new_size = xfs_bmap_broot_space_calc(mp, new_max);
-	else
-		new_size = 0;
+
+	new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	if (new_size == 0) {
 		xfs_iroot_free(ip, whichfork);
 		return;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/14] xfs: rearrange xfs_iroot_realloc a bit
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:02   ` [PATCH 06/14] xfs: move the zero records logic into xfs_bmap_broot_space_calc Darrick J. Wong
@ 2023-12-27 13:02   ` Darrick J. Wong
  2023-12-27 13:03   ` [PATCH 08/14] xfs: standardize the btree maxrecs function parameters Darrick J. Wong
                     ` (6 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:02 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Rearrange the innards of xfs_iroot_realloc so that we can reduce
duplicated code prior to genericizing the function.  No functional
changes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_inode_fork.c |   49 +++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 27 deletions(-)


diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 81f054cd212..d070e0524b9 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -445,44 +445,46 @@ xfs_ifork_move_broot(
  */
 void
 xfs_iroot_realloc(
-	xfs_inode_t		*ip,
+	struct xfs_inode	*ip,
 	int			rec_diff,
 	int			whichfork)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	int			cur_max;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*new_broot;
-	int			new_max;
 	size_t			new_size;
 	size_t			old_size = ifp->if_broot_bytes;
+	int			cur_max;
+	int			new_max;
+
+	/* Handle degenerate cases. */
+	if (rec_diff == 0)
+		return;
 
 	/*
-	 * Handle the degenerate case quietly.
+	 * If there wasn't any memory allocated before, just allocate it now
+	 * and get out.
 	 */
-	if (rec_diff == 0) {
+	if (old_size == 0) {
+		ASSERT(rec_diff > 0);
+
+		new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
+		xfs_iroot_alloc(ip, whichfork, new_size);
 		return;
 	}
 
+	/* Compute the new and old record count and space requirements. */
+	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
+	new_max = cur_max + rec_diff;
+	ASSERT(new_max >= 0);
+	new_size = xfs_bmap_broot_space_calc(mp, new_max);
+
 	if (rec_diff > 0) {
-		/*
-		 * If there wasn't any memory allocated before, just
-		 * allocate it now and get out.
-		 */
-		if (old_size == 0) {
-			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
-			xfs_iroot_alloc(ip, whichfork, new_size);
-			return;
-		}
-
 		/*
 		 * If there is already an existing if_broot, then we need
 		 * to realloc() it and shift the pointers to their new
 		 * location.
 		 */
-		cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
-		new_max = cur_max + rec_diff;
-		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_NOFS | __GFP_NOFAIL);
 		ifp->if_broot_bytes = new_size;
@@ -494,14 +496,8 @@ xfs_iroot_realloc(
 	/*
 	 * rec_diff is less than 0.  In this case, we are shrinking the
 	 * if_broot buffer.  It must already exist.  If we go to zero
-	 * records, just get rid of the root and clear the status bit.
+	 * bytes, just get rid of the root and clear the status bit.
 	 */
-	ASSERT((ifp->if_broot != NULL) && (old_size > 0));
-	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
-	new_max = cur_max + rec_diff;
-	ASSERT(new_max >= 0);
-
-	new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	if (new_size == 0) {
 		xfs_iroot_free(ip, whichfork);
 		return;
@@ -514,8 +510,7 @@ xfs_iroot_realloc(
 
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
-	ifp->if_broot_bytes = (int)new_size;
-	return;
+	ifp->if_broot_bytes = new_size;
 }
 
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/14] xfs: standardize the btree maxrecs function parameters
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:02   ` [PATCH 07/14] xfs: rearrange xfs_iroot_realloc a bit Darrick J. Wong
@ 2023-12-27 13:03   ` Darrick J. Wong
  2023-12-27 13:03   ` [PATCH 09/14] xfs: generalize the btree root reallocation function Darrick J. Wong
                     ` (5 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:03 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Standardize the parameters in xfs_{alloc,bm,ino,rmap,refcount}bt_maxrecs
so that we have consistent calling conventions.  This doesn't affect the
kernel that much, but enables us to clean up userspace a bit.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/btheight.c               |   18 ++++--------------
 libxfs/xfs_alloc_btree.c    |    6 +++---
 libxfs/xfs_alloc_btree.h    |    3 ++-
 libxfs/xfs_bmap.c           |    2 +-
 libxfs/xfs_bmap_btree.c     |    6 +++---
 libxfs/xfs_bmap_btree.h     |    5 +++--
 libxfs/xfs_ialloc.c         |    4 ++--
 libxfs/xfs_ialloc_btree.c   |    6 +++---
 libxfs/xfs_ialloc_btree.h   |    3 ++-
 libxfs/xfs_inode_fork.c     |    2 +-
 libxfs/xfs_refcount_btree.c |    5 +++--
 libxfs/xfs_refcount_btree.h |    3 ++-
 libxfs/xfs_rmap_btree.c     |    9 +++++----
 libxfs/xfs_rmap_btree.h     |    3 ++-
 libxfs/xfs_sb.c             |   16 ++++++++--------
 repair/phase5.c             |   16 ++++++++--------
 16 files changed, 52 insertions(+), 55 deletions(-)


diff --git a/db/btheight.c b/db/btheight.c
index 0b421ab50a3..6643489c82c 100644
--- a/db/btheight.c
+++ b/db/btheight.c
@@ -12,21 +12,11 @@
 #include "input.h"
 #include "libfrog/convert.h"
 
-static int refc_maxrecs(struct xfs_mount *mp, int blocklen, int leaf)
-{
-	return libxfs_refcountbt_maxrecs(blocklen, leaf != 0);
-}
-
-static int rmap_maxrecs(struct xfs_mount *mp, int blocklen, int leaf)
-{
-	return libxfs_rmapbt_maxrecs(blocklen, leaf);
-}
-
 struct btmap {
 	const char	*tag;
 	unsigned int	(*maxlevels)(void);
-	int		(*maxrecs)(struct xfs_mount *mp, int blocklen,
-				   int leaf);
+	unsigned int	(*maxrecs)(struct xfs_mount *mp, unsigned int blocklen,
+				   bool leaf);
 } maps[] = {
 	{
 		.tag		= "bnobt",
@@ -56,12 +46,12 @@ struct btmap {
 	{
 		.tag		= "refcountbt",
 		.maxlevels	= libxfs_refcountbt_maxlevels_ondisk,
-		.maxrecs	= refc_maxrecs,
+		.maxrecs	= libxfs_refcountbt_maxrecs,
 	},
 	{
 		.tag		= "rmapbt",
 		.maxlevels	= libxfs_rmapbt_maxlevels_ondisk,
-		.maxrecs	= rmap_maxrecs,
+		.maxrecs	= libxfs_rmapbt_maxrecs,
 	},
 };
 
diff --git a/libxfs/xfs_alloc_btree.c b/libxfs/xfs_alloc_btree.c
index 93faa832e5b..a03a8776c21 100644
--- a/libxfs/xfs_alloc_btree.c
+++ b/libxfs/xfs_alloc_btree.c
@@ -609,11 +609,11 @@ xfs_allocbt_block_maxrecs(
 /*
  * Calculate number of records in an alloc btree block.
  */
-int
+unsigned int
 xfs_allocbt_maxrecs(
 	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
+	unsigned int		blocklen,
+	bool			leaf)
 {
 	blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
 	return xfs_allocbt_block_maxrecs(blocklen, leaf);
diff --git a/libxfs/xfs_alloc_btree.h b/libxfs/xfs_alloc_btree.h
index 45df893ef6b..f61f51d0bd7 100644
--- a/libxfs/xfs_alloc_btree.h
+++ b/libxfs/xfs_alloc_btree.h
@@ -53,7 +53,8 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
 struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
 		struct xbtree_afakeroot *afake, struct xfs_perag *pag,
 		xfs_btnum_t btnum);
-extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+unsigned int xfs_allocbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 5935f87833b..7fefbb7d21c 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -560,7 +560,7 @@ xfs_bmap_btree_to_extents(
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
 	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
 	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false) == 1);
 
 	pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes);
 	cbno = be64_to_cpu(*pp);
diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c
index 1dd8d12af8f..1e7b89e7730 100644
--- a/libxfs/xfs_bmap_btree.c
+++ b/libxfs/xfs_bmap_btree.c
@@ -664,11 +664,11 @@ xfs_bmbt_commit_staged_btree(
 /*
  * Calculate number of records in a bmap btree block.
  */
-int
+unsigned int
 xfs_bmbt_maxrecs(
 	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
+	unsigned int		blocklen,
+	bool			leaf)
 {
 	blocklen -= xfs_bmbt_block_len(mp);
 	return xfs_bmbt_block_maxrecs(blocklen, leaf);
diff --git a/libxfs/xfs_bmap_btree.h b/libxfs/xfs_bmap_btree.h
index 5a3bae94deb..a9ddc9b42e6 100644
--- a/libxfs/xfs_bmap_btree.h
+++ b/libxfs/xfs_bmap_btree.h
@@ -35,7 +35,8 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
 
 extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
 extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
-extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+unsigned int xfs_bmbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 
 extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
 				 int whichfork, xfs_ino_t new_owner,
@@ -150,7 +151,7 @@ xfs_bmap_broot_ptr_addr(
 	unsigned int		i,
 	unsigned int		sz)
 {
-	return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0));
+	return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, false));
 }
 
 /*
diff --git a/libxfs/xfs_ialloc.c b/libxfs/xfs_ialloc.c
index 19543f76994..8aae4b79c85 100644
--- a/libxfs/xfs_ialloc.c
+++ b/libxfs/xfs_ialloc.c
@@ -2914,8 +2914,8 @@ xfs_ialloc_setup_geometry(
 
 	/* Compute inode btree geometry. */
 	igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-	igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-	igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+	igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true);
+	igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false);
 	igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
 	igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
 
diff --git a/libxfs/xfs_ialloc_btree.c b/libxfs/xfs_ialloc_btree.c
index 4275244b15c..80d28d3fea5 100644
--- a/libxfs/xfs_ialloc_btree.c
+++ b/libxfs/xfs_ialloc_btree.c
@@ -558,11 +558,11 @@ xfs_inobt_block_maxrecs(
 /*
  * Calculate number of records in an inobt btree block.
  */
-int
+unsigned int
 xfs_inobt_maxrecs(
 	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
+	unsigned int		blocklen,
+	bool			leaf)
 {
 	blocklen -= XFS_INOBT_BLOCK_LEN(mp);
 	return xfs_inobt_block_maxrecs(blocklen, leaf);
diff --git a/libxfs/xfs_ialloc_btree.h b/libxfs/xfs_ialloc_btree.h
index 3262c3fe5eb..ed0f619fd33 100644
--- a/libxfs/xfs_ialloc_btree.h
+++ b/libxfs/xfs_ialloc_btree.h
@@ -50,7 +50,8 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
 		struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum);
 struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
 		struct xbtree_afakeroot *afake, xfs_btnum_t btnum);
-extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+unsigned int xfs_inobt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 
 /* ir_holemask to inode allocation bitmap conversion */
 uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec);
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index d070e0524b9..bb66028bff0 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -474,7 +474,7 @@ xfs_iroot_realloc(
 	}
 
 	/* Compute the new and old record count and space requirements. */
-	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
+	cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	new_size = xfs_bmap_broot_space_calc(mp, new_max);
diff --git a/libxfs/xfs_refcount_btree.c b/libxfs/xfs_refcount_btree.c
index ab8925051a9..1fbd250c1a8 100644
--- a/libxfs/xfs_refcount_btree.c
+++ b/libxfs/xfs_refcount_btree.c
@@ -433,9 +433,10 @@ xfs_refcountbt_block_maxrecs(
 /*
  * Calculate the number of records in a refcount btree block.
  */
-int
+unsigned int
 xfs_refcountbt_maxrecs(
-	int			blocklen,
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
 	bool			leaf)
 {
 	blocklen -= XFS_REFCOUNT_BLOCK_LEN;
diff --git a/libxfs/xfs_refcount_btree.h b/libxfs/xfs_refcount_btree.h
index d66b37259be..fe3c20d6779 100644
--- a/libxfs/xfs_refcount_btree.h
+++ b/libxfs/xfs_refcount_btree.h
@@ -50,7 +50,8 @@ extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
 		struct xbtree_afakeroot *afake, struct xfs_perag *pag);
-extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
+unsigned int xfs_refcountbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
 
 extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
diff --git a/libxfs/xfs_rmap_btree.c b/libxfs/xfs_rmap_btree.c
index 7342623ed5e..23ea9cd992f 100644
--- a/libxfs/xfs_rmap_btree.c
+++ b/libxfs/xfs_rmap_btree.c
@@ -587,7 +587,7 @@ xfs_rmapbt_mem_verify(
 	}
 
 	return xfbtree_sblock_verify(bp,
-			xfs_rmapbt_maxrecs(xfo_to_b(1), level == 0));
+			xfs_rmapbt_maxrecs(mp, xfo_to_b(1), level == 0));
 }
 
 static void
@@ -715,10 +715,11 @@ xfs_rmapbt_block_maxrecs(
 /*
  * Calculate number of records in an rmap btree block.
  */
-int
+unsigned int
 xfs_rmapbt_maxrecs(
-	int			blocklen,
-	int			leaf)
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
 {
 	blocklen -= XFS_RMAP_BLOCK_LEN;
 	return xfs_rmapbt_block_maxrecs(blocklen, leaf);
diff --git a/libxfs/xfs_rmap_btree.h b/libxfs/xfs_rmap_btree.h
index 5d0454fd052..415fad8dad7 100644
--- a/libxfs/xfs_rmap_btree.h
+++ b/libxfs/xfs_rmap_btree.h
@@ -48,7 +48,8 @@ struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
 		struct xbtree_afakeroot *afake, struct xfs_perag *pag);
 void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
 		struct xfs_trans *tp, struct xfs_buf *agbp);
-int xfs_rmapbt_maxrecs(int blocklen, int leaf);
+unsigned int xfs_rmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
 
 extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index d04a8e15331..aff2ab79f9b 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -1108,23 +1108,23 @@ xfs_sb_mount_common(
 	mp->m_rgblklog = log2_if_power2(sbp->sb_rgblocks);
 	mp->m_rgblkmask = mask64_if_power2(sbp->sb_rgblocks);
 
-	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
-	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
 	mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
 
-	mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
-	mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
 	mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
 
-	mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1);
-	mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0);
+	mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
 	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
 
-	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true);
-	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false);
+	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
 	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
 
diff --git a/repair/phase5.c b/repair/phase5.c
index 983f2169228..74594d53a87 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -644,21 +644,21 @@ phase5(xfs_mount_t *mp)
 
 #ifdef XR_BLD_FREE_TRACE
 	fprintf(stderr, "inobt level 1, maxrec = %d, minrec = %d\n",
-		libxfs_inobt_maxrecs(mp, mp->m_sb.sb_blocksize, 0),
-		libxfs_inobt_maxrecs(mp, mp->m_sb.sb_blocksize, 0) / 2);
+		libxfs_inobt_maxrecs(mp, mp->m_sb.sb_blocksize, false),
+		libxfs_inobt_maxrecs(mp, mp->m_sb.sb_blocksize, false) / 2);
 	fprintf(stderr, "inobt level 0 (leaf), maxrec = %d, minrec = %d\n",
-		libxfs_inobt_maxrecs(mp, mp->m_sb.sb_blocksize, 1),
-		libxfs_inobt_maxrecs(mp, mp->m_sb.sb_blocksize, 1) / 2);
+		libxfs_inobt_maxrecs(mp, mp->m_sb.sb_blocksize, true),
+		libxfs_inobt_maxrecs(mp, mp->m_sb.sb_blocksize, true) / 2);
 	fprintf(stderr, "xr inobt level 0 (leaf), maxrec = %d\n",
 		XR_INOBT_BLOCK_MAXRECS(mp, 0));
 	fprintf(stderr, "xr inobt level 1 (int), maxrec = %d\n",
 		XR_INOBT_BLOCK_MAXRECS(mp, 1));
 	fprintf(stderr, "bnobt level 1, maxrec = %d, minrec = %d\n",
-		libxfs_allocbt_maxrecs(mp, mp->m_sb.sb_blocksize, 0),
-		libxfs_allocbt_maxrecs(mp, mp->m_sb.sb_blocksize, 0) / 2);
+		libxfs_allocbt_maxrecs(mp, mp->m_sb.sb_blocksize, false),
+		libxfs_allocbt_maxrecs(mp, mp->m_sb.sb_blocksize, false) / 2);
 	fprintf(stderr, "bnobt level 0 (leaf), maxrec = %d, minrec = %d\n",
-		libxfs_allocbt_maxrecs(mp, mp->m_sb.sb_blocksize, 1),
-		libxfs_allocbt_maxrecs(mp, mp->m_sb.sb_blocksize, 1) / 2);
+		libxfs_allocbt_maxrecs(mp, mp->m_sb.sb_blocksize, true),
+		libxfs_allocbt_maxrecs(mp, mp->m_sb.sb_blocksize, true) / 2);
 #endif
 	/*
 	 * make sure the root and realtime inodes show up allocated


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/14] xfs: generalize the btree root reallocation function
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 13:03   ` [PATCH 08/14] xfs: standardize the btree maxrecs function parameters Darrick J. Wong
@ 2023-12-27 13:03   ` Darrick J. Wong
  2023-12-27 13:03   ` [PATCH 10/14] xfs: support leaves in the incore btree root block in xfs_iroot_realloc Darrick J. Wong
                     ` (4 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:03 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In preparation for storing realtime rmap btree roots in an inode fork,
make xfs_iroot_realloc take an ops structure that takes care of all the
btree-specific geometry pieces.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_bmap_btree.c |   51 +++++++++++++++++++++++++++++
 libxfs/xfs_btree.c      |   22 ++++++++-----
 libxfs/xfs_btree.h      |    3 ++
 libxfs/xfs_inode_fork.c |   82 ++++++++++-------------------------------------
 libxfs/xfs_inode_fork.h |   23 +++++++++++++
 5 files changed, 107 insertions(+), 74 deletions(-)


diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c
index 1e7b89e7730..4156e23a2da 100644
--- a/libxfs/xfs_bmap_btree.c
+++ b/libxfs/xfs_bmap_btree.c
@@ -511,6 +511,56 @@ xfs_bmbt_keys_contiguous(
 				 be64_to_cpu(key2->bmbt.br_startoff));
 }
 
+/* Move the bmap btree root from one incore buffer to another. */
+static void
+xfs_bmbt_broot_move(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_btree_block	*dst_broot,
+	size_t			dst_bytes,
+	struct xfs_btree_block	*src_broot,
+	size_t			src_bytes,
+	unsigned int		numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	void			*dptr;
+	void			*sptr;
+
+	ASSERT(xfs_bmap_bmdr_space(src_broot) <= xfs_inode_fork_size(ip, whichfork));
+
+	/*
+	 * We always have to move the pointers because they are not butted
+	 * against the btree block header.
+	 */
+	if (numrecs) {
+		sptr = xfs_bmap_broot_ptr_addr(mp, src_broot, 1, src_bytes);
+		dptr = xfs_bmap_broot_ptr_addr(mp, dst_broot, 1, dst_bytes);
+		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
+	}
+
+	if (src_broot == dst_broot)
+		return;
+
+	/*
+	 * If the root is being totally relocated, we have to migrate the block
+	 * header and the keys that come after it.
+	 */
+	memcpy(dst_broot, src_broot, xfs_bmbt_block_len(mp));
+
+	/* Now copy the keys, which come right after the header. */
+	if (numrecs) {
+		sptr = xfs_bmbt_key_addr(mp, src_broot, 1);
+		dptr = xfs_bmbt_key_addr(mp, dst_broot, 1);
+		memcpy(dptr, sptr, numrecs * sizeof(struct xfs_bmbt_key));
+	}
+}
+
+static const struct xfs_ifork_broot_ops xfs_bmbt_iroot_ops = {
+	.maxrecs		= xfs_bmbt_maxrecs,
+	.size			= xfs_bmap_broot_space_calc,
+	.move			= xfs_bmbt_broot_move,
+};
+
 const struct xfs_btree_ops xfs_bmbt_ops = {
 	.rec_len		= sizeof(xfs_bmbt_rec_t),
 	.key_len		= sizeof(xfs_bmbt_key_t),
@@ -534,6 +584,7 @@ const struct xfs_btree_ops xfs_bmbt_ops = {
 	.keys_inorder		= xfs_bmbt_keys_inorder,
 	.recs_inorder		= xfs_bmbt_recs_inorder,
 	.keys_contiguous	= xfs_bmbt_keys_contiguous,
+	.iroot_ops		= &xfs_bmbt_iroot_ops,
 };
 
 static struct xfs_btree_cur *
diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index 7cc6379a113..b6f73fcc6d6 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -3068,6 +3068,16 @@ xfs_btree_split(
 #define xfs_btree_split	__xfs_btree_split
 #endif /* __KERNEL__ */
 
+static inline void
+xfs_btree_iroot_realloc(
+	struct xfs_btree_cur		*cur,
+	int				rec_diff)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	xfs_iroot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
+			cur->bc_ops->iroot_ops, rec_diff);
+}
 
 /*
  * Copy the old inode root contents into a real block and make the
@@ -3152,9 +3162,7 @@ xfs_btree_new_iroot(
 
 	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
 
-	xfs_iroot_realloc(cur->bc_ino.ip,
-			  1 - xfs_btree_get_numrecs(cblock),
-			  cur->bc_ino.whichfork);
+	xfs_btree_iroot_realloc(cur, 1 - xfs_btree_get_numrecs(cblock));
 
 	xfs_btree_setbuf(cur, level, cbp);
 
@@ -3324,7 +3332,7 @@ xfs_btree_make_block_unfull(
 
 		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
 			/* A root block that can be made bigger. */
-			xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
+			xfs_btree_iroot_realloc(cur, 1);
 			*stat = 1;
 		} else {
 			/* A root block that needs replacing */
@@ -3732,8 +3740,7 @@ xfs_btree_kill_iroot(
 
 	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
 	if (index) {
-		xfs_iroot_realloc(cur->bc_ino.ip, index,
-				  cur->bc_ino.whichfork);
+		xfs_btree_iroot_realloc(cur, index);
 		block = ifp->if_broot;
 	}
 
@@ -3930,8 +3937,7 @@ xfs_btree_delrec(
 	 */
 	if (level == cur->bc_nlevels - 1) {
 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
-			xfs_iroot_realloc(cur->bc_ino.ip, -1,
-					  cur->bc_ino.whichfork);
+			xfs_btree_iroot_realloc(cur, -1);
 
 			error = xfs_btree_kill_iroot(cur);
 			if (error)
diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h
index 339b5561e5b..7872fc1739b 100644
--- a/libxfs/xfs_btree.h
+++ b/libxfs/xfs_btree.h
@@ -205,6 +205,9 @@ struct xfs_btree_ops {
 			       const union xfs_btree_key *key1,
 			       const union xfs_btree_key *key2,
 			       const union xfs_btree_key *mask);
+
+	/* Functions for manipulating the btree root block. */
+	const struct xfs_ifork_broot_ops *iroot_ops;
 };
 
 /*
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index bb66028bff0..50422bbeb8f 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -381,50 +381,6 @@ xfs_iroot_free(
 	ifp->if_broot = NULL;
 }
 
-/* Move the bmap btree root from one incore buffer to another. */
-static void
-xfs_ifork_move_broot(
-	struct xfs_inode	*ip,
-	int			whichfork,
-	struct xfs_btree_block	*dst_broot,
-	size_t			dst_bytes,
-	struct xfs_btree_block	*src_broot,
-	size_t			src_bytes,
-	unsigned int		numrecs)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	void			*dptr;
-	void			*sptr;
-
-	ASSERT(xfs_bmap_bmdr_space(src_broot) <= xfs_inode_fork_size(ip, whichfork));
-
-	/*
-	 * We always have to move the pointers because they are not butted
-	 * against the btree block header.
-	 */
-	if (numrecs) {
-		sptr = xfs_bmap_broot_ptr_addr(mp, src_broot, 1, src_bytes);
-		dptr = xfs_bmap_broot_ptr_addr(mp, dst_broot, 1, dst_bytes);
-		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
-	}
-
-	if (src_broot == dst_broot)
-		return;
-
-	/*
-	 * If the root is being totally relocated, we have to migrate the block
-	 * header and the keys that come after it.
-	 */
-	memcpy(dst_broot, src_broot, xfs_bmbt_block_len(mp));
-
-	/* Now copy the keys, which come right after the header. */
-	if (numrecs) {
-		sptr = xfs_bmbt_key_addr(mp, src_broot, 1);
-		dptr = xfs_bmbt_key_addr(mp, dst_broot, 1);
-		memcpy(dptr, sptr, numrecs * sizeof(struct xfs_bmbt_key));
-	}
-}
-
 /*
  * Reallocate the space for if_broot based on the number of records
  * being added or deleted as indicated in rec_diff.  Move the records
@@ -438,24 +394,21 @@ xfs_ifork_move_broot(
  * if we are adding records, one will be allocated.  The caller must also
  * not request that the number of records go below zero, although
  * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- *	 requested for the if_broot array.
  */
 void
 xfs_iroot_realloc(
-	struct xfs_inode	*ip,
-	int			rec_diff,
-	int			whichfork)
+	struct xfs_inode		*ip,
+	int				whichfork,
+	const struct xfs_ifork_broot_ops *ops,
+	int				rec_diff)
 {
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
-	struct xfs_btree_block	*new_broot;
-	size_t			new_size;
-	size_t			old_size = ifp->if_broot_bytes;
-	int			cur_max;
-	int			new_max;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_block		*new_broot;
+	size_t				new_size;
+	size_t				old_size = ifp->if_broot_bytes;
+	int				cur_max;
+	int				new_max;
 
 	/* Handle degenerate cases. */
 	if (rec_diff == 0)
@@ -468,16 +421,16 @@ xfs_iroot_realloc(
 	if (old_size == 0) {
 		ASSERT(rec_diff > 0);
 
-		new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
+		new_size = ops->size(mp, rec_diff);
 		xfs_iroot_alloc(ip, whichfork, new_size);
 		return;
 	}
 
 	/* Compute the new and old record count and space requirements. */
-	cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
+	cur_max = ops->maxrecs(mp, old_size, false);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
-	new_size = xfs_bmap_broot_space_calc(mp, new_max);
+	new_size = ops->size(mp, new_max);
 
 	if (rec_diff > 0) {
 		/*
@@ -488,7 +441,7 @@ xfs_iroot_realloc(
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_NOFS | __GFP_NOFAIL);
 		ifp->if_broot_bytes = new_size;
-		xfs_ifork_move_broot(ip, whichfork, ifp->if_broot, new_size,
+		ops->move(ip, whichfork, ifp->if_broot, new_size,
 				ifp->if_broot, old_size, cur_max);
 		return;
 	}
@@ -505,15 +458,14 @@ xfs_iroot_realloc(
 
 	/* Reallocate the btree root and move the contents. */
 	new_broot = kmem_alloc(new_size, KM_NOFS);
-	xfs_ifork_move_broot(ip, whichfork, new_broot, new_size, ifp->if_broot,
-			old_size, new_max);
+	ops->move(ip, whichfork, new_broot, new_size, ifp->if_broot,
+			ifp->if_broot_bytes, new_max);
 
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = new_size;
 }
 
-
 /*
  * This is called when the amount of space needed for if_data
  * is increased or decreased.  The change in size is indicated by
diff --git a/libxfs/xfs_inode_fork.h b/libxfs/xfs_inode_fork.h
index 18ea2d27777..1ac9a7a8b5f 100644
--- a/libxfs/xfs_inode_fork.h
+++ b/libxfs/xfs_inode_fork.h
@@ -175,7 +175,6 @@ void		xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
 void		xfs_iroot_alloc(struct xfs_inode *ip, int whichfork,
 				size_t bytes);
 void		xfs_iroot_free(struct xfs_inode *ip, int whichfork);
-void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
 				  int);
@@ -274,4 +273,26 @@ static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
 	return smp_load_acquire(&ifp->if_needextents) != 0;
 }
 
+struct xfs_ifork_broot_ops {
+	/* Calculate the number of records/keys in the incore btree block. */
+	unsigned int (*maxrecs)(struct xfs_mount *mp, unsigned int blocksize,
+			bool leaf);
+
+	/* Calculate the bytes required for the incore btree root block. */
+	size_t (*size)(struct xfs_mount *mp, unsigned int nrecs);
+
+	/*
+	 * Move an incore btree root from one buffer to another.  Note that
+	 * src_broot and dst_broot could be the same or they could be totally
+	 * separate memory regions.
+	 */
+	void (*move)(struct xfs_inode *ip, int whichfork,
+			struct xfs_btree_block *dst_broot, size_t dst_bytes,
+			struct xfs_btree_block *src_broot, size_t src_bytes,
+			unsigned int numrecs);
+};
+
+void xfs_iroot_realloc(struct xfs_inode *ip, int whichfork,
+		const struct xfs_ifork_broot_ops *ops, int rec_diff);
+
 #endif	/* __XFS_INODE_FORK_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/14] xfs: support leaves in the incore btree root block in xfs_iroot_realloc
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-27 13:03   ` [PATCH 09/14] xfs: generalize the btree root reallocation function Darrick J. Wong
@ 2023-12-27 13:03   ` Darrick J. Wong
  2023-12-27 13:03   ` [PATCH 11/14] xfs: hoist the node iroot update code out of xfs_btree_new_iroot Darrick J. Wong
                     ` (3 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:03 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add some logic to xfs_iroot_realloc so that we can handle leaf records
in the btree root block correctly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/bmap_inflate.c       |    2 +-
 libxfs/xfs_bmap_btree.c |    4 +++-
 libxfs/xfs_bmap_btree.h |    5 ++++-
 libxfs/xfs_inode_fork.c |   12 +++++++-----
 libxfs/xfs_inode_fork.h |    5 +++--
 repair/bmap_repair.c    |    2 +-
 6 files changed, 19 insertions(+), 11 deletions(-)


diff --git a/db/bmap_inflate.c b/db/bmap_inflate.c
index 118d911a1db..b08204201c2 100644
--- a/db/bmap_inflate.c
+++ b/db/bmap_inflate.c
@@ -282,7 +282,7 @@ iroot_size(
 	unsigned int		nr_this_level,
 	void			*priv)
 {
-	return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level);
+	return xfs_bmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
 }
 
 static int
diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c
index 4156e23a2da..8658e7c390a 100644
--- a/libxfs/xfs_bmap_btree.c
+++ b/libxfs/xfs_bmap_btree.c
@@ -520,6 +520,7 @@ xfs_bmbt_broot_move(
 	size_t			dst_bytes,
 	struct xfs_btree_block	*src_broot,
 	size_t			src_bytes,
+	unsigned int		level,
 	unsigned int		numrecs)
 {
 	struct xfs_mount	*mp = ip->i_mount;
@@ -527,6 +528,7 @@ xfs_bmbt_broot_move(
 	void			*sptr;
 
 	ASSERT(xfs_bmap_bmdr_space(src_broot) <= xfs_inode_fork_size(ip, whichfork));
+	ASSERT(level > 0);
 
 	/*
 	 * We always have to move the pointers because they are not butted
@@ -839,7 +841,7 @@ xfs_bmbt_iroot_alloc(
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 
 	xfs_iroot_alloc(ip, whichfork,
-			xfs_bmap_broot_space_calc(ip->i_mount, 1));
+			xfs_bmap_broot_space_calc(ip->i_mount, 1, 1));
 
 	/* Fill in the root. */
 	xfs_btree_init_block(ip->i_mount, ifp->if_broot, &xfs_bmbt_ops, 1, 1,
diff --git a/libxfs/xfs_bmap_btree.h b/libxfs/xfs_bmap_btree.h
index a9ddc9b42e6..d20321bfe2f 100644
--- a/libxfs/xfs_bmap_btree.h
+++ b/libxfs/xfs_bmap_btree.h
@@ -161,8 +161,11 @@ xfs_bmap_broot_ptr_addr(
 static inline size_t
 xfs_bmap_broot_space_calc(
 	struct xfs_mount	*mp,
+	unsigned int		level,
 	unsigned int		nrecs)
 {
+	ASSERT(level > 0);
+
 	/*
 	 * If the bmbt root block is empty, we should be converting the fork
 	 * to extents format.  Hence, the size is zero.
@@ -183,7 +186,7 @@ xfs_bmap_broot_space(
 	struct xfs_mount	*mp,
 	struct xfs_bmdr_block	*bb)
 {
-	return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs));
+	return xfs_bmap_broot_space_calc(mp, 1, be16_to_cpu(bb->bb_numrecs));
 }
 
 /* Compute the space required for the ondisk root block. */
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 50422bbeb8f..ec3a399e798 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -407,6 +407,7 @@ xfs_iroot_realloc(
 	struct xfs_btree_block		*new_broot;
 	size_t				new_size;
 	size_t				old_size = ifp->if_broot_bytes;
+	unsigned int			level;
 	int				cur_max;
 	int				new_max;
 
@@ -421,16 +422,17 @@ xfs_iroot_realloc(
 	if (old_size == 0) {
 		ASSERT(rec_diff > 0);
 
-		new_size = ops->size(mp, rec_diff);
+		new_size = ops->size(mp, 0, rec_diff);
 		xfs_iroot_alloc(ip, whichfork, new_size);
 		return;
 	}
 
 	/* Compute the new and old record count and space requirements. */
-	cur_max = ops->maxrecs(mp, old_size, false);
+	level = be16_to_cpu(ifp->if_broot->bb_level);
+	cur_max = ops->maxrecs(mp, old_size, level == 0);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
-	new_size = ops->size(mp, new_max);
+	new_size = ops->size(mp, level, new_max);
 
 	if (rec_diff > 0) {
 		/*
@@ -442,7 +444,7 @@ xfs_iroot_realloc(
 					 GFP_NOFS | __GFP_NOFAIL);
 		ifp->if_broot_bytes = new_size;
 		ops->move(ip, whichfork, ifp->if_broot, new_size,
-				ifp->if_broot, old_size, cur_max);
+				ifp->if_broot, old_size, level, cur_max);
 		return;
 	}
 
@@ -459,7 +461,7 @@ xfs_iroot_realloc(
 	/* Reallocate the btree root and move the contents. */
 	new_broot = kmem_alloc(new_size, KM_NOFS);
 	ops->move(ip, whichfork, new_broot, new_size, ifp->if_broot,
-			ifp->if_broot_bytes, new_max);
+			ifp->if_broot_bytes, level, new_max);
 
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
diff --git a/libxfs/xfs_inode_fork.h b/libxfs/xfs_inode_fork.h
index 1ac9a7a8b5f..9a0136f8273 100644
--- a/libxfs/xfs_inode_fork.h
+++ b/libxfs/xfs_inode_fork.h
@@ -279,7 +279,8 @@ struct xfs_ifork_broot_ops {
 			bool leaf);
 
 	/* Calculate the bytes required for the incore btree root block. */
-	size_t (*size)(struct xfs_mount *mp, unsigned int nrecs);
+	size_t (*size)(struct xfs_mount *mp, unsigned int level,
+			unsigned int nrecs);
 
 	/*
 	 * Move an incore btree root from one buffer to another.  Note that
@@ -289,7 +290,7 @@ struct xfs_ifork_broot_ops {
 	void (*move)(struct xfs_inode *ip, int whichfork,
 			struct xfs_btree_block *dst_broot, size_t dst_bytes,
 			struct xfs_btree_block *src_broot, size_t src_bytes,
-			unsigned int numrecs);
+			unsigned int level, unsigned int numrecs);
 };
 
 void xfs_iroot_realloc(struct xfs_inode *ip, int whichfork,
diff --git a/repair/bmap_repair.c b/repair/bmap_repair.c
index a8cbff67ceb..dfd1405cca2 100644
--- a/repair/bmap_repair.c
+++ b/repair/bmap_repair.c
@@ -285,7 +285,7 @@ xrep_bmap_iroot_size(
 {
 	ASSERT(level > 0);
 
-	return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level);
+	return xfs_bmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
 }
 
 /* Update the inode counters. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/14] xfs: hoist the node iroot update code out of xfs_btree_new_iroot
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-27 13:03   ` [PATCH 10/14] xfs: support leaves in the incore btree root block in xfs_iroot_realloc Darrick J. Wong
@ 2023-12-27 13:03   ` Darrick J. Wong
  2023-12-27 13:04   ` [PATCH 12/14] xfs: hoist the node iroot update code out of xfs_btree_kill_iroot Darrick J. Wong
                     ` (2 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:03 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In preparation for allowing records in an inode btree root, hoist the
code that copies keyptrs from an existing node root into a child block
to a separate function.  Note that the new function explicitly computes
the keys of the new child block and stores that in the root block; while
the bmap btree could rely on leaving the key alone, realtime rmap needs
to set the new high key.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_btree.c |  113 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 74 insertions(+), 39 deletions(-)


diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index b6f73fcc6d6..5a21788c707 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -3079,6 +3079,77 @@ xfs_btree_iroot_realloc(
 			cur->bc_ops->iroot_ops, rec_diff);
 }
 
+/*
+ * Move the keys and pointers from a root block to a separate block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_promote_node_iroot(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level,
+	struct xfs_buf		*cbp,
+	union xfs_btree_ptr	*cptr,
+	struct xfs_btree_block	*cblock)
+{
+	union xfs_btree_key	*ckp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*cpp;
+	union xfs_btree_ptr	*pp;
+	int			i;
+	int			error;
+	int			numrecs = xfs_btree_get_numrecs(block);
+
+	/*
+	 * Increase tree height, adjusting the root block level to match.
+	 * We cannot change the root btree node size until we've copied the
+	 * block contents to the new child block.
+	 */
+	be16_add_cpu(&block->bb_level, 1);
+	cur->bc_nlevels++;
+	cur->bc_levels[level + 1].ptr = 1;
+
+	/*
+	 * Adjust the root btree record count, then copy the keys from the old
+	 * root to the new child block.
+	 */
+	xfs_btree_set_numrecs(block, 1);
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, ckp, kp, numrecs);
+
+	/* Check the pointers and copy them to the new child block. */
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+	for (i = 0; i < numrecs; i++) {
+		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+		if (error)
+			return error;
+	}
+	xfs_btree_copy_ptrs(cur, cpp, pp, numrecs);
+
+	/*
+	 * Set the first keyptr to point to the new child block, then shrink
+	 * the memory buffer for the root block.
+	 */
+	error = xfs_btree_debug_check_ptr(cur, cptr, 0, level);
+	if (error)
+		return error;
+	xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+	xfs_btree_get_keys(cur, cblock, kp);
+	xfs_btree_iroot_realloc(cur, 1 - numrecs);
+
+	/* Attach the new block to the cursor and log it. */
+	xfs_btree_setbuf(cur, level, cbp);
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_keys(cur, cbp, 1, numrecs);
+	xfs_btree_log_ptrs(cur, cbp, 1, numrecs);
+	return 0;
+}
+
 /*
  * Copy the old inode root contents into a real block and make the
  * broot point to it.
@@ -3092,14 +3163,10 @@ xfs_btree_new_iroot(
 	struct xfs_buf		*cbp;		/* buffer for cblock */
 	struct xfs_btree_block	*block;		/* btree block */
 	struct xfs_btree_block	*cblock;	/* child btree block */
-	union xfs_btree_key	*ckp;		/* child key pointer */
-	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
-	union xfs_btree_key	*kp;		/* pointer to btree key */
-	union xfs_btree_ptr	*pp;		/* pointer to block addr */
+	union xfs_btree_ptr	*pp;
 	union xfs_btree_ptr	nptr;		/* new block addr */
 	int			level;		/* btree level */
 	int			error;		/* error return code */
-	int			i;		/* loop counter */
 
 	XFS_BTREE_STATS_INC(cur, newroot);
 
@@ -3137,43 +3204,11 @@ xfs_btree_new_iroot(
 			cblock->bb_u.s.bb_blkno = bno;
 	}
 
-	be16_add_cpu(&block->bb_level, 1);
-	xfs_btree_set_numrecs(block, 1);
-	cur->bc_nlevels++;
-	ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
-	cur->bc_levels[level + 1].ptr = 1;
-
-	kp = xfs_btree_key_addr(cur, 1, block);
-	ckp = xfs_btree_key_addr(cur, 1, cblock);
-	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
-
-	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
-		if (error)
-			goto error0;
-	}
-
-	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
-
-	error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
+	error = xfs_btree_promote_node_iroot(cur, block, level, cbp, &nptr,
+			cblock);
 	if (error)
 		goto error0;
 
-	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
-
-	xfs_btree_iroot_realloc(cur, 1 - xfs_btree_get_numrecs(cblock));
-
-	xfs_btree_setbuf(cur, level, cbp);
-
-	/*
-	 * Do all this logging at the end so that
-	 * the root is at the right level.
-	 */
-	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
-	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-
 	*logflags |=
 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
 	*stat = 1;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/14] xfs: hoist the node iroot update code out of xfs_btree_kill_iroot
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-27 13:03   ` [PATCH 11/14] xfs: hoist the node iroot update code out of xfs_btree_new_iroot Darrick J. Wong
@ 2023-12-27 13:04   ` Darrick J. Wong
  2023-12-27 13:04   ` [PATCH 13/14] xfs: support storing records in the inode core root Darrick J. Wong
  2023-12-27 13:04   ` [PATCH 14/14] xfs: update btree keys correctly when _insrec splits an inode root block Darrick J. Wong
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:04 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In preparation for allowing records in an inode btree root, hoist the
code that copies keyptrs from an existing node child into the root block
to a separate function.  Remove some unnecessary conditionals and clean
up a few function calls in the new function.  Note that this change
reorders the ->free_block call with respect to the change in bc_nlevels
to make it easier to support inode root leaf blocks in the next patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_btree.c |   94 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 34 deletions(-)


diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index 5a21788c707..0f0198ae0cd 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -3704,6 +3704,63 @@ xfs_btree_insert(
 	return error;
 }
 
+/*
+ * Move the keyptrs from a child node block to the root block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_demote_node_child(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*cblock,
+	int			level,
+	int			numrecs)
+{
+	struct xfs_btree_block	*block;
+	union xfs_btree_key	*ckp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*cpp;
+	union xfs_btree_ptr	*pp;
+	int			i;
+	int			error;
+	int			diff;
+
+	/*
+	 * Adjust the root btree node size and the record count to match the
+	 * doomed child so that we can copy the keyptrs ahead of changing the
+	 * tree shape.
+	 */
+	diff = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+	xfs_btree_iroot_realloc(cur, diff);
+	block = xfs_btree_get_iroot(cur);
+
+	xfs_btree_set_numrecs(block, numrecs);
+	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+	/* Copy keys from the doomed block. */
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+	/* Copy pointers from the doomed block. */
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+	for (i = 0; i < numrecs; i++) {
+		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+		if (error)
+			return error;
+	}
+	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+	/* Decrease tree height, adjusting the root block level to match. */
+	cur->bc_levels[level - 1].bp = NULL;
+	be16_add_cpu(&block->bb_level, -1);
+	cur->bc_nlevels--;
+	return 0;
+}
+
 /*
  * Try to merge a non-leaf block back into the inode root.
  *
@@ -3716,24 +3773,16 @@ STATIC int
 xfs_btree_kill_iroot(
 	struct xfs_btree_cur	*cur)
 {
-	int			whichfork = cur->bc_ino.whichfork;
 	struct xfs_inode	*ip = cur->bc_ino.ip;
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*block;
 	struct xfs_btree_block	*cblock;
-	union xfs_btree_key	*kp;
-	union xfs_btree_key	*ckp;
-	union xfs_btree_ptr	*pp;
-	union xfs_btree_ptr	*cpp;
 	struct xfs_buf		*cbp;
 	int			level;
-	int			index;
 	int			numrecs;
 	int			error;
 #ifdef DEBUG
 	union xfs_btree_ptr	ptr;
 #endif
-	int			i;
 
 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 	ASSERT(cur->bc_nlevels > 1);
@@ -3773,39 +3822,16 @@ xfs_btree_kill_iroot(
 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 #endif
 
-	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
-	if (index) {
-		xfs_btree_iroot_realloc(cur, index);
-		block = ifp->if_broot;
-	}
-
-	be16_add_cpu(&block->bb_numrecs, index);
-	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-
-	kp = xfs_btree_key_addr(cur, 1, block);
-	ckp = xfs_btree_key_addr(cur, 1, cblock);
-	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
-
-	pp = xfs_btree_ptr_addr(cur, 1, block);
-	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-
-	for (i = 0; i < numrecs; i++) {
-		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
-		if (error)
-			return error;
-	}
-
-	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+	error = xfs_btree_demote_node_child(cur, cblock, level, numrecs);
+	if (error)
+		return error;
 
 	error = xfs_btree_free_block(cur, cbp);
 	if (error)
 		return error;
 
-	cur->bc_levels[level - 1].bp = NULL;
-	be16_add_cpu(&block->bb_level, -1);
 	xfs_trans_log_inode(cur->bc_tp, ip,
 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
-	cur->bc_nlevels--;
 out0:
 	return 0;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/14] xfs: support storing records in the inode core root
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-27 13:04   ` [PATCH 12/14] xfs: hoist the node iroot update code out of xfs_btree_kill_iroot Darrick J. Wong
@ 2023-12-27 13:04   ` Darrick J. Wong
  2023-12-27 13:04   ` [PATCH 14/14] xfs: update btree keys correctly when _insrec splits an inode root block Darrick J. Wong
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:04 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add the necessary flags and code so that we can support storing leaf
records in the inode root block of a btree.  This hasn't been necessary
before, but the realtime rmapbt will need to be able to do this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_btree.c         |  150 ++++++++++++++++++++++++++++++++++++++++----
 libxfs/xfs_btree.h         |    1 
 libxfs/xfs_btree_staging.c |    4 +
 3 files changed, 141 insertions(+), 14 deletions(-)


diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index 0f0198ae0cd..df13656ffe6 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -264,6 +264,11 @@ xfs_btree_check_block(
 	int			level,	/* level of the btree block */
 	struct xfs_buf		*bp)	/* buffer containing block, if any */
 {
+	/* Don't check the inode-core root. */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1)
+		return 0;
+
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 		return xfs_btree_check_lblock(cur, block, level, bp);
 	else
@@ -1544,12 +1549,16 @@ xfs_btree_log_recs(
 	int			first,
 	int			last)
 {
+	if (!bp) {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+				xfs_ilog_fbroot(cur->bc_ino.whichfork));
+		return;
+	}
 
 	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
 	xfs_trans_log_buf(cur->bc_tp, bp,
 			  xfs_btree_rec_offset(cur, first),
 			  xfs_btree_rec_offset(cur, last + 1) - 1);
-
 }
 
 /*
@@ -3079,6 +3088,64 @@ xfs_btree_iroot_realloc(
 			cur->bc_ops->iroot_ops, rec_diff);
 }
 
+/*
+ * Move the records from a root leaf block to a separate block.
+ *
+ * Trickery here: The amount of memory that we need per record for the incore
+ * root block changes when we convert a leaf block to an internal block.
+ * Therefore, we copy leaf records into the new btree block (cblock) before
+ * freeing the incore root block and changing the tree height.
+ *
+ * Once we've changed the tree height, we allocate a new incore root block
+ * (which will now be an internal root block) and populate it with a pointer to
+ * cblock and the relevant keys.
+ */
+STATIC void
+xfs_btree_promote_leaf_iroot(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	struct xfs_buf		*cbp,
+	union xfs_btree_ptr	*cptr,
+	struct xfs_btree_block	*cblock)
+{
+	union xfs_btree_rec	*rp;
+	union xfs_btree_rec	*crp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*pp;
+	size_t			size;
+	int			numrecs = xfs_btree_get_numrecs(block);
+
+	/* Copy the records from the leaf root into the new child block. */
+	rp = xfs_btree_rec_addr(cur, 1, block);
+	crp = xfs_btree_rec_addr(cur, 1, cblock);
+	xfs_btree_copy_recs(cur, crp, rp, numrecs);
+
+	/* Zap the old root and change the tree height. */
+	xfs_iroot_free(cur->bc_ino.ip, cur->bc_ino.whichfork);
+	cur->bc_nlevels++;
+	cur->bc_levels[1].ptr = 1;
+
+	/*
+	 * Allocate a new internal root block buffer and reinitialize it to
+	 * point to a single new child.
+	 */
+	size = cur->bc_ops->iroot_ops->size(cur->bc_mp, cur->bc_nlevels - 1, 1);
+	xfs_iroot_alloc(cur->bc_ino.ip, cur->bc_ino.whichfork, size);
+	block = xfs_btree_get_iroot(cur);
+	xfs_btree_init_block(cur->bc_mp, block, cur->bc_ops,
+			cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino);
+
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	kp = xfs_btree_key_addr(cur, 1, block);
+	xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+	xfs_btree_get_keys(cur, cblock, kp);
+
+	/* Attach the new block to the cursor and log it. */
+	xfs_btree_setbuf(cur, 0, cbp);
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_recs(cur, cbp, 1, numrecs);
+}
+
 /*
  * Move the keys and pointers from a root block to a separate block.
  *
@@ -3163,7 +3230,7 @@ xfs_btree_new_iroot(
 	struct xfs_buf		*cbp;		/* buffer for cblock */
 	struct xfs_btree_block	*block;		/* btree block */
 	struct xfs_btree_block	*cblock;	/* child btree block */
-	union xfs_btree_ptr	*pp;
+	union xfs_btree_ptr	aptr;
 	union xfs_btree_ptr	nptr;		/* new block addr */
 	int			level;		/* btree level */
 	int			error;		/* error return code */
@@ -3175,10 +3242,15 @@ xfs_btree_new_iroot(
 	level = cur->bc_nlevels - 1;
 
 	block = xfs_btree_get_iroot(cur);
-	pp = xfs_btree_ptr_addr(cur, 1, block);
+	ASSERT(level > 0 || (cur->bc_flags & XFS_BTREE_IROOT_RECORDS));
+	if (level > 0)
+		aptr = *xfs_btree_ptr_addr(cur, 1, block);
+	else
+		aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp,
+				cur->bc_ino.ip->i_ino));
 
 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
+	error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat);
 	if (error)
 		goto error0;
 	if (*stat == 0)
@@ -3204,10 +3276,14 @@ xfs_btree_new_iroot(
 			cblock->bb_u.s.bb_blkno = bno;
 	}
 
-	error = xfs_btree_promote_node_iroot(cur, block, level, cbp, &nptr,
-			cblock);
-	if (error)
-		goto error0;
+	if (level > 0) {
+		error = xfs_btree_promote_node_iroot(cur, block, level, cbp,
+				&nptr, cblock);
+		if (error)
+			goto error0;
+	} else {
+		xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock);
+	}
 
 	*logflags |=
 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
@@ -3704,6 +3780,45 @@ xfs_btree_insert(
 	return error;
 }
 
+/*
+ * Move the records from a child leaf block to the root block.
+ *
+ * Trickery here: The amount of memory we need per record for the incore root
+ * block changes when we convert a leaf block to an internal block.  Therefore,
+ * we free the incore root block, change the tree height, allocate a new incore
+ * root, and copy the records from the doomed block into the new root.
+ */
+STATIC void
+xfs_btree_demote_leaf_child(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*cblock,
+	int			numrecs)
+{
+	union xfs_btree_rec	*rp;
+	union xfs_btree_rec	*crp;
+	struct xfs_btree_block	*block;
+	size_t			size;
+
+	/* Zap the old root and change the tree height. */
+	xfs_iroot_free(cur->bc_ino.ip, cur->bc_ino.whichfork);
+	cur->bc_levels[0].bp = NULL;
+	cur->bc_nlevels--;
+
+	/*
+	 * Allocate a new internal root block buffer and reinitialize it with
+	 * the leaf records in the child.
+	 */
+	size = cur->bc_ops->iroot_ops->size(cur->bc_mp, 0, numrecs);
+	xfs_iroot_alloc(cur->bc_ino.ip, cur->bc_ino.whichfork, size);
+	block = xfs_btree_get_iroot(cur);
+	xfs_btree_init_block(cur->bc_mp, block, cur->bc_ops, 0, numrecs,
+			cur->bc_ino.ip->i_ino);
+
+	rp = xfs_btree_rec_addr(cur, 1, block);
+	crp = xfs_btree_rec_addr(cur, 1, cblock);
+	xfs_btree_copy_recs(cur, rp, crp, numrecs);
+}
+
 /*
  * Move the keyptrs from a child node block to the root block.
  *
@@ -3785,14 +3900,19 @@ xfs_btree_kill_iroot(
 #endif
 
 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
-	ASSERT(cur->bc_nlevels > 1);
+	ASSERT((cur->bc_flags & XFS_BTREE_IROOT_RECORDS) ||
+	       cur->bc_nlevels > 1);
 
 	/*
 	 * Don't deal with the root block needs to be a leaf case.
 	 * We're just going to turn the thing back into extents anyway.
 	 */
 	level = cur->bc_nlevels - 1;
-	if (level == 1)
+	if (level == 1 && !(cur->bc_flags & XFS_BTREE_IROOT_RECORDS))
+		goto out0;
+
+	/* If we're already a leaf, jump out. */
+	if (level == 0)
 		goto out0;
 
 	/*
@@ -3822,9 +3942,13 @@ xfs_btree_kill_iroot(
 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 #endif
 
-	error = xfs_btree_demote_node_child(cur, cblock, level, numrecs);
-	if (error)
-		return error;
+	if (level > 1) {
+		error = xfs_btree_demote_node_child(cur, cblock, level,
+				numrecs);
+		if (error)
+			return error;
+	} else
+		xfs_btree_demote_leaf_child(cur, cblock, numrecs);
 
 	error = xfs_btree_free_block(cur, cbp);
 	if (error)
diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h
index 7872fc1739b..bb6c2feecea 100644
--- a/libxfs/xfs_btree.h
+++ b/libxfs/xfs_btree.h
@@ -337,6 +337,7 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
  * is dynamically allocated and must be freed when the cursor is deleted.
  */
 #define XFS_BTREE_STAGING		(1<<5)
+#define XFS_BTREE_IROOT_RECORDS		(1<<6)	/* iroot can store records */
 
 /* btree stored in memory; not compatible with ROOT_IN_INODE */
 #ifdef CONFIG_XFS_BTREE_IN_XFILE
diff --git a/libxfs/xfs_btree_staging.c b/libxfs/xfs_btree_staging.c
index ec496915433..8b2e41dacff 100644
--- a/libxfs/xfs_btree_staging.c
+++ b/libxfs/xfs_btree_staging.c
@@ -710,7 +710,9 @@ xfs_btree_bload_compute_geometry(
 			 *
 			 * Note that bmap btrees forbid records in the root.
 			 */
-			if (level != 0 && nr_this_level <= avg_per_block) {
+			if ((level != 0 ||
+			     (cur->bc_flags & XFS_BTREE_IROOT_RECORDS)) &&
+			    nr_this_level <= avg_per_block) {
 				nr_blocks++;
 				break;
 			}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/14] xfs: update btree keys correctly when _insrec splits an inode root block
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-27 13:04   ` [PATCH 13/14] xfs: support storing records in the inode core root Darrick J. Wong
@ 2023-12-27 13:04   ` Darrick J. Wong
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:04 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In commit 2c813ad66a72, I partially fixed a bug wherein xfs_btree_insrec
would erroneously try to update the parent's key for a block that had
been split if we decided to insert the new record into the new block.
The solution was to detect this situation and update the in-core key
value that we pass up to the caller so that the caller will (eventually)
add the new block to the parent level of the tree with the correct key.

However, I missed a subtlety about the way inode-rooted btrees work.  If
the full block was a maximally sized inode root block, we'll solve that
fullness by moving the root block's records to a new block, resizing the
root block, and updating the root to point to the new block.  We don't
pass a pointer to the new block to the caller because that work has
already been done.  The new record will /always/ land in the new block,
so in this case we need to use xfs_btree_update_keys to update the keys.

This bug can theoretically manifest itself in the very rare case that we
split a bmbt root block and the new record lands in the very first slot
of the new block, though I've never managed to trigger it in practice.
However, it is very easy to reproduce by running generic/522 with the
realtime rmapbt patchset if rtinherit=1.

Fixes: 2c813ad66a72 ("xfs: support btrees with overlapping intervals for keys")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_btree.c |   29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)


diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index df13656ffe6..165ce251376 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -3653,14 +3653,31 @@ xfs_btree_insrec(
 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
 
 	/*
-	 * If we just inserted into a new tree block, we have to
-	 * recalculate nkey here because nkey is out of date.
+	 * Update btree keys to reflect the newly added record or keyptr.
+	 * There are three cases here to be aware of.  Normally, all we have to
+	 * do is walk towards the root, updating keys as necessary.
 	 *
-	 * Otherwise we're just updating an existing block (having shoved
-	 * some records into the new tree block), so use the regular key
-	 * update mechanism.
+	 * If the caller had us target a full block for the insertion, we dealt
+	 * with that by calling the _make_block_unfull function.  If the
+	 * "make unfull" function splits the block, it'll hand us back the key
+	 * and pointer of the new block.  We haven't yet added the new block to
+	 * the next level up, so if we decide to add the new record to the new
+	 * block (bp->b_bn != old_bn), we have to update the caller's pointer
+	 * so that the caller adds the new block with the correct key.
+	 *
+	 * However, there is a third possibility-- if the selected block is the
+	 * root block of an inode-rooted btree and cannot be expanded further,
+	 * the "make unfull" function moves the root block contents to a new
+	 * block and updates the root block to point to the new block.  In this
+	 * case, no block pointer is passed back because the block has already
+	 * been added to the btree.  In this case, we need to use the regular
+	 * key update function, just like the first case.  This is critical for
+	 * overlapping btrees, because the high key must be updated to reflect
+	 * the entire tree, not just the subtree accessible through the first
+	 * child of the root (which is now two levels down from the root).
 	 */
-	if (bp && xfs_buf_daddr(bp) != old_bn) {
+	if (!xfs_btree_ptr_is_null(cur, &nptr) &&
+	    bp && xfs_buf_daddr(bp) != old_bn) {
 		xfs_btree_get_keys(cur, block, lkey);
 	} else if (xfs_btree_needs_key_update(cur, optr)) {
 		error = xfs_btree_update_keys(cur, level);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/2] xfs: simplify xfs_ag_resv_free signature
  2023-12-31 19:53 ` [PATCHSET v2.0 08/17] xfsprogs: enable in-core block reservation for rt metadata Darrick J. Wong
@ 2023-12-27 13:05   ` Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 2/2] xfs: allow inode-based btrees to reserve space in the data device Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:05 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

It's not possible to fail at increasing fdblocks, so get rid of all the
error returns here.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_trace.h  |    1 -
 libxfs/xfs_ag.c      |    4 +---
 libxfs/xfs_ag_resv.c |   22 +++++-----------------
 libxfs/xfs_ag_resv.h |    2 +-
 4 files changed, 7 insertions(+), 22 deletions(-)


diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index b3240213364..08ec51fc799 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -269,7 +269,6 @@
 #define trace_xfs_ag_resv_critical(...)		((void) 0)
 #define trace_xfs_ag_resv_needed(...)		((void) 0)
 #define trace_xfs_ag_resv_free(...)		((void) 0)
-#define trace_xfs_ag_resv_free_error(...)	((void) 0)
 #define trace_xfs_ag_resv_init(...)		((void) 0)
 #define trace_xfs_ag_resv_init_error(...)	((void) 0)
 #define trace_xfs_ag_resv_alloc_extent(...)	((void) 0)
diff --git a/libxfs/xfs_ag.c b/libxfs/xfs_ag.c
index ddd5584f23e..f22d58ad040 100644
--- a/libxfs/xfs_ag.c
+++ b/libxfs/xfs_ag.c
@@ -942,9 +942,7 @@ xfs_ag_shrink_space(
 	 * Disable perag reservations so it doesn't cause the allocation request
 	 * to fail. We'll reestablish reservation before we return.
 	 */
-	error = xfs_ag_resv_free(pag);
-	if (error)
-		return error;
+	xfs_ag_resv_free(pag);
 
 	/* internal log shouldn't also show up in the free space btrees */
 	error = xfs_alloc_vextent_exact_bno(&args,
diff --git a/libxfs/xfs_ag_resv.c b/libxfs/xfs_ag_resv.c
index 3a80b1613e1..542740bb850 100644
--- a/libxfs/xfs_ag_resv.c
+++ b/libxfs/xfs_ag_resv.c
@@ -125,14 +125,13 @@ xfs_ag_resv_needed(
 }
 
 /* Clean out a reservation */
-static int
+static void
 __xfs_ag_resv_free(
 	struct xfs_perag		*pag,
 	enum xfs_ag_resv_type		type)
 {
 	struct xfs_ag_resv		*resv;
 	xfs_extlen_t			oldresv;
-	int				error;
 
 	trace_xfs_ag_resv_free(pag, type, 0);
 
@@ -148,30 +147,19 @@ __xfs_ag_resv_free(
 		oldresv = resv->ar_orig_reserved;
 	else
 		oldresv = resv->ar_reserved;
-	error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+	xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
 	resv->ar_reserved = 0;
 	resv->ar_asked = 0;
 	resv->ar_orig_reserved = 0;
-
-	if (error)
-		trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
-				error, _RET_IP_);
-	return error;
 }
 
 /* Free a per-AG reservation. */
-int
+void
 xfs_ag_resv_free(
 	struct xfs_perag		*pag)
 {
-	int				error;
-	int				err2;
-
-	error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
-	err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
-	if (err2 && !error)
-		error = err2;
-	return error;
+	__xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
+	__xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
 }
 
 static int
diff --git a/libxfs/xfs_ag_resv.h b/libxfs/xfs_ag_resv.h
index b74b210008e..ff20ed93de7 100644
--- a/libxfs/xfs_ag_resv.h
+++ b/libxfs/xfs_ag_resv.h
@@ -6,7 +6,7 @@
 #ifndef __XFS_AG_RESV_H__
 #define	__XFS_AG_RESV_H__
 
-int xfs_ag_resv_free(struct xfs_perag *pag);
+void xfs_ag_resv_free(struct xfs_perag *pag);
 int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp);
 
 bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/2] xfs: allow inode-based btrees to reserve space in the data device
  2023-12-31 19:53 ` [PATCHSET v2.0 08/17] xfsprogs: enable in-core block reservation for rt metadata Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 1/2] xfs: simplify xfs_ag_resv_free signature Darrick J. Wong
@ 2023-12-27 13:05   ` Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:05 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new space reservation scheme so that btree metadata for the
realtime volume can reserve space in the data device to avoid space
underruns.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h   |    5 +
 include/xfs_mount.h   |    1 
 include/xfs_trace.h   |    7 ++
 io/inject.c           |    1 
 libxfs/init.c         |   11 +++
 libxfs/libxfs_priv.h  |   11 +++
 libxfs/xfs_ag_resv.c  |    3 +
 libxfs/xfs_errortag.h |    4 +
 libxfs/xfs_imeta.c    |  190 +++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_imeta.h    |   11 +++
 libxfs/xfs_types.h    |    7 ++
 11 files changed, 248 insertions(+), 3 deletions(-)


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 2675abdffcd..ec73fe192fd 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -216,7 +216,10 @@ typedef struct xfs_inode {
 	struct xfs_ifork	i_df;		/* data fork */
 	struct xfs_ifork	i_af;		/* attribute fork */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
-	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
+	uint64_t		i_delayed_blks;	/* count of delay alloc blks */
+	/* Space that has been set aside to root a btree in this file. */
+	uint64_t		i_meta_resv_asked;
+
 	xfs_fsize_t		i_disk_size;	/* number of bytes in file */
 	xfs_rfsblock_t		i_nblocks;	/* # of direct & btree blocks */
 	prid_t			i_projid;	/* owner's project id */
diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index a2fdd7c2f14..51a02e69776 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -99,6 +99,7 @@ typedef struct xfs_mount {
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
 	uint			m_refc_maxlevels; /* max refc btree levels */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
+	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
 	uint			m_alloc_set_aside; /* space we can't use */
 	uint			m_ag_max_usable; /* max space per AG */
diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index 08ec51fc799..5010b35b1f6 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -389,4 +389,11 @@
 #define trace_xfs_iunlink_remove(...)		((void) 0)
 #define trace_xfs_iunlink_map_prev_fallback(...)	((void) 0)
 
+#define trace_xfs_imeta_resv_alloc_extent(...)	((void) 0)
+#define trace_xfs_imeta_resv_critical(...)	((void) 0)
+#define trace_xfs_imeta_resv_free(...)		((void) 0)
+#define trace_xfs_imeta_resv_free_extent(...)	((void) 0)
+#define trace_xfs_imeta_resv_init(...)		((void) 0)
+#define trace_xfs_imeta_resv_init_error(...)	((void) 0)
+
 #endif /* __TRACE_H__ */
diff --git a/io/inject.c b/io/inject.c
index 4b0cd76005c..644baa42b64 100644
--- a/io/inject.c
+++ b/io/inject.c
@@ -64,6 +64,7 @@ error_tag(char *name)
 		{ XFS_ERRTAG_WB_DELAY_MS,		"wb_delay_ms" },
 		{ XFS_ERRTAG_WRITE_DELAY_MS,		"write_delay_ms" },
 		{ XFS_ERRTAG_SWAPEXT_FINISH_ONE,	"swapext_finish_one" },
+		{ XFS_ERRTAG_IMETA_RESV_CRITICAL,	"imeta_resv_critical" },
 		{ XFS_ERRTAG_MAX,			NULL }
 	};
 	int	count;
diff --git a/libxfs/init.c b/libxfs/init.c
index 0332a4eeb21..2663485a80d 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -651,6 +651,15 @@ xfs_agbtree_compute_maxlevels(
 	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
 }
 
+/* Compute maximum possible height for realtime btree types for this fs. */
+static inline void
+xfs_rtbtree_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	/* This will be filled in later. */
+	mp->m_rtbtree_maxlevels = 0;
+}
+
 /* Compute maximum possible height of all btrees. */
 void
 libxfs_compute_all_maxlevels(
@@ -667,7 +676,7 @@ libxfs_compute_all_maxlevels(
 	xfs_refcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
-
+	xfs_rtbtree_compute_maxlevels(mp);
 }
 
 /* Mount the metadata files under the metadata directory tree. */
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 120a41e20a7..bbe7dd63443 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -221,6 +221,17 @@ uint32_t get_random_u32(void);
 #define get_random_u32()	(0)
 #endif
 
+static inline int
+__percpu_counter_compare(uint64_t *count, int64_t rhs, int32_t batch)
+{
+	if (*count > rhs)
+		return 1;
+	else if (*count < rhs)
+		return -1;
+	return 0;
+}
+
+
 #define PAGE_SIZE		getpagesize()
 
 #define inode_peek_iversion(inode)	(inode)->i_version
diff --git a/libxfs/xfs_ag_resv.c b/libxfs/xfs_ag_resv.c
index 542740bb850..5963ff5602b 100644
--- a/libxfs/xfs_ag_resv.c
+++ b/libxfs/xfs_ag_resv.c
@@ -112,6 +112,7 @@ xfs_ag_resv_needed(
 	case XFS_AG_RESV_RMAPBT:
 		len -= xfs_perag_resv(pag, type)->ar_reserved;
 		break;
+	case XFS_AG_RESV_IMETA:
 	case XFS_AG_RESV_NONE:
 		/* empty */
 		break;
@@ -346,6 +347,7 @@ xfs_ag_resv_alloc_extent(
 
 	switch (type) {
 	case XFS_AG_RESV_AGFL:
+	case XFS_AG_RESV_IMETA:
 		return;
 	case XFS_AG_RESV_METADATA:
 	case XFS_AG_RESV_RMAPBT:
@@ -388,6 +390,7 @@ xfs_ag_resv_free_extent(
 
 	switch (type) {
 	case XFS_AG_RESV_AGFL:
+	case XFS_AG_RESV_IMETA:
 		return;
 	case XFS_AG_RESV_METADATA:
 	case XFS_AG_RESV_RMAPBT:
diff --git a/libxfs/xfs_errortag.h b/libxfs/xfs_errortag.h
index 263d62a8d70..f359df69d6b 100644
--- a/libxfs/xfs_errortag.h
+++ b/libxfs/xfs_errortag.h
@@ -64,7 +64,8 @@
 #define XFS_ERRTAG_WB_DELAY_MS				42
 #define XFS_ERRTAG_WRITE_DELAY_MS			43
 #define XFS_ERRTAG_SWAPEXT_FINISH_ONE			44
-#define XFS_ERRTAG_MAX					45
+#define XFS_ERRTAG_IMETA_RESV_CRITICAL			45
+#define XFS_ERRTAG_MAX					46
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -113,5 +114,6 @@
 #define XFS_RANDOM_WB_DELAY_MS				3000
 #define XFS_RANDOM_WRITE_DELAY_MS			3000
 #define XFS_RANDOM_SWAPEXT_FINISH_ONE			1
+#define XFS_RANDOM_IMETA_RESV_CRITICAL			4
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
index 6ada36d5559..e2b14624381 100644
--- a/libxfs/xfs_imeta.c
+++ b/libxfs/xfs_imeta.c
@@ -26,6 +26,9 @@
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_health.h"
+#include "xfs_errortag.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
 
 /*
  * Metadata File Management
@@ -1074,3 +1077,190 @@ xfs_imeta_free_path(
 	kfree(path->im_path);
 	kfree(path);
 }
+
+/*
+ * Is the amount of space that could be allocated towards a given metadata
+ * file at or beneath a certain threshold?
+ */
+static inline bool
+xfs_imeta_resv_can_cover(
+	struct xfs_inode	*ip,
+	int64_t			rhs)
+{
+	/*
+	 * The amount of space that can be allocated to this metadata file is
+	 * the remaining reservation for the particular metadata file + the
+	 * global free block count.  Take care of the first case to avoid
+	 * touching the per-cpu counter.
+	 */
+	if (ip->i_delayed_blks >= rhs)
+		return true;
+
+	/*
+	 * There aren't enough blocks left in the inode's reservation, but it
+	 * isn't critical unless there also isn't enough free space.
+	 */
+	return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+			rhs - ip->i_delayed_blks, 2048) >= 0;
+}
+
+/*
+ * Is this metadata file critically low on blocks?  For now we'll define that
+ * as the number of blocks we can get our hands on being less than 10% of what
+ * we reserved or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_imeta_resv_critical(
+	struct xfs_inode	*ip)
+{
+	uint64_t		asked_low_water;
+
+	if (!ip)
+		return false;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	trace_xfs_imeta_resv_critical(ip, 0);
+
+	if (!xfs_imeta_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
+		return true;
+
+	asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
+	if (!xfs_imeta_resv_can_cover(ip, asked_low_water))
+		return true;
+
+	return XFS_TEST_ERROR(false, ip->i_mount,
+			XFS_ERRTAG_IMETA_RESV_CRITICAL);
+}
+
+/* Allocate a block from the metadata file's reservation. */
+void
+xfs_imeta_resv_alloc_extent(
+	struct xfs_inode	*ip,
+	struct xfs_alloc_arg	*args)
+{
+	int64_t			len = args->len;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+	ASSERT(args->resv == XFS_AG_RESV_IMETA);
+
+	trace_xfs_imeta_resv_alloc_extent(ip, args->len);
+
+	/*
+	 * Allocate the blocks from the metadata inode's block reservation
+	 * and update the ondisk sb counter.
+	 */
+	if (ip->i_delayed_blks > 0) {
+		int64_t		from_resv;
+
+		from_resv = min_t(int64_t, len, ip->i_delayed_blks);
+		ip->i_delayed_blks -= from_resv;
+		xfs_mod_delalloc(ip->i_mount, -from_resv);
+		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
+				-from_resv);
+		len -= from_resv;
+	}
+
+	/*
+	 * Any allocation in excess of the reservation requires in-core and
+	 * on-disk fdblocks updates.
+	 */
+	if (len)
+		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, -len);
+
+	ip->i_nblocks += args->len;
+	xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
+}
+
+/* Free a block to the metadata file's reservation. */
+void
+xfs_imeta_resv_free_extent(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp,
+	xfs_filblks_t		len)
+{
+	int64_t			to_resv;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+	trace_xfs_imeta_resv_free_extent(ip, len);
+
+	ip->i_nblocks -= len;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	/*
+	 * Add the freed blocks back into the inode's delalloc reservation
+	 * until it reaches the maximum size.  Update the ondisk fdblocks only.
+	 */
+	to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+	if (to_resv > 0) {
+		to_resv = min_t(int64_t, to_resv, len);
+		ip->i_delayed_blks += to_resv;
+		xfs_mod_delalloc(ip->i_mount, to_resv);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
+		len -= to_resv;
+	}
+
+	/*
+	 * Everything else goes back to the filesystem, so update the in-core
+	 * and on-disk counters.
+	 */
+	if (len)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
+}
+
+/* Release a metadata file's space reservation. */
+void
+xfs_imeta_resv_free_inode(
+	struct xfs_inode	*ip)
+{
+	if (!ip)
+		return;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	trace_xfs_imeta_resv_free(ip, 0);
+
+	xfs_mod_delalloc(ip->i_mount, -ip->i_delayed_blks);
+	xfs_mod_fdblocks(ip->i_mount, ip->i_delayed_blks, true);
+	ip->i_delayed_blks = 0;
+	ip->i_meta_resv_asked = 0;
+}
+
+/* Set up a metadata file's space reservation. */
+int
+xfs_imeta_resv_init_inode(
+	struct xfs_inode	*ip,
+	xfs_filblks_t		ask)
+{
+	xfs_filblks_t		hidden_space;
+	xfs_filblks_t		used;
+	int			error;
+
+	if (!ip || ip->i_meta_resv_asked > 0)
+		return 0;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+
+	/*
+	 * Space taken by all other metadata btrees are accounted on-disk as
+	 * used space.  We therefore only hide the space that is reserved but
+	 * not used by the trees.
+	 */
+	used = ip->i_nblocks;
+	if (used > ask)
+		ask = used;
+	hidden_space = ask - used;
+
+	error = xfs_mod_fdblocks(ip->i_mount, -(int64_t)hidden_space, true);
+	if (error) {
+		trace_xfs_imeta_resv_init_error(ip, error, _RET_IP_);
+		return error;
+	}
+
+	xfs_mod_delalloc(ip->i_mount, hidden_space);
+	ip->i_delayed_blks = hidden_space;
+	ip->i_meta_resv_asked = ask;
+
+	trace_xfs_imeta_resv_init(ip, ask);
+	return 0;
+}
diff --git a/libxfs/xfs_imeta.h b/libxfs/xfs_imeta.h
index 3b5953efc01..f6dda8e5af0 100644
--- a/libxfs/xfs_imeta.h
+++ b/libxfs/xfs_imeta.h
@@ -102,6 +102,17 @@ unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp);
 unsigned int xfs_imeta_link_space_res(struct xfs_mount *mp);
 unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp);
 
+/* Space reservations for metadata inodes. */
+struct xfs_alloc_arg;
+
+bool xfs_imeta_resv_critical(struct xfs_inode *ip);
+void xfs_imeta_resv_alloc_extent(struct xfs_inode *ip,
+		struct xfs_alloc_arg *args);
+void xfs_imeta_resv_free_extent(struct xfs_inode *ip, struct xfs_trans *tp,
+		xfs_filblks_t len);
+void xfs_imeta_resv_free_inode(struct xfs_inode *ip);
+int xfs_imeta_resv_init_inode(struct xfs_inode *ip, xfs_filblks_t ask);
+
 /* Must be implemented by the libxfs client */
 int xfs_imeta_iget(struct xfs_trans *tp, xfs_ino_t ino, unsigned char ftype,
 		struct xfs_inode **ipp);
diff --git a/libxfs/xfs_types.h b/libxfs/xfs_types.h
index 195471c4385..ad2ce83874f 100644
--- a/libxfs/xfs_types.h
+++ b/libxfs/xfs_types.h
@@ -221,6 +221,13 @@ enum xfs_ag_resv_type {
 	 * altering fdblocks.  If you think you need this you're wrong.
 	 */
 	XFS_AG_RESV_IGNORE,
+
+	/*
+	 * This allocation activity is being done on behalf of a metadata file.
+	 * These files maintain their own permanent space reservations and are
+	 * required to adjust fdblocks using the xfs_imeta_resv_* helpers.
+	 */
+	XFS_AG_RESV_IMETA,
 };
 
 /* Results of scanning a btree keyspace to check occupancy. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/8] xfs: clean up extent free log intent item tracepoint callsites
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
@ 2023-12-27 13:05   ` Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 2/8] xfs: convert "skip_discard" to a proper flags bitset Darrick J. Wong
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:05 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pass the incore EFI structure to the tracepoints instead of open-coding
the argument passing.  This cleans up the call sites a bit.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_trace.h |    5 ++---
 libxfs/xfs_alloc.c  |    7 +++----
 2 files changed, 5 insertions(+), 7 deletions(-)


diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index 5010b35b1f6..a6ae0ca13b6 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -13,8 +13,8 @@
 #define trace_xfbtree_trans_cancel_buf(...)	((void) 0)
 #define trace_xfbtree_trans_commit_buf(...)	((void) 0)
 
+#define trace_xfs_agfl_free_defer(...)		((void) 0)
 #define trace_xfs_agfl_reset(a,b,c,d)		((void) 0)
-#define trace_xfs_agfl_free_defer(a,b,c,d,e)	((void) 0)
 #define trace_xfs_alloc_cur_check(a,b,c,d,e,f)	((void) 0)
 #define trace_xfs_alloc_cur(a)			((void) 0)
 #define trace_xfs_alloc_cur_left(a)		((void) 0)
@@ -242,8 +242,7 @@
 #define trace_xfs_defer_item_pause(...)		((void) 0)
 #define trace_xfs_defer_item_unpause(...)	((void) 0)
 
-#define trace_xfs_bmap_free_defer(...)		((void) 0)
-#define trace_xfs_bmap_free_deferred(...)	((void) 0)
+#define trace_xfs_extent_free_defer(...)	((void) 0)
 
 #define trace_xfs_rmap_map(...)			((void) 0)
 #define trace_xfs_rmap_map_error(...)		((void) 0)
diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c
index 3d7686eadab..08cdfe7e3d3 100644
--- a/libxfs/xfs_alloc.c
+++ b/libxfs/xfs_alloc.c
@@ -2569,7 +2569,7 @@ xfs_defer_agfl_block(
 	xefi->xefi_owner = oinfo->oi_owner;
 	xefi->xefi_agresv = XFS_AG_RESV_AGFL;
 
-	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+	trace_xfs_agfl_free_defer(mp, xefi);
 
 	xfs_extent_free_get_group(mp, xefi);
 	xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
@@ -2631,9 +2631,8 @@ xfs_defer_extent_free(
 	} else {
 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
 	}
-	trace_xfs_bmap_free_defer(mp,
-			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
-			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+
+	trace_xfs_extent_free_defer(mp, xefi);
 
 	xfs_extent_free_get_group(mp, xefi);
 	*dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/8] xfs: convert "skip_discard" to a proper flags bitset
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 1/8] xfs: clean up extent free log intent item tracepoint callsites Darrick J. Wong
@ 2023-12-27 13:05   ` Darrick J. Wong
  2023-12-27 13:06   ` [PATCH 3/8] xfs: pass the fsbno to xfs_perag_intent_get Darrick J. Wong
                     ` (5 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:05 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert the boolean to skip discard on free into a proper flags field so
that we can add more flags in the next patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_ag.c             |    2 +-
 libxfs/xfs_alloc.c          |   13 +++++++------
 libxfs/xfs_alloc.h          |    9 +++++++--
 libxfs/xfs_bmap.c           |   12 ++++++++----
 libxfs/xfs_bmap_btree.c     |    2 +-
 libxfs/xfs_ialloc.c         |    5 ++---
 libxfs/xfs_ialloc_btree.c   |    2 +-
 libxfs/xfs_refcount.c       |    6 +++---
 libxfs/xfs_refcount_btree.c |    2 +-
 repair/bulkload.c           |    3 ++-
 10 files changed, 33 insertions(+), 23 deletions(-)


diff --git a/libxfs/xfs_ag.c b/libxfs/xfs_ag.c
index f22d58ad040..ac6b0090825 100644
--- a/libxfs/xfs_ag.c
+++ b/libxfs/xfs_ag.c
@@ -978,7 +978,7 @@ xfs_ag_shrink_space(
 			goto resv_err;
 
 		err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
-				XFS_AG_RESV_NONE, true);
+				XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD);
 		if (err2)
 			goto resv_err;
 
diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c
index 08cdfe7e3d3..442a045378c 100644
--- a/libxfs/xfs_alloc.c
+++ b/libxfs/xfs_alloc.c
@@ -2587,7 +2587,7 @@ xfs_defer_extent_free(
 	xfs_filblks_t			len,
 	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type		type,
-	bool				skip_discard,
+	unsigned int			free_flags,
 	struct xfs_defer_pending	**dfpp)
 {
 	struct xfs_extent_free_item	*xefi;
@@ -2607,6 +2607,7 @@ xfs_defer_extent_free(
 	ASSERT(len < mp->m_sb.sb_agblocks);
 	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
 #endif
+	ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
 	ASSERT(xfs_extfree_item_cache != NULL);
 	ASSERT(type != XFS_AG_RESV_AGFL);
 
@@ -2618,7 +2619,7 @@ xfs_defer_extent_free(
 	xefi->xefi_startblock = bno;
 	xefi->xefi_blockcount = (xfs_extlen_t)len;
 	xefi->xefi_agresv = type;
-	if (skip_discard)
+	if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
 	if (oinfo) {
 		ASSERT(oinfo->oi_offset == 0);
@@ -2646,11 +2647,11 @@ xfs_free_extent_later(
 	xfs_filblks_t			len,
 	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type		type,
-	bool				skip_discard)
+	unsigned int			free_flags)
 {
 	struct xfs_defer_pending	*dontcare = NULL;
 
-	return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
+	return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags,
 			&dontcare);
 }
 
@@ -2675,13 +2676,13 @@ xfs_free_extent_later(
 int
 xfs_alloc_schedule_autoreap(
 	const struct xfs_alloc_arg	*args,
-	bool				skip_discard,
+	unsigned int			free_flags,
 	struct xfs_alloc_autoreap	*aarp)
 {
 	int				error;
 
 	error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
-			&args->oinfo, args->resv, skip_discard, &aarp->dfp);
+			&args->oinfo, args->resv, free_flags, &aarp->dfp);
 	if (error)
 		return error;
 
diff --git a/libxfs/xfs_alloc.h b/libxfs/xfs_alloc.h
index 0b956f8b9d5..2da543fb90e 100644
--- a/libxfs/xfs_alloc.h
+++ b/libxfs/xfs_alloc.h
@@ -233,7 +233,12 @@ xfs_buf_to_agfl_bno(
 
 int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
 		xfs_filblks_t len, const struct xfs_owner_info *oinfo,
-		enum xfs_ag_resv_type type, bool skip_discard);
+		enum xfs_ag_resv_type type, unsigned int free_flags);
+
+/* Don't issue a discard for the blocks freed. */
+#define XFS_FREE_EXTENT_SKIP_DISCARD	(1U << 0)
+
+#define XFS_FREE_EXTENT_ALL_FLAGS	(XFS_FREE_EXTENT_SKIP_DISCARD)
 
 /*
  * List of extents to be free "later".
@@ -262,7 +267,7 @@ struct xfs_alloc_autoreap {
 };
 
 int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args,
-		bool skip_discard, struct xfs_alloc_autoreap *aarp);
+		unsigned int free_flags, struct xfs_alloc_autoreap *aarp);
 void xfs_alloc_cancel_autoreap(struct xfs_trans *tp,
 		struct xfs_alloc_autoreap *aarp);
 void xfs_alloc_commit_autoreap(struct xfs_trans *tp,
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 7fefbb7d21c..323f60b1128 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -582,7 +582,7 @@ xfs_bmap_btree_to_extents(
 
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
 	error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo,
-			XFS_AG_RESV_NONE, false);
+			XFS_AG_RESV_NONE, 0);
 	if (error)
 		return error;
 
@@ -5260,11 +5260,15 @@ xfs_bmap_del_extent_real(
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
 			xfs_refcount_decrease_extent(tp, del);
 		} else {
+			unsigned int	efi_flags = 0;
+
+			if ((bflags & XFS_BMAPI_NODISCARD) ||
+			    del->br_state == XFS_EXT_UNWRITTEN)
+				efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD;
+
 			error = xfs_free_extent_later(tp, del->br_startblock,
 					del->br_blockcount, NULL,
-					XFS_AG_RESV_NONE,
-					((bflags & XFS_BMAPI_NODISCARD) ||
-					del->br_state == XFS_EXT_UNWRITTEN));
+					XFS_AG_RESV_NONE, efi_flags);
 			if (error)
 				return error;
 		}
diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c
index 8658e7c390a..f98cc408540 100644
--- a/libxfs/xfs_bmap_btree.c
+++ b/libxfs/xfs_bmap_btree.c
@@ -269,7 +269,7 @@ xfs_bmbt_free_block(
 
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
 	error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo,
-			XFS_AG_RESV_NONE, false);
+			XFS_AG_RESV_NONE, 0);
 	if (error)
 		return error;
 
diff --git a/libxfs/xfs_ialloc.c b/libxfs/xfs_ialloc.c
index 8aae4b79c85..935f8127c0e 100644
--- a/libxfs/xfs_ialloc.c
+++ b/libxfs/xfs_ialloc.c
@@ -1960,7 +1960,7 @@ xfs_difree_inode_chunk(
 		return xfs_free_extent_later(tp,
 				XFS_AGB_TO_FSB(mp, agno, sagbno),
 				M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
-				XFS_AG_RESV_NONE, false);
+				XFS_AG_RESV_NONE, 0);
 	}
 
 	/* holemask is only 16-bits (fits in an unsigned long) */
@@ -2006,8 +2006,7 @@ xfs_difree_inode_chunk(
 		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
 		error = xfs_free_extent_later(tp,
 				XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
-				&XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE,
-				false);
+				&XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0);
 		if (error)
 			return error;
 
diff --git a/libxfs/xfs_ialloc_btree.c b/libxfs/xfs_ialloc_btree.c
index 80d28d3fea5..a5f2ccec2e9 100644
--- a/libxfs/xfs_ialloc_btree.c
+++ b/libxfs/xfs_ialloc_btree.c
@@ -160,7 +160,7 @@ __xfs_inobt_free_block(
 	xfs_inobt_mod_blockcount(cur, -1);
 	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
 	return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
-			&XFS_RMAP_OINFO_INOBT, resv, false);
+			&XFS_RMAP_OINFO_INOBT, resv, 0);
 }
 
 STATIC int
diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index fe63b9eec8a..6c6634675c2 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -1172,7 +1172,7 @@ xfs_refcount_adjust_extents(
 						tmp.rc_startblock);
 				error = xfs_free_extent_later(cur->bc_tp, fsbno,
 						  tmp.rc_blockcount, NULL,
-						  XFS_AG_RESV_NONE, false);
+						  XFS_AG_RESV_NONE, 0);
 				if (error)
 					goto out_error;
 			}
@@ -1236,7 +1236,7 @@ xfs_refcount_adjust_extents(
 					ext.rc_startblock);
 			error = xfs_free_extent_later(cur->bc_tp, fsbno,
 					ext.rc_blockcount, NULL,
-					XFS_AG_RESV_NONE, false);
+					XFS_AG_RESV_NONE, 0);
 			if (error)
 				goto out_error;
 		}
@@ -2021,7 +2021,7 @@ xfs_refcount_recover_cow_leftovers(
 		/* Free the block. */
 		error = xfs_free_extent_later(tp, fsb,
 				rr->rr_rrec.rc_blockcount, NULL,
-				XFS_AG_RESV_NONE, false);
+				XFS_AG_RESV_NONE, 0);
 		if (error)
 			goto out_trans;
 
diff --git a/libxfs/xfs_refcount_btree.c b/libxfs/xfs_refcount_btree.c
index 1fbd250c1a8..c57e89f4be8 100644
--- a/libxfs/xfs_refcount_btree.c
+++ b/libxfs/xfs_refcount_btree.c
@@ -107,7 +107,7 @@ xfs_refcountbt_free_block(
 	be32_add_cpu(&agf->agf_refcount_blocks, -1);
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
 	return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
-			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false);
+			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, 0);
 }
 
 STATIC int
diff --git a/repair/bulkload.c b/repair/bulkload.c
index a97839f549d..e9c52afd23c 100644
--- a/repair/bulkload.c
+++ b/repair/bulkload.c
@@ -196,7 +196,8 @@ bulkload_free_extent(
 	 */
 	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
 	error = -libxfs_free_extent_later(sc->tp, fsbno, free_aglen,
-			&bkl->oinfo, XFS_AG_RESV_NONE, true);
+			&bkl->oinfo, XFS_AG_RESV_NONE,
+			XFS_FREE_EXTENT_SKIP_DISCARD);
 	if (error)
 		return error;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/8] xfs: pass the fsbno to xfs_perag_intent_get
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 1/8] xfs: clean up extent free log intent item tracepoint callsites Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 2/8] xfs: convert "skip_discard" to a proper flags bitset Darrick J. Wong
@ 2023-12-27 13:06   ` Darrick J. Wong
  2023-12-27 13:06   ` [PATCH 4/8] xfs: add a xefi_entry helper Darrick J. Wong
                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:06 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

All callers of xfs_perag_intent_get have a fsbno and need boilerplate
code to turn that into an agno.  Just pass the fsbno to
xfs_perag_intent_get and look up the agno there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_mount.h |    3 ++-
 libxfs/defer_item.c |   21 ++++-----------------
 2 files changed, 6 insertions(+), 18 deletions(-)


diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index 51a02e69776..1284e848835 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -317,7 +317,8 @@ struct xfs_defer_drain { /* empty */ };
 #define xfs_defer_drain_init(dr)		((void)0)
 #define xfs_defer_drain_free(dr)		((void)0)
 
-#define xfs_perag_intent_get(mp, agno)		xfs_perag_get((mp), (agno))
+#define xfs_perag_intent_get(mp, agno) \
+	xfs_perag_get((mp), XFS_FSB_TO_AGNO((mp), (agno)))
 #define xfs_perag_intent_put(pag)		xfs_perag_put(pag)
 
 static inline void xfs_perag_intent_hold(struct xfs_perag *pag) {}
diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index fbd035d9b94..2958dcb85e5 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -79,10 +79,7 @@ xfs_extent_free_get_group(
 	struct xfs_mount		*mp,
 	struct xfs_extent_free_item	*xefi)
 {
-	xfs_agnumber_t			agno;
-
-	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
-	xefi->xefi_pag = xfs_perag_intent_get(mp, agno);
+	xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock);
 }
 
 /* Release an active AG ref after some freeing work. */
@@ -256,10 +253,7 @@ xfs_rmap_update_get_group(
 	struct xfs_mount	*mp,
 	struct xfs_rmap_intent	*ri)
 {
-	xfs_agnumber_t	agno;
-
-	agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
-	ri->ri_pag = xfs_perag_intent_get(mp, agno);
+	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
 }
 
 /* Release an active AG ref after finishing rmapping work. */
@@ -369,10 +363,7 @@ xfs_refcount_update_get_group(
 	struct xfs_mount		*mp,
 	struct xfs_refcount_intent	*ri)
 {
-	xfs_agnumber_t			agno;
-
-	agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock);
-	ri->ri_pag = xfs_perag_intent_get(mp, agno);
+	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
 }
 
 /* Release an active AG ref after finishing refcounting work. */
@@ -490,8 +481,6 @@ xfs_bmap_update_get_group(
 	struct xfs_mount	*mp,
 	struct xfs_bmap_intent	*bi)
 {
-	xfs_agnumber_t		agno;
-
 	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
 		if (xfs_has_rtgroups(mp)) {
 			xfs_rgnumber_t	rgno;
@@ -505,8 +494,6 @@ xfs_bmap_update_get_group(
 		return;
 	}
 
-	agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
-
 	/*
 	 * Bump the intent count on behalf of the deferred rmap and refcount
 	 * intent items that that we can queue when we finish this bmap work.
@@ -514,7 +501,7 @@ xfs_bmap_update_get_group(
 	 * intent drops the intent count, ensuring that the intent count
 	 * remains nonzero across the transaction roll.
 	 */
-	bi->bi_pag = xfs_perag_intent_get(mp, agno);
+	bi->bi_pag = xfs_perag_intent_get(mp, bi->bi_bmap.br_startblock);
 }
 
 /* Add this deferred BUI to the transaction. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/8] xfs: add a xefi_entry helper
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:06   ` [PATCH 3/8] xfs: pass the fsbno to xfs_perag_intent_get Darrick J. Wong
@ 2023-12-27 13:06   ` Darrick J. Wong
  2023-12-27 13:06   ` [PATCH 5/8] xfs: reuse xfs_extent_free_cancel_item Darrick J. Wong
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:06 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a helper to translate from the item list head to the
xfs_extent_free_item structure and use it so shorten assignments and
avoid the need for extra local variables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 2958dcb85e5..5fa54962267 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -32,6 +32,11 @@
 
 /* Extent Freeing */
 
+static inline struct xfs_extent_free_item *xefi_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_extent_free_item, xefi_list);
+}
+
 /* Sort bmap items by AG. */
 static int
 xfs_extent_free_diff_items(
@@ -39,11 +44,8 @@ xfs_extent_free_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	const struct xfs_extent_free_item *ra;
-	const struct xfs_extent_free_item *rb;
-
-	ra = container_of(a, struct xfs_extent_free_item, xefi_list);
-	rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+	struct xfs_extent_free_item	*ra = xefi_entry(a);
+	struct xfs_extent_free_item	*rb = xefi_entry(b);
 
 	return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno;
 }
@@ -99,12 +101,10 @@ xfs_extent_free_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_owner_info		oinfo = { };
-	struct xfs_extent_free_item	*xefi;
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 	xfs_agblock_t			agbno;
 	int				error = 0;
 
-	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
-
 	oinfo.oi_owner = xefi->xefi_owner;
 	if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
 		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
@@ -143,9 +143,7 @@ STATIC void
 xfs_extent_free_cancel_item(
 	struct list_head		*item)
 {
-	struct xfs_extent_free_item	*xefi;
-
-	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 
 	xfs_extent_free_put_group(xefi);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
@@ -173,13 +171,11 @@ xfs_agfl_free_finish_item(
 {
 	struct xfs_owner_info		oinfo = { };
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_extent_free_item	*xefi;
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 	struct xfs_buf			*agbp;
 	int				error;
 	xfs_agblock_t			agbno;
 
-	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
-
 	ASSERT(xefi->xefi_blockcount == 1);
 	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
 	oinfo.oi_owner = xefi->xefi_owner;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/8] xfs: reuse xfs_extent_free_cancel_item
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:06   ` [PATCH 4/8] xfs: add a xefi_entry helper Darrick J. Wong
@ 2023-12-27 13:06   ` Darrick J. Wong
  2023-12-27 13:06   ` [PATCH 6/8] xfs: remove duplicate asserts in xfs_defer_extent_free Darrick J. Wong
                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:06 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reuse xfs_extent_free_cancel_item to put the AG/RTG and free the item in
a few places that currently open code the logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 5fa54962267..b159f22c1c0 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -92,6 +92,17 @@ xfs_extent_free_put_group(
 	xfs_perag_intent_put(xefi->xefi_pag);
 }
 
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
+
+	xfs_extent_free_put_group(xefi);
+	kmem_cache_free(xfs_extfree_item_cache, xefi);
+}
+
 /* Process a free extent. */
 STATIC int
 xfs_extent_free_finish_item(
@@ -123,11 +134,8 @@ xfs_extent_free_finish_item(
 	 * Don't free the XEFI if we need a new transaction to complete
 	 * processing of it.
 	 */
-	if (error == -EAGAIN)
-		return error;
-
-	xfs_extent_free_put_group(xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
+	if (error != -EAGAIN)
+		xfs_extent_free_cancel_item(item);
 	return error;
 }
 
@@ -138,17 +146,6 @@ xfs_extent_free_abort_intent(
 {
 }
 
-/* Cancel a free extent. */
-STATIC void
-xfs_extent_free_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_extent_free_item	*xefi = xefi_entry(item);
-
-	xfs_extent_free_put_group(xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
-}
-
 const struct xfs_defer_op_type xfs_extent_free_defer_type = {
 	.name		= "extent_free",
 	.create_intent	= xfs_extent_free_create_intent,
@@ -185,8 +182,7 @@ xfs_agfl_free_finish_item(
 		error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
 				agbno, agbp, &oinfo);
 
-	xfs_extent_free_put_group(xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
+	xfs_extent_free_cancel_item(item);
 	return error;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/8] xfs: remove duplicate asserts in xfs_defer_extent_free
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:06   ` [PATCH 5/8] xfs: reuse xfs_extent_free_cancel_item Darrick J. Wong
@ 2023-12-27 13:06   ` Darrick J. Wong
  2023-12-27 13:07   ` [PATCH 7/8] xfs: remove xfs_defer_agfl_block Darrick J. Wong
  2023-12-27 13:07   ` [PATCH 8/8] xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c Darrick J. Wong
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:06 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

The bno/len verification is already done by the calls to
xfs_verify_rtbext / xfs_verify_fsbext, and reporting a corruption error
seem like the better handling than tripping an assert anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_alloc.c |   13 -------------
 1 file changed, 13 deletions(-)


diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c
index 442a045378c..160563b1b26 100644
--- a/libxfs/xfs_alloc.c
+++ b/libxfs/xfs_alloc.c
@@ -2592,23 +2592,10 @@ xfs_defer_extent_free(
 {
 	struct xfs_extent_free_item	*xefi;
 	struct xfs_mount		*mp = tp->t_mountp;
-#ifdef DEBUG
-	xfs_agnumber_t			agno;
-	xfs_agblock_t			agbno;
 
-	ASSERT(bno != NULLFSBLOCK);
-	ASSERT(len > 0);
 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
 	ASSERT(!isnullstartblock(bno));
-	agno = XFS_FSB_TO_AGNO(mp, bno);
-	agbno = XFS_FSB_TO_AGBNO(mp, bno);
-	ASSERT(agno < mp->m_sb.sb_agcount);
-	ASSERT(agbno < mp->m_sb.sb_agblocks);
-	ASSERT(len < mp->m_sb.sb_agblocks);
-	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
 	ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
-	ASSERT(xfs_extfree_item_cache != NULL);
 	ASSERT(type != XFS_AG_RESV_AGFL);
 
 	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 7/8] xfs: remove xfs_defer_agfl_block
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:06   ` [PATCH 6/8] xfs: remove duplicate asserts in xfs_defer_extent_free Darrick J. Wong
@ 2023-12-27 13:07   ` Darrick J. Wong
  2023-12-27 13:07   ` [PATCH 8/8] xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c Darrick J. Wong
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:07 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

xfs_free_extent_later can handle the extra AGFL special casing with
very little extra logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_alloc.c |   67 +++++++++++++++++-----------------------------------
 1 file changed, 22 insertions(+), 45 deletions(-)


diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c
index 160563b1b26..2cbdbd4c416 100644
--- a/libxfs/xfs_alloc.c
+++ b/libxfs/xfs_alloc.c
@@ -2534,48 +2534,6 @@ xfs_agfl_reset(
 	clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
 }
 
-/*
- * Defer an AGFL block free. This is effectively equivalent to
- * xfs_free_extent_later() with some special handling particular to AGFL blocks.
- *
- * Deferring AGFL frees helps prevent log reservation overruns due to too many
- * allocation operations in a transaction. AGFL frees are prone to this problem
- * because for one they are always freed one at a time. Further, an immediate
- * AGFL block free can cause a btree join and require another block free before
- * the real allocation can proceed. Deferring the free disconnects freeing up
- * the AGFL slot from freeing the block.
- */
-static int
-xfs_defer_agfl_block(
-	struct xfs_trans		*tp,
-	xfs_agnumber_t			agno,
-	xfs_agblock_t			agbno,
-	struct xfs_owner_info		*oinfo)
-{
-	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_extent_free_item	*xefi;
-	xfs_fsblock_t			fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
-
-	ASSERT(xfs_extfree_item_cache != NULL);
-	ASSERT(oinfo != NULL);
-
-	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
-		return -EFSCORRUPTED;
-
-	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
-			       GFP_KERNEL | __GFP_NOFAIL);
-	xefi->xefi_startblock = fsbno;
-	xefi->xefi_blockcount = 1;
-	xefi->xefi_owner = oinfo->oi_owner;
-	xefi->xefi_agresv = XFS_AG_RESV_AGFL;
-
-	trace_xfs_agfl_free_defer(mp, xefi);
-
-	xfs_extent_free_get_group(mp, xefi);
-	xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
-	return 0;
-}
-
 /*
  * Add the extent to the list of extents to be free at transaction end.
  * The list is maintained sorted (by block number).
@@ -2623,7 +2581,13 @@ xfs_defer_extent_free(
 	trace_xfs_extent_free_defer(mp, xefi);
 
 	xfs_extent_free_get_group(mp, xefi);
-	*dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
+
+	if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_agfl_free_defer_type);
+	else
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_extent_free_defer_type);
 	return 0;
 }
 
@@ -2881,8 +2845,21 @@ xfs_alloc_fix_freelist(
 		if (error)
 			goto out_agbp_relse;
 
-		/* defer agfl frees */
-		error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+		/*
+		 * Defer the AGFL block free.
+		 *
+		 * This helps to prevent log reservation overruns due to too
+		 * many allocation operations in a transaction. AGFL frees are
+		 * prone to this problem because for one they are always freed
+		 * one at a time.  Further, an immediate AGFL block free can
+		 * cause a btree join and require another block free before the
+		 * real allocation can proceed.
+		 * Deferring the free disconnects freeing up the AGFL slot from
+		 * freeing the block.
+		 */
+		error = xfs_free_extent_later(tp,
+				XFS_AGB_TO_FSB(mp, args->agno, bno), 1,
+				&targs.oinfo, XFS_AG_RESV_AGFL, 0);
 		if (error)
 			goto out_agbp_relse;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 8/8] xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:07   ` [PATCH 7/8] xfs: remove xfs_defer_agfl_block Darrick J. Wong
@ 2023-12-27 13:07   ` Darrick J. Wong
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:07 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the code that adds the incore xfs_extent_free_item deferred work
data to a transaction live with the EFI log item code.  This means that
the allocator code no longer has to know about the inner workings of the
EFI log items.

As a consequence, we can get rid of the _{get,put}_group helpers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   28 +++++++++++++++-------------
 libxfs/defer_item.h |    6 ++++++
 libxfs/xfs_alloc.c  |   12 ++----------
 libxfs/xfs_alloc.h  |    3 ---
 4 files changed, 23 insertions(+), 26 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index b159f22c1c0..9b9bce17f4e 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -27,6 +27,7 @@
 #include "defer_item.h"
 #include "xfs_ag.h"
 #include "xfs_swapext.h"
+#include "defer_item.h"
 
 /* Dummy defer item ops, since we don't do logging. */
 
@@ -75,21 +76,22 @@ xfs_extent_free_create_done(
 	return NULL;
 }
 
-/* Take an active ref to the AG containing the space we're freeing. */
+/* Add this deferred EFI to the transaction. */
 void
-xfs_extent_free_get_group(
-	struct xfs_mount		*mp,
-	struct xfs_extent_free_item	*xefi)
+xfs_extent_free_defer_add(
+	struct xfs_trans		*tp,
+	struct xfs_extent_free_item	*xefi,
+	struct xfs_defer_pending	**dfpp)
 {
+	struct xfs_mount		*mp = tp->t_mountp;
+
 	xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock);
-}
-
-/* Release an active AG ref after some freeing work. */
-static inline void
-xfs_extent_free_put_group(
-	struct xfs_extent_free_item	*xefi)
-{
-	xfs_perag_intent_put(xefi->xefi_pag);
+	if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_agfl_free_defer_type);
+	else
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_extent_free_defer_type);
 }
 
 /* Cancel a free extent. */
@@ -99,7 +101,7 @@ xfs_extent_free_cancel_item(
 {
 	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 
-	xfs_extent_free_put_group(xefi);
+	xfs_perag_intent_put(xefi->xefi_pag);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
 }
 
diff --git a/libxfs/defer_item.h b/libxfs/defer_item.h
index a3ef9e079d0..79e957eb8ff 100644
--- a/libxfs/defer_item.h
+++ b/libxfs/defer_item.h
@@ -14,4 +14,10 @@ struct xfs_swapext_intent;
 
 void xfs_swapext_defer_add(struct xfs_trans *tp, struct xfs_swapext_intent *sxi);
 
+struct xfs_extent_free_item;
+
+void xfs_extent_free_defer_add(struct xfs_trans *tp,
+		struct xfs_extent_free_item *xefi,
+		struct xfs_defer_pending **dfpp);
+
 #endif /* __LIBXFS_DEFER_ITEM_H_ */
diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c
index 2cbdbd4c416..36af2c087b0 100644
--- a/libxfs/xfs_alloc.c
+++ b/libxfs/xfs_alloc.c
@@ -23,6 +23,7 @@
 #include "xfs_ag_resv.h"
 #include "xfs_bmap.h"
 #include "xfs_health.h"
+#include "defer_item.h"
 
 struct kmem_cache	*xfs_extfree_item_cache;
 
@@ -2578,16 +2579,7 @@ xfs_defer_extent_free(
 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
 	}
 
-	trace_xfs_extent_free_defer(mp, xefi);
-
-	xfs_extent_free_get_group(mp, xefi);
-
-	if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
-		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
-				&xfs_agfl_free_defer_type);
-	else
-		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
-				&xfs_extent_free_defer_type);
+	xfs_extent_free_defer_add(tp, xefi, dfpp);
 	return 0;
 }
 
diff --git a/libxfs/xfs_alloc.h b/libxfs/xfs_alloc.h
index 2da543fb90e..0ed71a31fe7 100644
--- a/libxfs/xfs_alloc.h
+++ b/libxfs/xfs_alloc.h
@@ -254,9 +254,6 @@ struct xfs_extent_free_item {
 	enum xfs_ag_resv_type	xefi_agresv;
 };
 
-void xfs_extent_free_get_group(struct xfs_mount *mp,
-		struct xfs_extent_free_item *xefi);
-
 #define XFS_EFI_SKIP_DISCARD	(1U << 0) /* don't issue discard */
 #define XFS_EFI_ATTR_FORK	(1U << 1) /* freeing attr fork block */
 #define XFS_EFI_BMBT_BLOCK	(1U << 2) /* freeing bmap btree block */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/3] xfs: support logging EFIs for realtime extents
  2023-12-31 19:54 ` [PATCHSET v2.0 10/17] xfsprogs: widen EFI format to support rt Darrick J. Wong
@ 2023-12-27 13:07   ` Darrick J. Wong
  2023-12-27 13:07   ` [PATCH 2/3] xfs: support error injection when freeing rt extents Darrick J. Wong
  2023-12-27 13:08   ` [PATCH 3/3] xfs_logprint: report realtime EFIs Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:07 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the EFI mechanism how to free realtime extents.  We're going to
need this to enforce proper ordering of operations when we enable
realtime rmap.

Declare a new log intent item type (XFS_LI_EFI_RT) and a separate defer
ops for rt extents.  This keeps the ondisk artifacts and processing code
completely separate between the rt and non-rt cases.  Hopefully this
will make it easier to debug filesystem problems.

Previous versions of this patch accomplished this by setting the high
bit in each rt EFI extent.  This was found to be less transparent by
reviewers.

[Contains a bug fix and cleanups from hch]

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c     |   75 +++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_alloc.c      |   16 ++++++++--
 libxfs/xfs_alloc.h      |   17 +++++++++--
 libxfs/xfs_defer.c      |    6 ++++
 libxfs/xfs_defer.h      |    1 +
 libxfs/xfs_log_format.h |    6 +++-
 6 files changed, 115 insertions(+), 6 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 9b9bce17f4e..82b70575bc5 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -85,6 +85,17 @@ xfs_extent_free_defer_add(
 {
 	struct xfs_mount		*mp = tp->t_mountp;
 
+	if (xfs_efi_is_realtime(xefi)) {
+		xfs_rgnumber_t		rgno;
+
+		rgno = xfs_rtb_to_rgno(mp, xefi->xefi_startblock);
+		xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
+
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_rtextent_free_defer_type);
+		return;
+	}
+
 	xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock);
 	if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
 		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
@@ -157,6 +168,70 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
 	.cancel_item	= xfs_extent_free_cancel_item,
 };
 
+/* Sort bmap items by rtgroup. */
+static int
+xfs_rtextent_free_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_extent_free_item	*ra = xefi_entry(a);
+	struct xfs_extent_free_item	*rb = xefi_entry(b);
+
+	return ra->xefi_rtg->rtg_rgno - rb->xefi_rtg->rtg_rgno;
+}
+
+static struct xfs_log_item *
+xfs_rtextent_free_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	if (sort)
+		list_sort(mp, items, xfs_rtextent_free_diff_items);
+	return NULL;
+}
+
+/* Cancel a free extent. */
+STATIC void
+xfs_rtextent_free_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
+
+	xfs_rtgroup_put(xefi->xefi_rtg);
+	kmem_cache_free(xfs_extfree_item_cache, xefi);
+}
+
+STATIC int
+xfs_rtextent_free_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
+	int				error;
+
+	error = xfs_rtfree_blocks(tp, xefi->xefi_startblock,
+			xefi->xefi_blockcount);
+	if (error != -EAGAIN)
+		xfs_rtextent_free_cancel_item(item);
+	return error;
+}
+
+const struct xfs_defer_op_type xfs_rtextent_free_defer_type = {
+	.name		= "rtextent_free",
+	.create_intent	= xfs_rtextent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_rtextent_free_finish_item,
+	.cancel_item	= xfs_rtextent_free_cancel_item,
+};
+
 /*
  * AGFL blocks are accounted differently in the reserve pools and are not
  * inserted into the busy extent list.
diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c
index 36af2c087b0..589e9ef3003 100644
--- a/libxfs/xfs_alloc.c
+++ b/libxfs/xfs_alloc.c
@@ -2555,10 +2555,18 @@ xfs_defer_extent_free(
 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
 	ASSERT(!isnullstartblock(bno));
 	ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
-	ASSERT(type != XFS_AG_RESV_AGFL);
 
-	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
-		return -EFSCORRUPTED;
+	if (free_flags & XFS_FREE_EXTENT_REALTIME) {
+		if (type != XFS_AG_RESV_NONE) {
+			ASSERT(type == XFS_AG_RESV_NONE);
+			return -EFSCORRUPTED;
+		}
+		if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len)))
+			return -EFSCORRUPTED;
+	} else {
+		if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
+			return -EFSCORRUPTED;
+	}
 
 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
 			       GFP_KERNEL | __GFP_NOFAIL);
@@ -2567,6 +2575,8 @@ xfs_defer_extent_free(
 	xefi->xefi_agresv = type;
 	if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+	if (free_flags & XFS_FREE_EXTENT_REALTIME)
+		xefi->xefi_flags |= XFS_EFI_REALTIME;
 	if (oinfo) {
 		ASSERT(oinfo->oi_offset == 0);
 
diff --git a/libxfs/xfs_alloc.h b/libxfs/xfs_alloc.h
index 0ed71a31fe7..130026e981e 100644
--- a/libxfs/xfs_alloc.h
+++ b/libxfs/xfs_alloc.h
@@ -238,7 +238,11 @@ int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
 /* Don't issue a discard for the blocks freed. */
 #define XFS_FREE_EXTENT_SKIP_DISCARD	(1U << 0)
 
-#define XFS_FREE_EXTENT_ALL_FLAGS	(XFS_FREE_EXTENT_SKIP_DISCARD)
+/* Free blocks on the realtime device. */
+#define XFS_FREE_EXTENT_REALTIME	(1U << 1)
+
+#define XFS_FREE_EXTENT_ALL_FLAGS	(XFS_FREE_EXTENT_SKIP_DISCARD | \
+					 XFS_FREE_EXTENT_REALTIME)
 
 /*
  * List of extents to be free "later".
@@ -249,7 +253,10 @@ struct xfs_extent_free_item {
 	uint64_t		xefi_owner;
 	xfs_fsblock_t		xefi_startblock;/* starting fs block number */
 	xfs_extlen_t		xefi_blockcount;/* number of blocks in extent */
-	struct xfs_perag	*xefi_pag;
+	union {
+		struct xfs_perag	*xefi_pag;
+		struct xfs_rtgroup	*xefi_rtg;
+	};
 	unsigned int		xefi_flags;
 	enum xfs_ag_resv_type	xefi_agresv;
 };
@@ -258,6 +265,12 @@ struct xfs_extent_free_item {
 #define XFS_EFI_ATTR_FORK	(1U << 1) /* freeing attr fork block */
 #define XFS_EFI_BMBT_BLOCK	(1U << 2) /* freeing bmap btree block */
 #define XFS_EFI_CANCELLED	(1U << 3) /* dont actually free the space */
+#define XFS_EFI_REALTIME	(1U << 4) /* freeing realtime extent */
+
+static inline bool xfs_efi_is_realtime(const struct xfs_extent_free_item *xefi)
+{
+	return xefi->xefi_flags & XFS_EFI_REALTIME;
+}
 
 struct xfs_alloc_autoreap {
 	struct xfs_defer_pending	*dfp;
diff --git a/libxfs/xfs_defer.c b/libxfs/xfs_defer.c
index 41e607d55f0..4a1139913b9 100644
--- a/libxfs/xfs_defer.c
+++ b/libxfs/xfs_defer.c
@@ -839,6 +839,12 @@ xfs_defer_add(
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 
+	if (!ops->finish_item) {
+		ASSERT(ops->finish_item != NULL);
+		xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+		return NULL;
+	}
+
 	dfp = xfs_defer_find_last(tp, ops);
 	if (!dfp || !xfs_defer_can_append(dfp, ops))
 		dfp = xfs_defer_alloc(tp, ops);
diff --git a/libxfs/xfs_defer.h b/libxfs/xfs_defer.h
index c9a1fe3fe36..b4e1c386768 100644
--- a/libxfs/xfs_defer.h
+++ b/libxfs/xfs_defer.h
@@ -71,6 +71,7 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
 extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;
 extern const struct xfs_defer_op_type xfs_attr_defer_type;
 extern const struct xfs_defer_op_type xfs_swapext_defer_type;
 
diff --git a/libxfs/xfs_log_format.h b/libxfs/xfs_log_format.h
index bded03634e5..1f5fe4a588e 100644
--- a/libxfs/xfs_log_format.h
+++ b/libxfs/xfs_log_format.h
@@ -248,6 +248,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_ATTRD		0x1247  /* attr set/remove done */
 #define	XFS_LI_SXI		0x1248  /* extent swap intent */
 #define	XFS_LI_SXD		0x1249  /* extent swap done */
+#define	XFS_LI_EFI_RT		0x124a	/* realtime extent free intent */
+#define	XFS_LI_EFD_RT		0x124b	/* realtime extent free done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -267,7 +269,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_ATTRI,		"XFS_LI_ATTRI" }, \
 	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }, \
 	{ XFS_LI_SXI,		"XFS_LI_SXI" }, \
-	{ XFS_LI_SXD,		"XFS_LI_SXD" }
+	{ XFS_LI_SXD,		"XFS_LI_SXD" }, \
+	{ XFS_LI_EFI_RT,	"XFS_LI_EFI_RT" }, \
+	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }
 
 /*
  * Inode Log Item Format definitions.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/3] xfs: support error injection when freeing rt extents
  2023-12-31 19:54 ` [PATCHSET v2.0 10/17] xfsprogs: widen EFI format to support rt Darrick J. Wong
  2023-12-27 13:07   ` [PATCH 1/3] xfs: support logging EFIs for realtime extents Darrick J. Wong
@ 2023-12-27 13:07   ` Darrick J. Wong
  2023-12-27 13:08   ` [PATCH 3/3] xfs_logprint: report realtime EFIs Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:07 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

A handful of fstests expect to be able to test what happens when extent
free intents fail to actually free the extent.  Now that we're
supporting EFIs for realtime extents, add to xfs_rtfree_extent the same
injection point that exists in the regular extent freeing code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtbitmap.c |    4 ++++
 1 file changed, 4 insertions(+)


diff --git a/libxfs/xfs_rtbitmap.c b/libxfs/xfs_rtbitmap.c
index 7d29a72be6b..a42e54b28b9 100644
--- a/libxfs/xfs_rtbitmap.c
+++ b/libxfs/xfs_rtbitmap.c
@@ -16,6 +16,7 @@
 #include "xfs_trans.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_health.h"
+#include "xfs_errortag.h"
 
 /*
  * Realtime allocator bitmap functions shared with userspace.
@@ -1036,6 +1037,9 @@ xfs_rtfree_extent(
 	ASSERT(mp->m_rbmip->i_itemp != NULL);
 	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT))
+		return -EIO;
+
 	error = xfs_rtcheck_alloc_range(&args, start, len);
 	if (error)
 		return error;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/3] xfs_logprint: report realtime EFIs
  2023-12-31 19:54 ` [PATCHSET v2.0 10/17] xfsprogs: widen EFI format to support rt Darrick J. Wong
  2023-12-27 13:07   ` [PATCH 1/3] xfs: support logging EFIs for realtime extents Darrick J. Wong
  2023-12-27 13:07   ` [PATCH 2/3] xfs: support error injection when freeing rt extents Darrick J. Wong
@ 2023-12-27 13:08   ` Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:08 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Decode the EFI format just enough to report if an EFI targets the
realtime device or not.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 logprint/log_misc.c      |    2 ++
 logprint/log_print_all.c |    8 ++++++
 logprint/log_redo.c      |   57 +++++++++++++++++++++++++++++++++++-----------
 3 files changed, 53 insertions(+), 14 deletions(-)


diff --git a/logprint/log_misc.c b/logprint/log_misc.c
index 565e7b76284..9d63f376390 100644
--- a/logprint/log_misc.c
+++ b/logprint/log_misc.c
@@ -997,12 +997,14 @@ xlog_print_record(
 					&i, num_ops);
 			break;
 		    }
+		    case XFS_LI_EFI_RT:
 		    case XFS_LI_EFI: {
 			skip = xlog_print_trans_efi(&ptr,
 					be32_to_cpu(op_head->oh_len),
 					continued);
 			break;
 		    }
+		    case XFS_LI_EFD_RT:
 		    case XFS_LI_EFD: {
 			skip = xlog_print_trans_efd(&ptr,
 					be32_to_cpu(op_head->oh_len));
diff --git a/logprint/log_print_all.c b/logprint/log_print_all.c
index 6e528fcd097..d030efa9efb 100644
--- a/logprint/log_print_all.c
+++ b/logprint/log_print_all.c
@@ -410,9 +410,11 @@ xlog_recover_print_logitem(
 	case XFS_LI_INODE:
 		xlog_recover_print_inode(item);
 		break;
+	case XFS_LI_EFD_RT:
 	case XFS_LI_EFD:
 		xlog_recover_print_efd(item);
 		break;
+	case XFS_LI_EFI_RT:
 	case XFS_LI_EFI:
 		xlog_recover_print_efi(item);
 		break;
@@ -474,6 +476,12 @@ xlog_recover_print_item(
 	case XFS_LI_INODE:
 		printf("INO");
 		break;
+	case XFS_LI_EFD_RT:
+		printf("EFD_RT");
+		break;
+	case XFS_LI_EFI_RT:
+		printf("EFI_RT");
+		break;
 	case XFS_LI_EFD:
 		printf("EFD");
 		break;
diff --git a/logprint/log_redo.c b/logprint/log_redo.c
index 948924d5bcb..0cc3cd4ba28 100644
--- a/logprint/log_redo.c
+++ b/logprint/log_redo.c
@@ -67,6 +67,7 @@ xlog_print_trans_efi(
 	uint			src_len,
 	int			continued)
 {
+	const char		*item_name = "EFI?";
 	xfs_efi_log_format_t	*src_f, *f = NULL;
 	uint			dst_len;
 	xfs_extent_t		*ex;
@@ -103,8 +104,14 @@ xlog_print_trans_efi(
 		goto error;
 	}
 
-	printf(_("EFI:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
-		f->efi_size, f->efi_nextents, (unsigned long long)f->efi_id);
+	switch (f->efi_type) {
+	case XFS_LI_EFI:	item_name = "EFI"; break;
+	case XFS_LI_EFI_RT:	item_name = "EFI_RT"; break;
+	}
+
+	printf(_("%s:  #regs: %d	num_extents: %u  id: 0x%llx\n"),
+			item_name, f->efi_size, f->efi_nextents,
+			(unsigned long long)f->efi_id);
 
 	if (continued) {
 		printf(_("EFI free extent data skipped (CONTINUE set, no space)\n"));
@@ -113,7 +120,7 @@ xlog_print_trans_efi(
 
 	ex = f->efi_extents;
 	for (i=0; i < f->efi_nextents; i++) {
-		printf("(s: 0x%llx, l: %d) ",
+		printf("(s: 0x%llx, l: %u) ",
 			(unsigned long long)ex->ext_start, ex->ext_len);
 		if (i % 4 == 3) printf("\n");
 		ex++;
@@ -130,6 +137,7 @@ void
 xlog_recover_print_efi(
 	struct xlog_recover_item *item)
 {
+	const char		*item_name = "EFI?";
 	xfs_efi_log_format_t	*f, *src_f;
 	xfs_extent_t		*ex;
 	int			i;
@@ -155,12 +163,18 @@ xlog_recover_print_efi(
 		return;
 	}
 
-	printf(_("	EFI:  #regs:%d	num_extents:%d  id:0x%llx\n"),
-		   f->efi_size, f->efi_nextents, (unsigned long long)f->efi_id);
+	switch (f->efi_type) {
+	case XFS_LI_EFI:	item_name = "EFI"; break;
+	case XFS_LI_EFI_RT:	item_name = "EFI_RT"; break;
+	}
+
+	printf(_("	%s:  #regs:%d	num_extents:%u  id:0x%llx\n"),
+			item_name, f->efi_size, f->efi_nextents,
+			(unsigned long long)f->efi_id);
 	ex = f->efi_extents;
 	printf("	");
 	for (i=0; i< f->efi_nextents; i++) {
-		printf("(s: 0x%llx, l: %d) ",
+		printf("(s: 0x%llx, l: %u) ",
 			(unsigned long long)ex->ext_start, ex->ext_len);
 		if (i % 4 == 3)
 			printf("\n");
@@ -174,8 +188,10 @@ xlog_recover_print_efi(
 int
 xlog_print_trans_efd(char **ptr, uint len)
 {
-	xfs_efd_log_format_t *f;
-	xfs_efd_log_format_t lbuf;
+	const char		*item_name = "EFD?";
+	xfs_efd_log_format_t	*f;
+	xfs_efd_log_format_t	lbuf;
+
 	/* size without extents at end */
 	uint core_size = sizeof(xfs_efd_log_format_t);
 
@@ -185,11 +201,17 @@ xlog_print_trans_efd(char **ptr, uint len)
 	 */
 	memmove(&lbuf, *ptr, min(core_size, len));
 	f = &lbuf;
+
+	switch (f->efd_type) {
+	case XFS_LI_EFD:	item_name = "EFD"; break;
+	case XFS_LI_EFD_RT:	item_name = "EFD_RT"; break;
+	}
+
 	*ptr += len;
 	if (len >= core_size) {
-		printf(_("EFD:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
-			f->efd_size, f->efd_nextents,
-			(unsigned long long)f->efd_efi_id);
+		printf(_("%s:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
+				item_name, f->efd_size, f->efd_nextents,
+				(unsigned long long)f->efd_efi_id);
 
 		/* don't print extents as they are not used */
 
@@ -204,18 +226,25 @@ void
 xlog_recover_print_efd(
 	struct xlog_recover_item *item)
 {
+	const char		*item_name = "EFD?";
 	xfs_efd_log_format_t	*f;
 
 	f = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
+
+	switch (f->efd_type) {
+	case XFS_LI_EFD:	item_name = "EFD"; break;
+	case XFS_LI_EFD_RT:	item_name = "EFD_RT"; break;
+	}
+
 	/*
 	 * An xfs_efd_log_format structure contains a variable length array
 	 * as the last field.
 	 * Each element is of size xfs_extent_32_t or xfs_extent_64_t.
 	 * However, the extents are never used and won't be printed.
 	 */
-	printf(_("	EFD:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
-		f->efd_size, f->efd_nextents,
-		(unsigned long long)f->efd_efi_id);
+	printf(_("	%s:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
+			item_name, f->efd_size, f->efd_nextents,
+			(unsigned long long)f->efd_efi_id);
 }
 
 /* Reverse Mapping Update Items */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/9] xfs: attach rtgroup objects to btree cursors
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
@ 2023-12-27 13:08   ` Darrick J. Wong
  2023-12-27 13:08   ` [PATCH 2/9] xfs: give rmap btree cursor error tracepoints their own class Darrick J. Wong
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:08 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make it so that we can attach realtime group objects to btree cursors.
This will be crucial for enabling rmap btrees in realtime groups.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_btree.c |    4 ++++
 libxfs/xfs_btree.h |    2 ++
 2 files changed, 6 insertions(+)


diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index 165ce251376..e0276ad655a 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -28,6 +28,7 @@
 #include "xfile.h"
 #include "xfbtree.h"
 #include "xfs_btree_mem.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Btree magic numbers.
@@ -473,6 +474,9 @@ xfs_btree_del_cursor(
 	       xfs_is_shutdown(cur->bc_mp) || error != 0);
 	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
 		kmem_free(cur->bc_ops);
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    !(cur->bc_flags & XFS_BTREE_IN_XFILE) && cur->bc_ino.rtg)
+		xfs_rtgroup_put(cur->bc_ino.rtg);
 	if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
 	    !(cur->bc_flags & XFS_BTREE_IN_XFILE) && cur->bc_ag.pag)
 		xfs_perag_put(cur->bc_ag.pag);
diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h
index bb6c2feecea..ce0bc5dfffe 100644
--- a/libxfs/xfs_btree.h
+++ b/libxfs/xfs_btree.h
@@ -12,6 +12,7 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_ifork;
 struct xfs_perag;
+struct xfs_rtgroup;
 
 /*
  * Generic key, ptr and record wrapper structures.
@@ -247,6 +248,7 @@ struct xfs_btree_cur_ag {
 /* Btree-in-inode cursor information */
 struct xfs_btree_cur_ino {
 	struct xfs_inode		*ip;
+	struct xfs_rtgroup		*rtg;	/* if realtime metadata */
 	struct xbtree_ifakeroot		*ifake;	/* for staging cursor */
 	int				allocated;
 	short				forksize;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/9] xfs: give rmap btree cursor error tracepoints their own class
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
  2023-12-27 13:08   ` [PATCH 1/9] xfs: attach rtgroup objects to btree cursors Darrick J. Wong
@ 2023-12-27 13:08   ` Darrick J. Wong
  2023-12-27 13:08   ` [PATCH 3/9] xfs: prepare rmap btree tracepoints for widening Darrick J. Wong
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:08 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new tracepoint class for btree-related errors, then convert all
the rmap tracepoints to use it.  Also fix the one tracepoint that was
abusing the old class by making it a separate tracepoint.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rmap.c |   33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)


diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 8df591840dc..5b2cac8302a 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -110,8 +110,7 @@ xfs_rmap_update(
 			xfs_rmap_irec_offset_pack(irec));
 	error = xfs_btree_update(cur, &rec);
 	if (error)
-		trace_xfs_rmap_update_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_update_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -154,8 +153,7 @@ xfs_rmap_insert(
 	}
 done:
 	if (error)
-		trace_xfs_rmap_insert_error(rcur->bc_mp,
-				rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_insert_error(rcur, error, _RET_IP_);
 	return error;
 }
 
@@ -193,8 +191,7 @@ xfs_rmap_delete(
 	}
 done:
 	if (error)
-		trace_xfs_rmap_delete_error(rcur->bc_mp,
-				rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_delete_error(rcur, error, _RET_IP_);
 	return error;
 }
 
@@ -815,8 +812,7 @@ xfs_rmap_unmap(
 			unwritten, oinfo);
 out_error:
 	if (error)
-		trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno,
-				error, _RET_IP_);
+		trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1138,8 +1134,7 @@ xfs_rmap_map(
 			unwritten, oinfo);
 out_error:
 	if (error)
-		trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno,
-				error, _RET_IP_);
+		trace_xfs_rmap_map_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1334,8 +1329,7 @@ xfs_rmap_convert(
 	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
 		state &= ~RMAP_RIGHT_CONTIG;
 
-	trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
-			_RET_IP_);
+	trace_xfs_rmap_convert_state(cur, state, _RET_IP_);
 
 	/* reset the cursor back to PREV */
 	error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
@@ -1688,8 +1682,7 @@ xfs_rmap_convert(
 			unwritten, oinfo);
 done:
 	if (error)
-		trace_xfs_rmap_convert_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1812,8 +1805,7 @@ xfs_rmap_convert_shared(
 	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
 		state &= ~RMAP_RIGHT_CONTIG;
 
-	trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
-			_RET_IP_);
+	trace_xfs_rmap_convert_state(cur, state, _RET_IP_);
 	/*
 	 * Switch out based on the FILLING and CONTIG state bits.
 	 */
@@ -2115,8 +2107,7 @@ xfs_rmap_convert_shared(
 			unwritten, oinfo);
 done:
 	if (error)
-		trace_xfs_rmap_convert_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -2315,8 +2306,7 @@ xfs_rmap_unmap_shared(
 			unwritten, oinfo);
 out_error:
 	if (error)
-		trace_xfs_rmap_unmap_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -2476,8 +2466,7 @@ xfs_rmap_map_shared(
 			unwritten, oinfo);
 out_error:
 	if (error)
-		trace_xfs_rmap_map_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_map_error(cur, error, _RET_IP_);
 	return error;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/9] xfs: prepare rmap btree tracepoints for widening
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
  2023-12-27 13:08   ` [PATCH 1/9] xfs: attach rtgroup objects to btree cursors Darrick J. Wong
  2023-12-27 13:08   ` [PATCH 2/9] xfs: give rmap btree cursor error tracepoints their own class Darrick J. Wong
@ 2023-12-27 13:08   ` Darrick J. Wong
  2023-12-27 13:09   ` [PATCH 4/9] xfs: clean up rmap log intent item tracepoint callsites Darrick J. Wong
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:08 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Prepare the rmap btree tracepoints for use with realtime rmap btrees by
making them take the btree cursor object as a parameter.  This will save
us a lot of trouble later on.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rmap.c |  184 +++++++++++++++++++++--------------------------------
 1 file changed, 73 insertions(+), 111 deletions(-)


diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 5b2cac8302a..3c4f705ce59 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -99,8 +99,7 @@ xfs_rmap_update(
 	union xfs_btree_rec	rec;
 	int			error;
 
-	trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			irec->rm_startblock, irec->rm_blockcount,
+	trace_xfs_rmap_update(cur, irec->rm_startblock, irec->rm_blockcount,
 			irec->rm_owner, irec->rm_offset, irec->rm_flags);
 
 	rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock);
@@ -126,8 +125,7 @@ xfs_rmap_insert(
 	int			i;
 	int			error;
 
-	trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
-			len, owner, offset, flags);
+	trace_xfs_rmap_insert(rcur, agbno, len, owner, offset, flags);
 
 	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
 	if (error)
@@ -169,8 +167,7 @@ xfs_rmap_delete(
 	int			i;
 	int			error;
 
-	trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
-			len, owner, offset, flags);
+	trace_xfs_rmap_delete(rcur, agbno, len, owner, offset, flags);
 
 	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
 	if (error)
@@ -338,8 +335,7 @@ xfs_rmap_find_left_neighbor_helper(
 {
 	struct xfs_find_left_neighbor_info	*info = priv;
 
-	trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+	trace_xfs_rmap_find_left_neighbor_candidate(cur, rec->rm_startblock,
 			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
 			rec->rm_flags);
 
@@ -389,8 +385,8 @@ xfs_rmap_find_left_neighbor(
 	info.high.rm_blockcount = 0;
 	info.irec = irec;
 
-	trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
+	trace_xfs_rmap_find_left_neighbor_query(cur, bno, 0, owner, offset,
+			flags);
 
 	/*
 	 * Historically, we always used the range query to walk every reverse
@@ -421,8 +417,7 @@ xfs_rmap_find_left_neighbor(
 		return error;
 
 	*stat = 1;
-	trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+	trace_xfs_rmap_find_left_neighbor_result(cur, irec->rm_startblock,
 			irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
 			irec->rm_flags);
 	return 0;
@@ -437,8 +432,7 @@ xfs_rmap_lookup_le_range_helper(
 {
 	struct xfs_find_left_neighbor_info	*info = priv;
 
-	trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+	trace_xfs_rmap_lookup_le_range_candidate(cur, rec->rm_startblock,
 			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
 			rec->rm_flags);
 
@@ -485,8 +479,7 @@ xfs_rmap_lookup_le_range(
 	*stat = 0;
 	info.irec = irec;
 
-	trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			bno, 0, owner, offset, flags);
+	trace_xfs_rmap_lookup_le_range(cur, bno, 0, owner, offset, flags);
 
 	/*
 	 * Historically, we always used the range query to walk every reverse
@@ -517,8 +510,7 @@ xfs_rmap_lookup_le_range(
 		return error;
 
 	*stat = 1;
-	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+	trace_xfs_rmap_lookup_le_range_result(cur, irec->rm_startblock,
 			irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
 			irec->rm_flags);
 	return 0;
@@ -630,8 +622,7 @@ xfs_rmap_unmap(
 			(flags & XFS_RMAP_BMBT_BLOCK);
 	if (unwritten)
 		flags |= XFS_RMAP_UNWRITTEN;
-	trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo);
 
 	/*
 	 * We should always have a left record because there's a static record
@@ -647,10 +638,9 @@ xfs_rmap_unmap(
 		goto out_error;
 	}
 
-	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
-			ltrec.rm_blockcount, ltrec.rm_owner,
-			ltrec.rm_offset, ltrec.rm_flags);
+	trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock,
+			ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset,
+			ltrec.rm_flags);
 	ltoff = ltrec.rm_offset;
 
 	/*
@@ -717,10 +707,9 @@ xfs_rmap_unmap(
 
 	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
 		/* exact match, simply remove the record from rmap tree */
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				ltrec.rm_startblock, ltrec.rm_blockcount,
-				ltrec.rm_owner, ltrec.rm_offset,
-				ltrec.rm_flags);
+		trace_xfs_rmap_delete(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto out_error;
@@ -796,8 +785,7 @@ xfs_rmap_unmap(
 		else
 			cur->bc_rec.r.rm_offset = offset + len;
 		cur->bc_rec.r.rm_flags = flags;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
-				cur->bc_rec.r.rm_startblock,
+		trace_xfs_rmap_insert(cur, cur->bc_rec.r.rm_startblock,
 				cur->bc_rec.r.rm_blockcount,
 				cur->bc_rec.r.rm_owner,
 				cur->bc_rec.r.rm_offset,
@@ -808,8 +796,7 @@ xfs_rmap_unmap(
 	}
 
 out_done:
-	trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo);
 out_error:
 	if (error)
 		trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
@@ -973,8 +960,7 @@ xfs_rmap_map(
 			(flags & XFS_RMAP_BMBT_BLOCK);
 	if (unwritten)
 		flags |= XFS_RMAP_UNWRITTEN;
-	trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo);
 	ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
 
 	/*
@@ -987,8 +973,7 @@ xfs_rmap_map(
 	if (error)
 		goto out_error;
 	if (have_lt) {
-		trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
+		trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock,
 				ltrec.rm_blockcount, ltrec.rm_owner,
 				ltrec.rm_offset, ltrec.rm_flags);
 
@@ -1026,10 +1011,10 @@ xfs_rmap_map(
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
-		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
-			gtrec.rm_blockcount, gtrec.rm_owner,
-			gtrec.rm_offset, gtrec.rm_flags);
+		trace_xfs_rmap_find_right_neighbor_result(cur,
+				gtrec.rm_startblock, gtrec.rm_blockcount,
+				gtrec.rm_owner, gtrec.rm_offset,
+				gtrec.rm_flags);
 		if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
 			have_gt = 0;
 	}
@@ -1066,12 +1051,9 @@ xfs_rmap_map(
 			 * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
 			 */
 			ltrec.rm_blockcount += gtrec.rm_blockcount;
-			trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-					gtrec.rm_startblock,
-					gtrec.rm_blockcount,
-					gtrec.rm_owner,
-					gtrec.rm_offset,
-					gtrec.rm_flags);
+			trace_xfs_rmap_delete(cur, gtrec.rm_startblock,
+					gtrec.rm_blockcount, gtrec.rm_owner,
+					gtrec.rm_offset, gtrec.rm_flags);
 			error = xfs_btree_delete(cur, &i);
 			if (error)
 				goto out_error;
@@ -1118,8 +1100,7 @@ xfs_rmap_map(
 		cur->bc_rec.r.rm_owner = owner;
 		cur->bc_rec.r.rm_offset = offset;
 		cur->bc_rec.r.rm_flags = flags;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			owner, offset, flags);
+		trace_xfs_rmap_insert(cur, bno, len, owner, offset, flags);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
 			goto out_error;
@@ -1130,8 +1111,7 @@ xfs_rmap_map(
 		}
 	}
 
-	trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo);
 out_error:
 	if (error)
 		trace_xfs_rmap_map_error(cur, error, _RET_IP_);
@@ -1208,8 +1188,7 @@ xfs_rmap_convert(
 			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
 	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
 	new_endoff = offset + len;
-	trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo);
 
 	/*
 	 * For the initial lookup, look for an exact match or the left-adjacent
@@ -1225,10 +1204,9 @@ xfs_rmap_convert(
 		goto done;
 	}
 
-	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
-			PREV.rm_blockcount, PREV.rm_owner,
-			PREV.rm_offset, PREV.rm_flags);
+	trace_xfs_rmap_lookup_le_range_result(cur, PREV.rm_startblock,
+			PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset,
+			PREV.rm_flags);
 
 	ASSERT(PREV.rm_offset <= offset);
 	ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
@@ -1269,10 +1247,9 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, LEFT.rm_startblock,
-				LEFT.rm_blockcount, LEFT.rm_owner,
-				LEFT.rm_offset, LEFT.rm_flags);
+		trace_xfs_rmap_find_left_neighbor_result(cur,
+				LEFT.rm_startblock, LEFT.rm_blockcount,
+				LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags);
 		if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
 		    LEFT.rm_offset + LEFT.rm_blockcount == offset &&
 		    xfs_rmap_is_mergeable(&LEFT, owner, newext))
@@ -1310,10 +1287,10 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
-				RIGHT.rm_blockcount, RIGHT.rm_owner,
-				RIGHT.rm_offset, RIGHT.rm_flags);
+		trace_xfs_rmap_find_right_neighbor_result(cur,
+				RIGHT.rm_startblock, RIGHT.rm_blockcount,
+				RIGHT.rm_owner, RIGHT.rm_offset,
+				RIGHT.rm_flags);
 		if (bno + len == RIGHT.rm_startblock &&
 		    offset + len == RIGHT.rm_offset &&
 		    xfs_rmap_is_mergeable(&RIGHT, owner, newext))
@@ -1360,10 +1337,9 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				RIGHT.rm_startblock, RIGHT.rm_blockcount,
-				RIGHT.rm_owner, RIGHT.rm_offset,
-				RIGHT.rm_flags);
+		trace_xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto done;
@@ -1380,10 +1356,9 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				PREV.rm_startblock, PREV.rm_blockcount,
-				PREV.rm_owner, PREV.rm_offset,
-				PREV.rm_flags);
+		trace_xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto done;
@@ -1412,10 +1387,9 @@ xfs_rmap_convert(
 		 * Setting all of a previous oldext extent to newext.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				PREV.rm_startblock, PREV.rm_blockcount,
-				PREV.rm_owner, PREV.rm_offset,
-				PREV.rm_flags);
+		trace_xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto done;
@@ -1452,10 +1426,9 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				RIGHT.rm_startblock, RIGHT.rm_blockcount,
-				RIGHT.rm_owner, RIGHT.rm_offset,
-				RIGHT.rm_flags);
+		trace_xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto done;
@@ -1533,8 +1506,7 @@ xfs_rmap_convert(
 		NEW.rm_blockcount = len;
 		NEW.rm_flags = newext;
 		cur->bc_rec.r = NEW;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
-				len, owner, offset, newext);
+		trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
 			goto done;
@@ -1592,8 +1564,7 @@ xfs_rmap_convert(
 		NEW.rm_blockcount = len;
 		NEW.rm_flags = newext;
 		cur->bc_rec.r = NEW;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
-				len, owner, offset, newext);
+		trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
 			goto done;
@@ -1624,9 +1595,8 @@ xfs_rmap_convert(
 		NEW = PREV;
 		NEW.rm_blockcount = offset - PREV.rm_offset;
 		cur->bc_rec.r = NEW;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
-				NEW.rm_startblock, NEW.rm_blockcount,
-				NEW.rm_owner, NEW.rm_offset,
+		trace_xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
 				NEW.rm_flags);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
@@ -1653,8 +1623,7 @@ xfs_rmap_convert(
 		/* new middle extent - newext */
 		cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
 		cur->bc_rec.r.rm_flags |= newext;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
-				owner, offset, newext);
+		trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
 			goto done;
@@ -1678,8 +1647,7 @@ xfs_rmap_convert(
 		ASSERT(0);
 	}
 
-	trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo);
 done:
 	if (error)
 		trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
@@ -1718,8 +1686,7 @@ xfs_rmap_convert_shared(
 			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
 	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
 	new_endoff = offset + len;
-	trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo);
 
 	/*
 	 * For the initial lookup, look for and exact match or the left-adjacent
@@ -1788,10 +1755,10 @@ xfs_rmap_convert_shared(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
-				RIGHT.rm_blockcount, RIGHT.rm_owner,
-				RIGHT.rm_offset, RIGHT.rm_flags);
+		trace_xfs_rmap_find_right_neighbor_result(cur,
+				RIGHT.rm_startblock, RIGHT.rm_blockcount,
+				RIGHT.rm_owner, RIGHT.rm_offset,
+				RIGHT.rm_flags);
 		if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
 			state |= RMAP_RIGHT_CONTIG;
 	}
@@ -2103,8 +2070,7 @@ xfs_rmap_convert_shared(
 		ASSERT(0);
 	}
 
-	trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo);
 done:
 	if (error)
 		trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
@@ -2145,8 +2111,7 @@ xfs_rmap_unmap_shared(
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	if (unwritten)
 		flags |= XFS_RMAP_UNWRITTEN;
-	trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo);
 
 	/*
 	 * We should always have a left record because there's a static record
@@ -2302,8 +2267,7 @@ xfs_rmap_unmap_shared(
 			goto out_error;
 	}
 
-	trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo);
 out_error:
 	if (error)
 		trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
@@ -2341,8 +2305,7 @@ xfs_rmap_map_shared(
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	if (unwritten)
 		flags |= XFS_RMAP_UNWRITTEN;
-	trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo);
 
 	/* Is there a left record that abuts our range? */
 	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
@@ -2367,10 +2330,10 @@ xfs_rmap_map_shared(
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
-		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
-			gtrec.rm_blockcount, gtrec.rm_owner,
-			gtrec.rm_offset, gtrec.rm_flags);
+		trace_xfs_rmap_find_right_neighbor_result(cur,
+				gtrec.rm_startblock, gtrec.rm_blockcount,
+				gtrec.rm_owner, gtrec.rm_offset,
+				gtrec.rm_flags);
 
 		if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
 			have_gt = 0;
@@ -2462,8 +2425,7 @@ xfs_rmap_map_shared(
 			goto out_error;
 	}
 
-	trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo);
 out_error:
 	if (error)
 		trace_xfs_rmap_map_error(cur, error, _RET_IP_);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/9] xfs: clean up rmap log intent item tracepoint callsites
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:08   ` [PATCH 3/9] xfs: prepare rmap btree tracepoints for widening Darrick J. Wong
@ 2023-12-27 13:09   ` Darrick J. Wong
  2023-12-27 13:09   ` [PATCH 5/9] xfs: add a ri_entry helper Darrick J. Wong
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:09 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pass the incore rmap structure to the tracepoints instead of open-coding
the argument passing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rmap.c |   22 +++++-----------------
 libxfs/xfs_rmap.h |   10 ++++++++++
 2 files changed, 15 insertions(+), 17 deletions(-)


diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 3c4f705ce59..3c00b05d8d0 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -2575,20 +2575,15 @@ xfs_rmap_finish_one(
 	struct xfs_rmap_intent		*ri,
 	struct xfs_btree_cur		**pcur)
 {
+	struct xfs_owner_info		oinfo;
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_btree_cur		*rcur;
 	struct xfs_buf			*agbp = NULL;
-	int				error = 0;
-	struct xfs_owner_info		oinfo;
 	xfs_agblock_t			bno;
 	bool				unwritten;
+	int				error = 0;
 
-	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock);
-
-	trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno,
-			ri->ri_owner, ri->ri_whichfork,
-			ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount,
-			ri->ri_bmap.br_state);
+	trace_xfs_rmap_deferred(mp, ri);
 
 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
 		return -EIO;
@@ -2663,15 +2658,6 @@ __xfs_rmap_add(
 {
 	struct xfs_rmap_intent		*ri;
 
-	trace_xfs_rmap_defer(tp->t_mountp,
-			XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
-			type,
-			XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
-			owner, whichfork,
-			bmap->br_startoff,
-			bmap->br_blockcount,
-			bmap->br_state);
-
 	ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&ri->ri_list);
 	ri->ri_type = type;
@@ -2679,6 +2665,8 @@ __xfs_rmap_add(
 	ri->ri_whichfork = whichfork;
 	ri->ri_bmap = *bmap;
 
+	trace_xfs_rmap_defer(tp->t_mountp, ri);
+
 	xfs_rmap_update_get_group(tp->t_mountp, ri);
 	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
 }
diff --git a/libxfs/xfs_rmap.h b/libxfs/xfs_rmap.h
index 3a153b4801b..f16b07d851d 100644
--- a/libxfs/xfs_rmap.h
+++ b/libxfs/xfs_rmap.h
@@ -157,6 +157,16 @@ enum xfs_rmap_intent_type {
 	XFS_RMAP_FREE,
 };
 
+#define XFS_RMAP_INTENT_STRINGS \
+	{ XFS_RMAP_MAP,			"map" }, \
+	{ XFS_RMAP_MAP_SHARED,		"map_shared" }, \
+	{ XFS_RMAP_UNMAP,		"unmap" }, \
+	{ XFS_RMAP_UNMAP_SHARED,	"unmap_shared" }, \
+	{ XFS_RMAP_CONVERT,		"cvt" }, \
+	{ XFS_RMAP_CONVERT_SHARED,	"cvt_shared" }, \
+	{ XFS_RMAP_ALLOC,		"alloc" }, \
+	{ XFS_RMAP_FREE,		"free" }
+
 struct xfs_rmap_intent {
 	struct list_head			ri_list;
 	enum xfs_rmap_intent_type		ri_type;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/9] xfs: add a ri_entry helper
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:09   ` [PATCH 4/9] xfs: clean up rmap log intent item tracepoint callsites Darrick J. Wong
@ 2023-12-27 13:09   ` Darrick J. Wong
  2023-12-27 13:09   ` [PATCH 6/9] xfs: reuse xfs_rmap_update_cancel_item Darrick J. Wong
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:09 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a helper to translate from the item list head to the
rmap_intent_item structure and use it so shorten assignments and avoid
the need for extra local variables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 82b70575bc5..8fc27e9efd4 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -275,6 +275,11 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
 
 /* Reverse Mapping */
 
+static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_rmap_intent, ri_list);
+}
+
 /* Sort rmap intents by AG. */
 static int
 xfs_rmap_update_diff_items(
@@ -282,11 +287,8 @@ xfs_rmap_update_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	const struct xfs_rmap_intent	*ra;
-	const struct xfs_rmap_intent	*rb;
-
-	ra = container_of(a, struct xfs_rmap_intent, ri_list);
-	rb = container_of(b, struct xfs_rmap_intent, ri_list);
+	struct xfs_rmap_intent		*ra = ri_entry(a);
+	struct xfs_rmap_intent		*rb = ri_entry(b);
 
 	return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
 }
@@ -341,11 +343,9 @@ xfs_rmap_update_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
-	struct xfs_rmap_intent		*ri;
+	struct xfs_rmap_intent		*ri = ri_entry(item);
 	int				error;
 
-	ri = container_of(item, struct xfs_rmap_intent, ri_list);
-
 	error = xfs_rmap_finish_one(tp, ri, state);
 
 	xfs_rmap_update_put_group(ri);
@@ -365,9 +365,7 @@ STATIC void
 xfs_rmap_update_cancel_item(
 	struct list_head		*item)
 {
-	struct xfs_rmap_intent		*ri;
-
-	ri = container_of(item, struct xfs_rmap_intent, ri_list);
+	struct xfs_rmap_intent		*ri = ri_entry(item);
 
 	xfs_rmap_update_put_group(ri);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/9] xfs: reuse xfs_rmap_update_cancel_item
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:09   ` [PATCH 5/9] xfs: add a ri_entry helper Darrick J. Wong
@ 2023-12-27 13:09   ` Darrick J. Wong
  2023-12-27 13:09   ` [PATCH 7/9] xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one Darrick J. Wong
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:09 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reuse xfs_rmap_update_cancel_item to put the AG/RTG and free the item in
a few places that currently open code the logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 8fc27e9efd4..e7277b54532 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -335,6 +335,17 @@ xfs_rmap_update_put_group(
 	xfs_perag_intent_put(ri->ri_pag);
 }
 
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_rmap_intent		*ri = ri_entry(item);
+
+	xfs_rmap_update_put_group(ri);
+	kmem_cache_free(xfs_rmap_intent_cache, ri);
+}
+
 /* Process a deferred rmap update. */
 STATIC int
 xfs_rmap_update_finish_item(
@@ -348,8 +359,7 @@ xfs_rmap_update_finish_item(
 
 	error = xfs_rmap_finish_one(tp, ri, state);
 
-	xfs_rmap_update_put_group(ri);
-	kmem_cache_free(xfs_rmap_intent_cache, ri);
+	xfs_rmap_update_cancel_item(item);
 	return error;
 }
 
@@ -360,17 +370,6 @@ xfs_rmap_update_abort_intent(
 {
 }
 
-/* Cancel a deferred rmap update. */
-STATIC void
-xfs_rmap_update_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_rmap_intent		*ri = ri_entry(item);
-
-	xfs_rmap_update_put_group(ri);
-	kmem_cache_free(xfs_rmap_intent_cache, ri);
-}
-
 const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.name		= "rmap",
 	.create_intent	= xfs_rmap_update_create_intent,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 7/9] xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:09   ` [PATCH 6/9] xfs: reuse xfs_rmap_update_cancel_item Darrick J. Wong
@ 2023-12-27 13:09   ` Darrick J. Wong
  2023-12-27 13:10   ` [PATCH 8/9] xfs: simplify usage of the rcur local variable " Darrick J. Wong
  2023-12-27 13:10   ` [PATCH 9/9] xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:09 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

In xfs_rmap_finish_one we known the cursor is non-zero when calling
xfs_rmap_finish_one_cleanup and we pass a 0 error variable.  This means
xfs_rmap_finish_one_cleanup is just doing a xfs_btree_del_cursor.

Open code that and move xfs_rmap_finish_one_cleanup to
fs/xfs/xfs_rmap_item.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: minor porting changes]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rmap.c |   19 +------------------
 libxfs/xfs_rmap.h |    2 --
 2 files changed, 1 insertion(+), 20 deletions(-)


diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 3c00b05d8d0..a1a9f4927bd 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -2513,23 +2513,6 @@ xfs_rmap_query_all(
 	return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
 }
 
-/* Clean up after calling xfs_rmap_finish_one. */
-void
-xfs_rmap_finish_one_cleanup(
-	struct xfs_trans	*tp,
-	struct xfs_btree_cur	*rcur,
-	int			error)
-{
-	struct xfs_buf		*agbp;
-
-	if (rcur == NULL)
-		return;
-	agbp = rcur->bc_ag.agbp;
-	xfs_btree_del_cursor(rcur, error);
-	if (error)
-		xfs_trans_brelse(tp, agbp);
-}
-
 /* Commit an rmap operation into the ondisk tree. */
 int
 __xfs_rmap_finish_intent(
@@ -2594,7 +2577,7 @@ xfs_rmap_finish_one(
 	 */
 	rcur = *pcur;
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
-		xfs_rmap_finish_one_cleanup(tp, rcur, 0);
+		xfs_btree_del_cursor(rcur, 0);
 		rcur = NULL;
 		*pcur = NULL;
 	}
diff --git a/libxfs/xfs_rmap.h b/libxfs/xfs_rmap.h
index f16b07d851d..2513ee36aa2 100644
--- a/libxfs/xfs_rmap.h
+++ b/libxfs/xfs_rmap.h
@@ -192,8 +192,6 @@ void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
 void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
 		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
 
-void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
-		struct xfs_btree_cur *rcur, int error);
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
 		struct xfs_btree_cur **pcur);
 int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 8/9] xfs: simplify usage of the rcur local variable in xfs_rmap_finish_one
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:09   ` [PATCH 7/9] xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one Darrick J. Wong
@ 2023-12-27 13:10   ` Darrick J. Wong
  2023-12-27 13:10   ` [PATCH 9/9] xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:10 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

Only update rcur when we know the final *pcur value.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[djwong: don't leave the caller with a dangling ref]
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   18 ++++++++++++++++++
 libxfs/xfs_rmap.c   |    6 ++----
 2 files changed, 20 insertions(+), 4 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index e7277b54532..d3df56f0a2b 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -28,6 +28,7 @@
 #include "xfs_ag.h"
 #include "xfs_swapext.h"
 #include "defer_item.h"
+#include "xfs_btree.h"
 
 /* Dummy defer item ops, since we don't do logging. */
 
@@ -370,6 +371,23 @@ xfs_rmap_update_abort_intent(
 {
 }
 
+/* Clean up after calling xfs_rmap_finish_one. */
+STATIC void
+xfs_rmap_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	struct xfs_buf		*agbp = NULL;
+
+	if (rcur == NULL)
+		return;
+	agbp = rcur->bc_ag.agbp;
+	xfs_btree_del_cursor(rcur, error);
+	if (error && agbp)
+		xfs_trans_brelse(tp, agbp);
+}
+
 const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.name		= "rmap",
 	.create_intent	= xfs_rmap_update_create_intent,
diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index a1a9f4927bd..183e840b7f1 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -2560,7 +2560,7 @@ xfs_rmap_finish_one(
 {
 	struct xfs_owner_info		oinfo;
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_btree_cur		*rcur;
+	struct xfs_btree_cur		*rcur = *pcur;
 	struct xfs_buf			*agbp = NULL;
 	xfs_agblock_t			bno;
 	bool				unwritten;
@@ -2575,7 +2575,6 @@ xfs_rmap_finish_one(
 	 * If we haven't gotten a cursor or the cursor AG doesn't match
 	 * the startblock, get one now.
 	 */
-	rcur = *pcur;
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
 		xfs_btree_del_cursor(rcur, 0);
 		rcur = NULL;
@@ -2597,9 +2596,8 @@ xfs_rmap_finish_one(
 			return -EFSCORRUPTED;
 		}
 
-		rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
+		*pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
 	}
-	*pcur = rcur;
 
 	xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
 			ri->ri_bmap.br_startoff);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 9/9] xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 13:10   ` [PATCH 8/9] xfs: simplify usage of the rcur local variable " Darrick J. Wong
@ 2023-12-27 13:10   ` Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:10 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the code that adds the incore xfs_rmap_update_item deferred work
data to a transaction live with the RUI log item code.  This means that
the rmap code no longer has to know about the inner workings of the RUI
log items.

As a consequence, we can get rid of the _{get,put}_group helpers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   21 +++++++++------------
 libxfs/defer_item.h |    4 ++++
 libxfs/xfs_rmap.c   |    6 ++----
 libxfs/xfs_rmap.h   |    3 ---
 4 files changed, 15 insertions(+), 19 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index d3df56f0a2b..5399a20f186 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -319,21 +319,18 @@ xfs_rmap_update_create_done(
 	return NULL;
 }
 
-/* Take an active ref to the AG containing the space we're rmapping. */
+/* Add this deferred RUI to the transaction. */
 void
-xfs_rmap_update_get_group(
-	struct xfs_mount	*mp,
+xfs_rmap_defer_add(
+	struct xfs_trans	*tp,
 	struct xfs_rmap_intent	*ri)
 {
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	trace_xfs_rmap_defer(mp, ri);
+
 	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
-}
-
-/* Release an active AG ref after finishing rmapping work. */
-static inline void
-xfs_rmap_update_put_group(
-	struct xfs_rmap_intent	*ri)
-{
-	xfs_perag_intent_put(ri->ri_pag);
+	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
 }
 
 /* Cancel a deferred rmap update. */
@@ -343,7 +340,7 @@ xfs_rmap_update_cancel_item(
 {
 	struct xfs_rmap_intent		*ri = ri_entry(item);
 
-	xfs_rmap_update_put_group(ri);
+	xfs_perag_intent_put(ri->ri_pag);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);
 }
 
diff --git a/libxfs/defer_item.h b/libxfs/defer_item.h
index 79e957eb8ff..3ef31ad0aec 100644
--- a/libxfs/defer_item.h
+++ b/libxfs/defer_item.h
@@ -20,4 +20,8 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
 		struct xfs_extent_free_item *xefi,
 		struct xfs_defer_pending **dfpp);
 
+struct xfs_rmap_intent;
+
+void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
+
 #endif /* __LIBXFS_DEFER_ITEM_H_ */
diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 183e840b7f1..24daf0ffb66 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -23,6 +23,7 @@
 #include "xfs_inode.h"
 #include "xfs_ag.h"
 #include "xfs_health.h"
+#include "defer_item.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -2646,10 +2647,7 @@ __xfs_rmap_add(
 	ri->ri_whichfork = whichfork;
 	ri->ri_bmap = *bmap;
 
-	trace_xfs_rmap_defer(tp->t_mountp, ri);
-
-	xfs_rmap_update_get_group(tp->t_mountp, ri);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+	xfs_rmap_defer_add(tp, ri);
 }
 
 /* Map an extent into a file. */
diff --git a/libxfs/xfs_rmap.h b/libxfs/xfs_rmap.h
index 2513ee36aa2..e6240efd6fe 100644
--- a/libxfs/xfs_rmap.h
+++ b/libxfs/xfs_rmap.h
@@ -176,9 +176,6 @@ struct xfs_rmap_intent {
 	struct xfs_perag			*ri_pag;
 };
 
-void xfs_rmap_update_get_group(struct xfs_mount *mp,
-		struct xfs_rmap_intent *ri);
-
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
 void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		int whichfork, struct xfs_bmbt_irec *imap);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/47] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
@ 2023-12-27 13:10   ` Darrick J. Wong
  2023-12-27 13:11   ` [PATCH 02/47] xfs: introduce realtime rmap btree definitions Darrick J. Wong
                     ` (45 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:10 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Simplify the calling conventions by allowing callers to pass a fsbno
(xfs_fsblock_t) directly into these functions, since we're just going to
set it in a struct anyway.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |    6 ++----
 libxfs/xfs_rmap.c     |   12 +++++-------
 libxfs/xfs_rmap.h     |    8 ++++----
 repair/rmap.c         |    8 ++++----
 4 files changed, 15 insertions(+), 19 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 6c6634675c2..9f933d953b9 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -1886,8 +1886,7 @@ xfs_refcount_alloc_cow_extent(
 	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
-			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
@@ -1903,8 +1902,7 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
-			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	xfs_rmap_free_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
 	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 
diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 24daf0ffb66..3e95599ab8a 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -526,7 +526,7 @@ xfs_rmap_free_check_owner(
 	struct xfs_btree_cur	*cur,
 	uint64_t		ltoff,
 	struct xfs_rmap_irec	*rec,
-	xfs_filblks_t		len,
+	xfs_extlen_t		len,
 	uint64_t		owner,
 	uint64_t		offset,
 	unsigned int		flags)
@@ -2717,8 +2717,7 @@ xfs_rmap_convert_extent(
 void
 xfs_rmap_alloc_extent(
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
+	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
 {
@@ -2727,7 +2726,7 @@ xfs_rmap_alloc_extent(
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
 		return;
 
-	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+	bmap.br_startblock = fsbno;
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
@@ -2739,8 +2738,7 @@ xfs_rmap_alloc_extent(
 void
 xfs_rmap_free_extent(
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
+	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
 {
@@ -2749,7 +2747,7 @@ xfs_rmap_free_extent(
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
 		return;
 
-	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+	bmap.br_startblock = fsbno;
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
diff --git a/libxfs/xfs_rmap.h b/libxfs/xfs_rmap.h
index e6240efd6fe..0ccfd7d88e5 100644
--- a/libxfs/xfs_rmap.h
+++ b/libxfs/xfs_rmap.h
@@ -184,10 +184,10 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
 		struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *imap);
-void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
-		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
-		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+		xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+		xfs_extlen_t len, uint64_t owner);
 
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
 		struct xfs_btree_cur **pcur);
diff --git a/repair/rmap.c b/repair/rmap.c
index 37fcf923644..265199d2117 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -1278,7 +1278,6 @@ rmap_diffkeys(
 {
 	__u64			oa;
 	__u64			ob;
-	int64_t			d;
 	struct xfs_rmap_irec	tmp;
 
 	tmp = *kp1;
@@ -1288,9 +1287,10 @@ rmap_diffkeys(
 	tmp.rm_flags &= ~XFS_RMAP_REC_FLAGS;
 	ob = libxfs_rmap_irec_offset_pack(&tmp);
 
-	d = (int64_t)kp1->rm_startblock - kp2->rm_startblock;
-	if (d)
-		return d;
+	if (kp1->rm_startblock > kp2->rm_startblock)
+		return 1;
+	else if (kp2->rm_startblock > kp1->rm_startblock)
+		return -1;
 
 	if (kp1->rm_owner > kp2->rm_owner)
 		return 1;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/47] xfs: introduce realtime rmap btree definitions
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
  2023-12-27 13:10   ` [PATCH 01/47] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions Darrick J. Wong
@ 2023-12-27 13:11   ` Darrick J. Wong
  2023-12-27 13:11   ` [PATCH 03/47] xfs: define the on-disk realtime rmap btree format Darrick J. Wong
                     ` (44 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:11 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add new realtime rmap btree definitions. The realtime rmap btree will
be rooted from a hidden inode, but has its own shape and therefore
needs to have most of its own separate types.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_btree.h  |    1 +
 libxfs/xfs_format.h |    7 +++++++
 libxfs/xfs_types.h  |    5 +++--
 3 files changed, 11 insertions(+), 2 deletions(-)


diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h
index ce0bc5dfffe..e6571c9157d 100644
--- a/libxfs/xfs_btree.h
+++ b/libxfs/xfs_btree.h
@@ -64,6 +64,7 @@ union xfs_btree_rec {
 #define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi)
 #define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
 #define	XFS_BTNUM_RCBAG	((xfs_btnum_t)XFS_BTNUM_RCBAGi)
+#define	XFS_BTNUM_RTRMAP ((xfs_btnum_t)XFS_BTNUM_RTRMAPi)
 
 struct xfs_btree_ops;
 uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 87476c6bb6c..b47d4f16143 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1746,6 +1746,13 @@ typedef __be32 xfs_rmap_ptr_t;
 	 XFS_FIBT_BLOCK(mp) + 1 : \
 	 XFS_IBT_BLOCK(mp) + 1)
 
+/*
+ * Realtime Reverse mapping btree format definitions
+ *
+ * This is a btree for reverse mapping records for realtime volumes
+ */
+#define	XFS_RTRMAP_CRC_MAGIC	0x4d415052	/* 'MAPR' */
+
 /*
  * Reference Count Btree format definitions
  *
diff --git a/libxfs/xfs_types.h b/libxfs/xfs_types.h
index ad2ce83874f..b3edc57dc65 100644
--- a/libxfs/xfs_types.h
+++ b/libxfs/xfs_types.h
@@ -126,7 +126,7 @@ typedef enum {
 typedef enum {
 	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
 	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_RCBAGi,
-	XFS_BTNUM_MAX
+	XFS_BTNUM_RTRMAPi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 
 #define XFS_BTNUM_STRINGS \
@@ -137,7 +137,8 @@ typedef enum {
 	{ XFS_BTNUM_INOi,	"inobt" }, \
 	{ XFS_BTNUM_FINOi,	"finobt" }, \
 	{ XFS_BTNUM_REFCi,	"refcbt" }, \
-	{ XFS_BTNUM_RCBAGi,	"rcbagbt" }
+	{ XFS_BTNUM_RCBAGi,	"rcbagbt" }, \
+	{ XFS_BTNUM_RTRMAPi,	"rtrmapbt" }
 
 struct xfs_name {
 	const unsigned char	*name;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/47] xfs: define the on-disk realtime rmap btree format
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
  2023-12-27 13:10   ` [PATCH 01/47] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions Darrick J. Wong
  2023-12-27 13:11   ` [PATCH 02/47] xfs: introduce realtime rmap btree definitions Darrick J. Wong
@ 2023-12-27 13:11   ` Darrick J. Wong
  2023-12-27 13:11   ` [PATCH 04/47] xfs: realtime rmap btree transaction reservations Darrick J. Wong
                     ` (43 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:11 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Start filling out the rtrmap btree implementation. Start with the
on-disk btree format; add everything needed to read, write and
manipulate rmap btree blocks. This prepares the way for connecting the
btree operations implementation.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/libxfs.h          |    1 
 include/xfs_mount.h       |    9 +
 libxfs/Makefile           |    2 
 libxfs/init.c             |    5 -
 libxfs/xfs_btree.c        |    5 +
 libxfs/xfs_format.h       |    3 
 libxfs/xfs_ondisk.h       |    1 
 libxfs/xfs_rtrmap_btree.c |  305 +++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrmap_btree.h |   83 ++++++++++++
 libxfs/xfs_sb.c           |    6 +
 libxfs/xfs_shared.h       |    2 
 11 files changed, 420 insertions(+), 2 deletions(-)
 create mode 100644 libxfs/xfs_rtrmap_btree.c
 create mode 100644 libxfs/xfs_rtrmap_btree.h


diff --git a/include/libxfs.h b/include/libxfs.h
index 8d2e321b914..3ab93158cf7 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -93,6 +93,7 @@ struct iomap;
 #include "imeta_utils.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index 1284e848835..4e4da5bc4fa 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -92,11 +92,14 @@ typedef struct xfs_mount {
 	uint			m_bmap_dmnr[2];	/* XFS_BMAP_BLOCK_DMINRECS */
 	uint			m_rmap_mxr[2];	/* max rmap btree records */
 	uint			m_rmap_mnr[2];	/* min rmap btree records */
+	uint			m_rtrmap_mxr[2]; /* max rtrmap btree records */
+	uint			m_rtrmap_mnr[2]; /* min rtrmap btree records */
 	uint			m_refc_mxr[2];	/* max refc btree records */
 	uint			m_refc_mnr[2];	/* min refc btree records */
 	uint			m_alloc_maxlevels; /* max alloc btree levels */
 	uint			m_bm_maxlevels[2];  /* max bmap btree levels */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
+	uint			m_rtrmap_maxlevels; /* max rtrmap btree level */
 	uint			m_refc_maxlevels; /* max refc btree levels */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
 	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
@@ -233,6 +236,12 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
 __XFS_HAS_FEAT(metadir, METADIR)
 __XFS_HAS_FEAT(rtgroups, RTGROUPS)
 
+static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
+{
+	return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) &&
+	       xfs_has_rmapbt(mp);
+}
+
 /* Kernel mount features that we don't support */
 #define __XFS_UNSUPP_FEAT(name) \
 static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
diff --git a/libxfs/Makefile b/libxfs/Makefile
index a37a5263199..0b3e8c896bd 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -63,6 +63,7 @@ HFILES = \
 	xfs_rmap_btree.h \
 	xfs_rtbitmap.h \
 	xfs_rtgroup.h \
+	xfs_rtrmap_btree.h \
 	xfs_sb.h \
 	xfs_shared.h \
 	xfs_swapext.h \
@@ -121,6 +122,7 @@ CFILES = cache.c \
 	xfs_rmap_btree.c \
 	xfs_rtbitmap.c \
 	xfs_rtgroup.c \
+	xfs_rtrmap_btree.c \
 	xfs_sb.c \
 	xfs_swapext.c \
 	xfs_symlink_remote.c \
diff --git a/libxfs/init.c b/libxfs/init.c
index 2663485a80d..9a4dfe02945 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -21,6 +21,7 @@
 #include "xfs_trans.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_imeta.h"
 #include "libfrog/platform.h"
 #include "xfile.h"
 
@@ -656,8 +657,7 @@ static inline void
 xfs_rtbtree_compute_maxlevels(
 	struct xfs_mount	*mp)
 {
-	/* This will be filled in later. */
-	mp->m_rtbtree_maxlevels = 0;
+	mp->m_rtbtree_maxlevels = mp->m_rtrmap_maxlevels;
 }
 
 /* Compute maximum possible height of all btrees. */
@@ -673,6 +673,7 @@ libxfs_compute_all_maxlevels(
 	igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
 	xfs_ialloc_setup_geometry(mp);
 	xfs_rmapbt_compute_maxlevels(mp);
+	xfs_rtrmapbt_compute_maxlevels(mp);
 	xfs_refcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index e0276ad655a..ea0d5d71d03 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -29,6 +29,7 @@
 #include "xfbtree.h"
 #include "xfs_btree_mem.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Btree magic numbers.
@@ -5525,6 +5526,9 @@ xfs_btree_init_cur_caches(void)
 	if (error)
 		goto err;
 	error = xfs_refcountbt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_rtrmapbt_init_cur_cache();
 	if (error)
 		goto err;
 
@@ -5543,6 +5547,7 @@ xfs_btree_destroy_cur_caches(void)
 	xfs_bmbt_destroy_cur_cache();
 	xfs_rmapbt_destroy_cur_cache();
 	xfs_refcountbt_destroy_cur_cache();
+	xfs_rtrmapbt_destroy_cur_cache();
 }
 
 /* Move the btree cursor before the first record. */
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index b47d4f16143..5317c6438f0 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1753,6 +1753,9 @@ typedef __be32 xfs_rmap_ptr_t;
  */
 #define	XFS_RTRMAP_CRC_MAGIC	0x4d415052	/* 'MAPR' */
 
+/* inode-based btree pointer type */
+typedef __be64 xfs_rtrmap_ptr_t;
+
 /*
  * Reference Count Btree format definitions
  *
diff --git a/libxfs/xfs_ondisk.h b/libxfs/xfs_ondisk.h
index 70b96efa269..897a1b72f8d 100644
--- a/libxfs/xfs_ondisk.h
+++ b/libxfs/xfs_ondisk.h
@@ -77,6 +77,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
+	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
new file mode 100644
index 00000000000..1b6375af818
--- /dev/null
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs_priv.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_rtgroup.h"
+
+static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
+
+/*
+ * Realtime Reverse Map btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device.  See the comments in xfs_rmap_btree.c for more information.
+ *
+ * This tree is basically the same as the regular rmap btree except that it
+ * is rooted in an inode and does not live in free space.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrmapbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_cur	*new;
+
+	new = xfs_rtrmapbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ino.rtg,
+			cur->bc_ino.ip);
+
+	/* Copy the flags values since init cursor doesn't get them. */
+	new->bc_ino.flags = cur->bc_ino.flags;
+
+	return new;
+}
+
+static xfs_failaddr_t
+xfs_rtrmapbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	int			level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (!xfs_has_rmapbt(mp))
+		return __this_address;
+	fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+	level = be16_to_cpu(block->bb_level);
+	if (level > mp->m_rtrmap_maxlevels)
+		return __this_address;
+
+	return xfs_btree_lblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
+}
+
+static void
+xfs_rtrmapbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_lblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rtrmapbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrmapbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_rtrmapbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_lblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
+	.name			= "xfs_rtrmapbt",
+	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+	.verify_read		= xfs_rtrmapbt_read_verify,
+	.verify_write		= xfs_rtrmapbt_write_verify,
+	.verify_struct		= xfs_rtrmapbt_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrmapbt_ops = {
+	.rec_len		= sizeof(struct xfs_rmap_rec),
+	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+	.lru_refs		= XFS_RMAP_BTREE_REF,
+	.geom_flags		= XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE |
+				  XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING |
+				  XFS_BTREE_IROOT_RECORDS,
+
+	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
+	.buf_ops		= &xfs_rtrmapbt_buf_ops,
+};
+
+/* Initialize a new rt rmap btree cursor. */
+static struct xfs_btree_cur *
+xfs_rtrmapbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip)
+{
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RTRMAP,
+			&xfs_rtrmapbt_ops, mp->m_rtrmap_maxlevels,
+			xfs_rtrmapbt_cur_cache);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+
+	cur->bc_ino.ip = ip;
+	cur->bc_ino.allocated = 0;
+	cur->bc_ino.flags = 0;
+
+	cur->bc_ino.rtg = xfs_rtgroup_hold(rtg);
+	return cur;
+}
+
+/* Allocate a new rt rmap btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+	cur = xfs_rtrmapbt_init_common(mp, tp, rtg, ip);
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+	cur->bc_ino.whichfork = XFS_DATA_FORK;
+	return cur;
+}
+
+/* Create a new rt reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip,
+	struct xbtree_ifakeroot	*ifake)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_rtrmapbt_init_common(mp, NULL, rtg, ip);
+	cur->bc_nlevels = ifake->if_levels;
+	cur->bc_ino.forksize = ifake->if_fork_size;
+	cur->bc_ino.whichfork = -1;
+	xfs_btree_stage_ifakeroot(cur, ifake, NULL);
+	return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root.  Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrmapbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp)
+{
+	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
+	struct xfs_ifork	*ifp;
+	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	/*
+	 * Free any resources hanging off the real fork, then shallow-copy the
+	 * staging fork's contents into the real fork to transfer everything
+	 * we just built.
+	 */
+	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+	xfs_idestroy_fork(ifp);
+	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK, &xfs_rtrmapbt_ops);
+}
+
+/* Calculate number of records in a rt reverse mapping btree block. */
+static inline unsigned int
+xfs_rtrmapbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(struct xfs_rmap_rec);
+	return blocklen /
+		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Calculate number of records in an rt reverse mapping btree block.
+ */
+unsigned int
+xfs_rtrmapbt_maxrecs(
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_RTRMAP_BLOCK_LEN;
+	return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime reverse mapping btrees. */
+unsigned int
+xfs_rtrmapbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
+
+	/* We need at most one record for every block in an rt group. */
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrmapbt_init_cur_cache(void)
+{
+	xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
+			xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_rtrmapbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_rtrmapbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
+	xfs_rtrmapbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of an rt reverse mapping btree. */
+void
+xfs_rtrmapbt_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	unsigned int		d_maxlevels, r_maxlevels;
+
+	if (!xfs_has_rtrmapbt(mp)) {
+		mp->m_rtrmap_maxlevels = 0;
+		return;
+	}
+
+	/*
+	 * The realtime rmapbt lives on the data device, which means that its
+	 * maximum height is constrained by the size of the data device and
+	 * the height required to store one rmap record for each block in an
+	 * rt group.
+	 */
+	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
+				mp->m_sb.sb_dblocks);
+	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
+				mp->m_sb.sb_rgblocks);
+
+	/* Add one level to handle the inode root level. */
+	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
diff --git a/libxfs/xfs_rtrmap_btree.h b/libxfs/xfs_rtrmap_btree.h
new file mode 100644
index 00000000000..0f732679719
--- /dev/null
+++ b/libxfs/xfs_rtrmap_btree.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTRMAP_BTREE_H__
+#define	__XFS_RTRMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RTRMAP_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+		struct xfs_inode *ip);
+struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		struct xbtree_ifakeroot *ifake);
+void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp);
+unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
+void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrmapbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_rec_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_rec *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_high_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 sizeof(struct xfs_rmap_key) +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_ptr_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrmap_ptr_t *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+		 (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+unsigned int xfs_rtrmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rtrmapbt_init_cur_cache(void);
+void xfs_rtrmapbt_destroy_cur_cache(void);
+
+#endif	/* __XFS_RTRMAP_BTREE_H__ */
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index aff2ab79f9b..891b71190d5 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -26,6 +26,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_swapext.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1123,6 +1124,11 @@ xfs_sb_mount_common(
 	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
 	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
 
+	mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false);
+	mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2;
+	mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2;
+
 	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
 	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h
index 8ad4b67d6fe..adb742267c9 100644
--- a/libxfs/xfs_shared.h
+++ b/libxfs/xfs_shared.h
@@ -42,6 +42,7 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
 extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
@@ -54,6 +55,7 @@ extern const struct xfs_btree_ops xfs_finobt_ops;
 extern const struct xfs_btree_ops xfs_bmbt_ops;
 extern const struct xfs_btree_ops xfs_refcountbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_ops;
+extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
 
 /* log size calculation functions */
 int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/47] xfs: realtime rmap btree transaction reservations
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:11   ` [PATCH 03/47] xfs: define the on-disk realtime rmap btree format Darrick J. Wong
@ 2023-12-27 13:11   ` Darrick J. Wong
  2023-12-27 13:11   ` [PATCH 05/47] xfs: add realtime rmap btree operations Darrick J. Wong
                     ` (42 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:11 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that there's enough log reservation to handle mapping
and unmapping realtime extents.  We have to reserve enough space
to handle a split in the rtrmapbt to add the record and a second
split in the regular rmapbt to record the rtrmapbt split.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_swapext.c     |    4 +++-
 libxfs/xfs_trans_resv.c  |   12 ++++++++++--
 libxfs/xfs_trans_space.h |   13 +++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_swapext.c b/libxfs/xfs_swapext.c
index 1f7fbe76a89..36283ea72cd 100644
--- a/libxfs/xfs_swapext.c
+++ b/libxfs/xfs_swapext.c
@@ -759,7 +759,9 @@ xfs_swapext_rmapbt_blocks(
 	if (!xfs_has_rmapbt(mp))
 		return 0;
 	if (XFS_IS_REALTIME_INODE(req->ip1))
-		return 0;
+		return howmany_64(req->nr_exchanges,
+					XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) *
+			XFS_RTRMAPADD_SPACE_RES(mp);
 
 	return howmany_64(req->nr_exchanges,
 					XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
diff --git a/libxfs/xfs_trans_resv.c b/libxfs/xfs_trans_resv.c
index 800a2f9ecb8..18efae57975 100644
--- a/libxfs/xfs_trans_resv.c
+++ b/libxfs/xfs_trans_resv.c
@@ -211,7 +211,9 @@ xfs_calc_inode_chunk_res(
  * Per-extent log reservation for the btree changes involved in freeing or
  * allocating a realtime extent.  We have to be able to log as many rtbitmap
  * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
- * extents, as well as the realtime summary block.
+ * extents, as well as the realtime summary block (t1).  Realtime rmap btree
+ * operations happen in a second transaction, so factor in a couple of rtrmapbt
+ * splits (t2).
  */
 static unsigned int
 xfs_rtalloc_block_count(
@@ -220,10 +222,16 @@ xfs_rtalloc_block_count(
 {
 	unsigned int		rtbmp_blocks;
 	xfs_rtxlen_t		rtxlen;
+	unsigned int		t1, t2 = 0;
 
 	rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
 	rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen);
-	return (rtbmp_blocks + 1) * num_ops;
+	t1 = (rtbmp_blocks + 1) * num_ops;
+
+	if (xfs_has_rmapbt(mp))
+		t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1);
+
+	return max(t1, t2);
 }
 
 /*
diff --git a/libxfs/xfs_trans_space.h b/libxfs/xfs_trans_space.h
index 1155ff2d37e..d89b570aafc 100644
--- a/libxfs/xfs_trans_space.h
+++ b/libxfs/xfs_trans_space.h
@@ -14,6 +14,19 @@
 #define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)    \
 		(((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0]))
 
+/* Worst case number of realtime rmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)    \
+		(((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0]))
+
+/* Adding one realtime rmap could split every level to the top of the tree. */
+#define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels)
+
+/* Blocks we might need to add "b" realtime rmaps to a tree. */
+#define XFS_NRTRMAPADD_SPACE_RES(mp, b) \
+	((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \
+	  XFS_RTRMAPADD_SPACE_RES(mp))
+
 /* Worst case number of rmaps that can be held in a block. */
 #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
 		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/47] xfs: add realtime rmap btree operations
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:11   ` [PATCH 04/47] xfs: realtime rmap btree transaction reservations Darrick J. Wong
@ 2023-12-27 13:11   ` Darrick J. Wong
  2023-12-27 13:12   ` [PATCH 06/47] xfs: prepare rmap functions to deal with rtrmapbt Darrick J. Wong
                     ` (41 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:11 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Implement the generic btree operations needed to manipulate rtrmap
btree blocks. This is different from the regular rmapbt in that we
allocate space from the filesystem at large, and are neither
constrained to the free space nor any particular AG.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_btree.c        |   70 ++++++++++++
 libxfs/xfs_btree.h        |    5 +
 libxfs/xfs_rtrmap_btree.c |  271 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 346 insertions(+)


diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index ea0d5d71d03..f599dd17d30 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -30,6 +30,9 @@
 #include "xfs_btree_mem.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_imeta.h"
 
 /*
  * Btree magic numbers.
@@ -5576,3 +5579,70 @@ xfs_btree_goto_left_edge(
 
 	return 0;
 }
+
+/* Allocate a block for an inode-rooted metadata btree. */
+int
+xfs_btree_alloc_imeta_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	struct xfs_alloc_arg		args = {
+		.mp			= cur->bc_mp,
+		.tp			= cur->bc_tp,
+		.resv			= XFS_AG_RESV_IMETA,
+		.minlen			= 1,
+		.maxlen			= 1,
+		.prod			= 1,
+	};
+	struct xfs_inode		*ip = cur->bc_ino.ip;
+	int				error;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(XFS_IS_DQDETACHED(cur->bc_mp, ip));
+
+	xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork);
+	error = xfs_alloc_vextent_start_ag(&args,
+			XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino));
+	if (error)
+		return error;
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+
+	xfs_imeta_resv_alloc_extent(ip, &args);
+	cur->bc_ino.allocated++;
+
+	new->l = cpu_to_be64(args.fsbno);
+	*stat = 1;
+	return 0;
+}
+
+/* Free a block from an inode-rooted metadata btree. */
+int
+xfs_btree_free_imeta_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_inode	*ip = cur->bc_ino.ip;
+	struct xfs_trans	*tp = cur->bc_tp;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+	int			error;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(XFS_IS_DQDETACHED(cur->bc_mp, ip));
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
+	error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_IMETA,
+			0);
+	if (error)
+		return error;
+
+	xfs_imeta_resv_free_extent(ip, tp, 1);
+	return 0;
+}
diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h
index e6571c9157d..3559cf5d3a6 100644
--- a/libxfs/xfs_btree.h
+++ b/libxfs/xfs_btree.h
@@ -764,4 +764,9 @@ void xfs_btree_destroy_cur_caches(void);
 
 int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
 
+int xfs_btree_alloc_imeta_block(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *start, union xfs_btree_ptr *newp,
+		int *stat);
+int xfs_btree_free_imeta_block(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index 1b6375af818..a2cb497379f 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -18,10 +18,12 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
+#include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_rtgroup.h"
+#include "xfs_bmap.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -50,6 +52,182 @@ xfs_rtrmapbt_dup_cursor(
 	return new;
 }
 
+STATIC int
+xfs_rtrmapbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_rtrmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrmapbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0);
+	}
+
+	return cur->bc_mp->m_rtrmap_mxr[level != 0];
+}
+
+/*
+ * Convert the ondisk record's offset field into the ondisk key's offset field.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
+{
+	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+}
+
+STATIC void
+xfs_rtrmapbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->rmap.rm_startblock = rec->rmap.rm_startblock;
+	key->rmap.rm_owner = rec->rmap.rm_owner;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+}
+
+STATIC void
+xfs_rtrmapbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	uint64_t			off;
+	int				adj;
+
+	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+	key->rmap.rm_startblock = rec->rmap.rm_startblock;
+	be32_add_cpu(&key->rmap.rm_startblock, adj);
+	key->rmap.rm_owner = rec->rmap.rm_owner;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+		return;
+	off = be64_to_cpu(key->rmap.rm_offset);
+	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+	key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rtrmapbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+	rec->rmap.rm_offset = cpu_to_be64(
+			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rtrmapbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+/*
+ * Mask the appropriate parts of the ondisk key field for a key comparison.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline uint64_t offset_keymask(uint64_t offset)
+{
+	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
+	const struct xfs_rmap_key	*kp = &key->rmap;
+	__u64				x, y;
+	int64_t				d;
+
+	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+	if (d)
+		return d;
+
+	x = be64_to_cpu(kp->rm_owner);
+	y = rec->rm_owner;
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+
+	x = offset_keymask(be64_to_cpu(kp->rm_offset));
+	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+	return 0;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
+{
+	const struct xfs_rmap_key	*kp1 = &k1->rmap;
+	const struct xfs_rmap_key	*kp2 = &k2->rmap;
+	int64_t				d;
+	__u64				x, y;
+
+	/* Doesn't make sense to mask off the physical space part */
+	ASSERT(!mask || mask->rmap.rm_startblock);
+
+	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
+		     be32_to_cpu(kp2->rm_startblock);
+	if (d)
+		return d;
+
+	if (!mask || mask->rmap.rm_owner) {
+		x = be64_to_cpu(kp1->rm_owner);
+		y = be64_to_cpu(kp2->rm_owner);
+		if (x > y)
+			return 1;
+		else if (y > x)
+			return -1;
+	}
+
+	if (!mask || mask->rmap.rm_offset) {
+		/* Doesn't make sense to allow offset but not owner */
+		ASSERT(!mask || mask->rmap.rm_owner);
+
+		x = offset_keymask(be64_to_cpu(kp1->rm_offset));
+		y = offset_keymask(be64_to_cpu(kp2->rm_offset));
+		if (x > y)
+			return 1;
+		else if (y > x)
+			return -1;
+	}
+
+	return 0;
+}
+
 static xfs_failaddr_t
 xfs_rtrmapbt_verify(
 	struct xfs_buf		*bp)
@@ -116,6 +294,86 @@ const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
 	.verify_struct		= xfs_rtrmapbt_verify,
 };
 
+STATIC int
+xfs_rtrmapbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	uint32_t			x;
+	uint32_t			y;
+	uint64_t			a;
+	uint64_t			b;
+
+	x = be32_to_cpu(k1->rmap.rm_startblock);
+	y = be32_to_cpu(k2->rmap.rm_startblock);
+	if (x < y)
+		return 1;
+	else if (x > y)
+		return 0;
+	a = be64_to_cpu(k1->rmap.rm_owner);
+	b = be64_to_cpu(k2->rmap.rm_owner);
+	if (a < b)
+		return 1;
+	else if (a > b)
+		return 0;
+	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
+	if (a <= b)
+		return 1;
+	return 0;
+}
+
+STATIC int
+xfs_rtrmapbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	uint32_t			x;
+	uint32_t			y;
+	uint64_t			a;
+	uint64_t			b;
+
+	x = be32_to_cpu(r1->rmap.rm_startblock);
+	y = be32_to_cpu(r2->rmap.rm_startblock);
+	if (x < y)
+		return 1;
+	else if (x > y)
+		return 0;
+	a = be64_to_cpu(r1->rmap.rm_owner);
+	b = be64_to_cpu(r2->rmap.rm_owner);
+	if (a < b)
+		return 1;
+	else if (a > b)
+		return 0;
+	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
+	if (a <= b)
+		return 1;
+	return 0;
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrmapbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->rmap.rm_startblock);
+
+	/*
+	 * We only support checking contiguity of the physical space component.
+	 * If any callers ever need more specificity than that, they'll have to
+	 * implement it here.
+	 */
+	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
+
+	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
+				 be32_to_cpu(key2->rmap.rm_startblock));
+}
+
 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.rec_len		= sizeof(struct xfs_rmap_rec),
 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
@@ -125,7 +383,20 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 				  XFS_BTREE_IROOT_RECORDS,
 
 	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
+	.alloc_block		= xfs_btree_alloc_imeta_block,
+	.free_block		= xfs_btree_free_imeta_block,
+	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
+	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_rtrmapbt_init_ptr_from_cur,
+	.key_diff		= xfs_rtrmapbt_key_diff,
 	.buf_ops		= &xfs_rtrmapbt_buf_ops,
+	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
+	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
 };
 
 /* Initialize a new rt rmap btree cursor. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/47] xfs: prepare rmap functions to deal with rtrmapbt
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:11   ` [PATCH 05/47] xfs: add realtime rmap btree operations Darrick J. Wong
@ 2023-12-27 13:12   ` Darrick J. Wong
  2023-12-27 13:12   ` [PATCH 07/47] xfs: add a realtime flag to the rmap update log redo items Darrick J. Wong
                     ` (40 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:12 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Prepare the high-level rmap functions to deal with the new realtime
rmapbt and its slightly different conventions.  Provide the ability
to talk to either rmapbt or rtrmapbt formats from the same high
level code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rmap.c |   66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rmap.h |    3 ++
 2 files changed, 69 insertions(+)


diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 3e95599ab8a..007f17cc644 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -24,6 +24,7 @@
 #include "xfs_ag.h"
 #include "xfs_health.h"
 #include "defer_item.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -263,11 +264,72 @@ xfs_rmap_check_irec(
 	return NULL;
 }
 
+xfs_failaddr_t
+xfs_rtrmap_check_irec(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_rmap_irec	*irec)
+{
+	struct xfs_mount		*mp = rtg->rtg_mount;
+	bool				is_inode;
+	bool				is_unwritten;
+	bool				is_bmbt;
+	bool				is_attr;
+
+	if (irec->rm_blockcount == 0)
+		return __this_address;
+
+	if (irec->rm_owner == XFS_RMAP_OWN_FS) {
+		if (irec->rm_startblock != 0)
+			return __this_address;
+		if (irec->rm_blockcount != mp->m_sb.sb_rextsize)
+			return __this_address;
+		if (irec->rm_offset != 0)
+			return __this_address;
+	} else {
+		if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
+					    irec->rm_blockcount))
+			return __this_address;
+	}
+
+	if (!(xfs_verify_ino(mp, irec->rm_owner) ||
+	      (irec->rm_owner <= XFS_RMAP_OWN_FS &&
+	       irec->rm_owner >= XFS_RMAP_OWN_MIN)))
+		return __this_address;
+
+	/* Check flags. */
+	is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+	is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+	is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+	is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+	if (!is_inode && irec->rm_owner != XFS_RMAP_OWN_FS)
+		return __this_address;
+
+	if (!is_inode && irec->rm_offset != 0)
+		return __this_address;
+
+	if (is_bmbt || is_attr)
+		return __this_address;
+
+	if (is_unwritten && !is_inode)
+		return __this_address;
+
+	/* Check for a valid fork offset, if applicable. */
+	if (is_inode &&
+	    !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))
+		return __this_address;
+
+	return NULL;
+}
+
 static inline xfs_failaddr_t
 xfs_rmap_check_btrec(
 	struct xfs_btree_cur		*cur,
 	const struct xfs_rmap_irec	*irec)
 {
+	if (cur->bc_btnum == XFS_BTNUM_RTRMAP)
+		return xfs_rtrmap_check_irec(cur->bc_ino.rtg, irec);
+
 	if (cur->bc_flags & XFS_BTREE_IN_XFILE)
 		return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
 	return xfs_rmap_check_irec(cur->bc_ag.pag, irec);
@@ -284,6 +346,10 @@ xfs_rmap_complain_bad_rec(
 	if (cur->bc_flags & XFS_BTREE_IN_XFILE)
 		xfs_warn(mp,
  "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+	else if (cur->bc_btnum == XFS_BTNUM_RTRMAP)
+		xfs_warn(mp,
+ "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!",
+				cur->bc_ino.rtg->rtg_rgno, fa);
 	else
 		xfs_warn(mp,
  "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
diff --git a/libxfs/xfs_rmap.h b/libxfs/xfs_rmap.h
index 0ccfd7d88e5..762f2f40b6e 100644
--- a/libxfs/xfs_rmap.h
+++ b/libxfs/xfs_rmap.h
@@ -7,6 +7,7 @@
 #define __XFS_RMAP_H__
 
 struct xfs_perag;
+struct xfs_rtgroup;
 
 static inline void
 xfs_rmap_ino_bmbt_owner(
@@ -206,6 +207,8 @@ xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_rmap_irec *irec);
 xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
 		const struct xfs_rmap_irec *irec);
+xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg,
+		const struct xfs_rmap_irec *irec);
 
 int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, enum xbtree_recpacking *outcome);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/47] xfs: add a realtime flag to the rmap update log redo items
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:12   ` [PATCH 06/47] xfs: prepare rmap functions to deal with rtrmapbt Darrick J. Wong
@ 2023-12-27 13:12   ` Darrick J. Wong
  2023-12-27 13:12   ` [PATCH 08/47] xfs: add realtime reverse map inode to metadata directory Darrick J. Wong
                     ` (39 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:12 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Extend the rmap update (RUI) log items with a new realtime flag that
indicates that the updates apply against the realtime rmapbt.  We'll
wire up the actual rmap code later.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c     |   96 ++++++++++++++++++++++++++++++++++++++++++++++-
 libxfs/xfs_defer.h      |    1 
 libxfs/xfs_log_format.h |    6 ++-
 libxfs/xfs_refcount.c   |    4 +-
 libxfs/xfs_rmap.c       |   32 +++++++++++++---
 libxfs/xfs_rmap.h       |   12 ++++--
 6 files changed, 138 insertions(+), 13 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 5399a20f186..a82d23c17cf 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -29,6 +29,7 @@
 #include "xfs_swapext.h"
 #include "defer_item.h"
 #include "xfs_btree.h"
+#include "xfs_rtgroup.h"
 
 /* Dummy defer item ops, since we don't do logging. */
 
@@ -329,8 +330,23 @@ xfs_rmap_defer_add(
 
 	trace_xfs_rmap_defer(mp, ri);
 
-	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+	/*
+	 * Deferred rmap updates for the realtime and data sections must use
+	 * separate transactions to finish deferred work because updates to
+	 * realtime metadata files can lock AGFs to allocate btree blocks and
+	 * we don't want that mixing with the AGF locks taken to finish data
+	 * section updates.
+	 */
+	if (ri->ri_realtime) {
+		xfs_rgnumber_t	rgno;
+
+		rgno = xfs_rtb_to_rgno(mp, ri->ri_bmap.br_startblock);
+		ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+		xfs_defer_add(tp, &ri->ri_list, &xfs_rtrmap_update_defer_type);
+	} else {
+		ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
+		xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+	}
 }
 
 /* Cancel a deferred rmap update. */
@@ -395,6 +411,82 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.cancel_item	= xfs_rmap_update_cancel_item,
 };
 
+/* Sort rmap intents by rtgroup. */
+static int
+xfs_rtrmap_update_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_rmap_intent		*ra = ri_entry(a);
+	struct xfs_rmap_intent		*rb = ri_entry(b);
+
+	return ra->ri_rtg->rtg_rgno - rb->ri_rtg->rtg_rgno;
+}
+
+static struct xfs_log_item *
+xfs_rtrmap_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	if (sort)
+		list_sort(mp, items, xfs_rtrmap_update_diff_items);
+	return NULL;
+}
+
+/* Cancel a deferred realtime rmap update. */
+STATIC void
+xfs_rtrmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_rmap_intent		*ri = ri_entry(item);
+
+	xfs_rtgroup_put(ri->ri_rtg);
+	kmem_cache_free(xfs_rmap_intent_cache, ri);
+}
+
+/* Process a deferred realtime rmap update. */
+STATIC int
+xfs_rtrmap_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_rmap_intent		*ri = ri_entry(item);
+	int				error;
+
+	error = xfs_rtrmap_finish_one(tp, ri, state);
+
+	xfs_rtrmap_update_cancel_item(item);
+	return error;
+}
+
+/* Clean up after calling xfs_rtrmap_finish_one. */
+STATIC void
+xfs_rtrmap_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	if (rcur)
+		xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = {
+	.name		= "rtrmap",
+	.create_intent	= xfs_rtrmap_update_create_intent,
+	.abort_intent	= xfs_rmap_update_abort_intent,
+	.create_done	= xfs_rmap_update_create_done,
+	.finish_item	= xfs_rtrmap_update_finish_item,
+	.finish_cleanup = xfs_rtrmap_finish_one_cleanup,
+	.cancel_item	= xfs_rtrmap_update_cancel_item,
+};
+
 /* Reference Counting */
 
 /* Sort refcount intents by AG. */
diff --git a/libxfs/xfs_defer.h b/libxfs/xfs_defer.h
index b4e1c386768..fddcb4cccbc 100644
--- a/libxfs/xfs_defer.h
+++ b/libxfs/xfs_defer.h
@@ -69,6 +69,7 @@ struct xfs_defer_op_type {
 extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
 extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
 extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;
diff --git a/libxfs/xfs_log_format.h b/libxfs/xfs_log_format.h
index 1f5fe4a588e..ea4e88d6657 100644
--- a/libxfs/xfs_log_format.h
+++ b/libxfs/xfs_log_format.h
@@ -250,6 +250,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_SXD		0x1249  /* extent swap done */
 #define	XFS_LI_EFI_RT		0x124a	/* realtime extent free intent */
 #define	XFS_LI_EFD_RT		0x124b	/* realtime extent free done */
+#define	XFS_LI_RUI_RT		0x124c	/* realtime rmap update intent */
+#define	XFS_LI_RUD_RT		0x124d	/* realtime rmap update done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -271,7 +273,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_SXI,		"XFS_LI_SXI" }, \
 	{ XFS_LI_SXD,		"XFS_LI_SXD" }, \
 	{ XFS_LI_EFI_RT,	"XFS_LI_EFI_RT" }, \
-	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }
+	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }, \
+	{ XFS_LI_RUI_RT,	"XFS_LI_RUI_RT" }, \
+	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }
 
 /*
  * Inode Log Item Format definitions.
diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 9f933d953b9..0e8daab9986 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -1886,7 +1886,7 @@ xfs_refcount_alloc_cow_extent(
 	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
@@ -1902,7 +1902,7 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_free_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
 	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 
diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 007f17cc644..00544d6a20f 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -2681,6 +2681,21 @@ xfs_rmap_finish_one(
 	return 0;
 }
 
+/*
+ * Process one of the deferred realtime rmap operations.  We pass back the
+ * btree cursor to reduce overhead.
+ */
+int
+xfs_rtrmap_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_rmap_intent		*ri,
+	struct xfs_btree_cur		**pcur)
+{
+	/* coming in a subsequent patch */
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
 /*
  * Don't defer an rmap if we aren't an rmap filesystem.
  */
@@ -2701,6 +2716,7 @@ __xfs_rmap_add(
 	struct xfs_trans		*tp,
 	enum xfs_rmap_intent_type	type,
 	uint64_t			owner,
+	bool				isrt,
 	int				whichfork,
 	struct xfs_bmbt_irec		*bmap)
 {
@@ -2712,6 +2728,7 @@ __xfs_rmap_add(
 	ri->ri_owner = owner;
 	ri->ri_whichfork = whichfork;
 	ri->ri_bmap = *bmap;
+	ri->ri_realtime = isrt;
 
 	xfs_rmap_defer_add(tp, ri);
 }
@@ -2725,6 +2742,7 @@ xfs_rmap_map_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
 		return;
@@ -2732,7 +2750,7 @@ xfs_rmap_map_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_MAP_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /* Unmap an extent out of a file. */
@@ -2744,6 +2762,7 @@ xfs_rmap_unmap_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
 		return;
@@ -2751,7 +2770,7 @@ xfs_rmap_unmap_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_UNMAP_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /*
@@ -2769,6 +2788,7 @@ xfs_rmap_convert_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return;
@@ -2776,13 +2796,14 @@ xfs_rmap_convert_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_CONVERT_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /* Schedule the creation of an rmap for non-file data. */
 void
 xfs_rmap_alloc_extent(
 	struct xfs_trans	*tp,
+	bool			isrt,
 	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
@@ -2797,13 +2818,14 @@ xfs_rmap_alloc_extent(
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap);
 }
 
 /* Schedule the deletion of an rmap for non-file data. */
 void
 xfs_rmap_free_extent(
 	struct xfs_trans	*tp,
+	bool			isrt,
 	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
@@ -2818,7 +2840,7 @@ xfs_rmap_free_extent(
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap);
 }
 
 /* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
diff --git a/libxfs/xfs_rmap.h b/libxfs/xfs_rmap.h
index 762f2f40b6e..3719fc4cbc2 100644
--- a/libxfs/xfs_rmap.h
+++ b/libxfs/xfs_rmap.h
@@ -174,7 +174,11 @@ struct xfs_rmap_intent {
 	int					ri_whichfork;
 	uint64_t				ri_owner;
 	struct xfs_bmbt_irec			ri_bmap;
-	struct xfs_perag			*ri_pag;
+	union {
+		struct xfs_perag		*ri_pag;
+		struct xfs_rtgroup		*ri_rtg;
+	};
+	bool					ri_realtime;
 };
 
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
@@ -185,11 +189,13 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
 		struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *imap);
-void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
 		xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
 		xfs_extlen_t len, uint64_t owner);
 
+int xfs_rtrmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
+		struct xfs_btree_cur **pcur);
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
 		struct xfs_btree_cur **pcur);
 int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/47] xfs: add realtime reverse map inode to metadata directory
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:12   ` [PATCH 07/47] xfs: add a realtime flag to the rmap update log redo items Darrick J. Wong
@ 2023-12-27 13:12   ` Darrick J. Wong
  2023-12-27 13:12   ` [PATCH 09/47] xfs: add metadata reservations for realtime rmap btrees Darrick J. Wong
                     ` (38 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:12 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a metadir path to select the realtime rmap btree inode and load
it at mount time.  The rtrmapbt inode will have a unique extent format
code, which means that we also have to update the inode validation and
flush routines to look for it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/init.c             |    8 ++++++++
 libxfs/xfs_format.h       |    6 ++++--
 libxfs/xfs_inode_buf.c    |   10 ++++++++++
 libxfs/xfs_inode_fork.c   |    9 +++++++++
 libxfs/xfs_rtgroup.h      |    3 +++
 libxfs/xfs_rtrmap_btree.c |   33 +++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrmap_btree.h |    4 ++++
 7 files changed, 71 insertions(+), 2 deletions(-)


diff --git a/libxfs/init.c b/libxfs/init.c
index 9a4dfe02945..ba0b9a87f2d 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -969,6 +969,14 @@ libxfs_mount(
 void
 libxfs_rtmount_destroy(xfs_mount_t *mp)
 {
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
+	for_each_rtgroup(mp, rgno, rtg) {
+		if (rtg->rtg_rmapip)
+			libxfs_imeta_irele(rtg->rtg_rmapip);
+		rtg->rtg_rmapip = NULL;
+	}
 	if (mp->m_rsumip)
 		libxfs_imeta_irele(mp->m_rsumip);
 	if (mp->m_rbmip)
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 5317c6438f0..d374240fc58 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1026,7 +1026,8 @@ enum xfs_dinode_fmt {
 	XFS_DINODE_FMT_LOCAL,		/* bulk data */
 	XFS_DINODE_FMT_EXTENTS,		/* struct xfs_bmbt_rec */
 	XFS_DINODE_FMT_BTREE,		/* struct xfs_bmdr_block */
-	XFS_DINODE_FMT_UUID		/* added long ago, but never used */
+	XFS_DINODE_FMT_UUID,		/* added long ago, but never used */
+	XFS_DINODE_FMT_RMAP,		/* reverse mapping btree */
 };
 
 #define XFS_INODE_FORMAT_STR \
@@ -1034,7 +1035,8 @@ enum xfs_dinode_fmt {
 	{ XFS_DINODE_FMT_LOCAL,		"local" }, \
 	{ XFS_DINODE_FMT_EXTENTS,	"extent" }, \
 	{ XFS_DINODE_FMT_BTREE,		"btree" }, \
-	{ XFS_DINODE_FMT_UUID,		"uuid" }
+	{ XFS_DINODE_FMT_UUID,		"uuid" }, \
+	{ XFS_DINODE_FMT_RMAP,		"rmap" }
 
 /*
  * Max values for extnum and aextnum.
diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c
index 8d100595756..9755ae33813 100644
--- a/libxfs/xfs_inode_buf.c
+++ b/libxfs/xfs_inode_buf.c
@@ -408,6 +408,12 @@ xfs_dinode_verify_fork(
 		if (di_nextents > max_extents)
 			return __this_address;
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		if (!xfs_has_rtrmapbt(mp))
+			return __this_address;
+		if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
+			return __this_address;
+		break;
 	default:
 		return __this_address;
 	}
@@ -427,6 +433,10 @@ xfs_dinode_verify_forkoff(
 		if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
 			return __this_address;
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		if (!(xfs_has_metadir(mp) && xfs_has_parent(mp)))
+			return __this_address;
+		fallthrough;
 	case XFS_DINODE_FMT_LOCAL:	/* fall through ... */
 	case XFS_DINODE_FMT_EXTENTS:    /* fall through ... */
 	case XFS_DINODE_FMT_BTREE:
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index ec3a399e798..2e8f84e57a4 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -262,6 +262,11 @@ xfs_iformat_data_fork(
 			return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 		case XFS_DINODE_FMT_BTREE:
 			return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+		case XFS_DINODE_FMT_RMAP:
+			if (!xfs_has_rtrmapbt(ip->i_mount))
+				return -EFSCORRUPTED;
+			ASSERT(0); /* to be implemented later */
+			return -EFSCORRUPTED;
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -651,6 +656,10 @@ xfs_iflush_fork(
 		}
 		break;
 
+	case XFS_DINODE_FMT_RMAP:
+		ASSERT(0); /* to be implemented later */
+		break;
+
 	default:
 		ASSERT(0);
 		break;
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 0a63f14b5aa..77503bda355 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -22,6 +22,9 @@ struct xfs_rtgroup {
 	/* for rcu-safe freeing */
 	struct rcu_head		rcu_head;
 
+	/* reverse mapping btree inode */
+	struct xfs_inode	*rtg_rmapip;
+
 	/* Number of blocks in this group */
 	xfs_rgblock_t		rtg_blockcount;
 
diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index a2cb497379f..d788ef60333 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -24,6 +24,7 @@
 #include "xfs_cksum.h"
 #include "xfs_rtgroup.h"
 #include "xfs_bmap.h"
+#include "xfs_imeta.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -474,6 +475,7 @@ xfs_rtrmapbt_commit_staged_btree(
 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_RMAP);
 
 	/*
 	 * Free any resources hanging off the real fork, then shallow-copy the
@@ -574,3 +576,34 @@ xfs_rtrmapbt_compute_maxlevels(
 	/* Add one level to handle the inode root level. */
 	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
 }
+
+#define XFS_RTRMAP_NAMELEN		17
+
+/* Create the metadata directory path for an rtrmap btree inode. */
+int
+xfs_rtrmapbt_create_path(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_imeta_path	**pathp)
+{
+	struct xfs_imeta_path	*path;
+	unsigned char		*fname;
+	int			error;
+
+	error = xfs_imeta_create_file_path(mp, 2, &path);
+	if (error)
+		return error;
+
+	fname = kmalloc(XFS_RTRMAP_NAMELEN, GFP_KERNEL);
+	if (!fname) {
+		xfs_imeta_free_path(path);
+		return -ENOMEM;
+	}
+
+	snprintf(fname, XFS_RTRMAP_NAMELEN, "%u.rmap", rgno);
+	path->im_path[0] = "realtime";
+	path->im_path[1] = fname;
+	path->im_dynamicmask = 0x2;
+	*pathp = path;
+	return 0;
+}
diff --git a/libxfs/xfs_rtrmap_btree.h b/libxfs/xfs_rtrmap_btree.h
index 0f732679719..29b69866018 100644
--- a/libxfs/xfs_rtrmap_btree.h
+++ b/libxfs/xfs_rtrmap_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_cur;
 struct xfs_mount;
 struct xbtree_ifakeroot;
 struct xfs_rtgroup;
+struct xfs_imeta_path;
 
 /* rmaps only exist on crc enabled filesystems */
 #define XFS_RTRMAP_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
@@ -80,4 +81,7 @@ unsigned int xfs_rtrmapbt_maxlevels_ondisk(void);
 int __init xfs_rtrmapbt_init_cur_cache(void);
 void xfs_rtrmapbt_destroy_cur_cache(void);
 
+int xfs_rtrmapbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		struct xfs_imeta_path **pathp);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/47] xfs: add metadata reservations for realtime rmap btrees
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 13:12   ` [PATCH 08/47] xfs: add realtime reverse map inode to metadata directory Darrick J. Wong
@ 2023-12-27 13:12   ` Darrick J. Wong
  2023-12-27 13:13   ` [PATCH 10/47] xfs: wire up a new inode fork type for the realtime rmap Darrick J. Wong
                     ` (37 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:12 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reserve some free blocks so that we will always have enough free blocks
in the data volume to handle expansion of the realtime rmap btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtrmap_btree.c |   39 +++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrmap_btree.h |    2 ++
 2 files changed, 41 insertions(+)


diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index d788ef60333..cb24c2f7351 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -607,3 +607,42 @@ xfs_rtrmapbt_create_path(
 	*pathp = path;
 	return 0;
 }
+
+/* Calculate the rtrmap btree size for some records. */
+static unsigned long long
+xfs_rtrmapbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_rtrmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum rmap btree size.
+ */
+static unsigned long long
+xfs_rtrmapbt_max_size(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtblocks)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rtrmap_mxr[0] == 0)
+		return 0;
+
+	return xfs_rtrmapbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+xfs_filblks_t
+xfs_rtrmapbt_calc_reserves(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_has_rtrmapbt(mp))
+		return 0;
+
+	/* 1/64th (~1.5%) of the space, and enough for 1 record per block. */
+	return max_t(xfs_filblks_t, mp->m_sb.sb_rgblocks >> 6,
+			xfs_rtrmapbt_max_size(mp, mp->m_sb.sb_rgblocks));
+}
diff --git a/libxfs/xfs_rtrmap_btree.h b/libxfs/xfs_rtrmap_btree.h
index 29b69866018..b7950e6d45d 100644
--- a/libxfs/xfs_rtrmap_btree.h
+++ b/libxfs/xfs_rtrmap_btree.h
@@ -84,4 +84,6 @@ void xfs_rtrmapbt_destroy_cur_cache(void);
 int xfs_rtrmapbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 		struct xfs_imeta_path **pathp);
 
+xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/47] xfs: wire up a new inode fork type for the realtime rmap
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-27 13:12   ` [PATCH 09/47] xfs: add metadata reservations for realtime rmap btrees Darrick J. Wong
@ 2023-12-27 13:13   ` Darrick J. Wong
  2023-12-27 13:13   ` [PATCH 11/47] xfs: allow inodes with zero extents but nonzero nblocks Darrick J. Wong
                     ` (36 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:13 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Plumb in the pieces we need to embed the root of the realtime rmap
btree in an inode's data fork, complete with new fork type and
on-disk interpretation functions.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h       |    8 ++
 libxfs/xfs_inode_fork.c   |    8 +-
 libxfs/xfs_ondisk.h       |    1 
 libxfs/xfs_rtrmap_btree.c |  220 +++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrmap_btree.h |  112 +++++++++++++++++++++++
 5 files changed, 346 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index d374240fc58..1c1910256a9 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1755,6 +1755,14 @@ typedef __be32 xfs_rmap_ptr_t;
  */
 #define	XFS_RTRMAP_CRC_MAGIC	0x4d415052	/* 'MAPR' */
 
+/*
+ * rtrmap root header, on-disk form only.
+ */
+struct xfs_rtrmap_root {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+};
+
 /* inode-based btree pointer type */
 typedef __be64 xfs_rtrmap_ptr_t;
 
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 2e8f84e57a4..0e6cd5bacdb 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -25,6 +25,7 @@
 #include "xfs_errortag.h"
 #include "xfs_health.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_rtrmap_btree.h"
 
 struct kmem_cache *xfs_ifork_cache;
 
@@ -265,8 +266,7 @@ xfs_iformat_data_fork(
 		case XFS_DINODE_FMT_RMAP:
 			if (!xfs_has_rtrmapbt(ip->i_mount))
 				return -EFSCORRUPTED;
-			ASSERT(0); /* to be implemented later */
-			return -EFSCORRUPTED;
+			return xfs_iformat_rtrmap(ip, dip);
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -657,7 +657,9 @@ xfs_iflush_fork(
 		break;
 
 	case XFS_DINODE_FMT_RMAP:
-		ASSERT(0); /* to be implemented later */
+		ASSERT(whichfork == XFS_DATA_FORK);
+		if (iip->ili_fields & brootflag[whichfork])
+			xfs_iflush_rtrmap(ip, dip);
 		break;
 
 	default:
diff --git a/libxfs/xfs_ondisk.h b/libxfs/xfs_ondisk.h
index 897a1b72f8d..102a3574fc6 100644
--- a/libxfs/xfs_ondisk.h
+++ b/libxfs/xfs_ondisk.h
@@ -78,6 +78,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index cb24c2f7351..921bf7e1b11 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -83,6 +83,39 @@ xfs_rtrmapbt_get_maxrecs(
 	return cur->bc_mp->m_rtrmap_mxr[level != 0];
 }
 
+/* Calculate number of records in the ondisk realtime rmap btree inode root. */
+unsigned int
+xfs_rtrmapbt_droot_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= sizeof(struct xfs_rtrmap_root);
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_rmap_rec);
+	return blocklen / (2 * sizeof(struct xfs_rmap_key) +
+			sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_rtrmapbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrmapbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_rtrmap_mxr[level != 0];
+	return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
 /*
  * Convert the ondisk record's offset field into the ondisk key's offset field.
  * Fork and bmbt are significant parts of the rmap record key, but written
@@ -375,6 +408,64 @@ xfs_rtrmapbt_keys_contiguous(
 				 be32_to_cpu(key2->rmap.rm_startblock));
 }
 
+/* Move the rtrmap btree root from one incore buffer to another. */
+static void
+xfs_rtrmapbt_broot_move(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_btree_block	*dst_broot,
+	size_t			dst_bytes,
+	struct xfs_btree_block	*src_broot,
+	size_t			src_bytes,
+	unsigned int		level,
+	unsigned int		numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	void			*dptr;
+	void			*sptr;
+
+	ASSERT(xfs_rtrmap_droot_space(src_broot) <=
+			xfs_inode_fork_size(ip, whichfork));
+
+	/*
+	 * We always have to move the pointers because they are not butted
+	 * against the btree block header.
+	 */
+	if (numrecs && level > 0) {
+		sptr = xfs_rtrmap_broot_ptr_addr(mp, src_broot, 1, src_bytes);
+		dptr = xfs_rtrmap_broot_ptr_addr(mp, dst_broot, 1, dst_bytes);
+		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
+	}
+
+	if (src_broot == dst_broot)
+		return;
+
+	/*
+	 * If the root is being totally relocated, we have to migrate the block
+	 * header and the keys/records that come after it.
+	 */
+	memcpy(dst_broot, src_broot, XFS_RTRMAP_BLOCK_LEN);
+
+	if (!numrecs)
+		return;
+
+	if (level == 0) {
+		sptr = xfs_rtrmap_rec_addr(src_broot, 1);
+		dptr = xfs_rtrmap_rec_addr(dst_broot, 1);
+		memcpy(dptr, sptr, numrecs * sizeof(struct xfs_rmap_rec));
+	} else {
+		sptr = xfs_rtrmap_key_addr(src_broot, 1);
+		dptr = xfs_rtrmap_key_addr(dst_broot, 1);
+		memcpy(dptr, sptr, numrecs * 2 * sizeof(struct xfs_rmap_key));
+	}
+}
+
+static const struct xfs_ifork_broot_ops xfs_rtrmapbt_iroot_ops = {
+	.maxrecs		= xfs_rtrmapbt_maxrecs,
+	.size			= xfs_rtrmap_broot_space_calc,
+	.move			= xfs_rtrmapbt_broot_move,
+};
+
 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.rec_len		= sizeof(struct xfs_rmap_rec),
 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
@@ -388,6 +479,7 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.free_block		= xfs_btree_free_imeta_block,
 	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
 	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
+	.get_dmaxrecs		= xfs_rtrmapbt_get_dmaxrecs,
 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
@@ -398,6 +490,7 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
+	.iroot_ops		= &xfs_rtrmapbt_iroot_ops,
 };
 
 /* Initialize a new rt rmap btree cursor. */
@@ -646,3 +739,130 @@ xfs_rtrmapbt_calc_reserves(
 	return max_t(xfs_filblks_t, mp->m_sb.sb_rgblocks >> 6,
 			xfs_rtrmapbt_max_size(mp, mp->m_sb.sb_rgblocks));
 }
+
+/* Convert on-disk form of btree root to in-memory form. */
+STATIC void
+xfs_rtrmapbt_from_disk(
+	struct xfs_inode	*ip,
+	struct xfs_rtrmap_root	*dblock,
+	unsigned int		dblocklen,
+	struct xfs_btree_block	*rblock)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_rmap_key	*fkp;
+	__be64			*fpp;
+	struct xfs_rmap_key	*tkp;
+	__be64			*tpp;
+	struct xfs_rmap_rec	*frp;
+	struct xfs_rmap_rec	*trp;
+	unsigned int		rblocklen = xfs_rtrmap_broot_space(mp, dblock);
+	unsigned int		numrecs;
+	unsigned int		maxrecs;
+
+	xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino);
+
+	rblock->bb_level = dblock->bb_level;
+	rblock->bb_numrecs = dblock->bb_numrecs;
+	numrecs = be16_to_cpu(dblock->bb_numrecs);
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+		tkp = xfs_rtrmap_key_addr(rblock, 1);
+		fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+		tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+		trp = xfs_rtrmap_rec_addr(rblock, 1);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Load a realtime reverse mapping btree root in from disk. */
+int
+xfs_iformat_rtrmap(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	unsigned int		numrecs;
+	unsigned int		level;
+	int			dsize;
+
+	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+	numrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > mp->m_rtrmap_maxlevels ||
+	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize)
+		return -EFSCORRUPTED;
+
+	xfs_iroot_alloc(ip, XFS_DATA_FORK,
+			xfs_rtrmap_broot_space_calc(mp, level, numrecs));
+	xfs_rtrmapbt_from_disk(ip, dfp, dsize, ifp->if_broot);
+	return 0;
+}
+
+/* Convert in-memory form of btree root to on-disk form. */
+void
+xfs_rtrmapbt_to_disk(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*rblock,
+	unsigned int		rblocklen,
+	struct xfs_rtrmap_root	*dblock,
+	unsigned int		dblocklen)
+{
+	struct xfs_rmap_key	*fkp;
+	__be64			*fpp;
+	struct xfs_rmap_key	*tkp;
+	__be64			*tpp;
+	struct xfs_rmap_rec	*frp;
+	struct xfs_rmap_rec	*trp;
+	unsigned int		numrecs;
+	unsigned int		maxrecs;
+
+	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC));
+	ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+	ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+	numrecs = be16_to_cpu(rblock->bb_numrecs);
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrmap_key_addr(rblock, 1);
+		tkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+		fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrmap_rec_addr(rblock, 1);
+		trp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Flush a realtime reverse mapping btree root out to disk. */
+void
+xfs_iflush_rtrmap(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+	ASSERT(ifp->if_broot != NULL);
+	ASSERT(ifp->if_broot_bytes > 0);
+	ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, XFS_DATA_FORK));
+	xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
+			dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
diff --git a/libxfs/xfs_rtrmap_btree.h b/libxfs/xfs_rtrmap_btree.h
index b7950e6d45d..2b26a05f907 100644
--- a/libxfs/xfs_rtrmap_btree.h
+++ b/libxfs/xfs_rtrmap_btree.h
@@ -27,6 +27,7 @@ void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
 unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
 		bool leaf);
 void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf);
 
 /*
  * Addresses of records, keys, and pointers within an incore rtrmapbt block.
@@ -86,4 +87,115 @@ int xfs_rtrmapbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 
 xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp);
 
+/* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */
+
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_droot_rec_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_rec *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_droot_key_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)(block + 1) +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_droot_ptr_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrmap_ptr_t *)
+		((char *)(block + 1) +
+		 maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+		 (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_broot_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*bb,
+	unsigned int		index,
+	unsigned int		block_size)
+{
+	return xfs_rtrmap_ptr_addr(bb, index,
+			xfs_rtrmapbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrmap_broot_space_calc(
+	struct xfs_mount	*mp,
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = XFS_RTRMAP_BLOCK_LEN;
+
+	if (level > 0)
+		return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+					 sizeof(xfs_rtrmap_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb)
+{
+	return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrmap_droot_space_calc(
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = sizeof(struct xfs_rtrmap_root);
+
+	if (level > 0)
+		return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+					 sizeof(xfs_rtrmap_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrmap_droot_space(struct xfs_btree_block *bb)
+{
+	return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock,
+		unsigned int rblocklen, struct xfs_rtrmap_root *dblock,
+		unsigned int dblocklen);
+void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/47] xfs: allow inodes with zero extents but nonzero nblocks
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-27 13:13   ` [PATCH 10/47] xfs: wire up a new inode fork type for the realtime rmap Darrick J. Wong
@ 2023-12-27 13:13   ` Darrick J. Wong
  2023-12-27 13:13   ` [PATCH 12/47] xfs: use realtime EFI to free extents when realtime rmap is enabled Darrick J. Wong
                     ` (35 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:13 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Metadata inodes that store btrees will have zero extents and a nonzero
nblocks.  Adjust the inode verifier so that this combination is not
flagged.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_inode_buf.c |   16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c
index 9755ae33813..e7bf8ff7046 100644
--- a/libxfs/xfs_inode_buf.c
+++ b/libxfs/xfs_inode_buf.c
@@ -598,9 +598,6 @@ xfs_dinode_verify(
 	if (mode && nextents + naextents > nblocks)
 		return __this_address;
 
-	if (nextents + naextents == 0 && nblocks != 0)
-		return __this_address;
-
 	if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
 		return __this_address;
 
@@ -704,6 +701,19 @@ xfs_dinode_verify(
 			return fa;
 	}
 
+	/* metadata inodes containing btrees always have zero extent count */
+	if (flags2 & XFS_DIFLAG2_METADIR) {
+		switch (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK)) {
+		case XFS_DINODE_FMT_RMAP:
+			break;
+		default:
+			if (nextents + naextents == 0 && nblocks != 0)
+				return __this_address;
+			break;
+		}
+	} else if (nextents + naextents == 0 && nblocks != 0)
+		return __this_address;
+
 	return NULL;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/47] xfs: use realtime EFI to free extents when realtime rmap is enabled
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-27 13:13   ` [PATCH 11/47] xfs: allow inodes with zero extents but nonzero nblocks Darrick J. Wong
@ 2023-12-27 13:13   ` Darrick J. Wong
  2023-12-27 13:13   ` [PATCH 13/47] xfs: wire up rmap map and unmap to the realtime rmapbt Darrick J. Wong
                     ` (34 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:13 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When rmap is enabled, XFS expects a certain order of operations, which
is: 1) remove the file mapping, 2) remove the reverse mapping, and then
3) free the blocks.  xfs_bmap_del_extent_real tries to do 1 and 3 in the
same transaction, which means that when rtrmap is enabled, we have to
use realtime EFIs to maintain the expected order.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_bmap.c |   23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)


diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 323f60b1128..b84d1ad57f1 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -5051,7 +5051,6 @@ xfs_bmap_del_extent_real(
 {
 	xfs_fsblock_t		del_endblock=0;	/* first block past del */
 	xfs_fileoff_t		del_endoff;	/* first offset past del */
-	int			do_fx;	/* free extent at end of routine */
 	int			error;	/* error return value */
 	struct xfs_bmbt_irec	got;	/* current extent entry */
 	xfs_fileoff_t		got_endoff;	/* first offset past got */
@@ -5064,6 +5063,8 @@ xfs_bmap_del_extent_real(
 	uint			qfield;	/* quota field to update */
 	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
 	struct xfs_bmbt_irec	old;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
+	bool			want_free = !(bflags & XFS_BMAPI_REMAP);
 
 	*logflagsp = 0;
 
@@ -5095,18 +5096,24 @@ xfs_bmap_del_extent_real(
 		return -ENOSPC;
 
 	*logflagsp = XFS_ILOG_CORE;
-	if (xfs_ifork_is_realtime(ip, whichfork)) {
-		if (!(bflags & XFS_BMAPI_REMAP)) {
+	if (isrt) {
+		/*
+		 * Historically, we did not use EFIs to free realtime extents.
+		 * However, when reverse mapping is enabled, we must maintain
+		 * the same order of operations as the data device, which is:
+		 * Remove the file mapping, remove the reverse mapping, and
+		 * then free the blocks.  This means that we must delay the
+		 * freeing until after we've scheduled the rmap update.
+		 */
+		if (want_free && !xfs_has_rtrmapbt(mp)) {
 			error = xfs_rtfree_blocks(tp, del->br_startblock,
 					del->br_blockcount);
 			if (error)
 				return error;
+			want_free = false;
 		}
-
-		do_fx = 0;
 		qfield = XFS_TRANS_DQ_RTBCOUNT;
 	} else {
-		do_fx = 1;
 		qfield = XFS_TRANS_DQ_BCOUNT;
 	}
 	nblks = del->br_blockcount;
@@ -5256,7 +5263,7 @@ xfs_bmap_del_extent_real(
 	/*
 	 * If we need to, add to list of extents to delete.
 	 */
-	if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+	if (want_free) {
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
 			xfs_refcount_decrease_extent(tp, del);
 		} else {
@@ -5265,6 +5272,8 @@ xfs_bmap_del_extent_real(
 			if ((bflags & XFS_BMAPI_NODISCARD) ||
 			    del->br_state == XFS_EXT_UNWRITTEN)
 				efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD;
+			if (isrt)
+				efi_flags |= XFS_FREE_EXTENT_REALTIME;
 
 			error = xfs_free_extent_later(tp, del->br_startblock,
 					del->br_blockcount, NULL,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/47] xfs: wire up rmap map and unmap to the realtime rmapbt
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-27 13:13   ` [PATCH 12/47] xfs: use realtime EFI to free extents when realtime rmap is enabled Darrick J. Wong
@ 2023-12-27 13:13   ` Darrick J. Wong
  2023-12-27 13:14   ` [PATCH 14/47] xfs: create routine to allocate and initialize a realtime rmap btree inode Darrick J. Wong
                     ` (33 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:13 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Connect the map and unmap reverse-mapping operations to the realtime
rmapbt via the deferred operation callbacks.  This enables us to
perform rmap operations against the correct btree.

[Contains a minor bugfix from hch]

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rmap.c    |   37 ++++++++++++++++++++++++++++++++++---
 libxfs/xfs_rtgroup.c |    9 +++++++++
 libxfs/xfs_rtgroup.h |    5 ++++-
 3 files changed, 47 insertions(+), 4 deletions(-)


diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 00544d6a20f..cf2968cbd7f 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -25,6 +25,7 @@
 #include "xfs_health.h"
 #include "defer_item.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -2691,9 +2692,39 @@ xfs_rtrmap_finish_one(
 	struct xfs_rmap_intent		*ri,
 	struct xfs_btree_cur		**pcur)
 {
-	/* coming in a subsequent patch */
-	ASSERT(0);
-	return -EFSCORRUPTED;
+	struct xfs_owner_info		oinfo;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*rcur = *pcur;
+	xfs_rgnumber_t			rgno;
+	xfs_rgblock_t			bno;
+	bool				unwritten;
+
+	trace_xfs_rmap_deferred(mp, ri);
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
+		return -EIO;
+
+	/*
+	 * If we haven't gotten a cursor or the cursor rtgroup doesn't match
+	 * the startblock, get one now.
+	 */
+	if (rcur != NULL && rcur->bc_ino.rtg != ri->ri_rtg) {
+		xfs_btree_del_cursor(rcur, 0);
+		rcur = NULL;
+	}
+	if (rcur == NULL) {
+		xfs_rtgroup_lock(tp, ri->ri_rtg, XFS_RTGLOCK_RMAP);
+		*pcur = rcur = xfs_rtrmapbt_init_cursor(mp, tp, ri->ri_rtg,
+				ri->ri_rtg->rtg_rmapip);
+	}
+
+	xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
+			ri->ri_bmap.br_startoff);
+	unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
+	bno = xfs_rtb_to_rgbno(mp, ri->ri_bmap.br_startblock, &rgno);
+
+	return __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+			ri->ri_bmap.br_blockcount, &oinfo, unwritten);
 }
 
 /*
diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index 4ef6b9e9094..449cd57cf9e 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -549,6 +549,12 @@ xfs_rtgroup_lock(
 		xfs_rtbitmap_lock(tp, rtg->rtg_mount);
 	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
 		xfs_rtbitmap_lock_shared(rtg->rtg_mount, XFS_RBMLOCK_BITMAP);
+
+	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg->rtg_rmapip) {
+		xfs_ilock(rtg->rtg_rmapip, XFS_ILOCK_EXCL);
+		if (tp)
+			xfs_trans_ijoin(tp, rtg->rtg_rmapip, XFS_ILOCK_EXCL);
+	}
 }
 
 /* Unlock metadata inodes associated with this rt group. */
@@ -561,6 +567,9 @@ xfs_rtgroup_unlock(
 	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
 	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
+	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg->rtg_rmapip)
+		xfs_iunlock(rtg->rtg_rmapip, XFS_ILOCK_EXCL);
+
 	if (rtglock_flags & XFS_RTGLOCK_BITMAP)
 		xfs_rtbitmap_unlock(rtg->rtg_mount);
 	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 77503bda355..559a5135820 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -231,9 +231,12 @@ int xfs_rtgroup_init_secondary_super(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 #define XFS_RTGLOCK_BITMAP		(1U << 0)
 /* Lock the rt bitmap inode in shared mode */
 #define XFS_RTGLOCK_BITMAP_SHARED	(1U << 1)
+/* Lock the rt rmap inode in exclusive mode */
+#define XFS_RTGLOCK_RMAP		(1U << 2)
 
 #define XFS_RTGLOCK_ALL_FLAGS	(XFS_RTGLOCK_BITMAP | \
-				 XFS_RTGLOCK_BITMAP_SHARED)
+				 XFS_RTGLOCK_BITMAP_SHARED | \
+				 XFS_RTGLOCK_RMAP)
 
 void xfs_rtgroup_lock(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
 		unsigned int rtglock_flags);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/47] xfs: create routine to allocate and initialize a realtime rmap btree inode
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-27 13:13   ` [PATCH 13/47] xfs: wire up rmap map and unmap to the realtime rmapbt Darrick J. Wong
@ 2023-12-27 13:14   ` Darrick J. Wong
  2023-12-27 13:14   ` [PATCH 15/47] xfs: report realtime rmap btree corruption errors to the health system Darrick J. Wong
                     ` (32 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:14 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a library routine to allocate and initialize an empty realtime
rmapbt inode.  We'll use this for mkfs and repair.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtrmap_btree.c |   34 ++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrmap_btree.h |    4 ++++
 2 files changed, 38 insertions(+)


diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index 921bf7e1b11..832a58cfe13 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -866,3 +866,37 @@ xfs_iflush_rtrmap(
 	xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
 			dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
 }
+
+/*
+ * Create a realtime rmap btree inode.
+ *
+ * Regardless of the return value, the caller must clean up @upd.  If a new
+ * inode is returned through @*ipp, the caller must finish setting up the incore
+ * inode and release it.
+ */
+int
+xfs_rtrmapbt_create(
+	struct xfs_imeta_update	*upd,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = upd->mp;
+	struct xfs_ifork	*ifp;
+	int			error;
+
+	error = xfs_imeta_create(upd, S_IFREG, ipp);
+	if (error)
+		return error;
+
+	ifp = xfs_ifork_ptr(upd->ip, XFS_DATA_FORK);
+	ifp->if_format = XFS_DINODE_FMT_RMAP;
+	ASSERT(ifp->if_broot_bytes == 0);
+	ASSERT(ifp->if_bytes == 0);
+
+	/* Initialize the empty incore btree root. */
+	xfs_iroot_alloc(upd->ip, XFS_DATA_FORK,
+			xfs_rtrmap_broot_space_calc(mp, 0, 0));
+	xfs_btree_init_block(mp, ifp->if_broot, &xfs_rtrmapbt_ops, 0, 0,
+			upd->ip->i_ino);
+	xfs_trans_log_inode(upd->tp, upd->ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+	return 0;
+}
diff --git a/libxfs/xfs_rtrmap_btree.h b/libxfs/xfs_rtrmap_btree.h
index 2b26a05f907..108ab8c0aea 100644
--- a/libxfs/xfs_rtrmap_btree.h
+++ b/libxfs/xfs_rtrmap_btree.h
@@ -198,4 +198,8 @@ void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock,
 		unsigned int dblocklen);
 void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
 
+struct xfs_imeta_update;
+
+int xfs_rtrmapbt_create(struct xfs_imeta_update *upd, struct xfs_inode **ipp);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/47] xfs: report realtime rmap btree corruption errors to the health system
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-27 13:14   ` [PATCH 14/47] xfs: create routine to allocate and initialize a realtime rmap btree inode Darrick J. Wong
@ 2023-12-27 13:14   ` Darrick J. Wong
  2023-12-27 13:14   ` [PATCH 16/47] xfs: allow queued realtime intents to drain before scrubbing Darrick J. Wong
                     ` (31 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:14 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Whenever we encounter corrupt realtime rmap btree blocks, we should
report that to the health monitoring system for later reporting.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs_staging.h               |    1 +
 libxfs/xfs_health.h                   |    4 +++-
 libxfs/xfs_inode_fork.c               |    4 +++-
 libxfs/xfs_rtrmap_btree.c             |    5 ++++-
 man/man2/ioctl_xfs_rtgroup_geometry.2 |    3 +++
 5 files changed, 14 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_fs_staging.h b/libxfs/xfs_fs_staging.h
index 1f573314877..9d5d6af62b6 100644
--- a/libxfs/xfs_fs_staging.h
+++ b/libxfs/xfs_fs_staging.h
@@ -216,6 +216,7 @@ struct xfs_rtgroup_geometry {
 };
 #define XFS_RTGROUP_GEOM_SICK_SUPER	(1 << 0)  /* superblock */
 #define XFS_RTGROUP_GEOM_SICK_BITMAP	(1 << 1)  /* rtbitmap for this group */
+#define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1 << 2)  /* reverse mappings */
 
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 63, struct xfs_rtgroup_geometry)
 
diff --git a/libxfs/xfs_health.h b/libxfs/xfs_health.h
index 1e9938a417b..aeeb6276977 100644
--- a/libxfs/xfs_health.h
+++ b/libxfs/xfs_health.h
@@ -68,6 +68,7 @@ struct xfs_rtgroup;
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
 #define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
 #define XFS_SICK_RT_SUPER	(1 << 2)  /* rt group superblock */
+#define XFS_SICK_RT_RMAPBT	(1 << 3)  /* reverse mappings */
 
 /* Observable health issues for AG metadata. */
 #define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
@@ -113,7 +114,8 @@ struct xfs_rtgroup;
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY | \
-				 XFS_SICK_RT_SUPER)
+				 XFS_SICK_RT_SUPER | \
+				 XFS_SICK_RT_RMAPBT)
 
 #define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
 				 XFS_SICK_AG_AGF | \
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 0e6cd5bacdb..127571527cf 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -264,8 +264,10 @@ xfs_iformat_data_fork(
 		case XFS_DINODE_FMT_BTREE:
 			return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 		case XFS_DINODE_FMT_RMAP:
-			if (!xfs_has_rtrmapbt(ip->i_mount))
+			if (!xfs_has_rtrmapbt(ip->i_mount)) {
+				xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 				return -EFSCORRUPTED;
+			}
 			return xfs_iformat_rtrmap(ip, dip);
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index 832a58cfe13..5a25791baa5 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -25,6 +25,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_bmap.h"
 #include "xfs_imeta.h"
+#include "xfs_health.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -798,8 +799,10 @@ xfs_iformat_rtrmap(
 	level = be16_to_cpu(dfp->bb_level);
 
 	if (level > mp->m_rtrmap_maxlevels ||
-	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize)
+	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
+	}
 
 	xfs_iroot_alloc(ip, XFS_DATA_FORK,
 			xfs_rtrmap_broot_space_calc(mp, level, numrecs));
diff --git a/man/man2/ioctl_xfs_rtgroup_geometry.2 b/man/man2/ioctl_xfs_rtgroup_geometry.2
index ccd931d1e17..38753b93055 100644
--- a/man/man2/ioctl_xfs_rtgroup_geometry.2
+++ b/man/man2/ioctl_xfs_rtgroup_geometry.2
@@ -73,6 +73,9 @@ Realtime group superblock.
 .TP
 .B XFS_RTGROUP_GEOM_SICK_BITMAP
 Realtime bitmap for this group.
+.TP
+.B XFS_RTGROUP_GEOM_SICK_RTRMAPBT
+Reverse mapping btree for this group.
 .RE
 .SH RETURN VALUE
 On error, \-1 is returned, and


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/47] xfs: allow queued realtime intents to drain before scrubbing
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-27 13:14   ` [PATCH 15/47] xfs: report realtime rmap btree corruption errors to the health system Darrick J. Wong
@ 2023-12-27 13:14   ` Darrick J. Wong
  2023-12-27 13:14   ` [PATCH 17/47] xfs: scrub the realtime rmapbt Darrick J. Wong
                     ` (30 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:14 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When a writer thread executes a chain of log intent items for the
realtime volume, the ILOCKs taken during each step are for each rt
metadata file, not the entire rt volume itself.  Although scrub takes
all rt metadata ILOCKs, this isn't sufficient to guard against scrub
checking the rt volume while that writer thread is in the middle of
finishing a chain because there's no higher level locking primitive
guarding the realtime volume.

When there's a collision, cross-referencing between data structures
(e.g. rtrmapbt and rtrefcountbt) yields false corruption events; if
repair is running, this results in incorrect repairs, which is
catastrophic.

Fix this by adding to the mount structure the same drain that we use to
protect scrub against concurrent AG updates, but this time for the
realtime volume.

[Contains a few cleanups from hch]

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_mount.h  |   12 ++++++++++++
 libxfs/defer_item.c  |   31 +++++++++++++------------------
 libxfs/xfs_rtgroup.c |    2 ++
 libxfs/xfs_rtgroup.h |    9 +++++++++
 4 files changed, 36 insertions(+), 18 deletions(-)


diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index 4e4da5bc4fa..07f9e33b8b2 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -333,6 +333,18 @@ struct xfs_defer_drain { /* empty */ };
 static inline void xfs_perag_intent_hold(struct xfs_perag *pag) {}
 static inline void xfs_perag_intent_rele(struct xfs_perag *pag) {}
 
+struct xfs_rtgroup;
+
+#define xfs_rtgroup_intent_get(mp, rgno) \
+	xfs_rtgroup_get((mp), xfs_rtb_to_rgno((mp), (rgno)))
+#define xfs_rtgroup_intent_put(rtg)		xfs_rtgroup_put(rtg)
+
+static inline void xfs_rtgroup_intent_hold(struct xfs_rtgroup *rtg) { }
+static inline void xfs_rtgroup_intent_rele(struct xfs_rtgroup *rtg) { }
+
+#define xfs_drain_free(dr)		((void)0)
+#define xfs_drain_init(dr)		((void)0)
+
 static inline void libxfs_buftarg_drain(struct xfs_buftarg *btp)
 {
 	cache_purge(btp->bcache);
diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index a82d23c17cf..e7270d02c4b 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -88,11 +88,8 @@ xfs_extent_free_defer_add(
 	struct xfs_mount		*mp = tp->t_mountp;
 
 	if (xfs_efi_is_realtime(xefi)) {
-		xfs_rgnumber_t		rgno;
-
-		rgno = xfs_rtb_to_rgno(mp, xefi->xefi_startblock);
-		xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
-
+		xefi->xefi_rtg = xfs_rtgroup_intent_get(mp,
+						xefi->xefi_startblock);
 		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
 				&xfs_rtextent_free_defer_type);
 		return;
@@ -204,7 +201,7 @@ xfs_rtextent_free_cancel_item(
 {
 	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 
-	xfs_rtgroup_put(xefi->xefi_rtg);
+	xfs_rtgroup_intent_put(xefi->xefi_rtg);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
 }
 
@@ -338,13 +335,12 @@ xfs_rmap_defer_add(
 	 * section updates.
 	 */
 	if (ri->ri_realtime) {
-		xfs_rgnumber_t	rgno;
-
-		rgno = xfs_rtb_to_rgno(mp, ri->ri_bmap.br_startblock);
-		ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+		ri->ri_rtg = xfs_rtgroup_intent_get(mp,
+						ri->ri_bmap.br_startblock);
 		xfs_defer_add(tp, &ri->ri_list, &xfs_rtrmap_update_defer_type);
 	} else {
-		ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
+		ri->ri_pag = xfs_perag_intent_get(mp,
+						ri->ri_bmap.br_startblock);
 		xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
 	}
 }
@@ -445,7 +441,7 @@ xfs_rtrmap_update_cancel_item(
 {
 	struct xfs_rmap_intent		*ri = ri_entry(item);
 
-	xfs_rtgroup_put(ri->ri_rtg);
+	xfs_rtgroup_intent_put(ri->ri_rtg);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);
 }
 
@@ -656,10 +652,8 @@ xfs_bmap_update_get_group(
 {
 	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
 		if (xfs_has_rtgroups(mp)) {
-			xfs_rgnumber_t	rgno;
-
-			rgno = xfs_rtb_to_rgno(mp, bi->bi_bmap.br_startblock);
-			bi->bi_rtg = xfs_rtgroup_get(mp, rgno);
+			bi->bi_rtg = xfs_rtgroup_intent_get(mp,
+						bi->bi_bmap.br_startblock);
 		} else {
 			bi->bi_rtg = NULL;
 		}
@@ -695,8 +689,9 @@ xfs_bmap_update_put_group(
 	struct xfs_bmap_intent	*bi)
 {
 	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
-		if (xfs_has_rtgroups(bi->bi_owner->i_mount))
-			xfs_rtgroup_put(bi->bi_rtg);
+		if (xfs_has_rtgroups(bi->bi_owner->i_mount)) {
+			xfs_rtgroup_intent_put(bi->bi_rtg);
+		}
 		return;
 	}
 
diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index 449cd57cf9e..1acf98f8c7e 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -159,6 +159,7 @@ xfs_initialize_rtgroups(
 		/* Place kernel structure only init below this point. */
 		spin_lock_init(&rtg->rtg_state_lock);
 		init_waitqueue_head(&rtg->rtg_active_wq);
+		xfs_defer_drain_init(&rtg->rtg_intents_drain);
 #endif /* __KERNEL__ */
 
 		/* Active ref owned by mount indicates rtgroup is online. */
@@ -213,6 +214,7 @@ xfs_free_rtgroups(
 		spin_unlock(&mp->m_rtgroup_lock);
 		ASSERT(rtg);
 		XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_ref) != 0);
+		xfs_defer_drain_free(&rtg->rtg_intents_drain);
 
 		/* drop the mount's active reference */
 		xfs_rtgroup_rele(rtg);
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 559a5135820..9487c2e0047 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -39,6 +39,15 @@ struct xfs_rtgroup {
 #ifdef __KERNEL__
 	/* -- kernel only structures below this line -- */
 	spinlock_t		rtg_state_lock;
+
+	/*
+	 * We use xfs_drain to track the number of deferred log intent items
+	 * that have been queued (but not yet processed) so that waiters (e.g.
+	 * scrub) will not lock resources when other threads are in the middle
+	 * of processing a chain of intent items only to find momentary
+	 * inconsistencies.
+	 */
+	struct xfs_defer_drain	rtg_intents_drain;
 #endif /* __KERNEL__ */
 };
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/47] xfs: scrub the realtime rmapbt
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-27 13:14   ` [PATCH 16/47] xfs: allow queued realtime intents to drain before scrubbing Darrick J. Wong
@ 2023-12-27 13:14   ` Darrick J. Wong
  2023-12-27 13:15   ` [PATCH 18/47] xfs: scrub the metadir path of rt rmap btree files Darrick J. Wong
                     ` (29 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:14 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Check the realtime reverse mapping btree against the rtbitmap, and
modify the rtbitmap scrub to check against the rtrmapbt.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs.h                     |    3 ++-
 man/man2/ioctl_xfs_scrub_metadata.2 |    8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index 102b9273360..dcf048aae8c 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -737,9 +737,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
 #define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
 #define XFS_SCRUB_TYPE_RGBITMAP	31	/* realtime group bitmap */
+#define XFS_SCRUB_TYPE_RTRMAPBT	32	/* rtgroup reverse mapping btree */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	32
+#define XFS_SCRUB_TYPE_NR	33
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/man/man2/ioctl_xfs_scrub_metadata.2 b/man/man2/ioctl_xfs_scrub_metadata.2
index dc439897c98..79875968d1c 100644
--- a/man/man2/ioctl_xfs_scrub_metadata.2
+++ b/man/man2/ioctl_xfs_scrub_metadata.2
@@ -98,9 +98,13 @@ The realtime allocation group number must be given in
 must be zero.
 
 .PP
-.TP
+.nf
 .B XFS_SCRUB_TYPE_RGBITMAP
-Examine a given realtime allocation group's free space bitmap.
+.fi
+.TP
+.B XFS_SCRUB_TYPE_RTRMAPBT
+Examine a given realtime allocation group's free space bitmap or reverse
+mapping btree, respectively.
 Records are checked for obviously incorrect values and cross-referenced
 with other allocation group metadata records to ensure that there are no
 conflicts.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/47] xfs: scrub the metadir path of rt rmap btree files
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-27 13:14   ` [PATCH 17/47] xfs: scrub the realtime rmapbt Darrick J. Wong
@ 2023-12-27 13:15   ` Darrick J. Wong
  2023-12-27 13:15   ` [PATCH 19/47] xfs: online repair of the realtime rmap btree Darrick J. Wong
                     ` (28 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:15 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new XFS_SCRUB_METAPATH subtype so that we can scrub the metadata
directory tree path to the rmap btree file for each rt group.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/scrub.c |    5 +++++
 libxfs/xfs_fs.h |    3 ++-
 scrub/scrub.c   |    3 +++
 3 files changed, 10 insertions(+), 1 deletion(-)


diff --git a/libfrog/scrub.c b/libfrog/scrub.c
index b22770d639c..8822fc0088c 100644
--- a/libfrog/scrub.c
+++ b/libfrog/scrub.c
@@ -197,6 +197,11 @@ const struct xfrog_scrub_descr xfrog_metapaths[XFS_SCRUB_METAPATH_NR] = {
 		.descr	= "project quota metadir path",
 		.group	= XFROG_SCRUB_GROUP_FS,
 	},
+	[XFS_SCRUB_METAPATH_RTRMAPBT]	= {
+		.name	= "rtrmapbt",
+		.descr	= "rmap btree file metadir path",
+		.group	= XFROG_SCRUB_GROUP_RTGROUP,
+	},
 };
 
 /* Invoke the scrub ioctl.  Returns zero or negative error code. */
diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index dcf048aae8c..0bbdbfb0a8a 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -804,9 +804,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_METAPATH_USRQUOTA	2
 #define XFS_SCRUB_METAPATH_GRPQUOTA	3
 #define XFS_SCRUB_METAPATH_PRJQUOTA	4
+#define XFS_SCRUB_METAPATH_RTRMAPBT	5
 
 /* Number of metapath sm_ino values */
-#define XFS_SCRUB_METAPATH_NR		5
+#define XFS_SCRUB_METAPATH_NR		6
 
 /*
  * ioctl limits
diff --git a/scrub/scrub.c b/scrub/scrub.c
index 8f9fde80263..f20910de855 100644
--- a/scrub/scrub.c
+++ b/scrub/scrub.c
@@ -66,6 +66,9 @@ format_metapath_descr(
 				(unsigned long long)vhead->svh_ino);
 
 	sc = &xfrog_metapaths[vhead->svh_ino];
+	if (sc->group == XFROG_SCRUB_GROUP_RTGROUP)
+		return snprintf(buf, buflen, _("rtgroup %u %s"),
+				vhead->svh_agno, _(sc->descr));
 	return snprintf(buf, buflen, "%s", _(sc->descr));
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/47] xfs: online repair of the realtime rmap btree
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-27 13:15   ` [PATCH 18/47] xfs: scrub the metadir path of rt rmap btree files Darrick J. Wong
@ 2023-12-27 13:15   ` Darrick J. Wong
  2023-12-27 13:15   ` [PATCH 20/47] xfs: create a shadow rmap btree during realtime rmap repair Darrick J. Wong
                     ` (27 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:15 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Repair the realtime rmap btree while mounted.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtrmap_btree.c |    2 +-
 libxfs/xfs_rtrmap_btree.h |    3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index 5a25791baa5..0393da8837a 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -703,7 +703,7 @@ xfs_rtrmapbt_create_path(
 }
 
 /* Calculate the rtrmap btree size for some records. */
-static unsigned long long
+unsigned long long
 xfs_rtrmapbt_calc_size(
 	struct xfs_mount	*mp,
 	unsigned long long	len)
diff --git a/libxfs/xfs_rtrmap_btree.h b/libxfs/xfs_rtrmap_btree.h
index 108ab8c0aea..5aec719be05 100644
--- a/libxfs/xfs_rtrmap_btree.h
+++ b/libxfs/xfs_rtrmap_btree.h
@@ -202,4 +202,7 @@ struct xfs_imeta_update;
 
 int xfs_rtrmapbt_create(struct xfs_imeta_update *upd, struct xfs_inode **ipp);
 
+unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/47] xfs: create a shadow rmap btree during realtime rmap repair
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-27 13:15   ` [PATCH 19/47] xfs: online repair of the realtime rmap btree Darrick J. Wong
@ 2023-12-27 13:15   ` Darrick J. Wong
  2023-12-27 13:15   ` [PATCH 21/47] xfs: hook live realtime rmap operations during a repair operation Darrick J. Wong
                     ` (26 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:15 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create an in-memory btree of rmap records instead of an array.  This
enables us to do live record collection instead of freezing the fs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfbtree.c          |    2 +
 libxfs/xfs_btree.c        |    2 +
 libxfs/xfs_btree.h        |    1 
 libxfs/xfs_rmap.c         |    5 +-
 libxfs/xfs_rtrmap_btree.c |  123 +++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrmap_btree.h |    9 +++
 6 files changed, 141 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfbtree.c b/libxfs/xfbtree.c
index b4762393b3a..cdc37561a62 100644
--- a/libxfs/xfbtree.c
+++ b/libxfs/xfbtree.c
@@ -253,6 +253,8 @@ xfbtree_dup_cursor(
 
 	if (cur->bc_mem.pag)
 		ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
+	if (cur->bc_mem.rtg)
+		ncur->bc_mem.rtg = xfs_rtgroup_hold(cur->bc_mem.rtg);
 
 	return ncur;
 }
diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index f599dd17d30..450c48ceaf1 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -487,6 +487,8 @@ xfs_btree_del_cursor(
 	if (cur->bc_flags & XFS_BTREE_IN_XFILE) {
 		if (cur->bc_mem.pag)
 			xfs_perag_put(cur->bc_mem.pag);
+		if (cur->bc_mem.rtg)
+			xfs_rtgroup_put(cur->bc_mem.rtg);
 	}
 	kmem_cache_free(cur->bc_cache, cur);
 }
diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h
index 3559cf5d3a6..4753a5c8476 100644
--- a/libxfs/xfs_btree.h
+++ b/libxfs/xfs_btree.h
@@ -269,6 +269,7 @@ struct xfs_btree_cur_mem {
 	struct xfbtree			*xfbtree;
 	struct xfs_buf			*head_bp;
 	struct xfs_perag		*pag;
+	struct xfs_rtgroup		*rtg;
 };
 
 struct xfs_btree_level {
diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index cf2968cbd7f..42713dd17f4 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -328,8 +328,11 @@ xfs_rmap_check_btrec(
 	struct xfs_btree_cur		*cur,
 	const struct xfs_rmap_irec	*irec)
 {
-	if (cur->bc_btnum == XFS_BTNUM_RTRMAP)
+	if (cur->bc_btnum == XFS_BTNUM_RTRMAP) {
+		if (cur->bc_flags & XFS_BTREE_IN_XFILE)
+			return xfs_rtrmap_check_irec(cur->bc_mem.rtg, irec);
 		return xfs_rtrmap_check_irec(cur->bc_ino.rtg, irec);
+	}
 
 	if (cur->bc_flags & XFS_BTREE_IN_XFILE)
 		return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index 0393da8837a..b5adfe362a7 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -26,6 +26,9 @@
 #include "xfs_bmap.h"
 #include "xfs_imeta.h"
 #include "xfs_health.h"
+#include "xfile.h"
+#include "xfbtree.h"
+#include "xfs_btree_mem.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -555,6 +558,126 @@ xfs_rtrmapbt_stage_cursor(
 	return cur;
 }
 
+#ifdef CONFIG_XFS_BTREE_IN_XFILE
+/*
+ * Validate an in-memory realtime rmap btree block.  Callers are allowed to
+ * generate an in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rtrmapbt_mem_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+
+	level = be16_to_cpu(block->bb_level);
+	if (xfs_has_rmapbt(mp)) {
+		if (level >= mp->m_rtrmap_maxlevels)
+			return __this_address;
+	} else {
+		if (level >= xfs_rtrmapbt_maxlevels_ondisk())
+			return __this_address;
+	}
+
+	return xfbtree_lblock_verify(bp,
+			xfs_rtrmapbt_maxrecs(mp, xfo_to_b(1), level == 0));
+}
+
+static void
+xfs_rtrmapbt_mem_rw_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa = xfs_rtrmapbt_mem_verify(bp);
+
+	if (fa)
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = {
+	.name			= "xfs_rtrmapbt_mem",
+	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+	.verify_read		= xfs_rtrmapbt_mem_rw_verify,
+	.verify_write		= xfs_rtrmapbt_mem_rw_verify,
+	.verify_struct		= xfs_rtrmapbt_mem_verify,
+};
+
+static const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
+	.rec_len		= sizeof(struct xfs_rmap_rec),
+	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+	.lru_refs		= XFS_RMAP_BTREE_REF,
+	.geom_flags		= XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING |
+				  XFS_BTREE_LONG_PTRS | XFS_BTREE_IN_XFILE,
+
+	.dup_cursor		= xfbtree_dup_cursor,
+	.set_root		= xfbtree_set_root,
+	.alloc_block		= xfbtree_alloc_block,
+	.free_block		= xfbtree_free_block,
+	.get_minrecs		= xfbtree_get_minrecs,
+	.get_maxrecs		= xfbtree_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfbtree_init_ptr_from_cur,
+	.key_diff		= xfs_rtrmapbt_key_diff,
+	.buf_ops		= &xfs_rtrmapbt_mem_buf_ops,
+	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
+	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_mem_cursor(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*head_bp,
+	struct xfbtree		*xfbtree)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_mount	*mp = rtg->rtg_mount;
+
+	/* Overlapping btree; 2 keys per pointer. */
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RTRMAP,
+			&xfs_rtrmapbt_mem_ops, mp->m_rtrmap_maxlevels,
+			xfs_rtrmapbt_cur_cache);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+	cur->bc_mem.xfbtree = xfbtree;
+	cur->bc_mem.head_bp = head_bp;
+	cur->bc_nlevels = xfs_btree_mem_head_nlevels(head_bp);
+
+	cur->bc_mem.rtg = xfs_rtgroup_hold(rtg);
+	return cur;
+}
+
+int
+xfs_rtrmapbt_mem_create(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_buftarg	*target,
+	struct xfbtree		**xfbtreep)
+{
+	struct xfbtree_config	cfg = {
+		.btree_ops	= &xfs_rtrmapbt_mem_ops,
+		.target		= target,
+		.flags		= XFBTREE_DIRECT_MAP,
+		.owner		= rgno,
+	};
+
+	return xfbtree_create(mp, &cfg, xfbtreep);
+}
+#endif /* CONFIG_XFS_BTREE_IN_XFILE */
+
 /*
  * Install a new rt reverse mapping btree root.  Caller is responsible for
  * invalidating and freeing the old btree blocks.
diff --git a/libxfs/xfs_rtrmap_btree.h b/libxfs/xfs_rtrmap_btree.h
index 5aec719be05..b0a8e8d89f9 100644
--- a/libxfs/xfs_rtrmap_btree.h
+++ b/libxfs/xfs_rtrmap_btree.h
@@ -205,4 +205,13 @@ int xfs_rtrmapbt_create(struct xfs_imeta_update *upd, struct xfs_inode **ipp);
 unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
+#ifdef CONFIG_XFS_BTREE_IN_XFILE
+struct xfbtree;
+struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
+		struct xfs_trans *tp, struct xfs_buf *mhead_bp,
+		struct xfbtree *xfbtree);
+int xfs_rtrmapbt_mem_create(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		struct xfs_buftarg *target, struct xfbtree **xfbtreep);
+#endif /* CONFIG_XFS_BTREE_IN_XFILE */
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/47] xfs: hook live realtime rmap operations during a repair operation
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-27 13:15   ` [PATCH 20/47] xfs: create a shadow rmap btree during realtime rmap repair Darrick J. Wong
@ 2023-12-27 13:15   ` Darrick J. Wong
  2023-12-27 13:16   ` [PATCH 22/47] xfs_db: display the realtime rmap btree contents Darrick J. Wong
                     ` (25 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:15 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Hook the regular realtime rmap code when an rtrmapbt repair operation is
running so that we can unlock the AGF buffer to scan the filesystem and
keep the in-memory btree up to date during the scan.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rmap.c    |   56 +++++++++++++++++++++++++++++++++++++++++++++++---
 libxfs/xfs_rmap.h    |    6 +++++
 libxfs/xfs_rtgroup.c |    1 +
 libxfs/xfs_rtgroup.h |    3 +++
 4 files changed, 63 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 42713dd17f4..0056dc08662 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -919,8 +919,7 @@ xfs_rmap_update_hook(
 			.oinfo		= *oinfo, /* struct copy */
 		};
 
-		if (pag)
-			xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
+		xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
 	}
 }
 
@@ -945,6 +944,50 @@ xfs_rmap_hook_del(
 # define xfs_rmap_update_hook(t, p, o, s, b, u, oi)	do { } while (0)
 #endif /* CONFIG_XFS_LIVE_HOOKS */
 
+# if defined(CONFIG_XFS_LIVE_HOOKS) && defined(CONFIG_XFS_RT)
+static inline void
+xfs_rtrmap_update_hook(
+	struct xfs_trans		*tp,
+	struct xfs_rtgroup		*rtg,
+	enum xfs_rmap_intent_type	op,
+	xfs_rgblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) {
+		struct xfs_rmap_update_params	p = {
+			.startblock	= startblock,
+			.blockcount	= blockcount,
+			.unwritten	= unwritten,
+			.oinfo		= *oinfo, /* struct copy */
+		};
+
+		xfs_hooks_call(&rtg->rtg_rmap_update_hooks, op, &p);
+	}
+}
+
+/* Call the specified function during a rt reverse mapping update. */
+int
+xfs_rtrmap_hook_add(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_rmap_hook	*hook)
+{
+	return xfs_hooks_add(&rtg->rtg_rmap_update_hooks, &hook->update_hook);
+}
+
+/* Stop calling the specified function during a rt reverse mapping update. */
+void
+xfs_rtrmap_hook_del(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_rmap_hook	*hook)
+{
+	xfs_hooks_del(&rtg->rtg_rmap_update_hooks, &hook->update_hook);
+}
+#else
+# define xfs_rtrmap_update_hook(t, r, o, s, b, u, oi)	do { } while (0)
+#endif /* CONFIG_XFS_LIVE_HOOKS && CONFIG_XFS_RT */
+
 /*
  * Remove a reference to an extent in the rmap btree.
  */
@@ -2701,6 +2744,7 @@ xfs_rtrmap_finish_one(
 	xfs_rgnumber_t			rgno;
 	xfs_rgblock_t			bno;
 	bool				unwritten;
+	int				error;
 
 	trace_xfs_rmap_deferred(mp, ri);
 
@@ -2726,8 +2770,14 @@ xfs_rtrmap_finish_one(
 	unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
 	bno = xfs_rtb_to_rgbno(mp, ri->ri_bmap.br_startblock, &rgno);
 
-	return __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+	error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
 			ri->ri_bmap.br_blockcount, &oinfo, unwritten);
+	if (error)
+		return error;
+
+	xfs_rtrmap_update_hook(tp, ri->ri_rtg, ri->ri_type, bno,
+			ri->ri_bmap.br_blockcount, unwritten, &oinfo);
+	return 0;
 }
 
 /*
diff --git a/libxfs/xfs_rmap.h b/libxfs/xfs_rmap.h
index 3719fc4cbc2..9e19e657eef 100644
--- a/libxfs/xfs_rmap.h
+++ b/libxfs/xfs_rmap.h
@@ -275,6 +275,12 @@ void xfs_rmap_hook_enable(void);
 
 int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
 void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+
+# ifdef CONFIG_XFS_RT
+int xfs_rtrmap_hook_add(struct xfs_rtgroup *rtg, struct xfs_rmap_hook *hook);
+void xfs_rtrmap_hook_del(struct xfs_rtgroup *rtg, struct xfs_rmap_hook *hook);
+# endif /* CONFIG_XFS_RT */
+
 #endif
 
 #endif	/* __XFS_RMAP_H__ */
diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index 1acf98f8c7e..03eb776ef8b 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -160,6 +160,7 @@ xfs_initialize_rtgroups(
 		spin_lock_init(&rtg->rtg_state_lock);
 		init_waitqueue_head(&rtg->rtg_active_wq);
 		xfs_defer_drain_init(&rtg->rtg_intents_drain);
+		xfs_hooks_init(&rtg->rtg_rmap_update_hooks);
 #endif /* __KERNEL__ */
 
 		/* Active ref owned by mount indicates rtgroup is online. */
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 9487c2e0047..3522527e553 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -48,6 +48,9 @@ struct xfs_rtgroup {
 	 * inconsistencies.
 	 */
 	struct xfs_defer_drain	rtg_intents_drain;
+
+	/* Hook to feed rt rmapbt updates to an active online repair. */
+	struct xfs_hooks	rtg_rmap_update_hooks;
 #endif /* __KERNEL__ */
 };
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/47] xfs_db: display the realtime rmap btree contents
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-27 13:15   ` [PATCH 21/47] xfs: hook live realtime rmap operations during a repair operation Darrick J. Wong
@ 2023-12-27 13:16   ` Darrick J. Wong
  2023-12-27 13:16   ` [PATCH 23/47] xfs_db: support the realtime rmapbt Darrick J. Wong
                     ` (24 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:16 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Implement all the code we need to dump rtrmapbt contents, starting
from the root inode.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/bmroot.c              |  149 ++++++++++++++++++++++++++++++++++++++++++++++
 db/bmroot.h              |    2 +
 db/btblock.c             |  100 +++++++++++++++++++++++++++++++
 db/btblock.h             |    5 ++
 db/field.c               |   11 +++
 db/field.h               |    5 ++
 db/inode.c               |  102 +++++++++++++++++++++++++++++++
 db/inode.h               |    3 +
 db/type.c                |    5 ++
 db/type.h                |    1 
 libxfs/libxfs_api_defs.h |    4 +
 man/man8/xfs_db.8        |   60 ++++++++++++++++++-
 12 files changed, 443 insertions(+), 4 deletions(-)


diff --git a/db/bmroot.c b/db/bmroot.c
index 7ef07da181e..19490bd2499 100644
--- a/db/bmroot.c
+++ b/db/bmroot.c
@@ -24,6 +24,13 @@ static int	bmrootd_key_offset(void *obj, int startoff, int idx);
 static int	bmrootd_ptr_count(void *obj, int startoff);
 static int	bmrootd_ptr_offset(void *obj, int startoff, int idx);
 
+static int	rtrmaproot_rec_count(void *obj, int startoff);
+static int	rtrmaproot_rec_offset(void *obj, int startoff, int idx);
+static int	rtrmaproot_key_count(void *obj, int startoff);
+static int	rtrmaproot_key_offset(void *obj, int startoff, int idx);
+static int	rtrmaproot_ptr_count(void *obj, int startoff);
+static int	rtrmaproot_ptr_offset(void *obj, int startoff, int idx);
+
 #define	OFF(f)	bitize(offsetof(xfs_bmdr_block_t, bb_ ## f))
 const field_t	bmroota_flds[] = {
 	{ "level", FLDT_UINT16D, OI(OFF(level)), C1, 0, TYP_NONE },
@@ -54,6 +61,20 @@ const field_t	bmrootd_key_flds[] = {
 	{ NULL }
 };
 
+/* realtime rmap btree root */
+const field_t	rtrmaproot_flds[] = {
+	{ "level", FLDT_UINT16D, OI(OFF(level)), C1, 0, TYP_NONE },
+	{ "numrecs", FLDT_UINT16D, OI(OFF(numrecs)), C1, 0, TYP_NONE },
+	{ "recs", FLDT_RTRMAPBTREC, rtrmaproot_rec_offset, rtrmaproot_rec_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_NONE },
+	{ "keys", FLDT_RTRMAPBTKEY, rtrmaproot_key_offset, rtrmaproot_key_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_NONE },
+	{ "ptrs", FLDT_RTRMAPBTPTR, rtrmaproot_ptr_offset, rtrmaproot_ptr_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_RTRMAPBT },
+	{ NULL }
+};
+#undef OFF
+
 static int
 bmroota_key_count(
 	void			*obj,
@@ -241,3 +262,131 @@ bmrootd_size(
 	dip = obj;
 	return bitize((int)XFS_DFORK_DSIZE(dip, mp));
 }
+
+/* realtime rmap root */
+static int
+rtrmaproot_rec_count(
+	void			*obj,
+	int			startoff)
+{
+	struct xfs_rtrmap_root	*block;
+#ifdef DEBUG
+	struct xfs_dinode	*dip = obj;
+#endif
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrmap_root *)((char *)obj + byteize(startoff));
+	ASSERT((char *)block == XFS_DFORK_DPTR(dip));
+	if (be16_to_cpu(block->bb_level) > 0)
+		return 0;
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static int
+rtrmaproot_rec_offset(
+	void			*obj,
+	int			startoff,
+	int			idx)
+{
+	struct xfs_rtrmap_root	*block;
+	struct xfs_rmap_rec	*kp;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrmap_root *)((char *)obj + byteize(startoff));
+	ASSERT(be16_to_cpu(block->bb_level) == 0);
+	kp = xfs_rtrmap_droot_rec_addr(block, idx);
+	return bitize((int)((char *)kp - (char *)block));
+}
+
+static int
+rtrmaproot_key_count(
+	void			*obj,
+	int			startoff)
+{
+	struct xfs_rtrmap_root	*block;
+#ifdef DEBUG
+	struct xfs_dinode	*dip = obj;
+#endif
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrmap_root *)((char *)obj + byteize(startoff));
+	ASSERT((char *)block == XFS_DFORK_DPTR(dip));
+	if (be16_to_cpu(block->bb_level) == 0)
+		return 0;
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static int
+rtrmaproot_key_offset(
+	void			*obj,
+	int			startoff,
+	int			idx)
+{
+	struct xfs_rtrmap_root	*block;
+	struct xfs_rmap_key	*kp;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrmap_root *)((char *)obj + byteize(startoff));
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+	kp = xfs_rtrmap_droot_key_addr(block, idx);
+	return bitize((int)((char *)kp - (char *)block));
+}
+
+static int
+rtrmaproot_ptr_count(
+	void			*obj,
+	int			startoff)
+{
+	struct xfs_rtrmap_root	*block;
+#ifdef DEBUG
+	struct xfs_dinode	*dip = obj;
+#endif
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrmap_root *)((char *)obj + byteize(startoff));
+	ASSERT((char *)block == XFS_DFORK_DPTR(dip));
+	if (be16_to_cpu(block->bb_level) == 0)
+		return 0;
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static int
+rtrmaproot_ptr_offset(
+	void			*obj,
+	int			startoff,
+	int			idx)
+{
+	struct xfs_rtrmap_root	*block;
+	xfs_rtrmap_ptr_t	*pp;
+	struct xfs_dinode	*dip;
+	int			dmxr;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	dip = obj;
+	block = (struct xfs_rtrmap_root *)((char *)obj + byteize(startoff));
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+	dmxr = libxfs_rtrmapbt_droot_maxrecs(XFS_DFORK_DSIZE(dip, mp), false);
+	pp = xfs_rtrmap_droot_ptr_addr(block, idx, dmxr);
+	return bitize((int)((char *)pp - (char *)block));
+}
+
+int
+rtrmaproot_size(
+	void			*obj,
+	int			startoff,
+	int			idx)
+{
+	struct xfs_dinode	*dip;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	ASSERT(idx == 0);
+	dip = obj;
+	return bitize((int)XFS_DFORK_DSIZE(dip, mp));
+}
diff --git a/db/bmroot.h b/db/bmroot.h
index a1274cf6a94..a2c5cfb18f0 100644
--- a/db/bmroot.h
+++ b/db/bmroot.h
@@ -8,6 +8,8 @@ extern const struct field	bmroota_flds[];
 extern const struct field	bmroota_key_flds[];
 extern const struct field	bmrootd_flds[];
 extern const struct field	bmrootd_key_flds[];
+extern const struct field	rtrmaproot_flds[];
 
 extern int	bmroota_size(void *obj, int startoff, int idx);
 extern int	bmrootd_size(void *obj, int startoff, int idx);
+extern int	rtrmaproot_size(void *obj, int startoff, int idx);
diff --git a/db/btblock.c b/db/btblock.c
index d5be6adb734..5cad166278d 100644
--- a/db/btblock.c
+++ b/db/btblock.c
@@ -92,6 +92,12 @@ static struct xfs_db_btree {
 		sizeof(struct xfs_rmap_rec),
 		sizeof(__be32),
 	},
+	{	XFS_RTRMAP_CRC_MAGIC,
+		XFS_BTREE_LBLOCK_CRC_LEN,
+		2 * sizeof(struct xfs_rmap_key),
+		sizeof(struct xfs_rmap_rec),
+		sizeof(__be64),
+	},
 	{	XFS_REFC_CRC_MAGIC,
 		XFS_BTREE_SBLOCK_CRC_LEN,
 		sizeof(struct xfs_refcount_key),
@@ -813,6 +819,100 @@ const field_t	rmapbt_rec_flds[] = {
 	{ NULL }
 };
 
+/* realtime RMAP btree blocks */
+const field_t	rtrmapbt_crc_hfld[] = {
+	{ "", FLDT_RTRMAPBT_CRC, OI(0), C1, 0, TYP_NONE },
+	{ NULL }
+};
+
+#define	OFF(f)	bitize(offsetof(struct xfs_btree_block, bb_ ## f))
+const field_t	rtrmapbt_crc_flds[] = {
+	{ "magic", FLDT_UINT32X, OI(OFF(magic)), C1, 0, TYP_NONE },
+	{ "level", FLDT_UINT16D, OI(OFF(level)), C1, 0, TYP_NONE },
+	{ "numrecs", FLDT_UINT16D, OI(OFF(numrecs)), C1, 0, TYP_NONE },
+	{ "leftsib", FLDT_DFSBNO, OI(OFF(u.l.bb_leftsib)), C1, 0, TYP_RTRMAPBT },
+	{ "rightsib", FLDT_DFSBNO, OI(OFF(u.l.bb_rightsib)), C1, 0, TYP_RTRMAPBT },
+	{ "bno", FLDT_DFSBNO, OI(OFF(u.l.bb_blkno)), C1, 0, TYP_RTRMAPBT },
+	{ "lsn", FLDT_UINT64X, OI(OFF(u.l.bb_lsn)), C1, 0, TYP_NONE },
+	{ "uuid", FLDT_UUID, OI(OFF(u.l.bb_uuid)), C1, 0, TYP_NONE },
+	{ "owner", FLDT_INO, OI(OFF(u.l.bb_owner)), C1, 0, TYP_NONE },
+	{ "crc", FLDT_CRC, OI(OFF(u.l.bb_crc)), C1, 0, TYP_NONE },
+	{ "recs", FLDT_RTRMAPBTREC, btblock_rec_offset, btblock_rec_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_NONE },
+	{ "keys", FLDT_RTRMAPBTKEY, btblock_key_offset, btblock_key_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_NONE },
+	{ "ptrs", FLDT_RTRMAPBTPTR, btblock_ptr_offset, btblock_key_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_RTRMAPBT },
+	{ NULL }
+};
+#undef OFF
+
+#define	KOFF(f)	bitize(offsetof(struct xfs_rmap_key, rm_ ## f))
+
+#define RTRMAPBK_STARTBLOCK_BITOFF	0
+#define RTRMAPBK_OWNER_BITOFF		(RTRMAPBK_STARTBLOCK_BITOFF + RMAPBT_STARTBLOCK_BITLEN)
+#define RTRMAPBK_ATTRFLAG_BITOFF	(RTRMAPBK_OWNER_BITOFF + RMAPBT_OWNER_BITLEN)
+#define RTRMAPBK_BMBTFLAG_BITOFF	(RTRMAPBK_ATTRFLAG_BITOFF + RMAPBT_ATTRFLAG_BITLEN)
+#define RTRMAPBK_EXNTFLAG_BITOFF	(RTRMAPBK_BMBTFLAG_BITOFF + RMAPBT_BMBTFLAG_BITLEN)
+#define RTRMAPBK_UNUSED_OFFSET_BITOFF	(RTRMAPBK_EXNTFLAG_BITOFF + RMAPBT_EXNTFLAG_BITLEN)
+#define RTRMAPBK_OFFSET_BITOFF		(RTRMAPBK_UNUSED_OFFSET_BITOFF + RMAPBT_UNUSED_OFFSET_BITLEN)
+
+#define HI_KOFF(f)	bitize(sizeof(struct xfs_rmap_key) + offsetof(struct xfs_rmap_key, rm_ ## f))
+
+#define RTRMAPBK_STARTBLOCKHI_BITOFF	(bitize(sizeof(struct xfs_rmap_key)))
+#define RTRMAPBK_OWNERHI_BITOFF		(RTRMAPBK_STARTBLOCKHI_BITOFF + RMAPBT_STARTBLOCK_BITLEN)
+#define RTRMAPBK_ATTRFLAGHI_BITOFF	(RTRMAPBK_OWNERHI_BITOFF + RMAPBT_OWNER_BITLEN)
+#define RTRMAPBK_BMBTFLAGHI_BITOFF	(RTRMAPBK_ATTRFLAGHI_BITOFF + RMAPBT_ATTRFLAG_BITLEN)
+#define RTRMAPBK_EXNTFLAGHI_BITOFF	(RTRMAPBK_BMBTFLAGHI_BITOFF + RMAPBT_BMBTFLAG_BITLEN)
+#define RTRMAPBK_UNUSED_OFFSETHI_BITOFF	(RTRMAPBK_EXNTFLAGHI_BITOFF + RMAPBT_EXNTFLAG_BITLEN)
+#define RTRMAPBK_OFFSETHI_BITOFF	(RTRMAPBK_UNUSED_OFFSETHI_BITOFF + RMAPBT_UNUSED_OFFSET_BITLEN)
+
+const field_t	rtrmapbt_key_flds[] = {
+	{ "startblock", FLDT_RGBLOCK, OI(KOFF(startblock)), C1, 0, TYP_DATA },
+	{ "owner", FLDT_INT64D, OI(KOFF(owner)), C1, 0, TYP_NONE },
+	{ "offset", FLDT_RFILEOFFD, OI(RTRMAPBK_OFFSET_BITOFF), C1, 0, TYP_NONE },
+	{ "attrfork", FLDT_RATTRFORKFLG, OI(RTRMAPBK_ATTRFLAG_BITOFF), C1, 0,
+	  TYP_NONE },
+	{ "bmbtblock", FLDT_RBMBTFLG, OI(RTRMAPBK_BMBTFLAG_BITOFF), C1, 0,
+	  TYP_NONE },
+	{ "startblock_hi", FLDT_RGBLOCK, OI(HI_KOFF(startblock)), C1, 0, TYP_DATA },
+	{ "owner_hi", FLDT_INT64D, OI(HI_KOFF(owner)), C1, 0, TYP_NONE },
+	{ "offset_hi", FLDT_RFILEOFFD, OI(RTRMAPBK_OFFSETHI_BITOFF), C1, 0, TYP_NONE },
+	{ "attrfork_hi", FLDT_RATTRFORKFLG, OI(RTRMAPBK_ATTRFLAGHI_BITOFF), C1, 0,
+	  TYP_NONE },
+	{ "bmbtblock_hi", FLDT_RBMBTFLG, OI(RTRMAPBK_BMBTFLAGHI_BITOFF), C1, 0,
+	  TYP_NONE },
+	{ NULL }
+};
+#undef HI_KOFF
+#undef KOFF
+
+#define	ROFF(f)	bitize(offsetof(struct xfs_rmap_rec, rm_ ## f))
+
+#define RTRMAPBT_STARTBLOCK_BITOFF	0
+#define RTRMAPBT_BLOCKCOUNT_BITOFF	(RTRMAPBT_STARTBLOCK_BITOFF + RMAPBT_STARTBLOCK_BITLEN)
+#define RTRMAPBT_OWNER_BITOFF		(RTRMAPBT_BLOCKCOUNT_BITOFF + RMAPBT_BLOCKCOUNT_BITLEN)
+#define RTRMAPBT_ATTRFLAG_BITOFF	(RTRMAPBT_OWNER_BITOFF + RMAPBT_OWNER_BITLEN)
+#define RTRMAPBT_BMBTFLAG_BITOFF	(RTRMAPBT_ATTRFLAG_BITOFF + RMAPBT_ATTRFLAG_BITLEN)
+#define RTRMAPBT_EXNTFLAG_BITOFF	(RTRMAPBT_BMBTFLAG_BITOFF + RMAPBT_BMBTFLAG_BITLEN)
+#define RTRMAPBT_UNUSED_OFFSET_BITOFF	(RTRMAPBT_EXNTFLAG_BITOFF + RMAPBT_EXNTFLAG_BITLEN)
+#define RTRMAPBT_OFFSET_BITOFF		(RTRMAPBT_UNUSED_OFFSET_BITOFF + RMAPBT_UNUSED_OFFSET_BITLEN)
+
+const field_t	rtrmapbt_rec_flds[] = {
+	{ "startblock", FLDT_RGBLOCK, OI(RTRMAPBT_STARTBLOCK_BITOFF), C1, 0, TYP_DATA },
+	{ "blockcount", FLDT_EXTLEN, OI(RTRMAPBT_BLOCKCOUNT_BITOFF), C1, 0, TYP_NONE },
+	{ "owner", FLDT_INT64D, OI(RTRMAPBT_OWNER_BITOFF), C1, 0, TYP_NONE },
+	{ "offset", FLDT_RFILEOFFD, OI(RTRMAPBT_OFFSET_BITOFF), C1, 0, TYP_NONE },
+	{ "extentflag", FLDT_REXTFLG, OI(RTRMAPBT_EXNTFLAG_BITOFF), C1, 0,
+	  TYP_NONE },
+	{ "attrfork", FLDT_RATTRFORKFLG, OI(RTRMAPBT_ATTRFLAG_BITOFF), C1, 0,
+	  TYP_NONE },
+	{ "bmbtblock", FLDT_RBMBTFLG, OI(RTRMAPBT_BMBTFLAG_BITOFF), C1, 0,
+	  TYP_NONE },
+	{ NULL }
+};
+#undef ROFF
+
 /* refcount btree blocks */
 const field_t	refcbt_crc_hfld[] = {
 	{ "", FLDT_REFCBT_CRC, OI(0), C1, 0, TYP_NONE },
diff --git a/db/btblock.h b/db/btblock.h
index 4168c9e2e15..b4013ea8073 100644
--- a/db/btblock.h
+++ b/db/btblock.h
@@ -53,6 +53,11 @@ extern const struct field	rmapbt_crc_hfld[];
 extern const struct field	rmapbt_key_flds[];
 extern const struct field	rmapbt_rec_flds[];
 
+extern const struct field	rtrmapbt_crc_flds[];
+extern const struct field	rtrmapbt_crc_hfld[];
+extern const struct field	rtrmapbt_key_flds[];
+extern const struct field	rtrmapbt_rec_flds[];
+
 extern const struct field	refcbt_crc_flds[];
 extern const struct field	refcbt_crc_hfld[];
 extern const struct field	refcbt_key_flds[];
diff --git a/db/field.c b/db/field.c
index 4a6a4cf51c3..b3efbb5698d 100644
--- a/db/field.c
+++ b/db/field.c
@@ -184,6 +184,17 @@ const ftattr_t	ftattrtab[] = {
 	{ FLDT_RMAPBTREC, "rmapbtrec", fp_sarray, (char *)rmapbt_rec_flds,
 	  SI(bitsz(struct xfs_rmap_rec)), 0, NULL, rmapbt_rec_flds },
 
+	{ FLDT_RTRMAPBT_CRC, "rtrmapbt", NULL, (char *)rtrmapbt_crc_flds, btblock_size,
+	  FTARG_SIZE, NULL, rtrmapbt_crc_flds },
+	{ FLDT_RTRMAPBTKEY, "rtrmapbtkey", fp_sarray, (char *)rtrmapbt_key_flds,
+	  SI(bitize(2 * sizeof(struct xfs_rmap_key))), 0, NULL, rtrmapbt_key_flds },
+	{ FLDT_RTRMAPBTPTR, "rtrmapbtptr", fp_num, "%llu",
+	  SI(bitsz(xfs_rtrmap_ptr_t)), 0, fa_dfsbno, NULL },
+	{ FLDT_RTRMAPBTREC, "rtrmapbtrec", fp_sarray, (char *)rtrmapbt_rec_flds,
+	  SI(bitsz(struct xfs_rmap_rec)), 0, NULL, rtrmapbt_rec_flds },
+	{ FLDT_RTRMAPROOT, "rtrmaproot", NULL, (char *)rtrmaproot_flds, rtrmaproot_size,
+	  FTARG_SIZE, NULL, rtrmaproot_flds },
+
 	{ FLDT_REFCBT_CRC, "refcntbt", NULL, (char *)refcbt_crc_flds, btblock_size,
 	  FTARG_SIZE, NULL, refcbt_crc_flds },
 	{ FLDT_REFCBTKEY, "refcntbtkey", fp_sarray, (char *)refcbt_key_flds,
diff --git a/db/field.h b/db/field.h
index e9c6142f282..db3e13d3927 100644
--- a/db/field.h
+++ b/db/field.h
@@ -83,6 +83,11 @@ typedef enum fldt	{
 	FLDT_RMAPBTKEY,
 	FLDT_RMAPBTPTR,
 	FLDT_RMAPBTREC,
+	FLDT_RTRMAPBT_CRC,
+	FLDT_RTRMAPBTKEY,
+	FLDT_RTRMAPBTPTR,
+	FLDT_RTRMAPBTREC,
+	FLDT_RTRMAPROOT,
 	FLDT_REFCBT_CRC,
 	FLDT_REFCBTKEY,
 	FLDT_REFCBTPTR,
diff --git a/db/inode.c b/db/inode.c
index 16033c5ab79..6867f5c5427 100644
--- a/db/inode.c
+++ b/db/inode.c
@@ -17,6 +17,7 @@
 #include "bit.h"
 #include "output.h"
 #include "init.h"
+#include "libfrog/bitmap.h"
 
 static int	inode_a_bmbt_count(void *obj, int startoff);
 static int	inode_a_bmx_count(void *obj, int startoff);
@@ -47,6 +48,7 @@ static int	inode_u_muuid_count(void *obj, int startoff);
 static int	inode_u_sfdir2_count(void *obj, int startoff);
 static int	inode_u_sfdir3_count(void *obj, int startoff);
 static int	inode_u_symlink_count(void *obj, int startoff);
+static int	inode_u_rtrmapbt_count(void *obj, int startoff);
 
 static const cmdinfo_t	inode_cmd =
 	{ "inode", NULL, inode_f, 0, 1, 1, "[inode#]",
@@ -230,6 +232,8 @@ const field_t	inode_u_flds[] = {
 	{ "sfdir3", FLDT_DIR3SF, NULL, inode_u_sfdir3_count, FLD_COUNT, TYP_NONE },
 	{ "symlink", FLDT_CHARNS, NULL, inode_u_symlink_count, FLD_COUNT,
 	  TYP_NONE },
+	{ "rtrmapbt", FLDT_RTRMAPROOT, NULL, inode_u_rtrmapbt_count, FLD_COUNT,
+	  TYP_NONE },
 	{ NULL }
 };
 
@@ -243,7 +247,7 @@ const field_t	inode_a_flds[] = {
 };
 
 static const char	*dinode_fmt_name[] =
-	{ "dev", "local", "extents", "btree", "uuid" };
+	{ "dev", "local", "extents", "btree", "uuid", "rmap" };
 static const int	dinode_fmt_name_size =
 	sizeof(dinode_fmt_name) / sizeof(dinode_fmt_name[0]);
 
@@ -633,9 +637,86 @@ inode_init(void)
 	add_command(&inode_cmd);
 }
 
+static struct bitmap	*rmap_inodes;
+
+static inline int
+set_rtgroup_rmap_inode(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_imeta_path	*path;
+	struct xfs_trans	*tp;
+	xfs_ino_t		rtino;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(mp))
+		return 0;
+
+	error = -libxfs_rtrmapbt_create_path(mp, rgno, &path);
+	if (error)
+		return error;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		goto out_path;
+
+	error = -libxfs_imeta_lookup(tp, path, &rtino);
+	if (error)
+		goto out_trans;
+
+	if (rtino == NULLFSINO) {
+		error = EFSCORRUPTED;
+		goto out_trans;
+	}
+
+	error = bitmap_set(rmap_inodes, rtino, 1);
+
+out_trans:
+	libxfs_trans_cancel(tp);
+out_path:
+	libxfs_imeta_free_path(path);
+	return error;
+}
+
+int
+init_rtmeta_inode_bitmaps(
+	struct xfs_mount	*mp)
+{
+	xfs_rgnumber_t		rgno;
+	int			error;
+
+	if (rmap_inodes)
+		return 0;
+
+	error = bitmap_alloc(&rmap_inodes);
+	if (error)
+		return error;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		int err2 = set_rtgroup_rmap_inode(mp, rgno);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	return error;
+}
+
+bool is_rtrmap_inode(xfs_ino_t ino)
+{
+	return bitmap_test(rmap_inodes, ino, 1);
+}
+
 typnm_t
 inode_next_type(void)
 {
+	int		error;
+
+	error = init_rtmeta_inode_bitmaps(mp);
+	if (error) {
+		dbprintf(_("error %d setting up rt metadata inode bitmaps\n"),
+				error);
+	}
+
 	switch (iocur_top->mode & S_IFMT) {
 	case S_IFDIR:
 		return TYP_DIR2;
@@ -655,8 +736,9 @@ inode_next_type(void)
 			 iocur_top->ino == mp->m_sb.sb_gquotino ||
 			 iocur_top->ino == mp->m_sb.sb_pquotino)
 			return TYP_DQBLK;
-		else
-			return TYP_DATA;
+		else if (is_rtrmap_inode(iocur_top->ino))
+			return TYP_RTRMAPBT;
+		return TYP_DATA;
 	default:
 		return TYP_NONE;
 	}
@@ -790,6 +872,20 @@ inode_u_sfdir3_count(
 	       xfs_has_ftype(mp);
 }
 
+static int
+inode_u_rtrmapbt_count(
+	void			*obj,
+	int			startoff)
+{
+	struct xfs_dinode	*dip;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	dip = obj;
+	ASSERT((char *)XFS_DFORK_DPTR(dip) - (char *)dip == byteize(startoff));
+	return dip->di_format == XFS_DINODE_FMT_RMAP;
+}
+
 int
 inode_u_size(
 	void			*obj,
diff --git a/db/inode.h b/db/inode.h
index 31a2ebbba6a..a47b0575a15 100644
--- a/db/inode.h
+++ b/db/inode.h
@@ -23,3 +23,6 @@ extern int	inode_size(void *obj, int startoff, int idx);
 extern int	inode_u_size(void *obj, int startoff, int idx);
 extern void	xfs_inode_set_crc(struct xfs_buf *);
 extern void	set_cur_inode(xfs_ino_t ino);
+
+int init_rtmeta_inode_bitmaps(struct xfs_mount *mp);
+bool is_rtrmap_inode(xfs_ino_t ino);
diff --git a/db/type.c b/db/type.c
index 2091b4ac8b1..1dfc33ffb44 100644
--- a/db/type.c
+++ b/db/type.c
@@ -51,6 +51,7 @@ static const typ_t	__typtab[] = {
 	{ TYP_BNOBT, "bnobt", handle_struct, bnobt_hfld, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_CNTBT, "cntbt", handle_struct, cntbt_hfld, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_RMAPBT, NULL },
+	{ TYP_RTRMAPBT, NULL },
 	{ TYP_REFCBT, NULL },
 	{ TYP_DATA, "data", handle_block, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_DIR2, "dir2", handle_struct, dir2_hfld, NULL, TYP_F_NO_CRC_OFF },
@@ -91,6 +92,8 @@ static const typ_t	__typtab_crc[] = {
 		&xfs_cntbt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
 	{ TYP_RMAPBT, "rmapbt", handle_struct, rmapbt_crc_hfld,
 		&xfs_rmapbt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
+	{ TYP_RTRMAPBT, "rtrmapbt", handle_struct, rtrmapbt_crc_hfld,
+		&xfs_rtrmapbt_buf_ops, XFS_BTREE_LBLOCK_CRC_OFF },
 	{ TYP_REFCBT, "refcntbt", handle_struct, refcbt_crc_hfld,
 		&xfs_refcountbt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
 	{ TYP_DATA, "data", handle_block, NULL, NULL, TYP_F_NO_CRC_OFF },
@@ -141,6 +144,8 @@ static const typ_t	__typtab_spcrc[] = {
 		&xfs_cntbt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
 	{ TYP_RMAPBT, "rmapbt", handle_struct, rmapbt_crc_hfld,
 		&xfs_rmapbt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
+	{ TYP_RTRMAPBT, "rtrmapbt", handle_struct, rtrmapbt_crc_hfld,
+		&xfs_rtrmapbt_buf_ops, XFS_BTREE_LBLOCK_CRC_OFF },
 	{ TYP_REFCBT, "refcntbt", handle_struct, refcbt_crc_hfld,
 		&xfs_refcountbt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
 	{ TYP_DATA, "data", handle_block, NULL, NULL, TYP_F_NO_CRC_OFF },
diff --git a/db/type.h b/db/type.h
index e7f0ecc1768..c98f3640202 100644
--- a/db/type.h
+++ b/db/type.h
@@ -20,6 +20,7 @@ typedef enum typnm
 	TYP_BNOBT,
 	TYP_CNTBT,
 	TYP_RMAPBT,
+	TYP_RTRMAPBT,
 	TYP_REFCBT,
 	TYP_DATA,
 	TYP_DIR2,
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index c5dad34f3d2..e961453052a 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -286,6 +286,10 @@
 #define xfs_rtfree_extent		libxfs_rtfree_extent
 #define xfs_rtgroup_update_secondary_sbs	libxfs_rtgroup_update_secondary_sbs
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
+#define xfs_rtrmapbt_create_path	libxfs_rtrmapbt_create_path
+#define xfs_rtrmapbt_droot_maxrecs	libxfs_rtrmapbt_droot_maxrecs
+#define xfs_rtrmapbt_maxrecs		libxfs_rtrmapbt_maxrecs
+
 #define xfs_sb_from_disk		libxfs_sb_from_disk
 #define xfs_sb_quota_from_disk		libxfs_sb_quota_from_disk
 #define xfs_sb_read_secondary		libxfs_sb_read_secondary
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index d0115075888..0e20108fb51 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -1200,7 +1200,7 @@ The possible data types are:
 .BR agf ", " agfl ", " agi ", " attr ", " bmapbta ", " bmapbtd ,
 .BR bnobt ", " cntbt ", " data ", " dir ", " dir2 ", " dqblk ,
 .BR inobt ", " inode ", " log ", " refcntbt ", " rmapbt ", " rtbitmap ,
-.BR rtsummary ", " sb ", " symlink " and " text .
+.BR rtsummary ", " sb ", " symlink ", " rtrmapbt ", and " text .
 See the TYPES section below for more information on these data types.
 .TP
 .BI "timelimit [" OPTIONS ]
@@ -2348,6 +2348,64 @@ block number within the allocation group to the next level in the Btree.
 .PD
 .RE
 .TP
+.B rtrmapbt
+There is one reverse mapping Btree for each realtime group.
+The
+.BR startblock " and "
+.B blockcount
+fields are 32 bits wide and record blocks within a realtime group.
+The root of this Btree is the reverse-mapping inode, which is recorded in the
+metadata directory.
+Blocks are linked to sibling left and right blocks at each level, as well as by
+pointers from parent to child blocks.
+Each block has the following fields:
+.RS 1.4i
+.PD 0
+.TP 1.2i
+.B magic
+RTRMAP block magic number, 0x4d415052 ('MAPR').
+.TP
+.B level
+level number of this block, 0 is a leaf.
+.TP
+.B numrecs
+number of data entries in the block.
+.TP
+.B leftsib
+left (logically lower) sibling block, 0 if none.
+.TP
+.B rightsib
+right (logically higher) sibling block, 0 if none.
+.TP
+.B recs
+[leaf blocks only] array of reference count records. Each record contains
+.BR startblock ,
+.BR blockcount ,
+.BR owner ,
+.BR offset ,
+.BR attr_fork ,
+.BR bmbt_block ,
+and
+.BR unwritten .
+.TP
+.B keys
+[non-leaf blocks only] array of double-key records. The first ("low") key
+contains the first value of each block in the level below this one. The second
+("high") key contains the largest key that can be used to identify any record
+in the subtree. Each record contains
+.BR startblock ,
+.BR owner ,
+.BR offset ,
+.BR attr_fork ,
+and
+.BR bmbt_block .
+.TP
+.B ptrs
+[non-leaf blocks only] array of child block pointers. Each pointer is a
+block number within the allocation group to the next level in the Btree.
+.PD
+.RE
+.TP
 .B rtbitmap
 If the filesystem has a realtime subvolume, then the
 .B rbmino


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/47] xfs_db: support the realtime rmapbt
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-27 13:16   ` [PATCH 22/47] xfs_db: display the realtime rmap btree contents Darrick J. Wong
@ 2023-12-27 13:16   ` Darrick J. Wong
  2023-12-27 13:16   ` [PATCH 24/47] xfs_db: support rudimentary checks of the rtrmap btree Darrick J. Wong
                     ` (23 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:16 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Wire up various parts of xfs_db for realtime rmap support.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/btblock.c             |    3 ++
 db/btdump.c              |   64 ++++++++++++++++++++++++++++++++++++++++++++++
 db/btheight.c            |    5 ++++
 libxfs/libxfs_api_defs.h |    1 +
 man/man8/xfs_db.8        |    3 +-
 5 files changed, 75 insertions(+), 1 deletion(-)


diff --git a/db/btblock.c b/db/btblock.c
index 5cad166278d..70f6c3f6aed 100644
--- a/db/btblock.c
+++ b/db/btblock.c
@@ -147,6 +147,9 @@ block_to_bt(
 	case TYP_RMAPBT:
 		magic = crc ? XFS_RMAP_CRC_MAGIC : 0;
 		break;
+	case TYP_RTRMAPBT:
+		magic = crc ? XFS_RTRMAP_CRC_MAGIC : 0;
+		break;
 	case TYP_REFCBT:
 		magic = crc ? XFS_REFC_CRC_MAGIC : 0;
 		break;
diff --git a/db/btdump.c b/db/btdump.c
index 81642cde2b6..9c528e5a11a 100644
--- a/db/btdump.c
+++ b/db/btdump.c
@@ -441,6 +441,67 @@ dump_dabtree(
 	return ret;
 }
 
+static bool
+is_btree_inode(void)
+{
+	struct xfs_dinode	*dip;
+
+	dip = iocur_top->data;
+	return dip->di_format == XFS_DINODE_FMT_RMAP;
+}
+
+static int
+dump_btree_inode(
+	bool			dump_node_blocks)
+{
+	char			*prefix;
+	struct xfs_dinode	*dip;
+	struct xfs_rtrmap_root	*rtrmap;
+	int			level;
+	int			numrecs;
+	int			ret;
+
+	dip = iocur_top->data;
+	switch (dip->di_format) {
+	case XFS_DINODE_FMT_RMAP:
+		prefix = "u3.rtrmapbt";
+		rtrmap = (struct xfs_rtrmap_root *)XFS_DFORK_DPTR(dip);
+		level = be16_to_cpu(rtrmap->bb_level);
+		numrecs = be16_to_cpu(rtrmap->bb_numrecs);
+		break;
+	default:
+		dbprintf("Unknown metadata inode type %u\n", dip->di_format);
+		return 0;
+	}
+
+	if (numrecs == 0)
+		return 0;
+	if (level > 0) {
+		if (dump_node_blocks) {
+			ret = eval("print %s.keys", prefix);
+			if (ret)
+				goto err;
+			ret = eval("print %s.ptrs", prefix);
+			if (ret)
+				goto err;
+		}
+		ret = eval("addr %s.ptrs[1]", prefix);
+		if (ret)
+			goto err;
+		ret = dump_btree_long(dump_node_blocks);
+	} else {
+		ret = eval("print %s.recs", prefix);
+	}
+	if (ret)
+		goto err;
+
+	ret = eval("pop");
+	return ret;
+err:
+	eval("pop");
+	return ret;
+}
+
 static int
 btdump_f(
 	int		argc,
@@ -488,8 +549,11 @@ btdump_f(
 		return dump_btree_short(iflag);
 	case TYP_BMAPBTA:
 	case TYP_BMAPBTD:
+	case TYP_RTRMAPBT:
 		return dump_btree_long(iflag);
 	case TYP_INODE:
+		if (is_btree_inode())
+			return dump_btree_inode(iflag);
 		return dump_inode(iflag, aflag);
 	case TYP_ATTR:
 		return dump_dabtree(iflag, crc ? &attr3_print : &attr_print);
diff --git a/db/btheight.c b/db/btheight.c
index 6643489c82c..25ce3400334 100644
--- a/db/btheight.c
+++ b/db/btheight.c
@@ -53,6 +53,11 @@ struct btmap {
 		.maxlevels	= libxfs_rmapbt_maxlevels_ondisk,
 		.maxrecs	= libxfs_rmapbt_maxrecs,
 	},
+	{
+		.tag		= "rtrmapbt",
+		.maxlevels	= libxfs_rtrmapbt_maxlevels_ondisk,
+		.maxrecs	= libxfs_rtrmapbt_maxrecs,
+	},
 };
 
 static void
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index e961453052a..4b2fbd7cac9 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -288,6 +288,7 @@
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
 #define xfs_rtrmapbt_create_path	libxfs_rtrmapbt_create_path
 #define xfs_rtrmapbt_droot_maxrecs	libxfs_rtrmapbt_droot_maxrecs
+#define xfs_rtrmapbt_maxlevels_ondisk	libxfs_rtrmapbt_maxlevels_ondisk
 #define xfs_rtrmapbt_maxrecs		libxfs_rtrmapbt_maxrecs
 
 #define xfs_sb_from_disk		libxfs_sb_from_disk
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 0e20108fb51..778e3f9dd70 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -454,8 +454,9 @@ The supported btree types are:
 .IR finobt ,
 .IR bmapbt ,
 .IR refcountbt ,
+.IR rmapbt ,
 and
-.IR rmapbt .
+.IR rtrmapbt .
 The magic value
 .I all
 can be used to walk through all btree types.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/47] xfs_db: support rudimentary checks of the rtrmap btree
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-27 13:16   ` [PATCH 23/47] xfs_db: support the realtime rmapbt Darrick J. Wong
@ 2023-12-27 13:16   ` Darrick J. Wong
  2023-12-27 13:17   ` [PATCH 25/47] xfs_db: copy the realtime rmap btree Darrick J. Wong
                     ` (22 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:16 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Perform some fairly superficial checks of the rtrmap btree.  We'll
do more sophisticated checks in xfs_repair, but provide enough of
a spot-check here that we can do simple things.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/check.c |  203 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 db/inode.c |   28 ++++++++
 db/inode.h |    1 
 3 files changed, 226 insertions(+), 6 deletions(-)


diff --git a/db/check.c b/db/check.c
index d1c86206c08..351bb94a48e 100644
--- a/db/check.c
+++ b/db/check.c
@@ -20,6 +20,7 @@
 #include "init.h"
 #include "malloc.h"
 #include "dir2.h"
+#include "inode.h"
 
 typedef enum {
 	IS_USER_QUOTA, IS_PROJECT_QUOTA, IS_GROUP_QUOTA,
@@ -57,6 +58,7 @@ typedef enum {
 	DBM_RLDATA,
 	DBM_COWDATA,
 	DBM_RTSB,
+	DBM_BTRTRMAP,
 	DBM_NDBM
 } dbm_t;
 
@@ -71,6 +73,7 @@ typedef struct inodata {
 	xfs_ino_t	ino;
 	struct inodata	*parent;
 	char		*name;
+	xfs_rgnumber_t	rgno;	/* only if rtgroup metadata inode */
 } inodata_t;
 #define	MIN_INODATA_HASH_SIZE	256
 #define	MAX_INODATA_HASH_SIZE	65536
@@ -189,6 +192,7 @@ static const char	*typename[] = {
 	"rldata",
 	"cowdata",
 	"rtsb",
+	"btrtrmap",
 	NULL
 };
 
@@ -341,6 +345,9 @@ static void		process_rtbitmap(blkmap_t *blkmap);
 static void		process_rtsummary(blkmap_t *blkmap);
 static xfs_ino_t	process_sf_dir_v2(struct xfs_dinode *dip, int *dot,
 					  int *dotdot, inodata_t *id);
+static void		process_rtrmap(struct inodata *id,
+				       struct xfs_dinode *dip,
+				       xfs_rfsblock_t *toti);
 static void		quota_add(xfs_dqid_t *p, xfs_dqid_t *g, xfs_dqid_t *u,
 				  int dq, xfs_qcnt_t bc, xfs_qcnt_t ic,
 				  xfs_qcnt_t rc);
@@ -366,6 +373,12 @@ static void		scanfunc_bmap(struct xfs_btree_block *block,
 				      xfs_rfsblock_t *toti, xfs_extnum_t *nex,
 				      blkmap_t **blkmapp, int isroot,
 				      typnm_t btype);
+static void		scanfunc_rtrmap(struct xfs_btree_block *block,
+				      int level, dbm_t type, xfs_fsblock_t bno,
+				      inodata_t *id, xfs_rfsblock_t *totd,
+				      xfs_rfsblock_t *toti, xfs_extnum_t *nex,
+				      blkmap_t **blkmapp, int isroot,
+				      typnm_t btype);
 static void		scanfunc_bno(struct xfs_btree_block *block, int level,
 				     xfs_agf_t *agf, xfs_agblock_t bno,
 				     int isroot);
@@ -839,12 +852,21 @@ blockget_f(
 	xfs_agnumber_t	agno;
 	int		oldprefix;
 	int		sbyell;
+	int		error;
 
 	if (dbmap) {
 		dbprintf(_("already have block usage information\n"));
 		return 0;
 	}
 
+	error = init_rtmeta_inode_bitmaps(mp);
+	if (error) {
+		dbprintf(_("error %d setting up rt metadata inode bitmaps\n"),
+				error);
+		exitcode = 3;
+		return 0;
+	}
+
 	if (!init(argc, argv)) {
 		if (serious_error)
 			exitcode = 3;
@@ -1101,6 +1123,7 @@ blocktrash_f(
 		   (1 << DBM_QUOTA) |
 		   (1 << DBM_RTBITMAP) |
 		   (1 << DBM_RTSUM) |
+		   (1 << DBM_BTRTRMAP) |
 		   (1 << DBM_SYMLINK) |
 		   (1 << DBM_BTFINO) |
 		   (1 << DBM_BTRMAP) |
@@ -2850,7 +2873,7 @@ process_inode(
 		0				/* type 15 unused */
 	};
 	static char		*fmtnames[] = {
-		"dev", "local", "extents", "btree", "uuid"
+		"dev", "local", "extents", "btree", "uuid", "rmap"
 	};
 
 	ino = XFS_AGINO_TO_INO(mp, be32_to_cpu(agf->agf_seqno), agino);
@@ -2916,11 +2939,20 @@ process_inode(
 				be32_to_cpu(dip->di_next_unlinked), ino);
 		error++;
 	}
-	/*
-	 * di_mode is a 16-bit uint so no need to check the < 0 case
-	 */
+
+	/* Check that mode and data fork format match. */
 	mode = be16_to_cpu(dip->di_mode);
-	if ((((mode & S_IFMT) >> 12) > 15) ||
+	if (is_rtrmap_inode(ino)) {
+		if (!S_ISREG(mode) || dip->di_format != XFS_DINODE_FMT_RMAP) {
+			if (v)
+				dbprintf(
+			_("bad format %d for rtrmap inode %lld type %#o\n"),
+					dip->di_format, (long long)ino,
+					mode & S_IFMT);
+			error++;
+			return;
+		}
+	} else if ((((mode & S_IFMT) >> 12) > 15) ||
 	    (!(okfmts[(mode & S_IFMT) >> 12] & (1 << dip->di_format)))) {
 		if (v)
 			dbprintf(_("bad format %d for inode %lld type %#o\n"),
@@ -2993,6 +3025,9 @@ process_inode(
 			blkmap = blkmap_alloc(dnextents);
 			if (!xfs_has_metadir(mp))
 				addlink_inode(id);
+		} else if (is_rtrmap_inode(id->ino)) {
+			type = DBM_BTRTRMAP;
+			blkmap = blkmap_alloc(be32_to_cpu(dip->di_nextents));
 		}
 		else
 			type = DBM_DATA;
@@ -3024,6 +3059,10 @@ process_inode(
 		process_btinode(id, dip, type, &totdblocks, &totiblocks,
 			&nextents, &blkmap, XFS_DATA_FORK);
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		id->rgno = rtgroup_for_rtrmap_ino(mp, id->ino);
+		process_rtrmap(id, dip, &totiblocks);
+		break;
 	}
 	if (dip->di_forkoff) {
 		sbversion |= XFS_SB_VERSION_ATTRBIT;
@@ -3049,6 +3088,7 @@ process_inode(
 		case DBM_RTBITMAP:
 		case DBM_RTSUM:
 		case DBM_SYMLINK:
+		case DBM_BTRTRMAP:
 		case DBM_UNKNOWN:
 			bc = totdblocks + totiblocks +
 			     atotdblocks + atotiblocks;
@@ -3819,6 +3859,79 @@ process_rtsummary(
 	}
 }
 
+static void
+process_rtrmap(
+	struct inodata		*id,
+	struct xfs_dinode	*dip,
+	xfs_rfsblock_t		*toti)
+{
+	xfs_extnum_t		nex = 0;
+	xfs_rfsblock_t		totd = 0;
+	struct xfs_rtrmap_root	*dib;
+	int			whichfork = XFS_DATA_FORK;
+	int			i;
+	int			maxrecs;
+	xfs_rtrmap_ptr_t	*pp;
+
+	if (id->rgno == NULLRGNUMBER) {
+		dbprintf(
+	_("rt group for rmap ino %lld not found\n"),
+				id->ino);
+		error++;
+		return;
+	}
+
+	dib = (struct xfs_rtrmap_root *)XFS_DFORK_PTR(dip, whichfork);
+	if (be16_to_cpu(dib->bb_level) >= mp->m_rtrmap_maxlevels) {
+		if (!sflag || id->ilist)
+			dbprintf(_("level for ino %lld rtrmap root too "
+				 "large (%u)\n"),
+				id->ino,
+				be16_to_cpu(dib->bb_level));
+		error++;
+		return;
+	}
+	maxrecs = libxfs_rtrmapbt_droot_maxrecs(
+			XFS_DFORK_SIZE(dip, mp, whichfork),
+			dib->bb_level == 0);
+	if (be16_to_cpu(dib->bb_numrecs) > maxrecs) {
+		if (!sflag || id->ilist)
+			dbprintf(_("numrecs for ino %lld rtrmap root too "
+				 "large (%u)\n"),
+				id->ino,
+				be16_to_cpu(dib->bb_numrecs));
+		error++;
+		return;
+	}
+	if (be16_to_cpu(dib->bb_level) == 0) {
+		struct xfs_rmap_rec	*rp;
+		xfs_fsblock_t		lastblock;
+
+		rp = xfs_rtrmap_droot_rec_addr(dib, 1);
+		lastblock = 0;
+		for (i = 0; i < be16_to_cpu(dib->bb_numrecs); i++) {
+			if (be32_to_cpu(rp[i].rm_startblock) < lastblock) {
+				dbprintf(_(
+		"out-of-order rtrmap btree record %d (%u %u) root\n"),
+					 i, be32_to_cpu(rp[i].rm_startblock),
+					 be32_to_cpu(rp[i].rm_startblock));
+			} else {
+				lastblock = be32_to_cpu(rp[i].rm_startblock) +
+					    be32_to_cpu(rp[i].rm_blockcount);
+			}
+		}
+		return;
+	} else {
+		pp = xfs_rtrmap_droot_ptr_addr(dib, 1, maxrecs);
+		for (i = 0; i < be16_to_cpu(dib->bb_numrecs); i++)
+			scan_lbtree(get_unaligned_be64(&pp[i]),
+					be16_to_cpu(dib->bb_level),
+					scanfunc_rtrmap, DBM_BTRTRMAP,
+					id, &totd, toti,
+					&nex, NULL, 1, TYP_RTRMAPBT);
+	}
+}
+
 static xfs_ino_t
 process_sf_dir_v2(
 	struct xfs_dinode	*dip,
@@ -4917,6 +5030,86 @@ scanfunc_rmap(
 				TYP_RMAPBT);
 }
 
+static void
+scanfunc_rtrmap(
+	struct xfs_btree_block	*block,
+	int			level,
+	dbm_t			type,
+	xfs_fsblock_t		bno,
+	inodata_t		*id,
+	xfs_rfsblock_t		*totd,
+	xfs_rfsblock_t		*toti,
+	xfs_extnum_t		*nex,
+	blkmap_t		**blkmapp,
+	int			isroot,
+	typnm_t			btype)
+{
+	xfs_agblock_t		agbno;
+	xfs_agnumber_t		agno;
+	int			i;
+	xfs_rtrmap_ptr_t	*pp;
+	struct xfs_rmap_rec	*rp;
+	xfs_fsblock_t		lastblock;
+
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	if (be32_to_cpu(block->bb_magic) != XFS_RTRMAP_CRC_MAGIC) {
+		dbprintf(_("bad magic # %#x in rtrmapbt block %u/%u\n"),
+			be32_to_cpu(block->bb_magic), agno, bno);
+		serious_error++;
+		return;
+	}
+	if (be16_to_cpu(block->bb_level) != level) {
+		if (!sflag)
+			dbprintf(_("expected level %d got %d in rtrmapbt block "
+				 "%u/%u\n"),
+				level, be16_to_cpu(block->bb_level), agno, bno);
+		error++;
+	}
+	set_dbmap(agno, agbno, 1, type, agno, agbno);
+	set_inomap(agno, agbno, 1, id);
+	(*toti)++;
+	if (level == 0) {
+		if (be16_to_cpu(block->bb_numrecs) > mp->m_rtrmap_mxr[0] ||
+		    (isroot == 0 && be16_to_cpu(block->bb_numrecs) < mp->m_rtrmap_mnr[0])) {
+			dbprintf(_("bad btree nrecs (%u, min=%u, max=%u) in "
+				 "rtrmapbt block %u/%u\n"),
+				be16_to_cpu(block->bb_numrecs), mp->m_rtrmap_mnr[0],
+				mp->m_rtrmap_mxr[0], agno, bno);
+			serious_error++;
+			return;
+		}
+		rp = xfs_rtrmap_rec_addr(block, 1);
+		lastblock = 0;
+		for (i = 0; i < be16_to_cpu(block->bb_numrecs); i++) {
+			if (be32_to_cpu(rp[i].rm_startblock) < lastblock) {
+				dbprintf(_(
+		"out-of-order rtrmap btree record %d (%u %u) block %u/%u l %llu\n"),
+					 i, be32_to_cpu(rp[i].rm_startblock),
+					 be32_to_cpu(rp[i].rm_blockcount),
+					 agno, bno, lastblock);
+			} else {
+				lastblock = be32_to_cpu(rp[i].rm_startblock) +
+					    be32_to_cpu(rp[i].rm_blockcount);
+			}
+		}
+		return;
+	}
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_rtrmap_mxr[1] ||
+	    (isroot == 0 && be16_to_cpu(block->bb_numrecs) < mp->m_rtrmap_mnr[1])) {
+		dbprintf(_("bad btree nrecs (%u, min=%u, max=%u) in rtrmapbt "
+			 "block %u/%u\n"),
+			be16_to_cpu(block->bb_numrecs), mp->m_rtrmap_mnr[1],
+			mp->m_rtrmap_mxr[1], agno, bno);
+		serious_error++;
+		return;
+	}
+	pp = xfs_rtrmap_ptr_addr(block, 1, mp->m_rtrmap_mxr[1]);
+	for (i = 0; i < be16_to_cpu(block->bb_numrecs); i++)
+		scan_lbtree(be64_to_cpu(pp[i]), level, scanfunc_rtrmap, type, id,
+					totd, toti, nex, blkmapp, 0, btype);
+}
+
 static void
 scanfunc_refcnt(
 	struct xfs_btree_block	*block,
diff --git a/db/inode.c b/db/inode.c
index 6867f5c5427..492a8f53ed0 100644
--- a/db/inode.c
+++ b/db/inode.c
@@ -637,7 +637,12 @@ inode_init(void)
 	add_command(&inode_cmd);
 }
 
-static struct bitmap	*rmap_inodes;
+struct rtgroup_inodes {
+	xfs_ino_t		rmap_ino;
+};
+
+static struct rtgroup_inodes	*rtgroup_inodes;
+static struct bitmap		*rmap_inodes;
 
 static inline int
 set_rtgroup_rmap_inode(
@@ -670,6 +675,10 @@ set_rtgroup_rmap_inode(
 	}
 
 	error = bitmap_set(rmap_inodes, rtino, 1);
+	if (error)
+		goto out_trans;
+
+	rtgroup_inodes[rgno].rmap_ino = rtino;
 
 out_trans:
 	libxfs_trans_cancel(tp);
@@ -688,6 +697,11 @@ init_rtmeta_inode_bitmaps(
 	if (rmap_inodes)
 		return 0;
 
+	rtgroup_inodes = calloc(mp->m_sb.sb_rgcount,
+			sizeof(struct rtgroup_inodes));
+	if (!rtgroup_inodes)
+		return ENOMEM;
+
 	error = bitmap_alloc(&rmap_inodes);
 	if (error)
 		return error;
@@ -706,6 +720,18 @@ bool is_rtrmap_inode(xfs_ino_t ino)
 	return bitmap_test(rmap_inodes, ino, 1);
 }
 
+xfs_rgnumber_t rtgroup_for_rtrmap_ino(struct xfs_mount *mp, xfs_ino_t ino)
+{
+	unsigned int i;
+
+	for (i = 0; i < mp->m_sb.sb_rgcount; i++) {
+		if (rtgroup_inodes[i].rmap_ino == ino)
+			return i;
+	}
+
+	return NULLRGNUMBER;
+}
+
 typnm_t
 inode_next_type(void)
 {
diff --git a/db/inode.h b/db/inode.h
index a47b0575a15..04e606abed3 100644
--- a/db/inode.h
+++ b/db/inode.h
@@ -26,3 +26,4 @@ extern void	set_cur_inode(xfs_ino_t ino);
 
 int init_rtmeta_inode_bitmaps(struct xfs_mount *mp);
 bool is_rtrmap_inode(xfs_ino_t ino);
+xfs_rgnumber_t rtgroup_for_rtrmap_ino(struct xfs_mount *mp, xfs_ino_t ino);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 25/47] xfs_db: copy the realtime rmap btree
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (23 preceding siblings ...)
  2023-12-27 13:16   ` [PATCH 24/47] xfs_db: support rudimentary checks of the rtrmap btree Darrick J. Wong
@ 2023-12-27 13:17   ` Darrick J. Wong
  2023-12-27 13:17   ` [PATCH 26/47] xfs_db: make fsmap query the realtime reverse mapping tree Darrick J. Wong
                     ` (21 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:17 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Copy the realtime rmapbt when we're metadumping the filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/metadump.c |  129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)


diff --git a/db/metadump.c b/db/metadump.c
index ccf7b89ccd5..c6f6bf1ea17 100644
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -589,6 +589,55 @@ copy_rmap_btree(
 	return scan_btree(agno, root, levels, TYP_RMAPBT, agf, scanfunc_rmapbt);
 }
 
+static int
+scanfunc_rtrmapbt(
+	struct xfs_btree_block	*block,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	int			level,
+	typnm_t			btype,
+	void			*arg)
+{
+	xfs_rtrmap_ptr_t	*pp;
+	int			i;
+	int			numrecs;
+
+	if (level == 0)
+		return 1;
+
+	numrecs = be16_to_cpu(block->bb_numrecs);
+	if (numrecs > mp->m_rtrmap_mxr[1]) {
+		if (metadump.show_warnings)
+			print_warning("invalid numrecs (%u) in %s block %u/%u",
+				numrecs, typtab[btype].name, agno, agbno);
+		return 1;
+	}
+
+	pp = xfs_rtrmap_ptr_addr(block, 1, mp->m_rtrmap_mxr[1]);
+	for (i = 0; i < numrecs; i++) {
+		xfs_agnumber_t	pagno;
+		xfs_agblock_t	pbno;
+
+		pagno = XFS_FSB_TO_AGNO(mp, get_unaligned_be64(&pp[i]));
+		pbno = XFS_FSB_TO_AGBNO(mp, get_unaligned_be64(&pp[i]));
+
+		if (pbno == 0 || pbno > mp->m_sb.sb_agblocks ||
+		    pagno > mp->m_sb.sb_agcount) {
+			if (metadump.show_warnings)
+				print_warning("invalid block number (%u/%u) "
+						"in inode %llu %s block %u/%u",
+						pagno, pbno,
+						(unsigned long long)metadump.cur_ino,
+						typtab[btype].name, agno, agbno);
+			continue;
+		}
+		if (!scan_btree(pagno, pbno, level, btype, arg,
+				scanfunc_rtrmapbt))
+			return 0;
+	}
+	return 1;
+}
+
 static int
 scanfunc_refcntbt(
 	struct xfs_btree_block	*block,
@@ -2319,6 +2368,83 @@ process_exinode(
 					whichfork), nex, itype, is_meta);
 }
 
+static int
+process_rtrmap(
+	struct xfs_dinode	*dip,
+	typnm_t			itype)
+{
+	struct xfs_rtrmap_root	*dib;
+	int			i;
+	xfs_rtrmap_ptr_t	*pp;
+	int			level;
+	int			nrecs;
+	int			maxrecs;
+	int			whichfork;
+	typnm_t			btype;
+
+	if (itype == TYP_ATTR && metadump.show_warnings) {
+		print_warning("ignoring rtrmapbt root in inode %llu attr fork",
+				(unsigned long long)metadump.cur_ino);
+		return 1;
+	}
+
+	whichfork = XFS_DATA_FORK;
+	btype = TYP_RTRMAPBT;
+
+	dib = (struct xfs_rtrmap_root *)XFS_DFORK_PTR(dip, whichfork);
+	level = be16_to_cpu(dib->bb_level);
+	nrecs = be16_to_cpu(dib->bb_numrecs);
+
+	if (level > mp->m_rtrmap_maxlevels) {
+		if (metadump.show_warnings)
+			print_warning("invalid level (%u) in inode %lld %s "
+					"root", level,
+					(unsigned long long)metadump.cur_ino,
+					typtab[btype].name);
+		return 1;
+	}
+
+	if (level == 0)
+		return 1;
+
+	maxrecs = libxfs_rtrmapbt_droot_maxrecs(
+			XFS_DFORK_SIZE(dip, mp, whichfork),
+			false);
+	if (nrecs > maxrecs) {
+		if (metadump.show_warnings)
+			print_warning("invalid numrecs (%u) in inode %lld %s "
+					"root", nrecs,
+					(unsigned long long)metadump.cur_ino,
+					typtab[btype].name);
+		return 1;
+	}
+
+	pp = xfs_rtrmap_droot_ptr_addr(dib, 1, maxrecs);
+	for (i = 0; i < nrecs; i++) {
+		xfs_agnumber_t	ag;
+		xfs_agblock_t	bno;
+
+		ag = XFS_FSB_TO_AGNO(mp, get_unaligned_be64(&pp[i]));
+		bno = XFS_FSB_TO_AGBNO(mp, get_unaligned_be64(&pp[i]));
+
+		if (bno == 0 || bno > mp->m_sb.sb_agblocks ||
+				ag > mp->m_sb.sb_agcount) {
+			if (metadump.show_warnings)
+				print_warning("invalid block number (%u/%u) "
+						"in inode %llu %s root", ag,
+						bno,
+						(unsigned long long)metadump.cur_ino,
+						typtab[btype].name);
+			continue;
+		}
+
+		if (!scan_btree(ag, bno, level, btype, &itype,
+				scanfunc_rtrmapbt))
+			return 0;
+	}
+	return 1;
+}
+
 static int
 process_inode_data(
 	struct xfs_dinode	*dip,
@@ -2363,6 +2489,9 @@ process_inode_data(
 
 		case XFS_DINODE_FMT_BTREE:
 			return process_btinode(dip, itype);
+
+		case XFS_DINODE_FMT_RMAP:
+			return process_rtrmap(dip, itype);
 	}
 	return 1;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 26/47] xfs_db: make fsmap query the realtime reverse mapping tree
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (24 preceding siblings ...)
  2023-12-27 13:17   ` [PATCH 25/47] xfs_db: copy the realtime rmap btree Darrick J. Wong
@ 2023-12-27 13:17   ` Darrick J. Wong
  2023-12-27 13:17   ` [PATCH 27/47] xfs_io: support scrubbing rtgroup metadata paths Darrick J. Wong
                     ` (20 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:17 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Extend the 'fsmap' debugger command to support querying the realtime
rmap btree via a new -r argument.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/fsmap.c               |  164 +++++++++++++++++++++++++++++++++++++++++++++-
 libxfs/libxfs_api_defs.h |    2 +
 2 files changed, 162 insertions(+), 4 deletions(-)


diff --git a/db/fsmap.c b/db/fsmap.c
index 7fd42df2a1c..363c159ec07 100644
--- a/db/fsmap.c
+++ b/db/fsmap.c
@@ -102,6 +102,149 @@ fsmap(
 	}
 }
 
+static int
+fsmap_rt_fn(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct fsmap_info		*info = priv;
+
+	dbprintf(_("%llu: %u/%u len %u owner %lld offset %llu bmbt %d attrfork %d extflag %d\n"),
+		info->nr, cur->bc_ino.rtg->rtg_rgno, rec->rm_startblock,
+		rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+		!!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK),
+		!!(rec->rm_flags & XFS_RMAP_ATTR_FORK),
+		!!(rec->rm_flags & XFS_RMAP_UNWRITTEN));
+	info->nr++;
+
+	return 0;
+}
+
+static int
+fsmap_rtgroup(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_rmap_irec	*low,
+	const struct xfs_rmap_irec	*high,
+	struct fsmap_info		*info)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_trans	*tp;
+	struct xfs_inode	*ip;
+	struct xfs_imeta_path	*path;
+	struct xfs_btree_cur	*bt_cur;
+	xfs_ino_t		ino;
+	int			error;
+
+	error = -libxfs_rtrmapbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error) {
+		dbprintf(
+ _("Cannot create path to rtgroup %u rmap inode\n"),
+				rtg->rtg_rgno);
+		return error;
+	}
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error) {
+		dbprintf(
+ _("Cannot alloc transaction to look up rtgroup %u rmap inode\n"),
+				rtg->rtg_rgno);
+		goto out_path;
+	}
+		
+	error = -libxfs_imeta_lookup(tp, path, &ino);
+	if (ino == NULLFSINO)
+		error = ENOENT;
+	if (error) {
+		dbprintf(_("Cannot look up rtgroup %u rmap inode, error %d\n"),
+				rtg->rtg_rgno, error);
+		goto out_trans;
+	}
+
+	error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, &ip);
+	if (error) {
+		dbprintf(_("Cannot load rtgroup %u rmap inode\n"),
+				rtg->rtg_rgno);
+		goto out_trans;
+	}
+
+	bt_cur = libxfs_rtrmapbt_init_cursor(mp, tp, rtg, ip);
+	if (!bt_cur) {
+		dbprintf(_("Not enough memory.\n"));
+		goto out_rele;
+	}
+
+	error = -libxfs_rmap_query_range(bt_cur, low, high, fsmap_rt_fn,
+			info);
+	if (error) {
+		dbprintf(_("Error %d while querying rt fsmap btree.\n"),
+			error);
+		goto out_cur;
+	}
+
+out_cur:
+	libxfs_btree_del_cursor(bt_cur, error);
+out_rele:
+	libxfs_imeta_irele(ip);
+out_trans:
+	libxfs_trans_cancel(tp);
+out_path:
+	libxfs_imeta_free_path(path);
+	return error;
+}
+
+static void
+fsmap_rt(
+	xfs_fsblock_t		start_fsb,
+	xfs_fsblock_t		end_fsb)
+{
+	struct fsmap_info	info;
+	xfs_daddr_t		eofs;
+	struct xfs_rmap_irec	low;
+	struct xfs_rmap_irec	high;
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		start_rg;
+	xfs_rgnumber_t		end_rg;
+	int			error;
+
+	if (mp->m_sb.sb_rblocks == 0)
+		return;
+
+	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (XFS_FSB_TO_DADDR(mp, end_fsb) >= eofs)
+		end_fsb = XFS_DADDR_TO_FSB(mp, eofs - 1);
+
+	low.rm_startblock = xfs_rtb_to_rgbno(mp, start_fsb, &start_rg);
+	low.rm_owner = 0;
+	low.rm_offset = 0;
+	low.rm_flags = 0;
+	high.rm_startblock = -1U;
+	high.rm_owner = ULLONG_MAX;
+	high.rm_offset = ULLONG_MAX;
+	high.rm_flags = XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+			XFS_RMAP_UNWRITTEN;
+
+	end_rg = xfs_rtb_to_rgno(mp, end_fsb);
+
+	info.nr = 0;
+	for_each_rtgroup_range(mp, start_rg, end_rg, rtg) {
+		xfs_rgnumber_t		rgno;
+
+		if (rtg->rtg_rgno == end_rg)
+			high.rm_startblock = xfs_rtb_to_rgbno(mp, end_fsb,
+					&rgno);
+ 
+		error = fsmap_rtgroup(rtg, &low, &high, &info);
+		if (error) {
+			libxfs_rtgroup_put(rtg);
+			return;
+		}
+
+		if (rtg->rtg_rgno == start_rg)
+			low.rm_startblock = 0;
+	}
+}
+
 static int
 fsmap_f(
 	int			argc,
@@ -111,14 +254,18 @@ fsmap_f(
 	int			c;
 	xfs_fsblock_t		start_fsb = 0;
 	xfs_fsblock_t		end_fsb = NULLFSBLOCK;
+	bool			isrt = false;
 
 	if (!xfs_has_rmapbt(mp)) {
 		dbprintf(_("Filesystem does not support reverse mapping btree.\n"));
 		return 0;
 	}
 
-	while ((c = getopt(argc, argv, "")) != EOF) {
+	while ((c = getopt(argc, argv, "r")) != EOF) {
 		switch (c) {
+		case 'r':
+			isrt = true;
+			break;
 		default:
 			dbprintf(_("Bad option for fsmap command.\n"));
 			return 0;
@@ -141,14 +288,23 @@ fsmap_f(
 		}
 	}
 
-	fsmap(start_fsb, end_fsb);
+	if (argc > optind + 2) {
+		exitcode = 1;
+		dbprintf(_("Too many arguments to fsmap.\n"));
+		return 0;
+	}
+
+	if (isrt)
+		fsmap_rt(start_fsb, end_fsb);
+	else
+		fsmap(start_fsb, end_fsb);
 
 	return 0;
 }
 
 static const cmdinfo_t	fsmap_cmd =
-	{ "fsmap", NULL, fsmap_f, 0, 2, 0,
-	  N_("[start_fsb] [end_fsb]"),
+	{ "fsmap", NULL, fsmap_f, 0, -1, 0,
+	  N_("[-r] [start_fsb] [end_fsb]"),
 	  N_("display reverse mapping(s)"), NULL };
 
 void
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 4b2fbd7cac9..85a4a131c75 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -284,11 +284,13 @@
 #define xfs_rtsummary_wordcount		libxfs_rtsummary_wordcount
 
 #define xfs_rtfree_extent		libxfs_rtfree_extent
+#define xfs_rtgroup_put			libxfs_rtgroup_put
 #define xfs_rtgroup_update_secondary_sbs	libxfs_rtgroup_update_secondary_sbs
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
 #define xfs_rtrmapbt_create_path	libxfs_rtrmapbt_create_path
 #define xfs_rtrmapbt_droot_maxrecs	libxfs_rtrmapbt_droot_maxrecs
 #define xfs_rtrmapbt_maxlevels_ondisk	libxfs_rtrmapbt_maxlevels_ondisk
+#define xfs_rtrmapbt_init_cursor	libxfs_rtrmapbt_init_cursor
 #define xfs_rtrmapbt_maxrecs		libxfs_rtrmapbt_maxrecs
 
 #define xfs_sb_from_disk		libxfs_sb_from_disk


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 27/47] xfs_io: support scrubbing rtgroup metadata paths
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (25 preceding siblings ...)
  2023-12-27 13:17   ` [PATCH 26/47] xfs_db: make fsmap query the realtime reverse mapping tree Darrick J. Wong
@ 2023-12-27 13:17   ` Darrick J. Wong
  2023-12-27 13:17   ` [PATCH 28/47] libfrog: enable scrubbng of the realtime rmap Darrick J. Wong
                     ` (19 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:17 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Support scrubbing the metadata directory path of an rtgroup metadata
file.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/scrub.c        |   41 +++++++++++++++++++++++++++++++++++------
 man/man8/xfs_io.8 |    3 ++-
 2 files changed, 37 insertions(+), 7 deletions(-)


diff --git a/io/scrub.c b/io/scrub.c
index e9254c7882a..72296cbe86a 100644
--- a/io/scrub.c
+++ b/io/scrub.c
@@ -136,21 +136,23 @@ parse_metapath(
 	int		argc,
 	char		**argv,
 	int		optind,
-	__u64		*ino)
+	__u64		*ino,
+	__u32		*group)
 {
 	char		*p;
 	unsigned long long control;
+	unsigned long	control2 = 0;
 	int		i;
 
-	if (optind != argc - 1) {
+	if (optind != argc - 1 && optind != argc - 2) {
 		fprintf(stderr, _("Must specify metapath number.\n"));
 		return false;
 	}
 
 	for (i = 0; i < XFS_SCRUB_METAPATH_NR; i++) {
 		if (!strcmp(argv[optind], xfrog_metapaths[i].name)) {
-			*ino = i;
-			return true;
+			control = i;
+			goto find_group;
 		}
 	}
 
@@ -161,7 +163,32 @@ parse_metapath(
 		return false;
 	}
 
+find_group:
+	if (xfrog_metapaths[*ino].group == XFROG_SCRUB_GROUP_RTGROUP) {
+		if (optind == argc - 1) {
+			fprintf(stderr,
+_("%s: Metapath requires a group number.\n"),
+					xfrog_metapaths[*ino].name);
+			return false;
+		}
+		control2 = strtoul(argv[optind + 1], &p, 0);
+		if (*p != '\0') {
+			fprintf(stderr,
+ _("Bad group number '%s'.\n"),
+				argv[optind + 1]);
+			return false;
+		}
+	} else {
+		if (optind == argc - 2) {
+			fprintf(stderr,
+_("%s: Metapath does not take a second argument.\n"),
+					xfrog_metapaths[*ino].name);
+			return false;
+		}
+	}
+
 	*ino = control;
+	*group = control2;
 	return true;
 }
 
@@ -237,7 +264,8 @@ parse_args(
 
 	switch (d->group) {
 	case XFROG_SCRUB_GROUP_METAPATH:
-		if (!parse_metapath(argc, argv, optind, &meta->sm_ino)) {
+		if (!parse_metapath(argc, argv, optind, &meta->sm_ino,
+							&meta->sm_agno)) {
 			exitcode = 1;
 			return command_usage(cmdinfo);
 		}
@@ -587,7 +615,8 @@ scrubv_f(
 
 	switch (group) {
 	case XFROG_SCRUB_GROUP_METAPATH:
-		if (!parse_metapath(argc, argv, optind, &vhead->svh_ino)) {
+		if (!parse_metapath(argc, argv, optind, &vhead->svh_ino,
+							&vhead->svh_agno)) {
 			exitcode = 1;
 			return command_usage(&scrubv_cmd);
 		}
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 6cf4b9e32e5..f94de0ce40f 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1447,7 +1447,7 @@ Currently supported versions are 1 and 5.
 .RE
 .PD
 .TP
-.BI "scrub " type " [ " agnumber " | " rgnumber " | " "ino" " " "gen" " | " metapath " ]"
+.BI "scrub " type " [ " agnumber " | " rgnumber " | " "ino" " " "gen" " | " metapath " [ " rgnumber " ] ]"
 Scrub internal XFS filesystem metadata.  The
 .BI type
 parameter specifies which type of metadata to scrub.
@@ -1456,6 +1456,7 @@ For realtime group metadata, one rtgroup number must be specified.
 For file metadata, the scrub is applied to the open file unless the
 inode number and generation number are specified.
 For metapath, the name of a file or a raw number must be specified.
+If the metapath file is a per-rtgroup file, the group number must be specified.
 .RE
 .PD
 .TP


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 28/47] libfrog: enable scrubbng of the realtime rmap
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (26 preceding siblings ...)
  2023-12-27 13:17   ` [PATCH 27/47] xfs_io: support scrubbing rtgroup metadata paths Darrick J. Wong
@ 2023-12-27 13:17   ` Darrick J. Wong
  2023-12-27 13:18   ` [PATCH 29/47] xfs_scrub: check rtrmapbt metadata directory connections Darrick J. Wong
                     ` (18 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:17 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new entry so that we can scrub the rtrmapbt.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/scrub.c |    5 +++++
 scrub/repair.c  |    1 +
 2 files changed, 6 insertions(+)


diff --git a/libfrog/scrub.c b/libfrog/scrub.c
index 8822fc0088c..290ba0fb8bf 100644
--- a/libfrog/scrub.c
+++ b/libfrog/scrub.c
@@ -169,6 +169,11 @@ const struct xfrog_scrub_descr xfrog_scrubbers[XFS_SCRUB_TYPE_NR] = {
 		.descr	= "realtime group bitmap",
 		.group	= XFROG_SCRUB_GROUP_RTGROUP,
 	},
+	[XFS_SCRUB_TYPE_RTRMAPBT] = {
+		.name	= "rtrmapbt",
+		.descr	= "realtime reverse mapping btree",
+		.group	= XFROG_SCRUB_GROUP_RTGROUP,
+	},
 };
 
 const struct xfrog_scrub_descr xfrog_metapaths[XFS_SCRUB_METAPATH_NR] = {
diff --git a/scrub/repair.c b/scrub/repair.c
index 43037a7c5e1..fee03f97701 100644
--- a/scrub/repair.c
+++ b/scrub/repair.c
@@ -532,6 +532,7 @@ repair_item_difficulty(
 
 		switch (scrub_type) {
 		case XFS_SCRUB_TYPE_RMAPBT:
+		case XFS_SCRUB_TYPE_RTRMAPBT:
 			ret |= REPAIR_DIFFICULTY_SECONDARY;
 			break;
 		case XFS_SCRUB_TYPE_SB:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 29/47] xfs_scrub: check rtrmapbt metadata directory connections
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (27 preceding siblings ...)
  2023-12-27 13:17   ` [PATCH 28/47] libfrog: enable scrubbng of the realtime rmap Darrick J. Wong
@ 2023-12-27 13:18   ` Darrick J. Wong
  2023-12-27 13:18   ` [PATCH 30/47] xfs_scrub: retest metadata across scrub groups after a repair Darrick J. Wong
                     ` (17 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:18 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Run the rt rmap btree metapath scrubber during phase 5 to ensure that
it's still connected to the metadir tree after we've pruned any bad
links.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/phase5.c |   24 ++++++++++++++++++++++--
 scrub/scrub.h  |    4 +++-
 2 files changed, 25 insertions(+), 3 deletions(-)


diff --git a/scrub/phase5.c b/scrub/phase5.c
index 3b8daf39ae6..6fd3c698270 100644
--- a/scrub/phase5.c
+++ b/scrub/phase5.c
@@ -743,6 +743,7 @@ static int
 queue_metapath_scan(
 	struct workqueue	*wq,
 	bool			*abortedp,
+	xfs_rgnumber_t		rgno,
 	uint64_t		type)
 {
 	struct fs_scan_item	*item;
@@ -755,7 +756,7 @@ queue_metapath_scan(
 		str_liberror(ctx, ret, _("setting up metapath scan"));
 		return ret;
 	}
-	scrub_item_init_metapath(&item->sri, type);
+	scrub_item_init_metapath(&item->sri, rgno, type);
 	scrub_item_schedule(&item->sri, XFS_SCRUB_TYPE_METAPATH);
 	item->abortedp = abortedp;
 
@@ -778,6 +779,7 @@ run_kernel_metadir_path_scrubbers(
 	const struct xfrog_scrub_descr	*sc;
 	uint64_t		type;
 	unsigned int		nr_threads = scrub_nproc_workqueue(ctx);
+	xfs_rgnumber_t		rgno;
 	bool			aborted = false;
 	int			ret, ret2;
 
@@ -797,7 +799,7 @@ run_kernel_metadir_path_scrubbers(
 		if (sc->group != XFROG_SCRUB_GROUP_FS)
 			continue;
 
-		ret = queue_metapath_scan(&wq, &aborted, type);
+		ret = queue_metapath_scan(&wq, &aborted, 0, type);
 		if (ret) {
 			str_liberror(ctx, ret,
  _("queueing metapath scrub work"));
@@ -805,6 +807,24 @@ run_kernel_metadir_path_scrubbers(
 		}
 	}
 
+	/* Scan all rtgroup metadata files */
+	for (rgno = 0;
+	     rgno < ctx->mnt.fsgeom.rgcount && !aborted;
+	     rgno++) {
+		for (type = 0; type < XFS_SCRUB_METAPATH_NR; type++) {
+			sc = &xfrog_metapaths[type];
+			if (sc->group != XFROG_SCRUB_GROUP_RTGROUP)
+				continue;
+
+			ret = queue_metapath_scan(&wq, &aborted, rgno, type);
+			if (ret) {
+				str_liberror(ctx, ret,
+  _("queueing metapath scrub work"));
+				goto wait;
+			}
+		}
+	}
+
 wait:
 	ret2 = -workqueue_terminate(&wq);
 	if (ret2) {
diff --git a/scrub/scrub.h b/scrub/scrub.h
index bb94a11dcfc..24b5ad629c5 100644
--- a/scrub/scrub.h
+++ b/scrub/scrub.h
@@ -118,9 +118,11 @@ scrub_item_init_file(struct scrub_item *sri, const struct xfs_bulkstat *bstat)
 }
 
 static inline void
-scrub_item_init_metapath(struct scrub_item *sri, uint64_t metapath)
+scrub_item_init_metapath(struct scrub_item *sri, xfs_rgnumber_t rgno,
+		uint64_t metapath)
 {
 	memset(sri, 0, sizeof(*sri));
+	sri->sri_agno = rgno;
 	sri->sri_ino = metapath;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 30/47] xfs_scrub: retest metadata across scrub groups after a repair
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (28 preceding siblings ...)
  2023-12-27 13:18   ` [PATCH 29/47] xfs_scrub: check rtrmapbt metadata directory connections Darrick J. Wong
@ 2023-12-27 13:18   ` Darrick J. Wong
  2023-12-27 13:18   ` [PATCH 31/47] xfs_spaceman: report health status of the realtime rmap btree Darrick J. Wong
                     ` (16 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:18 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Certain types of metadata have dependencies that cross scrub groups.
For example, after a repair the part of realtime bitmap corresponding to
a realtime group, we potentially need to rebuild the realtime summary to
reflect the new bitmap contents.  The rtsummary is a separate scrub group
(metafiles) from the rgbitmap (rtgroup), which means that the rtsummary
repairs must be tracked by a separate scrub_item.

Create the necessary dependency table and code to make these kinds of
cross-group validations possible.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/phase4.c |   54 +++++++++++++++++++
 scrub/repair.c |  158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 scrub/repair.h |    5 ++
 3 files changed, 216 insertions(+), 1 deletion(-)


diff --git a/scrub/phase4.c b/scrub/phase4.c
index 88cb53aeac9..c58e4aaabda 100644
--- a/scrub/phase4.c
+++ b/scrub/phase4.c
@@ -42,6 +42,51 @@ struct repair_list_schedule {
 	bool				made_progress;
 };
 
+/*
+ * After a successful repair, schedule any additional revalidations needed in
+ * other scrub groups.
+ */
+static int
+revalidate_across_groups(
+	struct scrub_ctx		*ctx,
+	const struct action_item	*old_aitem,
+	struct repair_list_schedule	*rls)
+{
+	struct action_list		alist;
+	int				error;
+
+	action_list_init(&alist);
+
+	error = action_item_schedule_revalidation(ctx, old_aitem, &alist);
+	if (error) {
+		rls->aborted = true;
+		return error;
+	}
+
+	if (action_list_empty(&alist))
+		return 0;
+
+	pthread_mutex_unlock(&rls->lock);
+	error = action_list_revalidate(ctx, &alist);
+	pthread_mutex_lock(&rls->lock);
+
+	/*
+	 * Action items attached to @alist after the revalidation are either
+	 * the result of finding new inconsistencies or an incomplete list
+	 * after an operational error.  In the first case we need these new
+	 * items to be processed; in the second case, we're going to exit the
+	 * process.  Either way, pass the items back to the caller.
+	 */
+	action_list_merge(&rls->requeue_list, &alist);
+
+	if (error) {
+		rls->aborted = true;
+		return error;
+	}
+
+	return 0;
+}
+
 /* Try to repair as many things on our list as we can. */
 static void
 repair_list_worker(
@@ -89,9 +134,16 @@ repair_list_worker(
 			action_list_add(&rls->requeue_list, aitem);
 			break;
 		case TR_REPAIRED:
+			ret = revalidate_across_groups(ctx, aitem, rls);
+			if (ret) {
+				free(aitem);
+				break;
+			}
+
 			/*
 			 * All repairs for this item completed.  Free the item,
-			 * and remember that progress was made.
+			 * and remember that progress was made, even if group
+			 * revalidation uncovered more issues.
 			 */
 			rls->made_progress = true;
 			free(aitem);
diff --git a/scrub/repair.c b/scrub/repair.c
index fee03f97701..72533ab5b02 100644
--- a/scrub/repair.c
+++ b/scrub/repair.c
@@ -43,6 +43,15 @@ static const unsigned int repair_deps[XFS_SCRUB_TYPE_NR] = {
 					  DEP(XFS_SCRUB_TYPE_PQUOTA),
 	[XFS_SCRUB_TYPE_RTSUM]		= DEP(XFS_SCRUB_TYPE_RTBITMAP),
 };
+
+/*
+ * Data dependencies that cross scrub groups.  When we repair a metadata object
+ * of the given type (e.g. rtgroup bitmaps), we want to trigger a revalidation
+ * of the specified objects (e.g. rt summary file).
+ */
+static const unsigned int cross_group_recheck[XFS_SCRUB_TYPE_NR] = {
+	[XFS_SCRUB_TYPE_RGBITMAP]	= DEP(XFS_SCRUB_TYPE_RTSUM),
+};
 #undef DEP
 
 /*
@@ -631,6 +640,16 @@ action_list_add(
 	list_add_tail(&aitem->list, &alist->list);
 }
 
+/* Move an action item off of a list onto alist. */
+static void
+action_list_move(
+	struct action_list		*alist,
+	struct action_item		*aitem)
+{
+	list_del_init(&aitem->list);
+	action_list_add(alist, aitem);
+}
+
 /*
  * Try to repair a filesystem object and let the caller know what it should do
  * with the action item.  The caller must be able to requeue action items, so
@@ -894,3 +913,142 @@ repair_item_to_action_item(
 	*aitemp = aitem;
 	return 0;
 }
+
+static int
+schedule_cross_group_recheck(
+	struct scrub_ctx	*ctx,
+	unsigned int		recheck_mask,
+	struct action_list	*new_items)
+{
+	unsigned int		scrub_type;
+
+	foreach_scrub_type(scrub_type) {
+		struct action_item	*aitem;
+
+		if (!(recheck_mask & (1U << scrub_type)))
+			continue;
+
+		switch (xfrog_scrubbers[scrub_type].group) {
+		case XFROG_SCRUB_GROUP_FS:
+			/*
+			 * XXX gcc fortify gets confused on the memset in
+			 * scrub_item_init_fs if we hoist this allocation to a
+			 * helper function.
+			 */
+			aitem = malloc(sizeof(struct action_item));
+			if (!aitem) {
+				int	error = errno;
+
+				str_liberror(ctx, error,
+						_("creating repair revalidation action item"));
+				return error;
+			}
+
+			INIT_LIST_HEAD(&aitem->list);
+			aitem->sri.sri_revalidate = true;
+
+			scrub_item_init_fs(&aitem->sri);
+			scrub_item_schedule(&aitem->sri, scrub_type);
+			action_list_add(new_items, aitem);
+			break;
+		default:
+			/* We don't support any other groups yet. */
+			assert(false);
+			continue;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * After a successful repair, schedule revalidation of metadata outside of this
+ * scrub item's group.
+ */
+int
+action_item_schedule_revalidation(
+	struct scrub_ctx		*ctx,
+	const struct action_item	*old_aitem,
+	struct action_list		*new_repairs)
+{
+	struct action_list		new_items;
+	struct action_item		*aitem, *n;
+	unsigned int			scrub_type;
+	int				error = 0;
+
+	/* Find new scrub items to revalidate */
+	action_list_init(&new_items);
+	foreach_scrub_type(scrub_type) {
+		unsigned int		mask;
+
+		if (!(old_aitem->sri.sri_selected & (1ULL << scrub_type)))
+			continue;
+		mask = cross_group_recheck[scrub_type];
+		if (!mask)
+			continue;
+
+		error = schedule_cross_group_recheck(ctx, mask, &new_items);
+		if (error)
+			goto bad;
+	}
+	if (action_list_empty(&new_items))
+		return 0;
+
+	/* Scrub them all, and move corrupted items to the caller's list */
+	list_for_each_entry_safe(aitem, n, &new_items.list, list) {
+		unsigned int	bad;
+
+		error = scrub_item_check(ctx, &aitem->sri);
+		if (error)
+			 goto bad;
+
+		bad = repair_item_count_needsrepair(&aitem->sri);
+		if (bad > 0) {
+			/*
+			 * Uhoh, we found something else broken.  Queue it for
+			 * more repairs.
+			 */
+			aitem->sri.sri_revalidate = false;
+			action_list_move(new_repairs, aitem);
+		}
+	}
+
+bad:
+	/* Delete anything that's still on the list. */
+	list_for_each_entry_safe(aitem, n, &new_items.list, list) {
+		list_del(&aitem->list);
+		free(aitem);
+	}
+
+	return error;
+}
+
+/*
+ * Revalidate all items scheduled for a recheck, and drop the ones that are
+ * clean.
+ */
+int
+action_list_revalidate(
+	struct scrub_ctx	*ctx,
+	struct action_list	*alist)
+{
+	struct action_item	*aitem, *n;
+	int			error;
+
+	list_for_each_entry_safe(aitem, n, &alist->list, list) {
+		error = scrub_item_check(ctx, &aitem->sri);
+		if (error)
+			return error;
+
+		if (repair_item_count_needsrepair(&aitem->sri) > 0) {
+			aitem->sri.sri_revalidate = false;
+			continue;
+		}
+
+		/* Metadata are clean, delete from list. */
+		list_del(&aitem->list);
+		free(aitem);
+	}
+
+	return 0;
+}
diff --git a/scrub/repair.h b/scrub/repair.h
index ec4aa381a82..96f621f124d 100644
--- a/scrub/repair.h
+++ b/scrub/repair.h
@@ -50,6 +50,11 @@ enum tryrepair_outcome {
 int action_item_try_repair(struct scrub_ctx *ctx, struct action_item *aitem,
 		enum tryrepair_outcome *outcome);
 
+int action_item_schedule_revalidation(struct scrub_ctx *ctx,
+		const struct action_item *old_aitem,
+		struct action_list *new_items);
+int action_list_revalidate(struct scrub_ctx *sc, struct action_list *alist);
+
 void repair_item_mustfix(struct scrub_item *sri, struct scrub_item *fix_now);
 
 /* Primary metadata is corrupt */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 31/47] xfs_spaceman: report health status of the realtime rmap btree
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (29 preceding siblings ...)
  2023-12-27 13:18   ` [PATCH 30/47] xfs_scrub: retest metadata across scrub groups after a repair Darrick J. Wong
@ 2023-12-27 13:18   ` Darrick J. Wong
  2023-12-27 13:18   ` [PATCH 32/47] libxfs: dirty buffers should be marked uptodate too Darrick J. Wong
                     ` (15 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:18 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add reporting of the rt rmap btree health to spaceman.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 spaceman/health.c |   10 ++++++++++
 1 file changed, 10 insertions(+)


diff --git a/spaceman/health.c b/spaceman/health.c
index d9a18fcc3b4..fac81f4eda5 100644
--- a/spaceman/health.c
+++ b/spaceman/health.c
@@ -41,6 +41,11 @@ static bool has_reflink(const struct xfs_fsop_geom *g)
 	return g->flags & XFS_FSOP_GEOM_FLAGS_REFLINK;
 }
 
+static bool has_rtrmapbt(const struct xfs_fsop_geom *g)
+{
+	return g->rtblocks > 0 && (g->flags & XFS_FSOP_GEOM_FLAGS_RMAPBT);
+}
+
 struct flag_map {
 	unsigned int		mask;
 	bool			(*has_fn)(const struct xfs_fsop_geom *g);
@@ -145,6 +150,11 @@ static const struct flag_map rtgroup_flags[] = {
 		.mask = XFS_RTGROUP_GEOM_SICK_BITMAP,
 		.descr = "realtime bitmap",
 	},
+	{
+		.mask = XFS_RTGROUP_GEOM_SICK_RMAPBT,
+		.descr = "realtime reverse mappings btree",
+		.has_fn = has_rtrmapbt,
+	},
 	{0},
 };
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 32/47] libxfs: dirty buffers should be marked uptodate too
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (30 preceding siblings ...)
  2023-12-27 13:18   ` [PATCH 31/47] xfs_spaceman: report health status of the realtime rmap btree Darrick J. Wong
@ 2023-12-27 13:18   ` Darrick J. Wong
  2023-12-27 13:19   ` [PATCH 33/47] xfs_repair: flag suspect long-format btree blocks Darrick J. Wong
                     ` (14 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:18 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

I started fuzz-testing the realtime rmap feature with a very large
number of realtime allocation groups.  There were so many rt groups that
repair had to rebuild /realtime in the metadata directory tree, and that
directory was big enough to spur the creation of a block format
directory.

Unfortunately, repair then walks both directory trees to look for
unconnceted files.  This part of phase 6 emits CRC errors on the newly
created buffers for the /realtime directory, declares the directory to
be garbage, and moves all the rt rmap inodes to /lost+found, resulting
in a corrupt fs.

Poking around in gdb, I noticed that the buffer contents were indeed
zero, and that UPTODATE was not set.  This was very strange, until I
added a watch on bp->b_flags to watch for accesses.  It turns out that
xfs_repair's prefetch code will _get a buffer and zero the contents if
UPTODATE is not set.

The directory tree code in libxfs will also _get a buffer, initialize
it, and log it to the coordinating transaction, which in this case is
the transactions used to reconnect the rmap btree inodes to /realtime.
At no point does any of that code ever set UPTODATE on the buffer, which
is why prefetch zaps the contents.

Hence change both buffer dirtying functions to set UPTODATE, since a
dirty buffer is by definition at least as recent as whatever's on disk.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/rdwr.c  |    2 +-
 libxfs/trans.c |    1 +
 2 files changed, 2 insertions(+), 1 deletion(-)


diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index 17abced06c9..a3b30510926 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -959,7 +959,7 @@ libxfs_buf_mark_dirty(
 	 */
 	bp->b_error = 0;
 	bp->b_flags &= ~LIBXFS_B_STALE;
-	bp->b_flags |= LIBXFS_B_DIRTY;
+	bp->b_flags |= LIBXFS_B_DIRTY | LIBXFS_B_UPTODATE;
 }
 
 /* Prepare a buffer to be sent to the MRU list. */
diff --git a/libxfs/trans.c b/libxfs/trans.c
index aab9923d9ad..3c5d6383e8c 100644
--- a/libxfs/trans.c
+++ b/libxfs/trans.c
@@ -716,6 +716,7 @@ libxfs_trans_dirty_buf(
 	ASSERT(bp->b_transp == tp);
 	ASSERT(bip != NULL);
 
+	bp->b_flags |= LIBXFS_B_UPTODATE;
 	tp->t_flags |= XFS_TRANS_DIRTY;
 	set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 33/47] xfs_repair: flag suspect long-format btree blocks
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (31 preceding siblings ...)
  2023-12-27 13:18   ` [PATCH 32/47] libxfs: dirty buffers should be marked uptodate too Darrick J. Wong
@ 2023-12-27 13:19   ` Darrick J. Wong
  2023-12-27 13:19   ` [PATCH 34/47] xfs_repair: use realtime rmap btree data to check block types Darrick J. Wong
                     ` (13 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:19 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pass a "suspect" counter through scan_lbtree just like we do for
short-format btree blocks, and increment its value when we encounter
blocks with bad CRCs or outright corruption.  This makes it so that
repair actually catches bmbt blocks with bad crcs or other verifier
errors.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |    2 +-
 repair/scan.c   |   15 ++++++++++++---
 repair/scan.h   |    3 +++
 3 files changed, 16 insertions(+), 4 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index 31b3cd74139..a0071d5de88 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -872,7 +872,7 @@ _("bad bmap btree ptr 0x%" PRIx64 " in ino %" PRIu64 "\n"),
 
 		if (scan_lbtree(get_unaligned_be64(&pp[i]), level, scan_bmapbt,
 				type, whichfork, lino, tot, nex, blkmapp,
-				&cursor, 1, check_dups, magic,
+				&cursor, 0, 1, check_dups, magic,
 				(void *)zap_metadata, &xfs_bmbt_buf_ops))
 			return(1);
 		/*
diff --git a/repair/scan.c b/repair/scan.c
index 1cd4d0ad2e1..2f8a3348ae1 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -136,6 +136,7 @@ scan_lbtree(
 				xfs_extnum_t		*nex,
 				blkmap_t		**blkmapp,
 				bmap_cursor_t		*bm_cursor,
+				int			suspect,
 				int			isroot,
 				int			check_dups,
 				int			*dirty,
@@ -148,6 +149,7 @@ scan_lbtree(
 	xfs_extnum_t	*nex,
 	blkmap_t	**blkmapp,
 	bmap_cursor_t	*bm_cursor,
+	int		suspect,
 	int		isroot,
 	int		check_dups,
 	uint64_t	magic,
@@ -167,6 +169,12 @@ scan_lbtree(
 			XFS_FSB_TO_AGBNO(mp, root));
 		return(1);
 	}
+	if (bp->b_error == -EFSBADCRC || bp->b_error == -EFSCORRUPTED) {
+		do_warn(_("btree block %d/%d is suspect, error %d\n"),
+			XFS_FSB_TO_AGNO(mp, root),
+			XFS_FSB_TO_AGBNO(mp, root), bp->b_error);
+		suspect++;
+	}
 
 	/*
 	 * only check for bad CRC here - caller will determine if there
@@ -182,7 +190,7 @@ scan_lbtree(
 
 	err = (*func)(XFS_BUF_TO_BLOCK(bp), nlevels - 1,
 			type, whichfork, root, ino, tot, nex, blkmapp,
-			bm_cursor, isroot, check_dups, &dirty,
+			bm_cursor, suspect, isroot, check_dups, &dirty,
 			magic, priv);
 
 	ASSERT(dirty == 0 || (dirty && !no_modify));
@@ -209,6 +217,7 @@ scan_bmapbt(
 	xfs_extnum_t		*nex,
 	blkmap_t		**blkmapp,
 	bmap_cursor_t		*bm_cursor,
+	int			suspect,
 	int			isroot,
 	int			check_dups,
 	int			*dirty,
@@ -516,7 +525,7 @@ _("bad bmap btree ptr 0x%llx in ino %" PRIu64 "\n"),
 
 		err = scan_lbtree(be64_to_cpu(pp[i]), level, scan_bmapbt,
 				type, whichfork, ino, tot, nex, blkmapp,
-				bm_cursor, 0, check_dups, magic, priv,
+				bm_cursor, suspect, 0, check_dups, magic, priv,
 				&xfs_bmbt_buf_ops);
 		if (err)
 			return(1);
@@ -584,7 +593,7 @@ _("bad fwd (right) sibling pointer (saw %" PRIu64 " should be NULLFSBLOCK)\n"
 				be64_to_cpu(pkey[numrecs - 1].br_startoff);
 	}
 
-	return(0);
+	return suspect > 0 ? 1 : 0;
 }
 
 static void
diff --git a/repair/scan.h b/repair/scan.h
index 4da788becbe..aeaf9f1a7f4 100644
--- a/repair/scan.h
+++ b/repair/scan.h
@@ -23,6 +23,7 @@ int scan_lbtree(
 				xfs_extnum_t		*nex,
 				struct blkmap		**blkmapp,
 				bmap_cursor_t		*bm_cursor,
+				int			suspect,
 				int			isroot,
 				int			check_dups,
 				int			*dirty,
@@ -35,6 +36,7 @@ int scan_lbtree(
 	xfs_extnum_t	*nex,
 	struct blkmap	**blkmapp,
 	bmap_cursor_t	*bm_cursor,
+	int		suspect,
 	int		isroot,
 	int		check_dups,
 	uint64_t	magic,
@@ -52,6 +54,7 @@ int scan_bmapbt(
 	xfs_extnum_t		*nex,
 	struct blkmap		**blkmapp,
 	bmap_cursor_t		*bm_cursor,
+	int			suspect,
 	int			isroot,
 	int			check_dups,
 	int			*dirty,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 34/47] xfs_repair: use realtime rmap btree data to check block types
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (32 preceding siblings ...)
  2023-12-27 13:19   ` [PATCH 33/47] xfs_repair: flag suspect long-format btree blocks Darrick J. Wong
@ 2023-12-27 13:19   ` Darrick J. Wong
  2023-12-27 13:19   ` [PATCH 35/47] xfs_repair: create a new set of incore rmap information for rt groups Darrick J. Wong
                     ` (12 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:19 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use the realtime rmap btree to pre-populate the block type information
so that when repair iterates the primary metadata, we can confirm the
block type.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |  163 +++++++++++++++++++++++
 repair/scan.c   |  390 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 repair/scan.h   |   34 +++++
 3 files changed, 577 insertions(+), 10 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index a0071d5de88..23cec00ad9e 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -777,6 +777,153 @@ get_agino_buf(
  * first, one utility routine for each type of inode
  */
 
+/*
+ * return 1 if inode should be cleared, 0 otherwise
+ */
+static int
+process_rtrmap(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		ino,
+	struct xfs_dinode	*dip,
+	int			type,
+	int			*dirty,
+	xfs_rfsblock_t		*tot,
+	uint64_t		*nex,
+	blkmap_t		**blkmapp,
+	int			check_dups)
+{
+	struct xfs_rmap_irec	oldkey;
+	struct xfs_rmap_irec	key;
+	struct rmap_priv	priv;
+	struct xfs_rtrmap_root	*dib;
+	xfs_rtrmap_ptr_t	*pp;
+	struct xfs_rmap_key	*kp;
+	struct xfs_rmap_rec	*rp;
+	char			*forkname = get_forkname(XFS_DATA_FORK);
+	xfs_ino_t		lino;
+	xfs_fsblock_t		bno;
+	size_t			droot_sz;
+	int			i;
+	int			level;
+	int			numrecs;
+	int			dmxr;
+	int			suspect = 0;
+	int			error;
+
+	/* We rebuild the rtrmapbt, so no need to process blocks again. */
+	if (check_dups) {
+		*tot = be64_to_cpu(dip->di_nblocks);
+		return 0;
+	}
+
+	lino = XFS_AGINO_TO_INO(mp, agno, ino);
+
+	/* This rmap btree inode must be a metadata inode. */
+	if (!(dip->di_flags2 & be64_to_cpu(XFS_DIFLAG2_METADIR))) {
+		do_warn(
+_("rtrmap inode %" PRIu64 " not flagged as metadata\n"),
+			lino);
+		return 1;
+	}
+
+	memset(&priv.high_key, 0xFF, sizeof(priv.high_key));
+	priv.high_key.rm_blockcount = 0;
+	priv.agcnts = NULL;
+	priv.last_rec.rm_owner = XFS_RMAP_OWN_UNKNOWN;
+
+	dib = (struct xfs_rtrmap_root *)XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	*tot = 0;
+	*nex = 0;
+
+	level = be16_to_cpu(dib->bb_level);
+	numrecs = be16_to_cpu(dib->bb_numrecs);
+
+	if (level > mp->m_rtrmap_maxlevels) {
+		do_warn(
+_("bad level %d in inode %" PRIu64 " rtrmap btree root block\n"),
+			level, lino);
+		return 1;
+	}
+
+	/*
+	 * use rtroot/dfork_dsize since the root block is in the data fork
+	 */
+	droot_sz = xfs_rtrmap_droot_space_calc(level, numrecs);
+	if (droot_sz > XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK)) {
+		do_warn(
+_("computed size of rtrmapbt root (%zu bytes) is greater than space in "
+	  "inode %" PRIu64 " %s fork\n"),
+				droot_sz, lino, forkname);
+		return 1;
+	}
+
+	if (level == 0) {
+		rp = xfs_rtrmap_droot_rec_addr(dib, 1);
+		error = process_rtrmap_reclist(mp, rp, numrecs,
+				&priv.last_rec, NULL, "rtrmapbt root");
+		if (error) {
+			rmap_avoid_check();
+			return 1;
+		}
+		return 0;
+	}
+
+	dmxr = libxfs_rtrmapbt_droot_maxrecs(
+			XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK), false);
+	pp = xfs_rtrmap_droot_ptr_addr(dib, 1, dmxr);
+
+	/* check for in-order keys */
+	for (i = 0; i < numrecs; i++)  {
+		kp = xfs_rtrmap_droot_key_addr(dib, i + 1);
+
+		key.rm_flags = 0;
+		key.rm_startblock = be32_to_cpu(kp->rm_startblock);
+		key.rm_owner = be64_to_cpu(kp->rm_owner);
+		if (libxfs_rmap_irec_offset_unpack(be64_to_cpu(kp->rm_offset),
+				&key)) {
+			/* Look for impossible flags. */
+			do_warn(
+_("invalid flags in key %u of rtrmap root ino %" PRIu64 "\n"),
+				i, lino);
+			suspect++;
+			continue;
+		}
+		if (i == 0) {
+			oldkey = key;
+			continue;
+		}
+		if (rmap_diffkeys(&oldkey, &key) > 0) {
+			do_warn(
+_("out of order key %u in rtrmap root ino %" PRIu64 "\n"),
+				i, lino);
+			suspect++;
+			continue;
+		}
+		oldkey = key;
+	}
+
+	/* probe keys */
+	for (i = 0; i < numrecs; i++)  {
+		bno = get_unaligned_be64(&pp[i]);
+
+		if (!libxfs_verify_fsbno(mp, bno))  {
+			do_warn(
+_("bad rtrmap btree ptr 0x%" PRIx64 " in ino %" PRIu64 "\n"),
+				bno, lino);
+			return 1;
+		}
+
+		if (scan_lbtree(bno, level, scan_rtrmapbt,
+				type, XFS_DATA_FORK, lino, tot, nex, blkmapp,
+				NULL, 0, 1, check_dups, XFS_RTRMAP_CRC_MAGIC,
+				&priv, &xfs_rtrmapbt_buf_ops))
+			return 1;
+	}
+
+	return suspect ? 1 : 0;
+}
+
 /*
  * return 1 if inode should be cleared, 0 otherwise
  */
@@ -1553,7 +1700,7 @@ static int
 check_dinode_mode_format(
 	struct xfs_dinode	*dinoc)
 {
-	if (dinoc->di_format >= XFS_DINODE_FMT_UUID)
+	if (dinoc->di_format == XFS_DINODE_FMT_UUID)
 		return -1;	/* FMT_UUID is not used */
 
 	switch (dinode_fmt(dinoc)) {
@@ -1568,8 +1715,13 @@ check_dinode_mode_format(
 			dinoc->di_format > XFS_DINODE_FMT_BTREE) ? -1 : 0;
 
 	case S_IFREG:
-		return (dinoc->di_format < XFS_DINODE_FMT_EXTENTS ||
-			dinoc->di_format > XFS_DINODE_FMT_BTREE) ? -1 : 0;
+		switch (dinoc->di_format) {
+		case XFS_DINODE_FMT_RMAP:
+		case XFS_DINODE_FMT_EXTENTS:
+		case XFS_DINODE_FMT_BTREE:
+			return 0;
+		}
+		return -1;
 
 	case S_IFLNK:
 		return (dinoc->di_format < XFS_DINODE_FMT_LOCAL ||
@@ -1983,6 +2135,10 @@ process_inode_data_fork(
 			totblocks, nextents, dblkmap, XFS_DATA_FORK,
 			check_dups, zap_metadata);
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		err = process_rtrmap(mp, agno, ino, dino, type, dirty,
+				totblocks, nextents, dblkmap, check_dups);
+		break;
 	case XFS_DINODE_FMT_DEV:
 		err = 0;
 		break;
@@ -2042,6 +2198,7 @@ _("would have tried to rebuild inode %"PRIu64" data fork\n"),
 				XFS_DATA_FORK, 0, zap_metadata);
 			break;
 		case XFS_DINODE_FMT_DEV:
+		case XFS_DINODE_FMT_RMAP:
 			err = 0;
 			break;
 		default:
diff --git a/repair/scan.c b/repair/scan.c
index 2f8a3348ae1..27aeb341bf3 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -959,13 +959,6 @@ _("unknown block (%d,%d-%d) mismatch on %s tree, state - %d,%" PRIx64 "\n"),
 	}
 }
 
-struct rmap_priv {
-	struct aghdr_cnts	*agcnts;
-	struct xfs_rmap_irec	high_key;
-	struct xfs_rmap_irec	last_rec;
-	xfs_agblock_t		nr_blocks;
-};
-
 static bool
 rmap_in_order(
 	xfs_agblock_t	b,
@@ -1367,6 +1360,389 @@ _("out of order key %u in %s btree block (%u/%u)\n"),
 		rmap_avoid_check();
 }
 
+int
+process_rtrmap_reclist(
+	struct xfs_mount	*mp,
+	struct xfs_rmap_rec	*rp,
+	int			numrecs,
+	struct xfs_rmap_irec	*last_rec,
+	struct xfs_rmap_irec	*high_key,
+	const char		*name)
+{
+	int			suspect = 0;
+	int			i;
+	struct xfs_rmap_irec	oldkey;
+	struct xfs_rmap_irec	key;
+
+	for (i = 0; i < numrecs; i++) {
+		xfs_rgblock_t		b, end;
+		xfs_extlen_t		len;
+		uint64_t		owner, offset;
+
+		b = be32_to_cpu(rp[i].rm_startblock);
+		len = be32_to_cpu(rp[i].rm_blockcount);
+		owner = be64_to_cpu(rp[i].rm_owner);
+		offset = be64_to_cpu(rp[i].rm_offset);
+
+		key.rm_flags = 0;
+		key.rm_startblock = b;
+		key.rm_blockcount = len;
+		key.rm_owner = owner;
+		if (libxfs_rmap_irec_offset_unpack(offset, &key)) {
+			/* Look for impossible flags. */
+			do_warn(
+_("invalid flags in record %u of %s\n"),
+				i, name);
+			suspect++;
+			continue;
+		}
+
+
+		end = key.rm_startblock + key.rm_blockcount;
+
+		/* Make sure startblock & len make sense. */
+		if (b >= mp->m_sb.sb_rgblocks) {
+			do_warn(
+_("invalid start block %llu in record %u of %s\n"),
+				(unsigned long long)b, i, name);
+			suspect++;
+			continue;
+		}
+		if (len == 0 || end - 1 >= mp->m_sb.sb_rgblocks) {
+			do_warn(
+_("invalid length %llu in record %u of %s\n"),
+				(unsigned long long)len, i, name);
+			suspect++;
+			continue;
+		}
+
+		/* We only store file data and superblocks in the rtrmap. */
+		if (XFS_RMAP_NON_INODE_OWNER(owner) &&
+		    owner != XFS_RMAP_OWN_FS) {
+			do_warn(
+_("invalid owner %lld in record %u of %s\n"),
+				(long long int)owner, i, name);
+			suspect++;
+			continue;
+		}
+
+		/* Look for impossible record field combinations. */
+		if (key.rm_flags & XFS_RMAP_KEY_FLAGS) {
+			do_warn(
+_("record %d cannot have attr fork/key flags in %s\n"),
+					i, name);
+			suspect++;
+			continue;
+		}
+
+		/* Check for out of order records. */
+		if (i == 0)
+			oldkey = key;
+		else {
+			if (rmap_diffkeys(&oldkey, &key) > 0)
+				do_warn(
+_("out-of-order record %d (%llu %"PRId64" %"PRIu64" %llu) in %s\n"),
+				i, (unsigned long long)b, owner, offset,
+				(unsigned long long)len, name);
+			else
+				oldkey = key;
+		}
+
+		/* Is this mergeable with the previous record? */
+		if (rmaps_are_mergeable(last_rec, &key)) {
+			do_warn(
+_("record %d in %s should be merged with previous record\n"),
+				i, name);
+			last_rec->rm_blockcount += key.rm_blockcount;
+		} else
+			*last_rec = key;
+
+		/* Check that we don't go past the high key. */
+		key.rm_startblock += key.rm_blockcount - 1;
+		key.rm_offset += key.rm_blockcount - 1;
+		key.rm_blockcount = 0;
+		if (high_key && rmap_diffkeys(&key, high_key) > 0) {
+			do_warn(
+_("record %d greater than high key of %s\n"),
+				i, name);
+			suspect++;
+		}
+	}
+
+	return suspect;
+}
+
+int
+scan_rtrmapbt(
+	struct xfs_btree_block	*block,
+	int			level,
+	int			type,
+	int			whichfork,
+	xfs_fsblock_t		fsbno,
+	xfs_ino_t		ino,
+	xfs_rfsblock_t		*tot,
+	uint64_t		*nex,
+	blkmap_t		**blkmapp,
+	bmap_cursor_t		*bm_cursor,
+	int			suspect,
+	int			isroot,
+	int			check_dups,
+	int			*dirty,
+	uint64_t		magic,
+	void			*priv)
+{
+	const char		*name = "rtrmap";
+	char			rootname[256];
+	int			i;
+	xfs_rtrmap_ptr_t	*pp;
+	struct xfs_rmap_rec	*rp;
+	struct rmap_priv	*rmap_priv = priv;
+	int			hdr_errors = 0;
+	int			numrecs;
+	int			state;
+	struct xfs_rmap_key	*kp;
+	struct xfs_rmap_irec	oldkey;
+	struct xfs_rmap_irec	key;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	int			error;
+
+	agno = XFS_FSB_TO_AGNO(mp, fsbno);
+	agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
+
+	/* If anything here is bad, just bail. */
+	if (be32_to_cpu(block->bb_magic) != magic) {
+		do_warn(
+_("bad magic # %#x in inode %" PRIu64 " %s block %" PRIu64 "\n"),
+			be32_to_cpu(block->bb_magic), ino, name, fsbno);
+		return 1;
+	}
+	if (be16_to_cpu(block->bb_level) != level) {
+		do_warn(
+_("expected level %d got %d in inode %" PRIu64 ", %s block %" PRIu64 "\n"),
+			level, be16_to_cpu(block->bb_level),
+			ino, name, fsbno);
+		return(1);
+	}
+
+	/* verify owner */
+	if (be64_to_cpu(block->bb_u.l.bb_owner) != ino) {
+		do_warn(
+_("expected owner inode %" PRIu64 ", got %llu, %s block %" PRIu64 "\n"),
+			ino,
+			(unsigned long long)be64_to_cpu(block->bb_u.l.bb_owner),
+			name, fsbno);
+		return 1;
+	}
+	/* verify block number */
+	if (be64_to_cpu(block->bb_u.l.bb_blkno) !=
+	    XFS_FSB_TO_DADDR(mp, fsbno)) {
+		do_warn(
+_("expected block %" PRIu64 ", got %llu, %s block %" PRIu64 "\n"),
+			XFS_FSB_TO_DADDR(mp, fsbno),
+			(unsigned long long)be64_to_cpu(block->bb_u.l.bb_blkno),
+			name, fsbno);
+		return 1;
+	}
+	/* verify uuid */
+	if (platform_uuid_compare(&block->bb_u.l.bb_uuid,
+				  &mp->m_sb.sb_meta_uuid) != 0) {
+		do_warn(
+_("wrong FS UUID, %s block %" PRIu64 "\n"),
+			name, fsbno);
+		return 1;
+	}
+
+	/*
+	 * Check for btree blocks multiply claimed.  We're going to regenerate
+	 * the rtrmap anyway, so mark the blocks as metadata so they get freed.
+	 */
+	state = get_bmap(agno, agbno);
+	if (!(state == XR_E_UNKNOWN || state == XR_E_INUSE1))  {
+		do_warn(
+_("%s btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
+				name, state, agno, agbno, suspect);
+		suspect++;
+		goto out;
+	}
+	set_bmap(agno, agbno, XR_E_METADATA);
+
+	numrecs = be16_to_cpu(block->bb_numrecs);
+
+	/*
+	 * All realtime rmap btree blocks are freed for a fully empty
+	 * filesystem, thus they are counted towards the free data
+	 * block counter.  The root lives in an inode and is thus not
+	 * counted.
+	 */
+	(*tot)++;
+
+	if (level == 0) {
+		if (numrecs > mp->m_rtrmap_mxr[0])  {
+			numrecs = mp->m_rtrmap_mxr[0];
+			hdr_errors++;
+		}
+		if (isroot == 0 && numrecs < mp->m_rtrmap_mnr[0])  {
+			numrecs = mp->m_rtrmap_mnr[0];
+			hdr_errors++;
+		}
+
+		if (hdr_errors) {
+			do_warn(
+_("bad btree nrecs (%u, min=%u, max=%u) in bt%s block %u/%u\n"),
+				be16_to_cpu(block->bb_numrecs),
+				mp->m_rtrmap_mnr[0], mp->m_rtrmap_mxr[0],
+				name, agno, agbno);
+			suspect++;
+		}
+
+		rp = xfs_rtrmap_rec_addr(block, 1);
+		snprintf(rootname, 256, "%s btree block %u/%u", name, agno, agbno);
+		error = process_rtrmap_reclist(mp, rp, numrecs,
+				&rmap_priv->last_rec, &rmap_priv->high_key,
+				rootname);
+		if (error)
+			suspect++;
+		goto out;
+	}
+
+	/*
+	 * interior record
+	 */
+	pp = xfs_rtrmap_ptr_addr(block, 1, mp->m_rtrmap_mxr[1]);
+
+	if (numrecs > mp->m_rtrmap_mxr[1])  {
+		numrecs = mp->m_rtrmap_mxr[1];
+		hdr_errors++;
+	}
+	if (isroot == 0 && numrecs < mp->m_rtrmap_mnr[1])  {
+		numrecs = mp->m_rtrmap_mnr[1];
+		hdr_errors++;
+	}
+
+	/*
+	 * don't pass bogus tree flag down further if this block
+	 * looked ok.  bail out if two levels in a row look bad.
+	 */
+	if (hdr_errors)  {
+		do_warn(
+_("bad btree nrecs (%u, min=%u, max=%u) in bt%s block %u/%u\n"),
+			be16_to_cpu(block->bb_numrecs),
+			mp->m_rtrmap_mnr[1], mp->m_rtrmap_mxr[1],
+			name, agno, agbno);
+		if (suspect)
+			goto out;
+		suspect++;
+	} else if (suspect) {
+		suspect = 0;
+	}
+
+	/* check the node's high keys */
+	for (i = 0; !isroot && i < numrecs; i++) {
+		kp = xfs_rtrmap_high_key_addr(block, i + 1);
+
+		key.rm_flags = 0;
+		key.rm_startblock = be32_to_cpu(kp->rm_startblock);
+		key.rm_owner = be64_to_cpu(kp->rm_owner);
+		if (libxfs_rmap_irec_offset_unpack(be64_to_cpu(kp->rm_offset),
+				&key)) {
+			/* Look for impossible flags. */
+			do_warn(
+_("invalid flags in key %u of %s btree block %u/%u\n"),
+				i, name, agno, agbno);
+			suspect++;
+			continue;
+		}
+		if (rmap_diffkeys(&key, &rmap_priv->high_key) > 0) {
+			do_warn(
+_("key %d greater than high key of block (%u/%u) in %s tree\n"),
+				i, agno, agbno, name);
+			suspect++;
+		}
+	}
+
+	/* check for in-order keys */
+	for (i = 0; i < numrecs; i++)  {
+		kp = xfs_rtrmap_key_addr(block, i + 1);
+
+		key.rm_flags = 0;
+		key.rm_startblock = be32_to_cpu(kp->rm_startblock);
+		key.rm_owner = be64_to_cpu(kp->rm_owner);
+		if (libxfs_rmap_irec_offset_unpack(be64_to_cpu(kp->rm_offset),
+				&key)) {
+			/* Look for impossible flags. */
+			do_warn(
+_("invalid flags in key %u of %s btree block %u/%u\n"),
+				i, name, agno, agbno);
+			suspect++;
+			continue;
+		}
+		if (i == 0) {
+			oldkey = key;
+			continue;
+		}
+		if (rmap_diffkeys(&oldkey, &key) > 0) {
+			do_warn(
+_("out of order key %u in %s btree block (%u/%u)\n"),
+				i, name, agno, agbno);
+			suspect++;
+		}
+		oldkey = key;
+	}
+
+	for (i = 0; i < numrecs; i++)  {
+		xfs_fsblock_t		pbno = be64_to_cpu(pp[i]);
+
+		/*
+		 * XXX - put sibling detection right here.
+		 * we know our sibling chain is good.  So as we go,
+		 * we check the entry before and after each entry.
+		 * If either of the entries references a different block,
+		 * check the sibling pointer.  If there's a sibling
+		 * pointer mismatch, try and extract as much data
+		 * as possible.
+		 */
+		kp = xfs_rtrmap_high_key_addr(block, i + 1);
+		rmap_priv->high_key.rm_flags = 0;
+		rmap_priv->high_key.rm_startblock =
+				be32_to_cpu(kp->rm_startblock);
+		rmap_priv->high_key.rm_owner =
+				be64_to_cpu(kp->rm_owner);
+		if (libxfs_rmap_irec_offset_unpack(be64_to_cpu(kp->rm_offset),
+				&rmap_priv->high_key)) {
+			/* Look for impossible flags. */
+			do_warn(
+_("invalid flags in high key %u of %s btree block %u/%u\n"),
+				i, name, agno, agbno);
+			suspect++;
+			continue;
+		}
+
+		if (!libxfs_verify_fsbno(mp, pbno)) {
+			do_warn(
+_("bad %s btree ptr 0x%llx in ino %" PRIu64 "\n"),
+			       name, (unsigned long long)pbno, ino);
+			return 1;
+		}
+
+		error = scan_lbtree(pbno, level, scan_rtrmapbt,
+				type, whichfork, ino, tot, nex, blkmapp,
+				bm_cursor, suspect, 0, check_dups, magic,
+				rmap_priv, &xfs_rtrmapbt_buf_ops);
+		if (error) {
+			suspect++;
+			goto out;
+		}
+	}
+
+out:
+	if (hdr_errors || suspect) {
+		rmap_avoid_check();
+		return 1;
+	}
+	return 0;
+}
+
 struct refc_priv {
 	struct xfs_refcount_irec	last_rec;
 	xfs_agblock_t			nr_blocks;
diff --git a/repair/scan.h b/repair/scan.h
index aeaf9f1a7f4..a624c882734 100644
--- a/repair/scan.h
+++ b/repair/scan.h
@@ -66,4 +66,38 @@ scan_ags(
 	struct xfs_mount	*mp,
 	int			scan_threads);
 
+struct rmap_priv {
+	struct aghdr_cnts	*agcnts;
+	struct xfs_rmap_irec	high_key;
+	struct xfs_rmap_irec	last_rec;
+	xfs_agblock_t		nr_blocks;
+};
+
+int
+process_rtrmap_reclist(
+	struct xfs_mount	*mp,
+	struct xfs_rmap_rec	*rp,
+	int			numrecs,
+	struct xfs_rmap_irec	*last_rec,
+	struct xfs_rmap_irec	*high_key,
+	const char		*name);
+
+int scan_rtrmapbt(
+	struct xfs_btree_block	*block,
+	int			level,
+	int			type,
+	int			whichfork,
+	xfs_fsblock_t		bno,
+	xfs_ino_t		ino,
+	xfs_rfsblock_t		*tot,
+	uint64_t		*nex,
+	struct blkmap		**blkmapp,
+	bmap_cursor_t		*bm_cursor,
+	int			suspect,
+	int			isroot,
+	int			check_dups,
+	int			*dirty,
+	uint64_t		magic,
+	void			*priv);
+
 #endif /* _XR_SCAN_H */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 35/47] xfs_repair: create a new set of incore rmap information for rt groups
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (33 preceding siblings ...)
  2023-12-27 13:19   ` [PATCH 34/47] xfs_repair: use realtime rmap btree data to check block types Darrick J. Wong
@ 2023-12-27 13:19   ` Darrick J. Wong
  2023-12-27 13:19   ` [PATCH 36/47] xfs_repair: collect relatime reverse-mapping data for refcount/rmap tree rebuilding Darrick J. Wong
                     ` (11 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:19 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a parallel set of "xfs_ag_rmap" structures to cache information
about reverse mappings for the realtime groups.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    3 +
 repair/agbtree.c         |    5 +-
 repair/dinode.c          |    2 -
 repair/rmap.c            |  143 +++++++++++++++++++++++++++++++++++++---------
 repair/rmap.h            |    7 +-
 5 files changed, 126 insertions(+), 34 deletions(-)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 85a4a131c75..e65b4d6fea5 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -284,6 +284,7 @@
 #define xfs_rtsummary_wordcount		libxfs_rtsummary_wordcount
 
 #define xfs_rtfree_extent		libxfs_rtfree_extent
+#define xfs_rtgroup_get			libxfs_rtgroup_get
 #define xfs_rtgroup_put			libxfs_rtgroup_put
 #define xfs_rtgroup_update_secondary_sbs	libxfs_rtgroup_update_secondary_sbs
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
@@ -292,6 +293,8 @@
 #define xfs_rtrmapbt_maxlevels_ondisk	libxfs_rtrmapbt_maxlevels_ondisk
 #define xfs_rtrmapbt_init_cursor	libxfs_rtrmapbt_init_cursor
 #define xfs_rtrmapbt_maxrecs		libxfs_rtrmapbt_maxrecs
+#define xfs_rtrmapbt_mem_create		libxfs_rtrmapbt_mem_create
+#define xfs_rtrmapbt_mem_cursor		libxfs_rtrmapbt_mem_cursor
 
 #define xfs_sb_from_disk		libxfs_sb_from_disk
 #define xfs_sb_quota_from_disk		libxfs_sb_quota_from_disk
diff --git a/repair/agbtree.c b/repair/agbtree.c
index dccb15f9667..a401c80da38 100644
--- a/repair/agbtree.c
+++ b/repair/agbtree.c
@@ -645,7 +645,7 @@ init_rmapbt_cursor(
 
 	/* Compute how many blocks we'll need. */
 	error = -libxfs_btree_bload_compute_geometry(btr->cur, &btr->bload,
-			rmap_record_count(sc->mp, agno));
+			rmap_record_count(sc->mp, false, agno));
 	if (error)
 		do_error(
 _("Unable to compute rmap btree geometry, error %d.\n"), error);
@@ -662,7 +662,8 @@ build_rmap_tree(
 {
 	int			error;
 
-	error = rmap_init_mem_cursor(sc->mp, NULL, agno, &btr->rmapbt_cursor);
+	error = rmap_init_mem_cursor(sc->mp, NULL, false, agno,
+			&btr->rmapbt_cursor);
 	if (error)
 		do_error(
 _("Insufficient memory to construct rmap cursor.\n"));
diff --git a/repair/dinode.c b/repair/dinode.c
index 23cec00ad9e..41b44e6faad 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -657,7 +657,7 @@ _("illegal state %d in block map %" PRIu64 "\n"),
 			}
 		}
 		if (collect_rmaps && !zap_metadata) /* && !check_dups */
-			rmap_add_rec(mp, ino, whichfork, &irec);
+			rmap_add_rec(mp, ino, whichfork, &irec, false);
 		*tot += irec.br_blockcount;
 	}
 	error = 0;
diff --git a/repair/rmap.c b/repair/rmap.c
index 265199d2117..aa47013baec 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -26,7 +26,7 @@
 # define dbg_printf(f, a...)
 #endif
 
-/* per-AG rmap object anchor */
+/* allocation group (AG or rtgroup) rmap object anchor */
 struct xfs_ag_rmap {
 	struct xfbtree	*ar_xfbtree;		/* rmap observations */
 	struct xfs_slab	*ar_agbtree_rmaps;	/* rmaps for rebuilt ag btrees */
@@ -36,9 +36,17 @@ struct xfs_ag_rmap {
 };
 
 static struct xfs_ag_rmap *ag_rmaps;
+static struct xfs_ag_rmap *rg_rmaps;
 bool rmapbt_suspect;
 static bool refcbt_suspect;
 
+static struct xfs_ag_rmap *rmaps_for_group(bool isrt, unsigned int group)
+{
+	if (isrt)
+		return &rg_rmaps[group];
+	return &ag_rmaps[group];
+}
+
 static inline int rmap_compare(const void *a, const void *b)
 {
 	return libxfs_rmap_compare(a, b);
@@ -76,6 +84,44 @@ rmaps_destroy(
 	xfile_free_buftarg(target);
 }
 
+/* Initialize the in-memory rmap btree for collecting realtime rmap records. */
+STATIC void
+rmaps_init_rt(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_ag_rmap	*ag_rmap)
+{
+	struct xfs_buftarg	*target;
+	char			*descr;
+	unsigned long long	maxbytes;
+	int			error;
+
+	if (!xfs_has_realtime(mp))
+		return;
+
+	/*
+	 * Each rtgroup rmap btree file can consume the entire data device,
+	 * even if the metadata space reservation will be smaller than that.
+	 */
+	maxbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks);
+	descr = kasprintf("xfs_repair (%s): rtgroup %u rmap records",
+			mp->m_fsname, rgno);
+	error = -xfile_alloc_buftarg(mp, descr, maxbytes, &target);
+	kfree(descr);
+	if (error)
+		goto nomem;
+
+	error = -libxfs_rtrmapbt_mem_create(mp, rgno, target,
+			&ag_rmap->ar_xfbtree);
+	if (error)
+		goto nomem;
+
+	return;
+nomem:
+	do_error(
+_("Insufficient memory while allocating realtime reverse mapping btree."));
+}
+
 /* Initialize the in-memory rmap btree for collecting per-AG rmap records. */
 STATIC void
 rmaps_init_ag(
@@ -135,6 +181,13 @@ rmaps_init(
 
 	for (i = 0; i < mp->m_sb.sb_agcount; i++)
 		rmaps_init_ag(mp, i, &ag_rmaps[i]);
+
+	rg_rmaps = calloc(mp->m_sb.sb_rgcount, sizeof(struct xfs_ag_rmap));
+	if (!rg_rmaps)
+		do_error(_("couldn't allocate per-rtgroup reverse map roots\n"));
+
+	for (i = 0; i < mp->m_sb.sb_rgcount; i++)
+		rmaps_init_rt(mp, i, &rg_rmaps[i]);
 }
 
 /*
@@ -149,6 +202,11 @@ rmaps_free(
 	if (!rmap_needs_work(mp))
 		return;
 
+	for (i = 0; i < mp->m_sb.sb_rgcount; i++)
+		rmaps_destroy(mp, &rg_rmaps[i]);
+	free(rg_rmaps);
+	rg_rmaps = NULL;
+
 	for (i = 0; i < mp->m_sb.sb_agcount; i++)
 		rmaps_destroy(mp, &ag_rmaps[i]);
 	free(ag_rmaps);
@@ -184,26 +242,38 @@ int
 rmap_init_mem_cursor(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
+	bool			isrt,
 	xfs_agnumber_t		agno,
 	struct rmap_mem_cur	*rmcur)
 {
 	struct xfbtree		*xfbt;
-	struct xfs_perag	*pag;
+	struct xfs_perag	*pag = NULL;
+	struct xfs_rtgroup	*rtg = NULL;
 	int			error;
 
-	xfbt = ag_rmaps[agno].ar_xfbtree;
+	xfbt = rmaps_for_group(isrt, agno)->ar_xfbtree;
 	error = -xfbtree_head_read_buf(xfbt, tp, &rmcur->mhead_bp);
 	if (error)
 		return error;
 
-	pag = libxfs_perag_get(mp, agno);
-	rmcur->mcur = libxfs_rmapbt_mem_cursor(pag, tp, rmcur->mhead_bp, xfbt);
+	if (isrt) {
+		rtg = libxfs_rtgroup_get(mp, agno);
+		rmcur->mcur = libxfs_rtrmapbt_mem_cursor(rtg, tp,
+				rmcur->mhead_bp, xfbt);
+	} else {
+		pag = libxfs_perag_get(mp, agno);
+		rmcur->mcur = libxfs_rmapbt_mem_cursor(pag, tp,
+				rmcur->mhead_bp, xfbt);
+	}
 
 	error = -libxfs_btree_goto_left_edge(rmcur->mcur);
 	if (error)
 		rmap_free_mem_cursor(tp, rmcur, error);
 
-	libxfs_perag_put(pag);
+	if (pag)
+		libxfs_perag_put(pag);
+	if (rtg)
+		libxfs_rtgroup_put(rtg);
 	return error;
 }
 
@@ -248,6 +318,7 @@ rmap_get_mem_rec(
 static void
 rmap_add_mem_rec(
 	struct xfs_mount	*mp,
+	bool			isrt,
 	xfs_agnumber_t		agno,
 	struct xfs_rmap_irec	*rmap)
 {
@@ -256,12 +327,12 @@ rmap_add_mem_rec(
 	struct xfs_trans	*tp;
 	int			error;
 
-	xfbt = ag_rmaps[agno].ar_xfbtree;
+	xfbt = rmaps_for_group(isrt, agno)->ar_xfbtree;
 	error = -libxfs_trans_alloc_empty(mp, &tp);
 	if (error)
 		do_error(_("allocating tx for in-memory rmap update\n"));
 
-	error = rmap_init_mem_cursor(mp, tp, agno, &rmcur);
+	error = rmap_init_mem_cursor(mp, tp, isrt, agno, &rmcur);
 	if (error)
 		do_error(_("reading in-memory rmap btree head\n"));
 
@@ -286,7 +357,8 @@ rmap_add_rec(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino,
 	int			whichfork,
-	struct xfs_bmbt_irec	*irec)
+	struct xfs_bmbt_irec	*irec,
+	bool			isrt)
 {
 	struct xfs_rmap_irec	rmap;
 	xfs_agnumber_t		agno;
@@ -295,11 +367,19 @@ rmap_add_rec(
 	if (!rmap_needs_work(mp))
 		return;
 
-	agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
-	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
-	ASSERT(agno != NULLAGNUMBER);
-	ASSERT(agno < mp->m_sb.sb_agcount);
-	ASSERT(agbno + irec->br_blockcount <= mp->m_sb.sb_agblocks);
+	if (isrt) {
+		xfs_rgnumber_t	rgno;
+
+		agbno = xfs_rtb_to_rgbno(mp, irec->br_startblock, &rgno);
+		agno = rgno;
+		ASSERT(agbno + irec->br_blockcount <= mp->m_sb.sb_rblocks);
+	} else {
+		agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
+		agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
+		ASSERT(agno != NULLAGNUMBER);
+		ASSERT(agno < mp->m_sb.sb_agcount);
+		ASSERT(agbno + irec->br_blockcount <= mp->m_sb.sb_agblocks);
+	}
 	ASSERT(ino != NULLFSINO);
 	ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
 
@@ -313,7 +393,7 @@ rmap_add_rec(
 	if (irec->br_state == XFS_EXT_UNWRITTEN)
 		rmap.rm_flags |= XFS_RMAP_UNWRITTEN;
 
-	rmap_add_mem_rec(mp, agno, &rmap);
+	rmap_add_mem_rec(mp, isrt, agno, &rmap);
 }
 
 /* add a raw rmap; these will be merged later */
@@ -340,7 +420,7 @@ __rmap_add_raw_rec(
 	rmap.rm_startblock = agbno;
 	rmap.rm_blockcount = len;
 
-	rmap_add_mem_rec(mp, agno, &rmap);
+	rmap_add_mem_rec(mp, false, agno, &rmap);
 }
 
 /*
@@ -409,6 +489,7 @@ rmap_add_agbtree_mapping(
 		.rm_blockcount	= len,
 	};
 	struct xfs_perag	*pag;
+	struct xfs_ag_rmap	*x;
 
 	if (!rmap_needs_work(mp))
 		return 0;
@@ -417,7 +498,8 @@ rmap_add_agbtree_mapping(
 	assert(libxfs_verify_agbext(pag, agbno, len));
 	libxfs_perag_put(pag);
 
-	return slab_add(ag_rmaps[agno].ar_agbtree_rmaps, &rmap);
+	x = rmaps_for_group(false, agno);
+	return slab_add(x->ar_agbtree_rmaps, &rmap);
 }
 
 static int
@@ -533,7 +615,7 @@ rmap_commit_agbtree_mappings(
 	struct xfs_buf		*agflbp = NULL;
 	struct xfs_trans	*tp;
 	__be32			*agfl_bno, *b;
-	struct xfs_ag_rmap	*ag_rmap = &ag_rmaps[agno];
+	struct xfs_ag_rmap	*ag_rmap = rmaps_for_group(false, agno);
 	struct bitmap		*own_ag_bitmap = NULL;
 	int			error = 0;
 
@@ -796,7 +878,7 @@ refcount_emit(
 	int			error;
 	struct xfs_slab		*rlslab;
 
-	rlslab = ag_rmaps[agno].ar_refcount_items;
+	rlslab = rmaps_for_group(false, agno)->ar_refcount_items;
 	ASSERT(nr_rmaps > 0);
 
 	dbg_printf("REFL: agno=%u pblk=%u, len=%u -> refcount=%zu\n",
@@ -930,12 +1012,12 @@ compute_refcounts(
 
 	if (!xfs_has_reflink(mp))
 		return 0;
-	if (ag_rmaps[agno].ar_xfbtree == NULL)
+	if (rmaps_for_group(false, agno)->ar_xfbtree == NULL)
 		return 0;
 
-	nr_rmaps = rmap_record_count(mp, agno);
+	nr_rmaps = rmap_record_count(mp, false, agno);
 
-	error = rmap_init_mem_cursor(mp, NULL, agno, &rmcur);
+	error = rmap_init_mem_cursor(mp, NULL, false, agno, &rmcur);
 	if (error)
 		return error;
 
@@ -1040,16 +1122,17 @@ count_btree_records(
 uint64_t
 rmap_record_count(
 	struct xfs_mount	*mp,
+	bool			isrt,
 	xfs_agnumber_t		agno)
 {
 	struct rmap_mem_cur	rmcur;
 	uint64_t		nr = 0;
 	int			error;
 
-	if (ag_rmaps[agno].ar_xfbtree == NULL)
+	if (rmaps_for_group(isrt, agno)->ar_xfbtree == NULL)
 		return 0;
 
-	error = rmap_init_mem_cursor(mp, NULL, agno, &rmcur);
+	error = rmap_init_mem_cursor(mp, NULL, isrt, agno, &rmcur);
 	if (error)
 		do_error(_("%s while reading in-memory rmap btree\n"),
 				strerror(error));
@@ -1165,7 +1248,7 @@ rmaps_verify_btree(
 	}
 
 	/* Create cursors to rmap structures */
-	error = rmap_init_mem_cursor(mp, NULL, agno, &rm_cur);
+	error = rmap_init_mem_cursor(mp, NULL, false, agno, &rm_cur);
 	if (error) {
 		do_warn(_("Not enough memory to check reverse mappings.\n"));
 		return;
@@ -1485,7 +1568,9 @@ refcount_record_count(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agno)
 {
-	return slab_count(ag_rmaps[agno].ar_refcount_items);
+	struct xfs_ag_rmap	*x = rmaps_for_group(false, agno);
+
+	return slab_count(x->ar_refcount_items);
 }
 
 /*
@@ -1496,7 +1581,9 @@ init_refcount_cursor(
 	xfs_agnumber_t		agno,
 	struct xfs_slab_cursor	**cur)
 {
-	return init_slab_cursor(ag_rmaps[agno].ar_refcount_items, NULL, cur);
+	struct xfs_ag_rmap	*x = rmaps_for_group(false, agno);
+
+	return init_slab_cursor(x->ar_refcount_items, NULL, cur);
 }
 
 /*
@@ -1697,7 +1784,7 @@ rmap_store_agflcount(
 	if (!rmap_needs_work(mp))
 		return;
 
-	ag_rmaps[agno].ar_flcount = count;
+	rmaps_for_group(false, agno)->ar_flcount = count;
 }
 
 /* Estimate the size of the ondisk rmapbt from the incore data. */
diff --git a/repair/rmap.h b/repair/rmap.h
index 50268b2f8ca..7a94ed6f90a 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -15,7 +15,7 @@ extern void rmaps_init(struct xfs_mount *);
 extern void rmaps_free(struct xfs_mount *);
 
 void rmap_add_rec(struct xfs_mount *mp, xfs_ino_t ino, int whichfork,
-		struct xfs_bmbt_irec *irec);
+		struct xfs_bmbt_irec *irec, bool realtime);
 void rmap_add_bmbt_rec(struct xfs_mount *mp, xfs_ino_t ino, int whichfork,
 		xfs_fsblock_t fsbno);
 bool rmaps_are_mergeable(struct xfs_rmap_irec *r1, struct xfs_rmap_irec *r2);
@@ -26,7 +26,8 @@ int rmap_add_agbtree_mapping(struct xfs_mount *mp, xfs_agnumber_t agno,
 		xfs_agblock_t agbno, xfs_extlen_t len, uint64_t owner);
 int rmap_commit_agbtree_mappings(struct xfs_mount *mp, xfs_agnumber_t agno);
 
-uint64_t rmap_record_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+uint64_t rmap_record_count(struct xfs_mount *mp, bool isrt,
+		xfs_agnumber_t agno);
 extern void rmap_avoid_check(void);
 void rmaps_verify_btree(struct xfs_mount *mp, xfs_agnumber_t agno);
 
@@ -57,7 +58,7 @@ struct rmap_mem_cur {
 };
 
 int rmap_init_mem_cursor(struct xfs_mount *mp, struct xfs_trans *tp,
-		xfs_agnumber_t agno, struct rmap_mem_cur *rmcur);
+		bool isrt, xfs_agnumber_t agno, struct rmap_mem_cur *rmcur);
 void rmap_free_mem_cursor(struct xfs_trans *tp, struct rmap_mem_cur *rmcur,
 		int error);
 int rmap_get_mem_rec(struct rmap_mem_cur *rmcur, struct xfs_rmap_irec *irec);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 36/47] xfs_repair: collect relatime reverse-mapping data for refcount/rmap tree rebuilding
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (34 preceding siblings ...)
  2023-12-27 13:19   ` [PATCH 35/47] xfs_repair: create a new set of incore rmap information for rt groups Darrick J. Wong
@ 2023-12-27 13:19   ` Darrick J. Wong
  2023-12-27 13:20   ` [PATCH 37/47] xfs_repair: refactor realtime inode check Darrick J. Wong
                     ` (10 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:19 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Collect reverse-mapping data for realtime files so that we can later
check and rebuild the reference count tree and the reverse mapping
tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |    4 ++++
 1 file changed, 4 insertions(+)


diff --git a/repair/dinode.c b/repair/dinode.c
index 41b44e6faad..d88bd80783c 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -350,6 +350,10 @@ _("inode %" PRIu64 " - bad rt extent overflows - start %" PRIu64 ", "
 	 */
 	*tot += irec->br_blockcount;
 
+	/* Record mapping data for the realtime rmap. */
+	if (collect_rmaps && !zap_metadata && !check_dups)
+		rmap_add_rec(mp, ino, XFS_DATA_FORK, irec, true);
+
 	return 0;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 37/47] xfs_repair: refactor realtime inode check
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (35 preceding siblings ...)
  2023-12-27 13:19   ` [PATCH 36/47] xfs_repair: collect relatime reverse-mapping data for refcount/rmap tree rebuilding Darrick J. Wong
@ 2023-12-27 13:20   ` Darrick J. Wong
  2023-12-27 13:20   ` [PATCH 38/47] xfs_repair: find and mark the rtrmapbt inodes Darrick J. Wong
                     ` (9 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:20 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Refactor the realtime bitmap and summary checks into a helper function.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |   84 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 39 insertions(+), 45 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index d88bd80783c..450f19eba4f 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -1736,6 +1736,39 @@ check_dinode_mode_format(
 	return 0;	/* invalid modes are checked elsewhere */
 }
 
+static int
+process_check_rt_inode(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dinoc,
+	xfs_ino_t		lino,
+	int			*type,
+	int			*dirty,
+	int			expected_type,
+	const char		*tag)
+{
+	xfs_extnum_t		dnextents = xfs_dfork_data_extents(dinoc);
+
+	if (*type != expected_type) {
+		do_warn(
+_("%s inode %" PRIu64 " has bad type 0x%x, "),
+			tag, lino, dinode_fmt(dinoc));
+		if (!no_modify)  {
+			do_warn(_("resetting to regular file\n"));
+			change_dinode_fmt(dinoc, S_IFREG);
+			*dirty = 1;
+		} else  {
+			do_warn(_("would reset to regular file\n"));
+		}
+	}
+	if (mp->m_sb.sb_rblocks == 0 && dnextents != 0)  {
+		do_warn(
+_("bad # of extents (%" PRIu64 ") for %s inode %" PRIu64 "\n"),
+			dnextents, tag, lino);
+		return 1;
+	}
+	return 0;
+}
+
 /*
  * If inode is a superblock inode, does type check to make sure is it valid.
  * Returns 0 if it's valid, non-zero if it needs to be cleared.
@@ -1749,8 +1782,6 @@ process_check_sb_inodes(
 	int			*type,
 	int			*dirty)
 {
-	xfs_extnum_t		dnextents;
-
 	if (lino == mp->m_sb.sb_rootino) {
 		if (*type != XR_INO_DIR)  {
 			do_warn(_("root inode %" PRIu64 " has bad type 0x%x\n"),
@@ -1792,49 +1823,12 @@ process_check_sb_inodes(
 		}
 		return 0;
 	}
-	dnextents = xfs_dfork_data_extents(dinoc);
-	if (lino == mp->m_sb.sb_rsumino) {
-		if (*type != XR_INO_RTSUM) {
-			do_warn(
-_("realtime summary inode %" PRIu64 " has bad type 0x%x, "),
-				lino, dinode_fmt(dinoc));
-			if (!no_modify)  {
-				do_warn(_("resetting to regular file\n"));
-				change_dinode_fmt(dinoc, S_IFREG);
-				*dirty = 1;
-			} else  {
-				do_warn(_("would reset to regular file\n"));
-			}
-		}
-		if (mp->m_sb.sb_rblocks == 0 && dnextents != 0)  {
-			do_warn(
-_("bad # of extents (%" PRIu64 ") for realtime summary inode %" PRIu64 "\n"),
-				dnextents, lino);
-			return 1;
-		}
-		return 0;
-	}
-	if (lino == mp->m_sb.sb_rbmino) {
-		if (*type != XR_INO_RTBITMAP) {
-			do_warn(
-_("realtime bitmap inode %" PRIu64 " has bad type 0x%x, "),
-				lino, dinode_fmt(dinoc));
-			if (!no_modify)  {
-				do_warn(_("resetting to regular file\n"));
-				change_dinode_fmt(dinoc, S_IFREG);
-				*dirty = 1;
-			} else  {
-				do_warn(_("would reset to regular file\n"));
-			}
-		}
-		if (mp->m_sb.sb_rblocks == 0 && dnextents != 0)  {
-			do_warn(
-_("bad # of extents (%" PRIu64 ") for realtime bitmap inode %" PRIu64 "\n"),
-				dnextents, lino);
-			return 1;
-		}
-		return 0;
-	}
+	if (lino == mp->m_sb.sb_rsumino)
+		return process_check_rt_inode(mp, dinoc, lino, type, dirty,
+				XR_INO_RTSUM, _("realtime summary"));
+	if (lino == mp->m_sb.sb_rbmino)
+		return process_check_rt_inode(mp, dinoc, lino, type, dirty,
+				XR_INO_RTBITMAP, _("realtime bitmap"));
 	return 0;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 38/47] xfs_repair: find and mark the rtrmapbt inodes
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (36 preceding siblings ...)
  2023-12-27 13:20   ` [PATCH 37/47] xfs_repair: refactor realtime inode check Darrick J. Wong
@ 2023-12-27 13:20   ` Darrick J. Wong
  2023-12-27 13:20   ` [PATCH 39/47] xfs_repair: check existing realtime rmapbt entries against observed rmaps Darrick J. Wong
                     ` (8 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:20 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that we find the realtime rmapbt inodes and mark them
appropriately, just in case we find a rogue inode claiming to be an
rtrmap, or garbage in the metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dino_chunks.c |   13 +++++
 repair/dinode.c      |   42 ++++++++++++++++-
 repair/dir2.c        |    4 ++
 repair/incore.h      |    1 
 repair/rmap.c        |  123 +++++++++++++++++++++++++++++++++++++++++++++++++-
 repair/rmap.h        |    5 ++
 repair/scan.c        |    8 ++-
 7 files changed, 187 insertions(+), 9 deletions(-)


diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c
index d132556d9dc..b7a5879bf4b 100644
--- a/repair/dino_chunks.c
+++ b/repair/dino_chunks.c
@@ -15,6 +15,8 @@
 #include "versions.h"
 #include "prefetch.h"
 #include "progress.h"
+#include "slab.h"
+#include "rmap.h"
 
 /*
  * validates inode block or chunk, returns # of good inodes
@@ -1012,6 +1014,17 @@ process_inode_chunk(
 	_("would clear realtime summary inode %" PRIu64 "\n"),
 						ino);
 				}
+			} else if (is_rtrmap_inode(ino)) {
+				rmap_avoid_check(mp);
+				if (!no_modify)  {
+					do_warn(
+	_("cleared realtime rmap inode %" PRIu64 "\n"),
+						ino);
+				} else  {
+					do_warn(
+	_("would clear realtime rmap inode %" PRIu64 "\n"),
+						ino);
+				}
 			} else if (!no_modify)  {
 				do_warn(_("cleared inode %" PRIu64 "\n"),
 					ino);
diff --git a/repair/dinode.c b/repair/dinode.c
index 450f19eba4f..e08b23a9454 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -153,6 +153,9 @@ clear_dinode(xfs_mount_t *mp, struct xfs_dinode *dino, xfs_ino_t ino_num)
 	clear_dinode_core(mp, dino, ino_num);
 	clear_dinode_unlinked(mp, dino);
 
+	if (is_rtrmap_inode(ino_num))
+		rmap_avoid_check(mp);
+
 	/* and clear the forks */
 	memset(XFS_DFORK_DPTR(dino), 0, XFS_LITINO(mp));
 	return;
@@ -823,13 +826,22 @@ process_rtrmap(
 
 	lino = XFS_AGINO_TO_INO(mp, agno, ino);
 
-	/* This rmap btree inode must be a metadata inode. */
+	/*
+	 * This rmap btree inode must be a metadata inode reachable via
+	 * /realtime/$rgno.rmap in the metadata directory tree.
+	 */
 	if (!(dip->di_flags2 & be64_to_cpu(XFS_DIFLAG2_METADIR))) {
 		do_warn(
 _("rtrmap inode %" PRIu64 " not flagged as metadata\n"),
 			lino);
 		return 1;
 	}
+	if (type != XR_INO_RTRMAP) {
+		do_warn(
+_("rtrmap inode %" PRIu64 " was not found in the metadata directory tree\n"),
+			lino);
+		return 1;
+	}
 
 	memset(&priv.high_key, 0xFF, sizeof(priv.high_key));
 	priv.high_key.rm_blockcount = 0;
@@ -867,7 +879,7 @@ _("computed size of rtrmapbt root (%zu bytes) is greater than space in "
 		error = process_rtrmap_reclist(mp, rp, numrecs,
 				&priv.last_rec, NULL, "rtrmapbt root");
 		if (error) {
-			rmap_avoid_check();
+			rmap_avoid_check(mp);
 			return 1;
 		}
 		return 0;
@@ -1829,6 +1841,9 @@ process_check_sb_inodes(
 	if (lino == mp->m_sb.sb_rbmino)
 		return process_check_rt_inode(mp, dinoc, lino, type, dirty,
 				XR_INO_RTBITMAP, _("realtime bitmap"));
+	if (is_rtrmap_inode(lino))
+		return process_check_rt_inode(mp, dinoc, lino, type, dirty,
+				XR_INO_RTRMAP, _("realtime rmap btree"));
 	return 0;
 }
 
@@ -1926,6 +1941,18 @@ _("realtime summary inode %" PRIu64 " has bad size %" PRId64 " (should be %d)\n"
 		}
 		break;
 
+	case XR_INO_RTRMAP:
+		/*
+		 * if we have no rmapbt, any inode claiming
+		 * to be a real-time file is bogus
+		 */
+		if (!xfs_has_rmapbt(mp)) {
+			do_warn(
+_("found inode %" PRIu64 " claiming to be a rtrmapbt file, but rmapbt is disabled\n"), lino);
+			return 1;
+		}
+		break;
+
 	default:
 		break;
 	}
@@ -1954,6 +1981,14 @@ _("bad attr fork offset %d in dev inode %" PRIu64 ", should be %d\n"),
 			return 1;
 		}
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		if (!(xfs_has_metadir(mp) && xfs_has_parent(mp))) {
+			do_warn(
+_("metadata inode %" PRIu64 " type %d cannot have attr fork\n"),
+				lino, dino->di_format);
+			return 1;
+		}
+		fallthrough;
 	case XFS_DINODE_FMT_LOCAL:
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_BTREE:
@@ -3050,6 +3085,8 @@ _("bad (negative) size %" PRId64 " on inode %" PRIu64 "\n"),
 			type = XR_INO_GQUOTA;
 		else if (lino == mp->m_sb.sb_pquotino)
 			type = XR_INO_PQUOTA;
+		else if (is_rtrmap_inode(lino))
+			type = XR_INO_RTRMAP;
 		else
 			type = XR_INO_DATA;
 		break;
@@ -3155,6 +3192,7 @@ _("Bad CoW extent size %u on inode %" PRIu64 ", "),
 		case XR_INO_UQUOTA:
 		case XR_INO_GQUOTA:
 		case XR_INO_PQUOTA:
+		case XR_INO_RTRMAP:
 			/*
 			 * This inode was recognized as being filesystem
 			 * metadata, so preserve the inode and its contents for
diff --git a/repair/dir2.c b/repair/dir2.c
index a7f5018fba2..43229b3cd9b 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -15,6 +15,8 @@
 #include "da_util.h"
 #include "prefetch.h"
 #include "progress.h"
+#include "slab.h"
+#include "rmap.h"
 
 /*
  * Known bad inode list.  These are seen when the leaf and node
@@ -154,6 +156,8 @@ is_meta_ino(
 		reason = _("realtime bitmap");
 	else if (lino == mp->m_sb.sb_rsumino)
 		reason = _("realtime summary");
+	else if (is_rtrmap_inode(lino))
+		reason = _("realtime rmap");
 	else if (lino == mp->m_sb.sb_uquotino)
 		reason = _("user quota");
 	else if (lino == mp->m_sb.sb_gquotino)
diff --git a/repair/incore.h b/repair/incore.h
index 645cc5317c8..6ee7a662930 100644
--- a/repair/incore.h
+++ b/repair/incore.h
@@ -221,6 +221,7 @@ int		count_bcnt_extents(xfs_agnumber_t);
 #define XR_INO_UQUOTA	12		/* user quota inode */
 #define XR_INO_GQUOTA	13		/* group quota inode */
 #define XR_INO_PQUOTA	14		/* project quota inode */
+#define XR_INO_RTRMAP	15		/* realtime rmap */
 
 /* inode allocation tree */
 
diff --git a/repair/rmap.c b/repair/rmap.c
index aa47013baec..b7e7fbe3f47 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -33,6 +33,12 @@ struct xfs_ag_rmap {
 	int		ar_flcount;		/* agfl entries from leftover */
 						/* agbt allocations */
 	struct xfs_slab	*ar_refcount_items;	/* refcount items, p4-5 */
+
+	/*
+	 * inumber of the rmap btree for this rtgroup.  This can be set to
+	 * NULLFSINO to signal to phase 6 to link a new inode into the metadir.
+	 */
+	xfs_ino_t	rg_rmap_ino;
 };
 
 static struct xfs_ag_rmap *ag_rmaps;
@@ -40,6 +46,9 @@ static struct xfs_ag_rmap *rg_rmaps;
 bool rmapbt_suspect;
 static bool refcbt_suspect;
 
+/* Bitmap of rt group rmap inodes reachable via /realtime/$rgno.rmap. */
+static struct bitmap	*rmap_inodes;
+
 static struct xfs_ag_rmap *rmaps_for_group(bool isrt, unsigned int group)
 {
 	if (isrt)
@@ -116,6 +125,7 @@ rmaps_init_rt(
 	if (error)
 		goto nomem;
 
+	ag_rmap->rg_rmap_ino = NULLFSINO;
 	return;
 nomem:
 	do_error(
@@ -163,6 +173,90 @@ rmaps_init_ag(
 _("Insufficient memory while allocating realtime reverse mapping btree."));
 }
 
+static inline int
+set_rtgroup_rmap_inode(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_imeta_path	*path;
+	struct xfs_ag_rmap	*ar = rmaps_for_group(true, rgno);
+	struct xfs_trans	*tp;
+	xfs_ino_t		ino;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(mp))
+		return 0;
+
+	error = -libxfs_rtrmapbt_create_path(mp, rgno, &path);
+	if (error)
+		return error;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		goto out_path;
+
+	error = -libxfs_imeta_lookup(tp, path, &ino);
+	if (error)
+		goto out_trans;
+
+	if (ino == NULLFSINO || bitmap_test(rmap_inodes, ino, 1)) {
+		error = EFSCORRUPTED;
+		goto out_trans;
+	}
+
+	error = bitmap_set(rmap_inodes, ino, 1);
+	if (error)
+		goto out_trans;
+
+	ar->rg_rmap_ino = ino;
+
+out_trans:
+	libxfs_trans_cancel(tp);
+out_path:
+	libxfs_imeta_free_path(path);
+	return error;
+}
+
+static void
+discover_rtgroup_inodes(
+	struct xfs_mount	*mp)
+{
+	xfs_rgnumber_t		rgno;
+	int			error;
+
+	error = bitmap_alloc(&rmap_inodes);
+	if (error)
+		goto out;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		int err2 = set_rtgroup_rmap_inode(mp, rgno);
+		if (err2 && !error)
+			error = err2;
+	}
+
+out:
+	if (error == EFSCORRUPTED)
+		do_warn(
+ _("corruption in metadata directory tree while discovering rt group inodes\n"));
+	if (error)
+		do_warn(
+ _("couldn't discover rt group inodes, err %d\n"),
+				error);
+}
+
+static inline void
+free_rtmeta_inode_bitmaps(void)
+{
+	bitmap_free(&rmap_inodes);
+}
+
+bool is_rtrmap_inode(xfs_ino_t ino)
+{
+	if (!rmap_inodes)
+		return false;
+	return bitmap_test(rmap_inodes, ino, 1);
+}
+
 /*
  * Initialize per-AG reverse map data.
  */
@@ -188,6 +282,8 @@ rmaps_init(
 
 	for (i = 0; i < mp->m_sb.sb_rgcount; i++)
 		rmaps_init_rt(mp, i, &rg_rmaps[i]);
+
+	discover_rtgroup_inodes(mp);
 }
 
 /*
@@ -202,6 +298,8 @@ rmaps_free(
 	if (!rmap_needs_work(mp))
 		return;
 
+	free_rtmeta_inode_bitmaps();
+
 	for (i = 0; i < mp->m_sb.sb_rgcount; i++)
 		rmaps_destroy(mp, &rg_rmaps[i]);
 	free(rg_rmaps);
@@ -1148,11 +1246,22 @@ rmap_record_count(
 }
 
 /*
- * Disable the refcount btree check.
+ * Disable the rmap btree check.
  */
 void
-rmap_avoid_check(void)
+rmap_avoid_check(
+	struct xfs_mount	*mp)
 {
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
+	for_each_rtgroup(mp, rgno, rtg) {
+		struct xfs_ag_rmap *ar = rmaps_for_group(true, rtg->rtg_rgno);
+
+		ar->rg_rmap_ino = NULLFSINO;
+	}
+
+	bitmap_clear(rmap_inodes, 0, XFS_MAXINUMBER);
 	rmapbt_suspect = true;
 }
 
@@ -1831,3 +1940,13 @@ estimate_refcountbt_blocks(
 	return libxfs_refcountbt_calc_size(mp,
 			slab_count(x->ar_refcount_items));
 }
+
+/* Retrieve the rtrmapbt inode number for a given rtgroup. */
+xfs_ino_t
+rtgroup_rmap_ino(
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_ag_rmap	*ar = rmaps_for_group(true, rtg->rtg_rgno);
+
+	return ar->rg_rmap_ino;
+}
diff --git a/repair/rmap.h b/repair/rmap.h
index 7a94ed6f90a..dd55ba3cc29 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -28,7 +28,7 @@ int rmap_commit_agbtree_mappings(struct xfs_mount *mp, xfs_agnumber_t agno);
 
 uint64_t rmap_record_count(struct xfs_mount *mp, bool isrt,
 		xfs_agnumber_t agno);
-extern void rmap_avoid_check(void);
+extern void rmap_avoid_check(struct xfs_mount *mp);
 void rmaps_verify_btree(struct xfs_mount *mp, xfs_agnumber_t agno);
 
 extern int64_t rmap_diffkeys(struct xfs_rmap_irec *kp1,
@@ -63,4 +63,7 @@ void rmap_free_mem_cursor(struct xfs_trans *tp, struct rmap_mem_cur *rmcur,
 		int error);
 int rmap_get_mem_rec(struct rmap_mem_cur *rmcur, struct xfs_rmap_irec *irec);
 
+bool is_rtrmap_inode(xfs_ino_t ino);
+xfs_ino_t rtgroup_rmap_ino(struct xfs_rtgroup *rtg);
+
 #endif /* RMAP_H_ */
diff --git a/repair/scan.c b/repair/scan.c
index 27aeb341bf3..2f414898078 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -1357,7 +1357,7 @@ _("out of order key %u in %s btree block (%u/%u)\n"),
 
 out:
 	if (suspect)
-		rmap_avoid_check();
+		rmap_avoid_check(mp);
 }
 
 int
@@ -1737,7 +1737,7 @@ _("bad %s btree ptr 0x%llx in ino %" PRIu64 "\n"),
 
 out:
 	if (hdr_errors || suspect) {
-		rmap_avoid_check();
+		rmap_avoid_check(mp);
 		return 1;
 	}
 	return 0;
@@ -2818,7 +2818,7 @@ validate_agf(
 		if (levels == 0 || levels > mp->m_rmap_maxlevels) {
 			do_warn(_("bad levels %u for rmapbt root, agno %d\n"),
 				levels, agno);
-			rmap_avoid_check();
+			rmap_avoid_check(mp);
 		}
 
 		bno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
@@ -2833,7 +2833,7 @@ validate_agf(
 		} else {
 			do_warn(_("bad agbno %u for rmapbt root, agno %d\n"),
 				bno, agno);
-			rmap_avoid_check();
+			rmap_avoid_check(mp);
 		}
 	}
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 39/47] xfs_repair: check existing realtime rmapbt entries against observed rmaps
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (37 preceding siblings ...)
  2023-12-27 13:20   ` [PATCH 38/47] xfs_repair: find and mark the rtrmapbt inodes Darrick J. Wong
@ 2023-12-27 13:20   ` Darrick J. Wong
  2023-12-27 13:20   ` [PATCH 40/47] xfs_repair: always check realtime file mappings against incore info Darrick J. Wong
                     ` (7 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:20 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Once we've finished collecting reverse mapping observations from the
metadata scan, check those observations against the realtime rmap btree
(particularly if we're in -n mode) to detect rtrmapbt problems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase4.c |   12 +++
 repair/rmap.c   |  262 ++++++++++++++++++++++++++++++++++++++++++++-----------
 repair/rmap.h   |    2 
 3 files changed, 223 insertions(+), 53 deletions(-)


diff --git a/repair/phase4.c b/repair/phase4.c
index cfdea1460e5..b0cb805f30c 100644
--- a/repair/phase4.c
+++ b/repair/phase4.c
@@ -155,6 +155,16 @@ check_rmap_btrees(
 	rmaps_verify_btree(wq->wq_ctx, agno);
 }
 
+static void
+check_rtrmap_btrees(
+	struct workqueue *wq,
+	xfs_agnumber_t	agno,
+	void		*arg)
+{
+	rmap_add_fixed_rtgroup_rec(wq->wq_ctx, agno);
+	rtrmaps_verify_btree(wq->wq_ctx, agno);
+}
+
 static void
 compute_ag_refcounts(
 	struct workqueue*wq,
@@ -207,6 +217,8 @@ process_rmap_data(
 	create_work_queue(&wq, mp, platform_nproc());
 	for (i = 0; i < mp->m_sb.sb_agcount; i++)
 		queue_work(&wq, check_rmap_btrees, i, NULL);
+	for (i = 0; i < mp->m_sb.sb_rgcount; i++)
+		queue_work(&wq, check_rtrmap_btrees, i, NULL);
 	destroy_work_queue(&wq);
 
 	if (!xfs_has_reflink(mp))
diff --git a/repair/rmap.c b/repair/rmap.c
index b7e7fbe3f47..5ac7188f12e 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -17,6 +17,7 @@
 #include "libxfs/xfile.h"
 #include "libxfs/xfbtree.h"
 #include "rcbag.h"
+#include "prefetch.h"
 
 #undef RMAP_DEBUG
 
@@ -682,6 +683,26 @@ rmap_add_fixed_ag_rec(
 	}
 }
 
+/* Add this realtime group's fixed metadata to the incore data. */
+void
+rmap_add_fixed_rtgroup_rec(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_rmap_irec	rmap = {
+		.rm_startblock	= 0,
+		.rm_blockcount	= mp->m_sb.sb_rextsize,
+		.rm_owner	= XFS_RMAP_OWN_FS,
+		.rm_offset	= 0,
+		.rm_flags	= 0,
+	};
+
+	if (!rmap_needs_work(mp))
+		return;
+
+	rmap_add_mem_rec(mp, true, rgno, &rmap);
+}
+
 /*
  * Copy the per-AG btree reverse-mapping data into the rmapbt.
  *
@@ -1331,62 +1352,25 @@ rmap_is_good(
 #undef NEXTP
 #undef NEXTL
 
-/*
- * Compare the observed reverse mappings against what's in the ag btree.
- */
-void
-rmaps_verify_btree(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno)
+static int
+rmap_compare_records(
+	struct rmap_mem_cur	*rm_cur,
+	struct xfs_btree_cur	*bt_cur,
+	unsigned int		group)
 {
-	struct rmap_mem_cur	rm_cur;
 	struct xfs_rmap_irec	rm_rec;
 	struct xfs_rmap_irec	tmp;
-	struct xfs_btree_cur	*bt_cur = NULL;
-	struct xfs_buf		*agbp = NULL;
-	struct xfs_perag	*pag = NULL;
 	int			have;
 	int			error;
 
-	if (!xfs_has_rmapbt(mp) || add_rmapbt)
-		return;
-	if (rmapbt_suspect) {
-		if (no_modify && agno == 0)
-			do_warn(_("would rebuild corrupt rmap btrees.\n"));
-		return;
-	}
-
-	/* Create cursors to rmap structures */
-	error = rmap_init_mem_cursor(mp, NULL, false, agno, &rm_cur);
-	if (error) {
-		do_warn(_("Not enough memory to check reverse mappings.\n"));
-		return;
-	}
-
-	pag = libxfs_perag_get(mp, agno);
-	error = -libxfs_alloc_read_agf(pag, NULL, 0, &agbp);
-	if (error) {
-		do_warn(_("Could not read AGF %u to check rmap btree.\n"),
-				agno);
-		goto err_pag;
-	}
-
-	/* Leave the per-ag data "uninitialized" since we rewrite it later */
-	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
-
-	bt_cur = libxfs_rmapbt_init_cursor(mp, NULL, agbp, pag);
-	if (!bt_cur) {
-		do_warn(_("Not enough memory to check reverse mappings.\n"));
-		goto err_agf;
-	}
-
-	while ((error = rmap_get_mem_rec(&rm_cur, &rm_rec)) == 1) {
+	while ((error = rmap_get_mem_rec(rm_cur, &rm_rec)) == 1) {
 		error = rmap_lookup(bt_cur, &rm_rec, &tmp, &have);
 		if (error) {
 			do_warn(
 _("Could not read reverse-mapping record for (%u/%u).\n"),
-					agno, rm_rec.rm_startblock);
-			goto err_cur;
+					group,
+					rm_rec.rm_startblock);
+			return error;
 		}
 
 		/*
@@ -1401,15 +1385,15 @@ _("Could not read reverse-mapping record for (%u/%u).\n"),
 			if (error) {
 				do_warn(
 _("Could not read reverse-mapping record for (%u/%u).\n"),
-						agno, rm_rec.rm_startblock);
-				goto err_cur;
+						group, rm_rec.rm_startblock);
+				return error;
 			}
 		}
 		if (!have) {
 			do_warn(
 _("Missing reverse-mapping record for (%u/%u) %slen %u owner %"PRId64" \
 %s%soff %"PRIu64"\n"),
-				agno, rm_rec.rm_startblock,
+				group, rm_rec.rm_startblock,
 				(rm_rec.rm_flags & XFS_RMAP_UNWRITTEN) ?
 					_("unwritten ") : "",
 				rm_rec.rm_blockcount,
@@ -1422,12 +1406,12 @@ _("Missing reverse-mapping record for (%u/%u) %slen %u owner %"PRId64" \
 			continue;
 		}
 
-		/* Compare each refcount observation against the btree's */
+		/* Compare each rmap observation against the btree's */
 		if (!rmap_is_good(&rm_rec, &tmp)) {
 			do_warn(
 _("Incorrect reverse-mapping: saw (%u/%u) %slen %u owner %"PRId64" %s%soff \
 %"PRIu64"; should be (%u/%u) %slen %u owner %"PRId64" %s%soff %"PRIu64"\n"),
-				agno, tmp.rm_startblock,
+				group, tmp.rm_startblock,
 				(tmp.rm_flags & XFS_RMAP_UNWRITTEN) ?
 					_("unwritten ") : "",
 				tmp.rm_blockcount,
@@ -1437,7 +1421,7 @@ _("Incorrect reverse-mapping: saw (%u/%u) %slen %u owner %"PRId64" %s%soff \
 				(tmp.rm_flags & XFS_RMAP_BMBT_BLOCK) ?
 					_("bmbt ") : "",
 				tmp.rm_offset,
-				agno, rm_rec.rm_startblock,
+				group, rm_rec.rm_startblock,
 				(rm_rec.rm_flags & XFS_RMAP_UNWRITTEN) ?
 					_("unwritten ") : "",
 				rm_rec.rm_blockcount,
@@ -1450,8 +1434,61 @@ _("Incorrect reverse-mapping: saw (%u/%u) %slen %u owner %"PRId64" %s%soff \
 		}
 	}
 
+	return error;
+}
+
+/*
+ * Compare the observed reverse mappings against what's in the ag btree.
+ */
+void
+rmaps_verify_btree(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct rmap_mem_cur	rm_cur;
+	struct xfs_btree_cur	*bt_cur = NULL;
+	struct xfs_buf		*agbp = NULL;
+	struct xfs_perag	*pag = NULL;
+	int			error;
+
+	if (!xfs_has_rmapbt(mp) || add_rmapbt)
+		return;
+	if (rmapbt_suspect) {
+		if (no_modify && agno == 0)
+			do_warn(_("would rebuild corrupt rmap btrees.\n"));
+		return;
+	}
+
+	/* Create cursors to rmap structures */
+	error = rmap_init_mem_cursor(mp, NULL, false, agno, &rm_cur);
+	if (error) {
+		do_warn(_("Not enough memory to check reverse mappings.\n"));
+		return;
+	}
+
+	pag = libxfs_perag_get(mp, agno);
+	error = -libxfs_alloc_read_agf(pag, NULL, 0, &agbp);
+	if (error) {
+		do_warn(_("Could not read AGF %u to check rmap btree.\n"),
+				agno);
+		goto err_pag;
+	}
+
+	/* Leave the per-ag data "uninitialized" since we rewrite it later */
+	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
+
+	bt_cur = libxfs_rmapbt_init_cursor(mp, NULL, agbp, pag);
+	if (!bt_cur) {
+		do_warn(_("Not enough memory to check reverse mappings.\n"));
+		goto err_agf;
+	}
+
+	error = rmap_compare_records(&rm_cur, bt_cur, agno);
+	if (error)
+		goto err_cur;
+
 err_cur:
-	libxfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+	libxfs_btree_del_cursor(bt_cur, error);
 err_agf:
 	libxfs_buf_relse(agbp);
 err_pag:
@@ -1459,6 +1496,125 @@ _("Incorrect reverse-mapping: saw (%u/%u) %slen %u owner %"PRId64" %s%soff \
 	rmap_free_mem_cursor(NULL, &rm_cur, error);
 }
 
+/*
+ * Thread-safe version of xfs_imeta_iget.
+ *
+ * In the kernel, xfs_imeta_iget requires a transaction so that the untrusted
+ * lookup will not livelock the mount process if the inobt contains a cycle.
+ * However, the userspace buffer cache only locks buffers if it's told to.
+ * That only happens when prefetch is enabled.
+ *
+ * Depending on allocation patterns, realtime metadata inodes can share the
+ * same inode cluster buffer.  We don't want libxfs_trans_bjoin in racing iget
+ * calls to corrupt the incore buffer state, so we impose our own lock here.
+ * Evidently support orgs will sometimes use no-prefetch lockless mode as a
+ * last resort if repair gets stuck on a buffer lock elsewhere.
+ */
+static inline int
+threadsafe_imeta_iget(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	struct xfs_inode	**ipp)
+{
+	static pthread_mutex_t	lock = PTHREAD_MUTEX_INITIALIZER;
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	if (do_prefetch) {
+		error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, ipp);
+	} else {
+		pthread_mutex_lock(&lock);
+		error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, ipp);
+		pthread_mutex_unlock(&lock);
+	}
+	libxfs_trans_cancel(tp);
+
+	return error;
+}
+
+/*
+ * Compare the observed reverse mappings against what's in the rtgroup btree.
+ */
+void
+rtrmaps_verify_btree(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct rmap_mem_cur	rm_cur;
+	struct xfs_btree_cur	*bt_cur = NULL;
+	struct xfs_rtgroup	*rtg = NULL;
+	struct xfs_ag_rmap	*ar = rmaps_for_group(true, rgno);
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	if (!xfs_has_rmapbt(mp) || add_rmapbt)
+		return;
+	if (rmapbt_suspect) {
+		if (no_modify && rgno == 0)
+			do_warn(_("would rebuild corrupt rmap btrees.\n"));
+		return;
+	}
+
+	/* Create cursors to rmap structures */
+	error = rmap_init_mem_cursor(mp, NULL, true, rgno, &rm_cur);
+	if (error) {
+		do_warn(_("Not enough memory to check reverse mappings.\n"));
+		return;
+	}
+
+	rtg = libxfs_rtgroup_get(mp, rgno);
+	if (!rtg) {
+		do_warn(_("Could not load rtgroup %u.\n"), rgno);
+		goto err_rcur;
+	}
+
+	error = threadsafe_imeta_iget(mp, ar->rg_rmap_ino, &ip);
+	if (error) {
+		do_warn(
+_("Could not load rtgroup %u rmap inode, error %d.\n"),
+				rgno, error);
+		goto err_rtg;
+	}
+
+	if (ip->i_df.if_format != XFS_DINODE_FMT_RMAP) {
+		do_warn(
+_("rtgroup %u rmap inode has wrong format 0x%x, expected 0x%x\n"),
+				rgno, ip->i_df.if_format,
+				XFS_DINODE_FMT_RMAP);
+		goto err_ino;
+	}
+
+	if (xfs_inode_has_attr_fork(ip) &&
+	    !(xfs_has_metadir(mp) && xfs_has_parent(mp))) {
+		do_warn(
+_("rtgroup %u rmap inode should not have extended attributes\n"), rgno);
+		goto err_ino;
+	}
+
+	bt_cur = libxfs_rtrmapbt_init_cursor(mp, NULL, rtg, ip);
+	if (!bt_cur) {
+		do_warn(_("Not enough memory to check reverse mappings.\n"));
+		goto err_ino;
+	}
+
+	error = rmap_compare_records(&rm_cur, bt_cur, rgno);
+	if (error)
+		goto err_cur;
+
+err_cur:
+	libxfs_btree_del_cursor(bt_cur, error);
+err_ino:
+	libxfs_imeta_irele(ip);
+err_rtg:
+	libxfs_rtgroup_put(rtg);
+err_rcur:
+	rmap_free_mem_cursor(NULL, &rm_cur, error);
+}
+
 /*
  * Compare the key fields of two rmap records -- positive if key1 > key2,
  * negative if key1 < key2, and zero if equal.
diff --git a/repair/rmap.h b/repair/rmap.h
index dd55ba3cc29..dcd834ef242 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -21,6 +21,7 @@ void rmap_add_bmbt_rec(struct xfs_mount *mp, xfs_ino_t ino, int whichfork,
 bool rmaps_are_mergeable(struct xfs_rmap_irec *r1, struct xfs_rmap_irec *r2);
 
 void rmap_add_fixed_ag_rec(struct xfs_mount *mp, xfs_agnumber_t agno);
+void rmap_add_fixed_rtgroup_rec(struct xfs_mount *mp, xfs_rgnumber_t rgno);
 
 int rmap_add_agbtree_mapping(struct xfs_mount *mp, xfs_agnumber_t agno,
 		xfs_agblock_t agbno, xfs_extlen_t len, uint64_t owner);
@@ -30,6 +31,7 @@ uint64_t rmap_record_count(struct xfs_mount *mp, bool isrt,
 		xfs_agnumber_t agno);
 extern void rmap_avoid_check(struct xfs_mount *mp);
 void rmaps_verify_btree(struct xfs_mount *mp, xfs_agnumber_t agno);
+void rtrmaps_verify_btree(struct xfs_mount *mp, xfs_rgnumber_t rgno);
 
 extern int64_t rmap_diffkeys(struct xfs_rmap_irec *kp1,
 		struct xfs_rmap_irec *kp2);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 40/47] xfs_repair: always check realtime file mappings against incore info
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (38 preceding siblings ...)
  2023-12-27 13:20   ` [PATCH 39/47] xfs_repair: check existing realtime rmapbt entries against observed rmaps Darrick J. Wong
@ 2023-12-27 13:20   ` Darrick J. Wong
  2023-12-27 13:21   ` [PATCH 41/47] xfs_repair: rebuild the realtime rmap btree Darrick J. Wong
                     ` (6 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:20 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Curiously, the xfs_repair code that processes data fork mappings of
realtime files doesn't actually compare the mappings against the incore
state map during the !check_dups phase (aka phase 3).  As a result, we
lose the opportunity to clear damaged realtime data forks before we get
to crosslinked file checking in phase 4, which results in ondisk
metadata errors calling do_error, which aborts repair.

Split the process_rt_rec_state code into two functions: one to check the
mapping, and another to update the incore state.  The first one can be
called to help us decide if we're going to zap the fork, and the second
one updates the incore state if we decide to keep the fork.  We already
do this for regular data files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 8 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index e08b23a9454..a6713a7bc6b 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -219,7 +219,7 @@ _("data fork in rt ino %" PRIu64 " claims dup rt extent,"
 	return 0;
 }
 
-static int
+static void
 process_rt_rec_state(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino,
@@ -263,11 +263,78 @@ _("data fork in rt inode %" PRIu64 " found invalid rt extent %"PRIu64" state %d
 			set_rtbmap(ext, zap_metadata ? XR_E_METADATA :
 						       XR_E_INUSE);
 			break;
+		case XR_E_BAD_STATE:
+			do_error(
+_("bad state in rt extent map %" PRIu64 "\n"),
+				ext);
 		case XR_E_METADATA:
+		case XR_E_FS_MAP:
+		case XR_E_INO:
+		case XR_E_INUSE_FS:
+			break;
+		case XR_E_INUSE:
+		case XR_E_MULT:
+			set_rtbmap(ext, XR_E_MULT);
+			break;
+		case XR_E_FREE1:
+		default:
 			do_error(
+_("illegal state %d in rt extent %" PRIu64 "\n"),
+				state, ext);
+		}
+		b += mp->m_sb.sb_rextsize;
+	} while (b < irec->br_startblock + irec->br_blockcount);
+}
+
+/*
+ * Checks the realtime file's data mapping against in-core extent info, and
+ * complains if there are discrepancies.  Returns 0 if good, 1 if bad.
+ */
+static int
+check_rt_rec_state(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	struct xfs_bmbt_irec	*irec)
+{
+	xfs_fsblock_t		b = irec->br_startblock;
+	xfs_rtblock_t		ext;
+	int			state;
+
+	do {
+		ext = (xfs_rtblock_t)b / mp->m_sb.sb_rextsize;
+		state = get_rtbmap(ext);
+
+		if ((b % mp->m_sb.sb_rextsize) != 0) {
+			/*
+			 * We are midway through a partially written extent.
+			 * If we don't find the state that gets set in the
+			 * other clause of this loop body, then we have a
+			 * partially *mapped* rt extent and should complain.
+			 */
+			if (state != XR_E_INUSE && state != XR_E_FREE) {
+				do_warn(
+_("data fork in rt inode %" PRIu64 " found invalid rt extent %"PRIu64" state %d at rt block %"PRIu64"\n"),
+					ino, ext, state, b);
+				return 1;
+			}
+
+			b = roundup(b, mp->m_sb.sb_rextsize);
+			continue;
+		}
+
+		/*
+		 * This is the start of an rt extent.  Complain if there are
+		 * conflicting states.  We'll set the state elsewhere.
+		 */
+		switch (state)  {
+		case XR_E_FREE:
+		case XR_E_UNKNOWN:
+			break;
+		case XR_E_METADATA:
+			do_warn(
 _("data fork in rt inode %" PRIu64 " found metadata file block %" PRIu64 " in rt bmap\n"),
 				ino, ext);
-			break;
+			return 1;
 		case XR_E_BAD_STATE:
 			do_error(
 _("bad state in rt extent map %" PRIu64 "\n"),
@@ -275,12 +342,12 @@ _("bad state in rt extent map %" PRIu64 "\n"),
 		case XR_E_FS_MAP:
 		case XR_E_INO:
 		case XR_E_INUSE_FS:
-			do_error(
+			do_warn(
 _("data fork in rt inode %" PRIu64 " found rt metadata extent %" PRIu64 " in rt bmap\n"),
 				ino, ext);
+			return 1;
 		case XR_E_INUSE:
 		case XR_E_MULT:
-			set_rtbmap(ext, XR_E_MULT);
 			do_warn(
 _("data fork in rt inode %" PRIu64 " claims used rt extent %" PRIu64 "\n"),
 				ino, b);
@@ -341,13 +408,18 @@ _("inode %" PRIu64 " - bad rt extent overflows - start %" PRIu64 ", "
 		return 1;
 	}
 
-	if (check_dups)
-		bad = process_rt_rec_dups(mp, ino, irec);
-	else
-		bad = process_rt_rec_state(mp, ino, zap_metadata, irec);
+	bad = check_rt_rec_state(mp, ino, irec);
 	if (bad)
 		return bad;
 
+	if (check_dups) {
+		bad = process_rt_rec_dups(mp, ino, irec);
+		if (bad)
+			return bad;
+	} else {
+		process_rt_rec_state(mp, ino, zap_metadata, irec);
+	}
+
 	/*
 	 * bump up the block counter
 	 */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 41/47] xfs_repair: rebuild the realtime rmap btree
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (39 preceding siblings ...)
  2023-12-27 13:20   ` [PATCH 40/47] xfs_repair: always check realtime file mappings against incore info Darrick J. Wong
@ 2023-12-27 13:21   ` Darrick J. Wong
  2023-12-27 13:21   ` [PATCH 42/47] xfs_repair: check for global free space concerns with default btree slack levels Darrick J. Wong
                     ` (5 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:21 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Rebuild the realtime rmap btree file from the reverse mapping records we
gathered from walking the inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    5 +
 repair/Makefile          |    1 
 repair/bulkload.c        |   41 +++++++
 repair/bulkload.h        |    2 
 repair/phase6.c          |  155 +++++++++++++++++++++++++++
 repair/rmap.c            |   26 +++++
 repair/rmap.h            |    3 +
 repair/rtrmap_repair.c   |  261 ++++++++++++++++++++++++++++++++++++++++++++++
 repair/xfs_repair.c      |    8 +
 9 files changed, 500 insertions(+), 2 deletions(-)
 create mode 100644 repair/rtrmap_repair.c


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index e65b4d6fea5..b5bb6c39928 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -271,6 +271,7 @@
 #define xfs_rmap_irec_offset_unpack	libxfs_rmap_irec_offset_unpack
 #define xfs_rmap_lookup_le		libxfs_rmap_lookup_le
 #define xfs_rmap_lookup_le_range	libxfs_rmap_lookup_le_range
+#define xfs_rmap_map_extent		libxfs_rmap_map_extent
 #define xfs_rmap_map_raw		libxfs_rmap_map_raw
 #define xfs_rmap_query_all		libxfs_rmap_query_all
 #define xfs_rmap_query_range		libxfs_rmap_query_range
@@ -288,6 +289,9 @@
 #define xfs_rtgroup_put			libxfs_rtgroup_put
 #define xfs_rtgroup_update_secondary_sbs	libxfs_rtgroup_update_secondary_sbs
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
+#define xfs_rtrmapbt_calc_size		libxfs_rtrmapbt_calc_size
+#define xfs_rtrmapbt_commit_staged_btree	libxfs_rtrmapbt_commit_staged_btree
+#define xfs_rtrmapbt_create		libxfs_rtrmapbt_create
 #define xfs_rtrmapbt_create_path	libxfs_rtrmapbt_create_path
 #define xfs_rtrmapbt_droot_maxrecs	libxfs_rtrmapbt_droot_maxrecs
 #define xfs_rtrmapbt_maxlevels_ondisk	libxfs_rtrmapbt_maxlevels_ondisk
@@ -295,6 +299,7 @@
 #define xfs_rtrmapbt_maxrecs		libxfs_rtrmapbt_maxrecs
 #define xfs_rtrmapbt_mem_create		libxfs_rtrmapbt_mem_create
 #define xfs_rtrmapbt_mem_cursor		libxfs_rtrmapbt_mem_cursor
+#define xfs_rtrmapbt_stage_cursor	libxfs_rtrmapbt_stage_cursor
 
 #define xfs_sb_from_disk		libxfs_sb_from_disk
 #define xfs_sb_quota_from_disk		libxfs_sb_quota_from_disk
diff --git a/repair/Makefile b/repair/Makefile
index 1f72c811056..5bec8154829 100644
--- a/repair/Makefile
+++ b/repair/Makefile
@@ -75,6 +75,7 @@ CFILES = \
 	rcbag.c \
 	rmap.c \
 	rt.c \
+	rtrmap_repair.c \
 	sb.c \
 	scan.c \
 	slab.c \
diff --git a/repair/bulkload.c b/repair/bulkload.c
index e9c52afd23c..819639ae343 100644
--- a/repair/bulkload.c
+++ b/repair/bulkload.c
@@ -364,3 +364,44 @@ bulkload_estimate_ag_slack(
 	if (bload->node_slack < 0)
 		bload->node_slack = 2;
 }
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us.  However, there are some
+ * exceptions to this rule:
+ *
+ * (1) If someone turned one of the debug knobs.
+ * (2) The FS has less than ~9% space free.
+ *
+ * Note that we actually use 3/32 for the comparison to avoid division.
+ */
+void
+bulkload_estimate_inode_slack(
+	struct xfs_mount	*mp,
+	struct xfs_btree_bload	*bload,
+	unsigned long long	free)
+{
+	/*
+	 * The global values are set to -1 (i.e. take the bload defaults)
+	 * unless someone has set them otherwise, so we just pull the values
+	 * here.
+	 */
+	bload->leaf_slack = bload_leaf_slack;
+	bload->node_slack = bload_node_slack;
+
+	/* No further changes if there's more than 3/32ths space left. */
+	if (free >= ((mp->m_sb.sb_dblocks * 3) >> 5))
+		return;
+
+	/*
+	 * We're low on space; load the btrees as tightly as possible.  Leave
+	 * a couple of open slots in each btree block so that we don't end up
+	 * splitting the btrees like crazy right after mount.
+	 */
+	if (bload->leaf_slack < 0)
+		bload->leaf_slack = 2;
+	if (bload->node_slack < 0)
+		bload->node_slack = 2;
+}
diff --git a/repair/bulkload.h b/repair/bulkload.h
index a88aafaa678..842121b1519 100644
--- a/repair/bulkload.h
+++ b/repair/bulkload.h
@@ -78,5 +78,7 @@ void bulkload_cancel(struct bulkload *bkl);
 int bulkload_commit(struct bulkload *bkl);
 void bulkload_estimate_ag_slack(struct repair_ctx *sc,
 		struct xfs_btree_bload *bload, unsigned int free);
+void bulkload_estimate_inode_slack(struct xfs_mount *mp,
+		struct xfs_btree_bload *bload, unsigned long long free);
 
 #endif /* __XFS_REPAIR_BULKLOAD_H__ */
diff --git a/repair/phase6.c b/repair/phase6.c
index 63a2768d9c6..4c387557c31 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -19,6 +19,8 @@
 #include "progress.h"
 #include "versions.h"
 #include "repair/pptr.h"
+#include "slab.h"
+#include "rmap.h"
 
 static xfs_ino_t		orphanage_ino;
 
@@ -1072,6 +1074,136 @@ mk_rsumino(
 	libxfs_irele(ip);
 }
 
+static void
+ensure_rtgroup_rmapbt(
+	struct xfs_rtgroup	*rtg,
+	xfs_filblks_t		est_fdblocks)
+{
+	struct xfs_imeta_update	upd;
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_path	*path;
+	struct xfs_inode	*ip;
+	xfs_ino_t		ino;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(mp))
+		return;
+
+	ino = rtgroup_rmap_ino(rtg);
+	if (no_modify) {
+		if (ino == NULLFSINO)
+			do_warn(_("would reset rtgroup %u rmap btree\n"),
+					rtg->rtg_rgno);
+		return;
+	}
+
+	if (ino == NULLFSINO)
+		do_warn(_("resetting rtgroup %u rmap btree\n"),
+				rtg->rtg_rgno);
+
+	error = -libxfs_rtrmapbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		do_error(
+ _("Couldn't create rtgroup %u rmap file path, err %d\n"),
+				rtg->rtg_rgno, error);
+
+	error = ensure_imeta_dirpath(mp, path);
+	if (error)
+		do_error(
+ _("Couldn't create rtgroup %u metadata directory, error %d\n"),
+				rtg->rtg_rgno, error);
+
+	if (ino != NULLFSINO) {
+		struct xfs_trans	*tp;
+
+		/*
+		 * We're still hanging on to our old inode pointer, so grab it
+		 * and reconnect it to the metadata directory tree.  If it
+		 * can't be grabbed, create a new rtrmap file.
+		 */
+		error = -libxfs_trans_alloc_empty(mp, &tp);
+		if (error)
+			do_error(
+ _("Couldn't allocate transaction to iget rtgroup %u rmap inode 0x%llx, error %d\n"),
+					rtg->rtg_rgno, (unsigned long long)ino,
+					error);
+		error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, &ip);
+		libxfs_trans_cancel(tp);
+		if (error) {
+			do_warn(
+ _("Couldn't iget rtgroup %u rmap inode 0x%llx, error %d\n"),
+					rtg->rtg_rgno, (unsigned long long)ino,
+					error);
+			goto zap;
+		}
+
+		/*
+		 * Since we're reattaching this file to the metadata directory
+		 * tree, try to remove all the parent pointers that might be
+		 * attached.
+		 */
+		try_erase_parent_ptrs(ip);
+
+		error = -libxfs_imeta_start_link(mp, path, ip, &upd);
+		if (error)
+			do_error(
+ _("Couldn't grab resources to reconnect rtgroup %u rmapbt, error %d\n"),
+					rtg->rtg_rgno, error);
+
+		error = -libxfs_imeta_link(&upd);
+		if (error)
+			do_error(
+ _("Failed to link rtgroup %u rmapbt inode 0x%llx, error %d\n"),
+					rtg->rtg_rgno,
+					(unsigned long long)ip->i_ino,
+					error);
+
+		/* Reset the link count to something sane. */
+		set_nlink(VFS_I(ip), 1);
+		ip->i_df.if_format = XFS_DINODE_FMT_RMAP;
+		libxfs_trans_log_inode(upd.tp, ip, XFS_ILOG_CORE);
+	} else {
+zap:
+		/*
+		 * The rtrmap inode was bad or gone, so just make a new one
+		 * and give our reference to the rtgroup structure.
+		 */
+		error = -libxfs_imeta_start_create(mp, path, &upd);
+		if (error)
+			do_error(
+ _("Couldn't grab resources to recreate rtgroup %u rmapbt, error %d\n"),
+					rtg->rtg_rgno, error);
+
+		error = -libxfs_rtrmapbt_create(&upd, &ip);
+		if (error)
+			do_error(
+ _("Couldn't create rtgroup %u rmap inode, error %d\n"),
+					rtg->rtg_rgno, error);
+	}
+
+	/* Mark the inode in use. */
+	mark_ino_inuse(mp, ip->i_ino, S_IFREG, upd.dp->i_ino);
+	mark_ino_metadata(mp, ip->i_ino);
+
+	error = -libxfs_imeta_commit_update(&upd);
+	if (error)
+		do_error(
+ _("Couldn't commit new rtgroup %u rmap inode %llu, error %d\n"),
+				rtg->rtg_rgno,
+				(unsigned long long)ip->i_ino,
+				error);
+
+	/* Copy our incore rmap data to the ondisk rmap inode. */
+	error = populate_rtgroup_rmapbt(rtg, ip, est_fdblocks);
+	if (error)
+		do_error(
+ _("rtgroup %u rmap btree could not be rebuilt, error %d\n"),
+				rtg->rtg_rgno, error);
+
+	libxfs_imeta_free_path(path);
+	libxfs_imeta_irele(ip);
+}
+
 /* Initialize a root directory. */
 static int
 init_fs_root_dir(
@@ -3838,6 +3970,27 @@ traverse_ags(
 	do_inode_prefetch(mp, ag_stride, traverse_function, false, true);
 }
 
+static void
+reset_rt_metadata_inodes(
+	struct xfs_mount	*mp)
+{
+	struct xfs_rtgroup	*rtg;
+	xfs_filblks_t		metadata_blocks = 0;
+	xfs_filblks_t		est_fdblocks = 0;
+	xfs_rgnumber_t		rgno;
+
+	/* Estimate how much free space will be left after building btrees */
+	for_each_rtgroup(mp, rgno, rtg) {
+		metadata_blocks += estimate_rtrmapbt_blocks(rtg);
+	}
+	if (mp->m_sb.sb_fdblocks > metadata_blocks)
+		est_fdblocks = mp->m_sb.sb_fdblocks - metadata_blocks;
+
+	for_each_rtgroup(mp, rgno, rtg) {
+		ensure_rtgroup_rmapbt(rtg, est_fdblocks);
+	}
+}
+
 void
 phase6(xfs_mount_t *mp)
 {
@@ -3903,6 +4056,8 @@ phase6(xfs_mount_t *mp)
 		}
 	}
 
+	reset_rt_metadata_inodes(mp);
+
 	if (!no_modify)  {
 		do_log(
 _("        - resetting contents of realtime bitmap and summary inodes\n"));
diff --git a/repair/rmap.c b/repair/rmap.c
index 5ac7188f12e..1312a0dde34 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -2106,3 +2106,29 @@ rtgroup_rmap_ino(
 
 	return ar->rg_rmap_ino;
 }
+
+/* Estimate the size of the ondisk rtrmapbt from the incore tree. */
+xfs_filblks_t
+estimate_rtrmapbt_blocks(
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_ag_rmap	*x;
+	unsigned long long	nr_recs;
+
+	if (!rmap_needs_work(mp) || !xfs_has_rtrmapbt(mp))
+		return 0;
+
+	x = &rg_rmaps[rtg->rtg_rgno];
+	if (!x->ar_xfbtree)
+		return 0;
+
+	/*
+	 * Overestimate the amount of space needed by pretending that every
+	 * byte in the incore tree is used to store rtrmapbt records.  This
+	 * means we can use SEEK_DATA/HOLE on the xfile, which is faster than
+	 * walking the entire btree.
+	 */
+	nr_recs = xfbtree_bytes(x->ar_xfbtree) / sizeof(struct xfs_rmap_rec);
+	return libxfs_rtrmapbt_calc_size(mp, nr_recs);
+}
diff --git a/repair/rmap.h b/repair/rmap.h
index dcd834ef242..1f99606a455 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -67,5 +67,8 @@ int rmap_get_mem_rec(struct rmap_mem_cur *rmcur, struct xfs_rmap_irec *irec);
 
 bool is_rtrmap_inode(xfs_ino_t ino);
 xfs_ino_t rtgroup_rmap_ino(struct xfs_rtgroup *rtg);
+int populate_rtgroup_rmapbt(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		xfs_filblks_t fdblocks);
+xfs_filblks_t estimate_rtrmapbt_blocks(struct xfs_rtgroup *rtg);
 
 #endif /* RMAP_H_ */
diff --git a/repair/rtrmap_repair.c b/repair/rtrmap_repair.c
new file mode 100644
index 00000000000..1c9df17d205
--- /dev/null
+++ b/repair/rtrmap_repair.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2019-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include <libxfs.h>
+#include "btree.h"
+#include "err_protos.h"
+#include "libxlog.h"
+#include "incore.h"
+#include "globals.h"
+#include "dinode.h"
+#include "slab.h"
+#include "rmap.h"
+#include "bulkload.h"
+
+/* Ported routines from fs/xfs/scrub/rtrmap_repair.c */
+
+/*
+ * Realtime Reverse Mapping (RTRMAPBT) Repair
+ * ==========================================
+ *
+ * Gather all the rmap records for the inode and fork we're fixing, reset the
+ * incore fork, then recreate the btree.
+ */
+struct xrep_rtrmap {
+	struct rmap_mem_cur	btree_cursor;
+
+	/* New fork. */
+	struct bulkload		new_fork_info;
+	struct xfs_btree_bload	rtrmap_bload;
+
+	struct repair_ctx	*sc;
+	struct xfs_rtgroup	*rtg;
+
+	/* Estimated free space after building all rt btrees */
+	xfs_filblks_t		est_fdblocks;
+};
+
+/* Retrieve rtrmapbt data for bulk load. */
+STATIC int
+xrep_rtrmap_get_records(
+	struct xfs_btree_cur	*cur,
+	unsigned int		idx,
+	struct xfs_btree_block	*block,
+	unsigned int		nr_wanted,
+	void			*priv)
+{
+	struct xrep_rtrmap	*rr = priv;
+	union xfs_btree_rec	*block_rec;
+	unsigned int		loaded;
+	int			ret;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		ret = rmap_get_mem_rec(&rr->btree_cursor, &cur->bc_rec.r);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			do_error(
+ _("ran out of records while rebuilding rt rmap btree\n"));
+
+		block_rec = libxfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrmap_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_rtrmap	*rr = priv;
+
+	return bulkload_claim_block(cur, &rr->new_fork_info, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrmap_iroot_size(
+	struct xfs_btree_cur	*cur,
+	unsigned int		level,
+	unsigned int		nr_this_level,
+	void			*priv)
+{
+	return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
+}
+
+/* Reserve new btree blocks and bulk load all the rtrmap records. */
+STATIC int
+xrep_rtrmap_btree_load(
+	struct xrep_rtrmap	*rr,
+	struct xfs_btree_cur	*rtrmap_cur)
+{
+	struct repair_ctx	*sc = rr->sc;
+	int			error;
+
+	rr->rtrmap_bload.get_records = xrep_rtrmap_get_records;
+	rr->rtrmap_bload.claim_block = xrep_rtrmap_claim_block;
+	rr->rtrmap_bload.iroot_size = xrep_rtrmap_iroot_size;
+	bulkload_estimate_inode_slack(sc->mp, &rr->rtrmap_bload,
+			rr->est_fdblocks);
+
+	/* Compute how many blocks we'll need. */
+	error = -libxfs_btree_bload_compute_geometry(rtrmap_cur,
+			&rr->rtrmap_bload,
+			rmap_record_count(sc->mp, true, rr->rtg->rtg_rgno));
+	if (error)
+		return error;
+
+	/*
+	 * Guess how many blocks we're going to need to rebuild an entire rtrmap
+	 * from the number of extents we found, and pump up our transaction to
+	 * have sufficient block reservation.
+	 */
+	error = -libxfs_trans_reserve_more(sc->tp, rr->rtrmap_bload.nr_blocks,
+			0);
+	if (error)
+		return error;
+
+	/*
+	 * Reserve the space we'll need for the new btree.  Drop the cursor
+	 * while we do this because that can roll the transaction and cursors
+	 * can't handle that.
+	 */
+	error = bulkload_alloc_file_blocks(&rr->new_fork_info,
+			rr->rtrmap_bload.nr_blocks);
+	if (error)
+		return error;
+
+	/* Add all observed rtrmap records. */
+	error = rmap_init_mem_cursor(rr->sc->mp, sc->tp, true,
+			rr->rtg->rtg_rgno, &rr->btree_cursor);
+	if (error)
+		return error;
+	error = -libxfs_btree_bload(rtrmap_cur, &rr->rtrmap_bload, rr);
+	rmap_free_mem_cursor(sc->tp, &rr->btree_cursor, error);
+	return error;
+}
+
+/* Update the inode counters. */
+STATIC int
+xrep_rtrmap_reset_counters(
+	struct xrep_rtrmap	*rr)
+{
+	struct repair_ctx	*sc = rr->sc;
+
+	/*
+	 * Update the inode block counts to reflect the btree we just
+	 * generated.
+	 */
+	sc->ip->i_nblocks = rr->new_fork_info.ifake.if_blocks;
+	libxfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+
+	/* Quotas don't exist so we're done. */
+	return 0;
+}
+
+/*
+ * Use the collected rmap information to stage a new rt rmap btree.  If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed.
+ */
+static int
+xrep_rtrmap_build_new_tree(
+	struct xrep_rtrmap	*rr)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_btree_cur	*cur;
+	struct repair_ctx	*sc = rr->sc;
+	struct xbtree_ifakeroot	*ifake = &rr->new_fork_info.ifake;
+	int			error;
+
+	/*
+	 * Prepare to construct the new fork by initializing the new btree
+	 * structure and creating a fake ifork in the ifakeroot structure.
+	 */
+	libxfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+	bulkload_init_inode(&rr->new_fork_info, sc, XFS_DATA_FORK, &oinfo);
+	cur = libxfs_rtrmapbt_stage_cursor(sc->mp, rr->rtg, sc->ip, ifake);
+
+	/*
+	 * Figure out the size and format of the new fork, then fill it with
+	 * all the rtrmap records we've found.  Join the inode to the
+	 * transaction so that we can roll the transaction while holding the
+	 * inode locked.
+	 */
+	libxfs_trans_ijoin(sc->tp, sc->ip, 0);
+	ifake->if_fork->if_format = XFS_DINODE_FMT_RMAP;
+	error = xrep_rtrmap_btree_load(rr, cur);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Install the new fork in the inode.  After this point the old mapping
+	 * data are no longer accessible and the new tree is live.  We delete
+	 * the cursor immediately after committing the staged root because the
+	 * staged fork might be in extents format.
+	 */
+	libxfs_rtrmapbt_commit_staged_btree(cur, sc->tp);
+	libxfs_btree_del_cursor(cur, 0);
+
+	/* Reset the inode counters now that we've changed the fork. */
+	error = xrep_rtrmap_reset_counters(rr);
+	if (error)
+		goto err_newbt;
+
+	/* Dispose of any unused blocks and the accounting infomation. */
+	error = bulkload_commit(&rr->new_fork_info);
+	if (error)
+		return error;
+
+	return -libxfs_trans_roll_inode(&sc->tp, sc->ip);
+err_cur:
+	if (cur)
+		libxfs_btree_del_cursor(cur, error);
+err_newbt:
+	bulkload_cancel(&rr->new_fork_info);
+	return error;
+}
+
+/* Store the realtime reverse-mappings in the rtrmapbt. */
+int
+populate_rtgroup_rmapbt(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip,
+	xfs_filblks_t		est_fdblocks)
+{
+	struct repair_ctx	sc = {
+		.mp		= rtg->rtg_mount,
+		.ip		= ip,
+	};
+	struct xrep_rtrmap	rr = {
+		.sc		= &sc,
+		.rtg		= rtg,
+		.est_fdblocks	= est_fdblocks,
+	};
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(mp))
+		return 0;
+
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+			&sc.tp);
+	if (error)
+		return error;
+
+	error = xrep_rtrmap_build_new_tree(&rr);
+	if (error)
+		goto out_cancel;
+
+	return -libxfs_trans_commit(sc.tp);
+
+out_cancel:
+	libxfs_trans_cancel(sc.tp);
+	return error;
+}
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index e3701f91470..88d23dbc8ec 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -1425,13 +1425,17 @@ main(int argc, char **argv)
 	rcbagbt_destroy_cur_cache();
 
 	/*
-	 * Done with the block usage maps, toss them...
+	 * Done with the block usage maps, toss them.  Realtime metadata aren't
+	 * rebuilt until phase 6, so we have to keep them around.
 	 */
-	rmaps_free(mp);
+	if (mp->m_sb.sb_rblocks == 0)
+		rmaps_free(mp);
 	free_bmaps(mp);
 
 	if (!bad_ino_btree)  {
 		phase6(mp);
+		if (mp->m_sb.sb_rblocks != 0)
+			rmaps_free(mp);
 		phase_end(mp, 6);
 
 		phase7(mp, phase2_threads);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 42/47] xfs_repair: check for global free space concerns with default btree slack levels
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (40 preceding siblings ...)
  2023-12-27 13:21   ` [PATCH 41/47] xfs_repair: rebuild the realtime rmap btree Darrick J. Wong
@ 2023-12-27 13:21   ` Darrick J. Wong
  2023-12-27 13:21   ` [PATCH 43/47] xfs_repair: rebuild the bmap btree for realtime files Darrick J. Wong
                     ` (4 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:21 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

It's possible that before repair was started, the filesystem might have
been nearly full, and its metadata btree blocks could all have been
nearly full.  If we then rebuild the btrees with blocks that are only
75% full, that expansion might be enough to run out of free space.  The
solution to this is to pack the new blocks completely full if we fear
running out of space.

Previously, we only had to check and decide that on a per-AG basis.
However, now that XFS can have filesystems with metadata btrees rooted
in inodes, we have a global free space concern because there might be
enough space in each AG to regenerate the AG btrees at 75%, but that
might not leave enough space to regenerate the inode btrees, even if we
fill those blocks to 100%.

Hence we need to precompute the worst case space usage for all btrees in
the filesystem and compare /that/ against the global free space to
decide if we're going to pack the btrees maximally to conserve space.
That decision can override the per-AG determination.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/globals.c |    6 +++
 repair/globals.h |    2 +
 repair/phase5.c  |  116 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 repair/phase6.c  |   16 +++++--
 4 files changed, 131 insertions(+), 9 deletions(-)


diff --git a/repair/globals.c b/repair/globals.c
index b121d6e2d6d..92ebe5fab8a 100644
--- a/repair/globals.c
+++ b/repair/globals.c
@@ -133,3 +133,9 @@ int		thread_count;
 
 /* If nonzero, simulate failure after this phase. */
 int		fail_after_phase;
+
+/*
+ * Do we think we're going to be so low on disk space that we need to pack
+ * all rebuilt btree blocks completely full to avoid running out of space?
+ */
+bool		need_packed_btrees;
diff --git a/repair/globals.h b/repair/globals.h
index f5dcc11f410..2e11f35a0e4 100644
--- a/repair/globals.h
+++ b/repair/globals.h
@@ -180,4 +180,6 @@ extern int		fail_after_phase;
 
 extern struct libxfs_init x;
 
+extern bool		need_packed_btrees;
+
 #endif /* _XFS_REPAIR_GLOBAL_H */
diff --git a/repair/phase5.c b/repair/phase5.c
index 74594d53a87..5e1dff0aadd 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -479,11 +479,14 @@ _("unable to rebuild AG %u.  Not enough free space in on-disk AG.\n"),
 
 	/*
 	 * Estimate the number of free blocks in this AG after rebuilding
-	 * all btrees.
+	 * all btrees, unless we already decided that we need to pack all
+	 * btree blocks maximally.
 	 */
-	total_btblocks = estimate_agbtree_blocks(pag, num_extents);
-	if (num_freeblocks > total_btblocks)
-		est_agfreeblocks = num_freeblocks - total_btblocks;
+	if (!need_packed_btrees) {
+		total_btblocks = estimate_agbtree_blocks(pag, num_extents);
+		if (num_freeblocks > total_btblocks)
+			est_agfreeblocks = num_freeblocks - total_btblocks;
+	}
 
 	init_ino_cursors(&sc, pag, est_agfreeblocks, &sb_icount_ag[agno],
 			&sb_ifree_ag[agno], &btr_ino, &btr_fino);
@@ -631,6 +634,109 @@ check_rtmetadata(
 	check_rtsummary(mp);
 }
 
+/*
+ * Estimate the amount of free space used by the perag metadata without
+ * building the incore tree.  This is only necessary if realtime btrees are
+ * enabled.
+ */
+static xfs_extlen_t
+estimate_agbtree_blocks_early(
+	struct xfs_perag	*pag,
+	unsigned int		*num_freeblocks)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	xfs_agblock_t		agbno;
+	xfs_agblock_t		ag_end;
+	xfs_extlen_t		extent_len;
+	xfs_extlen_t		blen;
+	unsigned int		num_extents = 0;
+	int			bstate;
+	bool			in_extent = false;
+
+	/* Find the number of free space extents. */
+	ag_end = libxfs_ag_block_count(mp, pag->pag_agno);
+	for (agbno = 0; agbno < ag_end; agbno += blen) {
+		bstate = get_bmap_ext(pag->pag_agno, agbno, ag_end, &blen);
+		if (bstate < XR_E_INUSE)  {
+			if (!in_extent) {
+				/*
+				 * found the start of a free extent
+				 */
+				in_extent = true;
+				num_extents++;
+				extent_len = blen;
+			} else {
+				extent_len += blen;
+			}
+		} else {
+			if (in_extent)  {
+				/*
+				 * free extent ends here
+				 */
+				in_extent = false;
+				*num_freeblocks += extent_len;
+			}
+		}
+	}
+	if (in_extent)
+		*num_freeblocks += extent_len;
+
+	return estimate_agbtree_blocks(pag, num_extents);
+}
+
+/*
+ * Decide if we need to pack every new btree block completely full to conserve
+ * disk space.  Normally we rebuild btree blocks to be 75% full, but we don't
+ * want to start rebuilding AG btrees that way only to discover that there
+ * isn't enough space left in the data volume to rebuild inode-based btrees.
+ */
+static bool
+are_packed_btrees_needed(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
+	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
+	unsigned long long	metadata_blocks = 0;
+	unsigned long long	fdblocks = 0;
+
+	/*
+	 * If we don't have inode-based metadata, we can let the AG btrees
+	 * pack as needed; there are no global space concerns here.
+	 */
+	if (!xfs_has_rtrmapbt(mp))
+		return false;
+
+	for_each_perag(mp, agno, pag) {
+		unsigned int	ag_fdblocks = 0;
+
+		metadata_blocks += estimate_agbtree_blocks_early(pag,
+								 &ag_fdblocks);
+		fdblocks += ag_fdblocks;
+	}
+
+	for_each_rtgroup(mp, rgno, rtg) {
+		metadata_blocks += estimate_rtrmapbt_blocks(rtg);
+	}
+
+	/*
+	 * If we think we'll have more metadata blocks than free space, then
+	 * pack the btree blocks.
+	 */
+	if (metadata_blocks > fdblocks)
+		return true;
+
+	/*
+	 * If the amount of free space after building btrees is less than 9%
+	 * of the data volume, pack the btree blocks.
+	 */
+	fdblocks -= metadata_blocks;
+	if (fdblocks < ((mp->m_sb.sb_dblocks * 3) >> 5))
+		return true;
+	return false;
+}
+
 void
 phase5(xfs_mount_t *mp)
 {
@@ -682,6 +788,8 @@ phase5(xfs_mount_t *mp)
 	if (error)
 		do_error(_("cannot alloc lost block bitmap\n"));
 
+	need_packed_btrees = are_packed_btrees_needed(mp);
+
 	for_each_perag(mp, agno, pag)
 		phase5_func(mp, pag, lost_blocks);
 
diff --git a/repair/phase6.c b/repair/phase6.c
index 4c387557c31..ab5c22ffbb0 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -3979,12 +3979,18 @@ reset_rt_metadata_inodes(
 	xfs_filblks_t		est_fdblocks = 0;
 	xfs_rgnumber_t		rgno;
 
-	/* Estimate how much free space will be left after building btrees */
-	for_each_rtgroup(mp, rgno, rtg) {
-		metadata_blocks += estimate_rtrmapbt_blocks(rtg);
+	/*
+	 * Estimate how much free space will be left after building btrees
+	 * unless we already decided that we needed to pack all new blocks
+	 * maximally.
+	 */
+	if (!need_packed_btrees) {
+		for_each_rtgroup(mp, rgno, rtg) {
+			metadata_blocks += estimate_rtrmapbt_blocks(rtg);
+		}
+		if (mp->m_sb.sb_fdblocks > metadata_blocks)
+			est_fdblocks = mp->m_sb.sb_fdblocks - metadata_blocks;
 	}
-	if (mp->m_sb.sb_fdblocks > metadata_blocks)
-		est_fdblocks = mp->m_sb.sb_fdblocks - metadata_blocks;
 
 	for_each_rtgroup(mp, rgno, rtg) {
 		ensure_rtgroup_rmapbt(rtg, est_fdblocks);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 43/47] xfs_repair: rebuild the bmap btree for realtime files
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (41 preceding siblings ...)
  2023-12-27 13:21   ` [PATCH 42/47] xfs_repair: check for global free space concerns with default btree slack levels Darrick J. Wong
@ 2023-12-27 13:21   ` Darrick J. Wong
  2023-12-27 13:21   ` [PATCH 44/47] xfs_repair: reserve per-AG space while rebuilding rt metadata Darrick J. Wong
                     ` (3 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:21 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use the realtime rmap btree information to rebuild an inode's data fork
when appropriate.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/bmap_repair.c |  131 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 127 insertions(+), 4 deletions(-)


diff --git a/repair/bmap_repair.c b/repair/bmap_repair.c
index dfd1405cca2..5d4da861322 100644
--- a/repair/bmap_repair.c
+++ b/repair/bmap_repair.c
@@ -212,6 +212,122 @@ xrep_bmap_scan_ag(
 	return error;
 }
 
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_rtfork_rmap(
+	struct repair_ctx		*sc,
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec)
+{
+	/* xattr extents are never stored on realtime devices */
+	if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+		return EFSCORRUPTED;
+
+	/* bmbt blocks are never stored on realtime devices */
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		return EFSCORRUPTED;
+
+	/* Data extents for non-rt files are never stored on the rt device. */
+	if (!XFS_IS_REALTIME_INODE(sc->ip))
+		return EFSCORRUPTED;
+
+	/* Check the file offsets and physical extents. */
+	if (!xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+		return EFSCORRUPTED;
+
+	/* Check that this fits in the rt volume. */
+	if (!xfs_verify_rgbext(cur->bc_ino.rtg, rec->rm_startblock,
+				rec->rm_blockcount))
+		return EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Record realtime extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rtrmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_bmap		*rb = priv;
+	int				error = 0;
+
+	/* Skip extents which are not owned by this inode and fork. */
+	if (rec->rm_owner != rb->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_bmap_check_rtfork_rmap(rb->sc, cur, rec);
+	if (error)
+		return error;
+
+	/*
+	 * Record all blocks allocated to this file even if the extent isn't
+	 * for the fork we're rebuilding so that we can reset di_nblocks later.
+	 */
+	rb->nblocks += rec->rm_blockcount;
+
+	/* If this rmap isn't for the fork we want, we're done. */
+	if (rb->whichfork == XFS_DATA_FORK &&
+	    (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+	if (rb->whichfork == XFS_ATTR_FORK &&
+	    !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+
+	return xrep_bmap_from_rmap(rb, rec->rm_offset, rec->rm_startblock,
+			rec->rm_blockcount,
+			rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/*
+ * Scan the realtime reverse mappings to build the new extent map.  The rt rmap
+ * inodes must be loaded from disk explicitly here, since we have not yet
+ * validated the metadata directory tree but do not wish to throw away user
+ * data unnecessarily.
+ */
+STATIC int
+xrep_bmap_scan_rt(
+	struct xrep_bmap	*rb,
+	struct xfs_rtgroup	*rtg)
+{
+	struct repair_ctx	*sc = rb->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_btree_cur	*cur;
+	struct xfs_inode	*ip;
+	struct xfs_imeta_path	*path;
+	xfs_ino_t		ino;
+	int			error;
+
+	error = -libxfs_rtrmapbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		return error;
+
+	error = -libxfs_imeta_lookup(sc->tp, path, &ino);
+	if (error)
+		goto out_path;
+
+	if (ino == NULLFSINO) {
+		error = EFSCORRUPTED;
+		goto out_path;
+	}
+
+	error = -libxfs_imeta_iget(sc->tp, ino, XFS_DIR3_FT_REG_FILE, &ip);
+	if (error)
+		goto out_path;
+
+	cur = libxfs_rtrmapbt_init_cursor(mp, sc->tp, rtg, ip);
+	error = -libxfs_rmap_query_all(cur, xrep_bmap_walk_rtrmap, rb);
+	if (error)
+		goto out_cur;
+out_cur:
+	libxfs_btree_del_cursor(cur, error);
+	libxfs_imeta_irele(ip);
+out_path:
+	libxfs_imeta_free_path(path);
+	return error;
+}
+
 /*
  * Collect block mappings for this fork of this inode and decide if we have
  * enough space to rebuild.  Caller is responsible for cleaning up the list if
@@ -222,9 +338,20 @@ xrep_bmap_find_mappings(
 	struct xrep_bmap	*rb)
 {
 	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 	int			error;
 
+	/* Iterate the rtrmaps for extents. */
+	for_each_rtgroup(rb->sc->mp, rgno, rtg) {
+		error = xrep_bmap_scan_rt(rb, rtg);
+		if (error) {
+			libxfs_rtgroup_put(rtg);
+			return error;
+		}
+	}
+
 	/* Iterate the rmaps for extents. */
 	for_each_perag(rb->sc->mp, agno, pag) {
 		error = xrep_bmap_scan_ag(rb, pag);
@@ -572,10 +699,6 @@ xrep_bmap_check_inputs(
 		return EINVAL;
 	}
 
-	/* Don't know how to rebuild realtime data forks. */
-	if (XFS_IS_REALTIME_INODE(sc->ip))
-		return EOPNOTSUPP;
-
 	return 0;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 44/47] xfs_repair: reserve per-AG space while rebuilding rt metadata
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (42 preceding siblings ...)
  2023-12-27 13:21   ` [PATCH 43/47] xfs_repair: rebuild the bmap btree for realtime files Darrick J. Wong
@ 2023-12-27 13:21   ` Darrick J. Wong
  2023-12-27 13:22   ` [PATCH 45/47] xfs_repair: allow sysadmins to add realtime reverse mapping indexes Darrick J. Wong
                     ` (2 subsequent siblings)
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:21 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Realtime metadata btrees can consume quite a bit of space on a full
filesystem.  Since the metadata are just regular files, we need to
make the per-AG reservations to avoid overfilling any of the AGs while
rebuilding metadata.  This avoids the situation where a filesystem comes
straight from repair and immediately trips over not having enough space
in an AG.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/libxfs.h |    1 +
 repair/phase6.c  |   47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)


diff --git a/include/libxfs.h b/include/libxfs.h
index 3ab93158cf7..72f38938f69 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -94,6 +94,7 @@ struct iomap;
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_ag_resv.h"
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/repair/phase6.c b/repair/phase6.c
index ab5c22ffbb0..fd862362f1d 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -3997,10 +3997,43 @@ reset_rt_metadata_inodes(
 	}
 }
 
+static int
+reserve_ag_blocks(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	int			error = 0;
+	int			err2;
+
+	mp->m_finobt_nores = false;
+
+	for_each_perag(mp, agno, pag) {
+		err2 = -libxfs_ag_resv_init(pag, NULL);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	return error;
+}
+
+static void
+unreserve_ag_blocks(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	for_each_perag(mp, agno, pag)
+		libxfs_ag_resv_free(pag);
+}
+
 void
 phase6(xfs_mount_t *mp)
 {
 	ino_tree_node_t		*irec;
+	bool			reserve_perag;
+	int			error;
 	int			i;
 
 	parent_ptr_init(mp);
@@ -4040,6 +4073,17 @@ phase6(xfs_mount_t *mp)
 		do_warn(_("would reinitialize metadata root directory\n"));
 	}
 
+	reserve_perag = xfs_has_realtime(mp) && !no_modify;
+	if (reserve_perag) {
+		error = reserve_ag_blocks(mp);
+		if (error) {
+			if (error != ENOSPC)
+				do_warn(
+	_("could not reserve per-AG space to rebuild realtime metadata"));
+			reserve_perag = false;
+		}
+	}
+
 	if (need_rbmino)  {
 		if (!no_modify)  {
 			if (need_rbmino > 0)
@@ -4078,6 +4122,9 @@ _("        - resetting contents of realtime bitmap and summary inodes\n"));
 		}
 	}
 
+	if (reserve_perag)
+		unreserve_ag_blocks(mp);
+
 	reattach_metadir_quota_inodes(mp);
 
 	mark_standalone_inodes(mp);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 45/47] xfs_repair: allow sysadmins to add realtime reverse mapping indexes
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (43 preceding siblings ...)
  2023-12-27 13:21   ` [PATCH 44/47] xfs_repair: reserve per-AG space while rebuilding rt metadata Darrick J. Wong
@ 2023-12-27 13:22   ` Darrick J. Wong
  2023-12-27 13:22   ` [PATCH 46/47] xfs_logprint: report realtime RUIs Darrick J. Wong
  2023-12-27 13:22   ` [PATCH 47/47] mkfs: create the realtime rmap inode Darrick J. Wong
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:22 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow the sysadmin to use xfs_repair to upgrade an existing filesystem
to support the reverse mapping btree index for realtime volumes.  This
is needed for online fsck.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    4 ++
 repair/phase2.c          |  100 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 99 insertions(+), 5 deletions(-)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index b5bb6c39928..cfa8c474160 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -63,6 +63,7 @@
 #define xfs_btree_bload			libxfs_btree_bload
 #define xfs_btree_bload_compute_geometry libxfs_btree_bload_compute_geometry
 #define xfs_btree_calc_size		libxfs_btree_calc_size
+#define xfs_btree_compute_maxlevels	libxfs_btree_compute_maxlevels
 #define xfs_btree_decrement		libxfs_btree_decrement
 #define xfs_btree_del_cursor		libxfs_btree_del_cursor
 #define xfs_btree_delete		libxfs_btree_delete
@@ -190,6 +191,8 @@
 #define xfs_imeta_link_space_res	libxfs_imeta_link_space_res
 #define xfs_imeta_lookup		libxfs_imeta_lookup
 #define xfs_imeta_mount			libxfs_imeta_mount
+#define xfs_imeta_resv_free_inode	libxfs_imeta_resv_free_inode
+#define xfs_imeta_resv_init_inode	libxfs_imeta_resv_init_inode
 #define xfs_imeta_set_iflag		libxfs_imeta_set_iflag
 #define xfs_imeta_start_create		libxfs_imeta_start_create
 #define xfs_imeta_start_link		libxfs_imeta_start_link
@@ -289,6 +292,7 @@
 #define xfs_rtgroup_put			libxfs_rtgroup_put
 #define xfs_rtgroup_update_secondary_sbs	libxfs_rtgroup_update_secondary_sbs
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
+#define xfs_rtrmapbt_calc_reserves	libxfs_rtrmapbt_calc_reserves
 #define xfs_rtrmapbt_calc_size		libxfs_rtrmapbt_calc_size
 #define xfs_rtrmapbt_commit_staged_btree	libxfs_rtrmapbt_commit_staged_btree
 #define xfs_rtrmapbt_create		libxfs_rtrmapbt_create
diff --git a/repair/phase2.c b/repair/phase2.c
index 4cb0d7946bf..22458aee4cd 100644
--- a/repair/phase2.c
+++ b/repair/phase2.c
@@ -250,9 +250,8 @@ set_rmapbt(
 		exit(0);
 	}
 
-	if (xfs_has_realtime(mp)) {
-		printf(
-	_("Reverse mapping btree feature not supported with realtime.\n"));
+	if (xfs_has_realtime(mp) && !xfs_has_rtgroups(mp)) {
+		printf(_("Reverse mapping btree requires realtime groups.\n"));
 		exit(0);
 	}
 
@@ -265,6 +264,11 @@ set_rmapbt(
 	printf(_("Adding reverse mapping btrees to filesystem.\n"));
 	new_sb->sb_features_ro_compat |= XFS_SB_FEAT_RO_COMPAT_RMAPBT;
 	new_sb->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
+
+	/* Quota counts will be wrong once we add the rmap inodes. */
+	if (xfs_has_realtime(mp))
+		quotacheck_skip();
+
 	return true;
 }
 
@@ -450,6 +454,63 @@ check_free_space(
 	return avail > GIGABYTES(10, mp->m_sb.sb_blocklog);
 }
 
+/*
+ * Reserve space to handle rt rmap btree expansion.
+ *
+ * If the rmap inode for this group already exists, we assume that we're adding
+ * some other feature.  Note that we have not validated the metadata directory
+ * tree, so we must perform the lookup by hand and abort the upgrade if there
+ * are errors.  Otherwise, the amount of space needed to handle a new maximally
+ * sized rmap btree is added to @new_resv.
+ */
+static int
+reserve_rtrmap_inode(
+	struct xfs_rtgroup	*rtg,
+	xfs_rfsblock_t		*new_resv)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_trans	*tp;
+	struct xfs_imeta_path	*path;
+	xfs_ino_t		ino;
+	xfs_filblks_t		ask;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(mp))
+		return 0;
+
+	error = -libxfs_rtrmapbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		return error;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		goto out_path;
+
+	ask = libxfs_rtrmapbt_calc_reserves(mp);
+
+	error = -libxfs_imeta_lookup(tp, path, &ino);
+	if (error)
+		goto out_trans;
+
+	if (ino == NULLFSINO) {
+		*new_resv += ask;
+		goto out_trans;
+	}
+
+	error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE,
+			&rtg->rtg_rmapip);
+	if (error)
+		goto out_trans;
+
+	error = -libxfs_imeta_resv_init_inode(rtg->rtg_rmapip, ask);
+
+out_trans:
+	libxfs_trans_cancel(tp);
+out_path:
+	libxfs_imeta_free_path(path);
+	return error;
+}
+
 static void
 check_fs_free_space(
 	struct xfs_mount		*mp,
@@ -457,7 +518,10 @@ check_fs_free_space(
 	struct xfs_sb			*new_sb)
 {
 	struct xfs_perag		*pag;
+	struct xfs_rtgroup		*rtg;
+	xfs_rfsblock_t			new_resv = 0;
 	xfs_agnumber_t			agno;
+	xfs_rgnumber_t			rgno;
 	int				error;
 
 	/* Make sure we have enough space for per-AG reservations. */
@@ -533,6 +597,21 @@ check_fs_free_space(
 		libxfs_trans_cancel(tp);
 	}
 
+	/* Realtime metadata btree inodes */
+	for_each_rtgroup(mp, rgno, rtg) {
+		error = reserve_rtrmap_inode(rtg, &new_resv);
+		if (error == ENOSPC) {
+			printf(
+_("Not enough free space would remain for rtgroup %u rmap inode.\n"),
+					rtg->rtg_rgno);
+			exit(0);
+		}
+		if (error)
+			do_error(
+_("Error %d while checking rtgroup %u rmap inode space reservation.\n"),
+					error, rtg->rtg_rgno);
+	}
+
 	/*
 	 * If we're adding parent pointers, we need at least 25% free since
 	 * scanning the entire filesystem to guesstimate the overhead is
@@ -548,13 +627,24 @@ check_fs_free_space(
 
 	/*
 	 * Would the post-upgrade filesystem have enough free space on the data
-	 * device after making per-AG reservations?
+	 * device after making per-AG reservations and reserving rt metadata
+	 * inode blocks?
 	 */
-	if (!check_free_space(mp, mp->m_sb.sb_fdblocks, mp->m_sb.sb_dblocks)) {
+	if (new_resv > mp->m_sb.sb_fdblocks ||
+	    !check_free_space(mp, mp->m_sb.sb_fdblocks, mp->m_sb.sb_dblocks)) {
 		printf(_("Filesystem will be low on space after upgrade.\n"));
 		exit(1);
 	}
 
+	/* Unreserve the realtime metadata reservations. */
+	for_each_rtgroup(mp, rgno, rtg) {
+		if (rtg->rtg_rmapip) {
+			libxfs_imeta_resv_free_inode(rtg->rtg_rmapip);
+			libxfs_imeta_irele(rtg->rtg_rmapip);
+			rtg->rtg_rmapip = NULL;
+		}
+	}
+
 	/*
 	 * Release the per-AG reservations and mark the per-AG structure as
 	 * uninitialized so that we don't trip over stale cached counters


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 46/47] xfs_logprint: report realtime RUIs
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (44 preceding siblings ...)
  2023-12-27 13:22   ` [PATCH 45/47] xfs_repair: allow sysadmins to add realtime reverse mapping indexes Darrick J. Wong
@ 2023-12-27 13:22   ` Darrick J. Wong
  2023-12-27 13:22   ` [PATCH 47/47] mkfs: create the realtime rmap inode Darrick J. Wong
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:22 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Decode the RUI format just enough to report if an RUI targets the
realtime device or not.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 logprint/log_misc.c      |    2 ++
 logprint/log_print_all.c |    8 ++++++++
 logprint/log_redo.c      |   24 +++++++++++++++++++-----
 3 files changed, 29 insertions(+), 5 deletions(-)


diff --git a/logprint/log_misc.c b/logprint/log_misc.c
index 9d63f376390..3661b595c53 100644
--- a/logprint/log_misc.c
+++ b/logprint/log_misc.c
@@ -1021,12 +1021,14 @@ xlog_print_record(
 					be32_to_cpu(op_head->oh_len));
 			break;
 		    }
+		    case XFS_LI_RUI_RT:
 		    case XFS_LI_RUI: {
 			skip = xlog_print_trans_rui(&ptr,
 					be32_to_cpu(op_head->oh_len),
 					continued);
 			break;
 		    }
+		    case XFS_LI_RUD_RT:
 		    case XFS_LI_RUD: {
 			skip = xlog_print_trans_rud(&ptr,
 					be32_to_cpu(op_head->oh_len));
diff --git a/logprint/log_print_all.c b/logprint/log_print_all.c
index d030efa9efb..e67e2c57f26 100644
--- a/logprint/log_print_all.c
+++ b/logprint/log_print_all.c
@@ -424,9 +424,11 @@ xlog_recover_print_logitem(
 	case XFS_LI_ATTRI:
 		xlog_recover_print_attri(item);
 		break;
+	case XFS_LI_RUD_RT:
 	case XFS_LI_RUD:
 		xlog_recover_print_rud(item);
 		break;
+	case XFS_LI_RUI_RT:
 	case XFS_LI_RUI:
 		xlog_recover_print_rui(item);
 		break;
@@ -500,6 +502,12 @@ xlog_recover_print_item(
 	case XFS_LI_RUI:
 		printf("RUI");
 		break;
+	case XFS_LI_RUD_RT:
+		printf("RUD_RT");
+		break;
+	case XFS_LI_RUI_RT:
+		printf("RUI_RT");
+		break;
 	case XFS_LI_CUD:
 		printf("CUD");
 		break;
diff --git a/logprint/log_redo.c b/logprint/log_redo.c
index 0cc3cd4ba28..ae6f311f19b 100644
--- a/logprint/log_redo.c
+++ b/logprint/log_redo.c
@@ -274,6 +274,7 @@ xlog_print_trans_rui(
 	uint			src_len,
 	int			continued)
 {
+	const char		*item_name = "RUI?";
 	struct xfs_rui_log_format	*src_f, *f = NULL;
 	uint			dst_len;
 	uint			nextents;
@@ -318,8 +319,14 @@ xlog_print_trans_rui(
 		goto error;
 	}
 
-	printf(_("RUI:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
-		f->rui_size, f->rui_nextents, (unsigned long long)f->rui_id);
+	switch (f->rui_type) {
+	case XFS_LI_RUI:	item_name = "RUI"; break;
+	case XFS_LI_RUI_RT:	item_name = "RUI_RT"; break;
+	}
+
+	printf(_("%s:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
+			item_name, f->rui_size, f->rui_nextents,
+			(unsigned long long)f->rui_id);
 
 	if (continued) {
 		printf(_("RUI extent data skipped (CONTINUE set, no space)\n"));
@@ -359,6 +366,7 @@ xlog_print_trans_rud(
 	char				**ptr,
 	uint				len)
 {
+	const char			*item_name = "RUD?";
 	struct xfs_rud_log_format	*f;
 	struct xfs_rud_log_format	lbuf;
 
@@ -371,11 +379,17 @@ xlog_print_trans_rud(
 	 */
 	memmove(&lbuf, *ptr, min(core_size, len));
 	f = &lbuf;
+
+	switch (f->rud_type) {
+	case XFS_LI_RUD:	item_name = "RUD"; break;
+	case XFS_LI_RUD_RT:	item_name = "RUD_RT"; break;
+	}
+
 	*ptr += len;
 	if (len >= core_size) {
-		printf(_("RUD:  #regs: %d	                 id: 0x%llx\n"),
-			f->rud_size,
-			(unsigned long long)f->rud_rui_id);
+		printf(_("%s:  #regs: %d	                 id: 0x%llx\n"),
+				item_name, f->rud_size,
+				(unsigned long long)f->rud_rui_id);
 
 		/* don't print extents as they are not used */
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 47/47] mkfs: create the realtime rmap inode
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                     ` (45 preceding siblings ...)
  2023-12-27 13:22   ` [PATCH 46/47] xfs_logprint: report realtime RUIs Darrick J. Wong
@ 2023-12-27 13:22   ` Darrick J. Wong
  46 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:22 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a realtime rmapbt inode if we format the fs with realtime
and rmap.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/init.c   |    7 ----
 mkfs/proto.c    |   56 ++++++++++++++++++++++++++++++++++
 mkfs/xfs_mkfs.c |   90 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 141 insertions(+), 12 deletions(-)


diff --git a/libxfs/init.c b/libxfs/init.c
index ba0b9a87f2d..18bd2116c50 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -307,13 +307,6 @@ rtmount_init(
 		return -1;
 	}
 
-	if (xfs_has_rmapbt(mp)) {
-		fprintf(stderr,
-	_("%s: Reverse mapping btree not compatible with realtime device. Please try a newer xfsprogs.\n"),
-				progname);
-		return -1;
-	}
-
 	if (mp->m_rtdev_targp->bt_bdev == 0 && !xfs_is_debugger(mp)) {
 		fprintf(stderr, _("%s: filesystem has a realtime subvolume\n"),
 			progname);
diff --git a/mkfs/proto.c b/mkfs/proto.c
index 5239f9ec413..d575d9c511e 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -852,6 +852,54 @@ rtsummary_create(
 	mp->m_rsumip = rsumip;
 }
 
+/* Create the realtime rmap btree inode. */
+static void
+rtrmapbt_create(
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_imeta_update	upd;
+	struct xfs_rmap_irec	rmap = {
+		.rm_startblock	= 0,
+		.rm_blockcount	= rtg->rtg_mount->m_sb.sb_rextsize,
+		.rm_owner	= XFS_RMAP_OWN_FS,
+		.rm_offset	= 0,
+		.rm_flags	= 0,
+	};
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_path	*path;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	error = -libxfs_rtrmapbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		fail( _("rtrmap inode path creation failed"), error);
+
+	error = -libxfs_imeta_ensure_dirpath(mp, path);
+	if (error)
+		fail(_("rtgroup directory allocation failed"), error);
+
+	error = -libxfs_imeta_start_create(mp, path, &upd);
+	if (error)
+		res_failed(error);
+
+	error = -libxfs_rtrmapbt_create(&upd, &rtg->rtg_rmapip);
+	if (error)
+		fail(_("rtrmap inode creation failed"), error);
+
+	/* Adding an rmap for the rtgroup super should fit in the data fork */
+	cur = libxfs_rtrmapbt_init_cursor(mp, upd.tp, rtg, rtg->rtg_rmapip);
+	error = -libxfs_rmap_map_raw(cur, &rmap);
+	libxfs_btree_del_cursor(cur, error);
+	if (error)
+		fail(_("rtrmapbt initialization failed"), error);
+
+	error = -libxfs_imeta_commit_update(&upd);
+	if (error)
+		fail(_("rtrmapbt commit failed"), error);
+
+	libxfs_imeta_free_path(path);
+}
+
 /* Initialize block headers of rt free space files. */
 static int
 init_rtblock_headers(
@@ -1084,9 +1132,17 @@ static void
 rtinit(
 	struct xfs_mount	*mp)
 {
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
 	rtbitmap_create(mp);
 	rtsummary_create(mp);
 
+	for_each_rtgroup(mp, rgno, rtg) {
+		if (xfs_has_rtrmapbt(mp))
+			rtrmapbt_create(rtg);
+	}
+
 	rtbitmap_init(mp);
 	rtsummary_init(mp);
 	if (xfs_has_rtgroups(mp))
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 66532b8c9b6..162546cd1e8 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -2474,12 +2474,18 @@ _("reflink not supported with realtime devices\n"));
 		}
 		cli->sb_feat.reflink = false;
 
-		if (cli->sb_feat.rmapbt && cli_opt_set(&mopts, M_RMAPBT)) {
-			fprintf(stderr,
-_("rmapbt not supported with realtime devices\n"));
-			usage();
+		if (!cli->sb_feat.rtgroups && cli->sb_feat.rmapbt) {
+			if (cli_opt_set(&mopts, M_RMAPBT) &&
+			    cli_opt_set(&ropts, R_RTGROUPS)) {
+				fprintf(stderr,
+_("rmapbt not supported on realtime devices without rtgroups feature\n"));
+				usage();
+			} else if (cli_opt_set(&mopts, M_RMAPBT)) {
+				cli->sb_feat.rtgroups = true;
+			} else {
+				cli->sb_feat.rmapbt = false;
+			}
 		}
-		cli->sb_feat.rmapbt = false;
 	}
 
 	if ((cli->fsx.fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
@@ -4553,6 +4559,77 @@ cfgfile_parse(
 		cli->cfgfile);
 }
 
+static inline void
+prealloc_fail(
+	struct xfs_mount	*mp,
+	int			error,
+	xfs_filblks_t		ask,
+	const char		*tag)
+{
+	if (error == ENOSPC)
+		fprintf(stderr,
+	_("%s: cannot handle expansion of %s; need %llu free blocks, have %llu\n"),
+				progname, tag, (unsigned long long)ask,
+				(unsigned long long)mp->m_sb.sb_fdblocks);
+	else
+		fprintf(stderr,
+	_("%s: error %d while checking free space for %s\n"),
+				progname, error, tag);
+	exit(1);
+}
+
+/*
+ * Make sure there's enough space on the data device to handle realtime
+ * metadata btree expansions.
+ */
+static void
+check_rt_meta_prealloc(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
+	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
+	xfs_filblks_t		ask;
+	int			error;
+
+	/*
+	 * First create all the per-AG reservations, since they take from the
+	 * free block count.  Each AG should start with enough free space for
+	 * the per-AG reservation.
+	 */
+	mp->m_finobt_nores = false;
+
+	for_each_perag(mp, agno, pag) {
+		error = -libxfs_ag_resv_init(pag, NULL);
+		if (error && error != ENOSPC) {
+			fprintf(stderr,
+	_("%s: error %d while checking AG free space for realtime metadata\n"),
+					progname, error);
+			exit(1);
+		}
+	}
+
+	/* Realtime metadata btree inode */
+	for_each_rtgroup(mp, rgno, rtg) {
+		ask = libxfs_rtrmapbt_calc_reserves(mp);
+		error = -libxfs_imeta_resv_init_inode(rtg->rtg_rmapip, ask);
+		if (error)
+			prealloc_fail(mp, error, ask, _("realtime rmap btree"));
+	}
+
+	/* Unreserve the realtime metadata reservations. */
+	for_each_rtgroup(mp, rgno, rtg) {
+		libxfs_imeta_resv_free_inode(rtg->rtg_rmapip);
+	}
+
+	/* Unreserve the per-AG reservations. */
+	for_each_perag(mp, agno, pag)
+		libxfs_ag_resv_free(pag);
+
+	mp->m_finobt_nores = false;
+}
+
 int
 main(
 	int			argc,
@@ -4922,6 +4999,9 @@ main(
 	 */
 	check_root_ino(mp);
 
+	/* Make sure we can handle space preallocations of rt metadata btrees */
+	check_rt_meta_prealloc(mp);
+
 	/*
 	 * Re-write multiple secondary superblocks with rootinode field set
 	 */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/4] libxfs: resync libxfs_alloc_file_space interface with the kernel
  2023-12-31 19:55 ` [PATCHSET v2.0 13/17] xfsprogs: file write utility refactoring Darrick J. Wong
@ 2023-12-27 13:23   ` Darrick J. Wong
  2023-12-27 13:23   ` [PATCH 2/4] mkfs: use libxfs_alloc_file_space for rtinit Darrick J. Wong
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:23 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make the userspace xfs_alloc_file_space behave (more or less) like the
kernel version, at least as far as the interface goes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/libxfs.h |    4 +-
 libxfs/util.c    |  143 ++++++++++++++++++++++++++++++++++++------------------
 mkfs/proto.c     |    2 -
 3 files changed, 99 insertions(+), 50 deletions(-)


diff --git a/include/libxfs.h b/include/libxfs.h
index 72f38938f69..1fc8bc0e97b 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -174,8 +174,8 @@ extern int	libxfs_log_header(char *, uuid_t *, int, int, int, xfs_lsn_t,
 
 /* Shared utility routines */
 
-extern int	libxfs_alloc_file_space (struct xfs_inode *, xfs_off_t,
-				xfs_off_t, int, int);
+extern int	libxfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
+					xfs_off_t len, uint32_t bmapi_flags);
 
 /* XXX: this is messy and needs fixing */
 #ifndef __LIBXFS_INTERNAL_XFS_H__
diff --git a/libxfs/util.c b/libxfs/util.c
index 76e49b8637c..aec7798a814 100644
--- a/libxfs/util.c
+++ b/libxfs/util.c
@@ -179,78 +179,127 @@ libxfs_mod_incore_sb(
  */
 int
 libxfs_alloc_file_space(
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,
-	xfs_off_t	len,
-	int		alloc_type,
-	int		attr_flags)
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	uint32_t		bmapi_flags)
 {
-	xfs_mount_t	*mp;
-	xfs_off_t	count;
-	xfs_filblks_t	datablocks;
-	xfs_filblks_t	allocated_fsb;
-	xfs_filblks_t	allocatesize_fsb;
-	xfs_bmbt_irec_t *imapp;
-	xfs_bmbt_irec_t imaps[1];
-	int		reccount;
-	uint		resblks;
-	xfs_fileoff_t	startoffset_fsb;
-	xfs_trans_t	*tp;
-	int		xfs_bmapi_flags;
-	int		error;
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_off_t		count;
+	xfs_filblks_t		allocatesize_fsb;
+	xfs_extlen_t		extsz, temp;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_fileoff_t		endoffset_fsb;
+	int			rt;
+	xfs_trans_t		*tp;
+	xfs_bmbt_irec_t		imaps[1], *imapp;
+	int			error;
 
 	if (len <= 0)
 		return -EINVAL;
 
+	rt = XFS_IS_REALTIME_INODE(ip);
+	extsz = xfs_get_extsz_hint(ip);
+
 	count = len;
-	error = 0;
 	imapp = &imaps[0];
-	reccount = 1;
-	xfs_bmapi_flags = alloc_type ? XFS_BMAPI_PREALLOC : 0;
-	mp = ip->i_mount;
-	startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
-	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
+	endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+	allocatesize_fsb = endoffset_fsb - startoffset_fsb;
 
-	/* allocate file space until done or until there is an error */
+	/*
+	 * Allocate file space until done or until there is an error
+	 */
 	while (allocatesize_fsb && !error) {
-		datablocks = allocatesize_fsb;
+		xfs_fileoff_t	s, e;
+		unsigned int	dblocks, rblocks, resblks;
+		int		nimaps = 1;
 
-		resblks = (uint)XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
-					0, 0, &tp);
 		/*
-		 * Check for running out of space
+		 * Determine space reservations for data/realtime.
 		 */
-		if (error) {
-			ASSERT(error == -ENOSPC);
+		if (unlikely(extsz)) {
+			s = startoffset_fsb;
+			do_div(s, extsz);
+			s *= extsz;
+			e = startoffset_fsb + allocatesize_fsb;
+			div_u64_rem(startoffset_fsb, extsz, &temp);
+			if (temp)
+				e += temp;
+			div_u64_rem(e, extsz, &temp);
+			if (temp)
+				e += extsz - temp;
+		} else {
+			s = 0;
+			e = allocatesize_fsb;
+		}
+
+		/*
+		 * The transaction reservation is limited to a 32-bit block
+		 * count, hence we need to limit the number of blocks we are
+		 * trying to reserve to avoid an overflow. We can't allocate
+		 * more than @nimaps extents, and an extent is limited on disk
+		 * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
+		 * limit.
+		 */
+		resblks = min_t(xfs_fileoff_t, (e - s),
+				(XFS_MAX_BMBT_EXTLEN * nimaps));
+		if (unlikely(rt)) {
+			dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+			rblocks = resblks;
+		} else {
+			dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+			rblocks = 0;
+		}
+
+		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+				dblocks, rblocks, false, &tp);
+		if (error)
 			break;
-		}
-		xfs_trans_ijoin(tp, ip, 0);
 
-		error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb,
-				xfs_bmapi_flags, 0, imapp, &reccount);
+		error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+				XFS_IEXT_ADD_NOSPLIT_CNT);
+		if (error == -EFBIG)
+			error = xfs_iext_count_upgrade(tp, ip,
+					XFS_IEXT_ADD_NOSPLIT_CNT);
+		if (error)
+			goto error;
 
+		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+				allocatesize_fsb, bmapi_flags, 0, imapp,
+				&nimaps);
 		if (error)
-			goto error0;
+			goto error;
+
+		ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-		/*
-		 * Complete the transaction
-		 */
 		error = xfs_trans_commit(tp);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error)
 			break;
 
-		allocated_fsb = imapp->br_blockcount;
-		if (reccount == 0)
-			return -ENOSPC;
-
-		startoffset_fsb += allocated_fsb;
-		allocatesize_fsb -= allocated_fsb;
+		/*
+		 * If xfs_bmapi_write finds a delalloc extent at the requested
+		 * range, it tries to convert the entire delalloc extent to a
+		 * real allocation.
+		 * If the allocator cannot find a single free extent large
+		 * enough to cover the start block of the requested range,
+		 * xfs_bmapi_write will return 0 but leave *nimaps set to 0.
+		 * In that case we simply need to keep looping with the same
+		 * startoffset_fsb.
+		 */
+		if (nimaps) {
+			startoffset_fsb += imapp->br_blockcount;
+			allocatesize_fsb -= imapp->br_blockcount;
+		}
 	}
+
 	return error;
 
-error0:	/* Cancel bmap, cancel trans */
+error:
 	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
diff --git a/mkfs/proto.c b/mkfs/proto.c
index d575d9c511e..5b632d31215 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -205,7 +205,7 @@ rsvfile(
 	int		error;
 	xfs_trans_t	*tp;
 
-	error = -libxfs_alloc_file_space(ip, 0, llen, 1, 0);
+	error = -libxfs_alloc_file_space(ip, 0, llen, XFS_BMAPI_PREALLOC);
 
 	if (error) {
 		fail(_("error reserving space for a file"), error);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/4] mkfs: use libxfs_alloc_file_space for rtinit
  2023-12-31 19:55 ` [PATCHSET v2.0 13/17] xfsprogs: file write utility refactoring Darrick J. Wong
  2023-12-27 13:23   ` [PATCH 1/4] libxfs: resync libxfs_alloc_file_space interface with the kernel Darrick J. Wong
@ 2023-12-27 13:23   ` Darrick J. Wong
  2023-12-27 13:23   ` [PATCH 3/4] xfs_repair: use libxfs_alloc_file_space to reallocate rt metadata Darrick J. Wong
  2023-12-27 13:23   ` [PATCH 4/4] mkfs: use file write helper to populate files Darrick J. Wong
  3 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:23 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Since xfs_bmapi_write can now zero newly allocated blocks, use it to
initialize the realtime inodes instead of open coding this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 mkfs/proto.c |   80 +++++++++-------------------------------------------------
 1 file changed, 12 insertions(+), 68 deletions(-)


diff --git a/mkfs/proto.c b/mkfs/proto.c
index 5b632d31215..83888572395 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -949,43 +949,14 @@ static void
 rtbitmap_init(
 	struct xfs_mount	*mp)
 {
-	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
-	struct xfs_trans	*tp;
-	struct xfs_bmbt_irec	*ep;
-	xfs_fileoff_t		bno;
-	uint			blocks;
-	int			i;
-	int			nmap;
 	int			error;
 
-	blocks = mp->m_sb.sb_rbmblocks +
-			XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - 1;
-	error = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
+	error = -libxfs_alloc_file_space(mp->m_rbmip, 0,
+			mp->m_sb.sb_rbmblocks << mp->m_sb.sb_blocklog,
+			XFS_BMAPI_ZERO);
 	if (error)
-		res_failed(error);
-
-	libxfs_trans_ijoin(tp, mp->m_rbmip, 0);
-	bno = 0;
-	while (bno < mp->m_sb.sb_rbmblocks) {
-		nmap = XFS_BMAP_MAX_NMAP;
-		error = -libxfs_bmapi_write(tp, mp->m_rbmip, bno,
-				(xfs_extlen_t)(mp->m_sb.sb_rbmblocks - bno),
-				0, mp->m_sb.sb_rbmblocks, map, &nmap);
-		if (error)
-			fail(_("Allocation of the realtime bitmap failed"),
-				error);
-
-		for (i = 0, ep = map; i < nmap; i++, ep++) {
-			libxfs_device_zero(mp->m_ddev_targp,
-				XFS_FSB_TO_DADDR(mp, ep->br_startblock),
-				XFS_FSB_TO_BB(mp, ep->br_blockcount));
-			bno += ep->br_blockcount;
-		}
-	}
-
-	error = -libxfs_trans_commit(tp);
-	if (error)
-		fail(_("Block allocation of the realtime bitmap inode failed"),
+		fail(
+	_("Block allocation of the realtime bitmap inode failed"),
 				error);
 
 	if (xfs_has_rtgroups(mp)) {
@@ -1001,43 +972,13 @@ static void
 rtsummary_init(
 	struct xfs_mount	*mp)
 {
-	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
-	struct xfs_trans	*tp;
-	struct xfs_bmbt_irec	*ep;
-	xfs_fileoff_t		bno;
-	xfs_extlen_t		nsumblocks;
-	uint			blocks;
-	int			i;
-	int			nmap;
 	int			error;
 
-	nsumblocks = mp->m_rsumsize >> mp->m_sb.sb_blocklog;
-	blocks = nsumblocks + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - 1;
-	error = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
+	error = -libxfs_alloc_file_space(mp->m_rsumip, 0, mp->m_rsumsize,
+			XFS_BMAPI_ZERO);
 	if (error)
-		res_failed(error);
-	libxfs_trans_ijoin(tp, mp->m_rsumip, 0);
-
-	bno = 0;
-	while (bno < nsumblocks) {
-		nmap = XFS_BMAP_MAX_NMAP;
-		error = -libxfs_bmapi_write(tp, mp->m_rsumip, bno,
-				(xfs_extlen_t)(nsumblocks - bno),
-				0, nsumblocks, map, &nmap);
-		if (error)
-			fail(_("Allocation of the realtime summary failed"),
-				error);
-
-		for (i = 0, ep = map; i < nmap; i++, ep++) {
-			libxfs_device_zero(mp->m_ddev_targp,
-				XFS_FSB_TO_DADDR(mp, ep->br_startblock),
-				XFS_FSB_TO_BB(mp, ep->br_blockcount));
-			bno += ep->br_blockcount;
-		}
-	}
-	error = -libxfs_trans_commit(tp);
-	if (error)
-		fail(_("Block allocation of the realtime summary inode failed"),
+		fail(
+	_("Block allocation of the realtime summary inode failed"),
 				error);
 
 	if (xfs_has_rtgroups(mp)) {
@@ -1143,6 +1084,9 @@ rtinit(
 			rtrmapbt_create(rtg);
 	}
 
+	if (mp->m_sb.sb_rbmblocks == 0)
+		return;
+
 	rtbitmap_init(mp);
 	rtsummary_init(mp);
 	if (xfs_has_rtgroups(mp))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/4] xfs_repair: use libxfs_alloc_file_space to reallocate rt metadata
  2023-12-31 19:55 ` [PATCHSET v2.0 13/17] xfsprogs: file write utility refactoring Darrick J. Wong
  2023-12-27 13:23   ` [PATCH 1/4] libxfs: resync libxfs_alloc_file_space interface with the kernel Darrick J. Wong
  2023-12-27 13:23   ` [PATCH 2/4] mkfs: use libxfs_alloc_file_space for rtinit Darrick J. Wong
@ 2023-12-27 13:23   ` Darrick J. Wong
  2023-12-27 13:23   ` [PATCH 4/4] mkfs: use file write helper to populate files Darrick J. Wong
  3 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:23 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that libxfs_alloc_file_space can allocate and zero blocks, use it to
repair the realtime metadata instead of open-coding all this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   76 +++++++------------------------------------------------
 1 file changed, 10 insertions(+), 66 deletions(-)


diff --git a/repair/phase6.c b/repair/phase6.c
index fd862362f1d..c9974623d12 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -736,15 +736,9 @@ mk_rbmino(
 	struct xfs_mount	*mp)
 {
 	struct xfs_imeta_update	upd = { };
-	struct xfs_trans	*tp = NULL;
 	struct xfs_inode	*ip = NULL;
-	struct xfs_bmbt_irec	*ep;
 	int			i;
-	int			nmap;
 	int			error;
-	xfs_fileoff_t		bno;
-	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
-	uint			blocks;
 
 	/* Reset the realtime bitmap inode. */
 	if (!xfs_has_metadir(mp)) {
@@ -779,36 +773,15 @@ mk_rbmino(
 	 * then allocate blocks for file and fill with zeroes (stolen
 	 * from mkfs)
 	 */
-	blocks = mp->m_sb.sb_rbmblocks +
-			XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - 1;
-	error = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
-	if (error)
-		res_failed(error);
-
-	libxfs_trans_ijoin(tp, ip, 0);
-	bno = 0;
-	while (bno < mp->m_sb.sb_rbmblocks) {
-		nmap = XFS_BMAP_MAX_NMAP;
-		error = -libxfs_bmapi_write(tp, ip, bno,
-			  (xfs_extlen_t)(mp->m_sb.sb_rbmblocks - bno),
-			  0, mp->m_sb.sb_rbmblocks, map, &nmap);
+	if (mp->m_sb.sb_rbmblocks) {
+		error = -libxfs_alloc_file_space(ip, 0,
+				mp->m_sb.sb_rbmblocks << mp->m_sb.sb_blocklog,
+				XFS_BMAPI_ZERO);
 		if (error) {
 			do_error(
-			_("couldn't allocate realtime bitmap, error = %d\n"),
+	_("allocation of the realtime bitmap failed, error = %d\n"),
 				error);
 		}
-		for (i = 0, ep = map; i < nmap; i++, ep++) {
-			libxfs_device_zero(mp->m_ddev_targp,
-				XFS_FSB_TO_DADDR(mp, ep->br_startblock),
-				XFS_FSB_TO_BB(mp, ep->br_blockcount));
-			bno += ep->br_blockcount;
-		}
-	}
-	error = -libxfs_trans_commit(tp);
-	if (error) {
-		do_error(
-		_("allocation of the realtime bitmap failed, error = %d\n"),
-			error);
 	}
 	libxfs_irele(ip);
 }
@@ -996,16 +969,9 @@ mk_rsumino(
 	struct xfs_mount	*mp)
 {
 	struct xfs_imeta_update	upd = { };
-	struct xfs_trans	*tp = NULL;
 	struct xfs_inode	*ip = NULL;
-	struct xfs_bmbt_irec	*ep;
 	int			i;
-	int			nmap;
 	int			error;
-	int			nsumblocks;
-	xfs_fileoff_t		bno;
-	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
-	uint			blocks;
 
 	/* Reset the realtime summary inode. */
 	if (!xfs_has_metadir(mp)) {
@@ -1040,36 +1006,14 @@ mk_rsumino(
 	 * then allocate blocks for file and fill with zeroes (stolen
 	 * from mkfs)
 	 */
-	nsumblocks = mp->m_rsumsize >> mp->m_sb.sb_blocklog;
-	blocks = nsumblocks + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - 1;
-	error = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
-	if (error)
-		res_failed(error);
-
-	libxfs_trans_ijoin(tp, ip, 0);
-	bno = 0;
-	while (bno < nsumblocks) {
-		nmap = XFS_BMAP_MAX_NMAP;
-		error = -libxfs_bmapi_write(tp, ip, bno,
-			  (xfs_extlen_t)(nsumblocks - bno),
-			  0, nsumblocks, map, &nmap);
+	if (mp->m_rsumsize) {
+		error = -libxfs_alloc_file_space(ip, 0, mp->m_rsumsize,
+				XFS_BMAPI_ZERO);
 		if (error) {
 			do_error(
-		_("couldn't allocate realtime summary inode, error = %d\n"),
-				error);
-		}
-		for (i = 0, ep = map; i < nmap; i++, ep++) {
-			libxfs_device_zero(mp->m_ddev_targp,
-				      XFS_FSB_TO_DADDR(mp, ep->br_startblock),
-				      XFS_FSB_TO_BB(mp, ep->br_blockcount));
-			bno += ep->br_blockcount;
-		}
-	}
-	error = -libxfs_trans_commit(tp);
-	if (error) {
-		do_error(
 	_("allocation of the realtime summary ino failed, error = %d\n"),
-			error);
+				error);
+		}
 	}
 	libxfs_irele(ip);
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/4] mkfs: use file write helper to populate files
  2023-12-31 19:55 ` [PATCHSET v2.0 13/17] xfsprogs: file write utility refactoring Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:23   ` [PATCH 3/4] xfs_repair: use libxfs_alloc_file_space to reallocate rt metadata Darrick J. Wong
@ 2023-12-27 13:23   ` Darrick J. Wong
  3 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:23 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use the file write helper to write files into the filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/libxfs.h |    2 ++
 libxfs/util.c    |   69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mkfs/proto.c     |   26 ++++----------------
 3 files changed, 76 insertions(+), 21 deletions(-)


diff --git a/include/libxfs.h b/include/libxfs.h
index 1fc8bc0e97b..46003fe641d 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -176,6 +176,8 @@ extern int	libxfs_log_header(char *, uuid_t *, int, int, int, xfs_lsn_t,
 
 extern int	libxfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
 					xfs_off_t len, uint32_t bmapi_flags);
+extern int	libxfs_file_write(struct xfs_trans *tp, struct xfs_inode *ip,
+				  void *buf, size_t len, bool logit);
 
 /* XXX: this is messy and needs fixing */
 #ifndef __LIBXFS_INTERNAL_XFS_H__
diff --git a/libxfs/util.c b/libxfs/util.c
index aec7798a814..f2be9dbf4a2 100644
--- a/libxfs/util.c
+++ b/libxfs/util.c
@@ -537,3 +537,72 @@ get_random_u32(void)
 	return ret;
 }
 #endif
+
+/*
+ * Write a buffer to a file on the data device.  We assume there are no holes
+ * and no unwritten extents.
+ */
+int
+libxfs_file_write(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	void			*buf,
+	size_t			len,
+	bool			logit)
+{
+	struct xfs_bmbt_irec	map;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_buf		*bp;
+	xfs_fileoff_t		bno = 0;
+	xfs_fileoff_t		end_bno = XFS_B_TO_FSB(mp, len);
+	size_t			count;
+	size_t			bcount;
+	int			nmap;
+	int			error = 0;
+
+	/* Write up to 1MB at a time. */
+	while (bno < end_bno) {
+		xfs_filblks_t	maplen;
+
+		maplen = min(end_bno - bno, XFS_B_TO_FSBT(mp, 1048576));
+		nmap = 1;
+		error = libxfs_bmapi_read(ip, bno, maplen, &map, &nmap, 0);
+		if (error)
+			return error;
+		if (nmap != 1)
+			return -ENOSPC;
+
+		if (map.br_startblock == HOLESTARTBLOCK ||
+		    map.br_state == XFS_EXT_UNWRITTEN)
+			return -EINVAL;
+
+		error = libxfs_trans_get_buf(tp, mp->m_dev,
+				XFS_FSB_TO_DADDR(mp, map.br_startblock),
+				XFS_FSB_TO_BB(mp, map.br_blockcount),
+				0, &bp);
+		if (error)
+			break;
+		bp->b_ops = NULL;
+
+		count = min(len, XFS_FSB_TO_B(mp, map.br_blockcount));
+		memmove(bp->b_addr, buf, count);
+		bcount = BBTOB(bp->b_length);
+		if (count < bcount)
+			memset((char *)bp->b_addr + count, 0, bcount - count);
+
+		if (tp) {
+			libxfs_trans_log_buf(tp, bp, 0, bcount - 1);
+		} else {
+			libxfs_buf_mark_dirty(bp);
+			libxfs_buf_relse(bp);
+		}
+		if (error)
+			break;
+
+		buf += count;
+		len -= count;
+		bno += map.br_blockcount;
+	}
+
+	return error;
+}
diff --git a/mkfs/proto.c b/mkfs/proto.c
index 83888572395..436f9ac82b2 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -270,16 +270,12 @@ writefile(
 {
 	struct xfs_bmbt_irec	map;
 	struct xfs_mount	*mp;
-	struct xfs_buf		*bp;
-	xfs_daddr_t		d;
 	xfs_extlen_t		nb;
 	int			nmap;
 	int			error;
 
 	mp = ip->i_mount;
 	if (len > 0) {
-		int	bcount;
-
 		nb = XFS_B_TO_FSB(mp, len);
 		nmap = 1;
 		error = -libxfs_bmapi_write(tp, ip, 0, nb, 0, nb, &map, &nmap);
@@ -289,30 +285,18 @@ writefile(
 					progname);
 			exit(1);
 		}
-		if (error) {
+		if (error)
 			fail(_("error allocating space for a file"), error);
-		}
 		if (nmap != 1) {
 			fprintf(stderr,
 				_("%s: cannot allocate space for file\n"),
 				progname);
 			exit(1);
 		}
-		d = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-		error = -libxfs_trans_get_buf(NULL, mp->m_dev, d,
-				nb << mp->m_blkbb_log, 0, &bp);
-		if (error) {
-			fprintf(stderr,
-				_("%s: cannot allocate buffer for file\n"),
-				progname);
-			exit(1);
-		}
-		memmove(bp->b_addr, buf, len);
-		bcount = BBTOB(bp->b_length);
-		if (len < bcount)
-			memset((char *)bp->b_addr + len, 0, bcount - len);
-		libxfs_buf_mark_dirty(bp);
-		libxfs_buf_relse(bp);
+
+		error = -libxfs_file_write(tp, ip, buf, len, false);
+		if (error)
+			fail(_("error writing file"), error);
 	}
 	ip->i_disk_size = len;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/9] xfs: give refcount btree cursor error tracepoints their own class
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
@ 2023-12-27 13:24   ` Darrick J. Wong
  2023-12-27 13:24   ` [PATCH 2/9] xfs: create specialized classes for refcount tracepoints Darrick J. Wong
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:24 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert all the refcount tracepoints to use the btree error tracepoint
class.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |   42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 0e8daab9986..9bb7acbdc6f 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -210,8 +210,7 @@ xfs_refcount_update(
 
 	error = xfs_btree_update(cur, &rec);
 	if (error)
-		trace_xfs_refcount_update_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_refcount_update_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -246,8 +245,7 @@ xfs_refcount_insert(
 
 out_error:
 	if (error)
-		trace_xfs_refcount_insert_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_refcount_insert_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -287,8 +285,7 @@ xfs_refcount_delete(
 			&found_rec);
 out_error:
 	if (error)
-		trace_xfs_refcount_delete_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_refcount_delete_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -437,8 +434,7 @@ xfs_refcount_split_extent(
 	return error;
 
 out_error:
-	trace_xfs_refcount_split_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_split_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -521,8 +517,7 @@ xfs_refcount_merge_center_extents(
 	return error;
 
 out_error:
-	trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_merge_center_extents_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -588,8 +583,7 @@ xfs_refcount_merge_left_extent(
 	return error;
 
 out_error:
-	trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_merge_left_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -657,8 +651,7 @@ xfs_refcount_merge_right_extent(
 	return error;
 
 out_error:
-	trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_merge_right_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -752,8 +745,7 @@ xfs_refcount_find_left_extents(
 	return error;
 
 out_error:
-	trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_find_left_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -847,8 +839,7 @@ xfs_refcount_find_right_extents(
 	return error;
 
 out_error:
-	trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_find_right_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1253,8 +1244,7 @@ xfs_refcount_adjust_extents(
 
 	return error;
 out_error:
-	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1314,8 +1304,7 @@ xfs_refcount_adjust(
 	return 0;
 
 out_error:
-	trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			error, _RET_IP_);
+	trace_xfs_refcount_adjust_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1629,8 +1618,7 @@ xfs_refcount_find_shared(
 
 out_error:
 	if (error)
-		trace_xfs_refcount_find_shared_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_refcount_find_shared_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1785,8 +1773,7 @@ xfs_refcount_adjust_cow_extents(
 
 	return error;
 out_error:
-	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1832,8 +1819,7 @@ xfs_refcount_adjust_cow(
 	return 0;
 
 out_error:
-	trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			error, _RET_IP_);
+	trace_xfs_refcount_adjust_cow_error(cur, error, _RET_IP_);
 	return error;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/9] xfs: create specialized classes for refcount tracepoints
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
  2023-12-27 13:24   ` [PATCH 1/9] xfs: give refcount btree cursor error tracepoints their own class Darrick J. Wong
@ 2023-12-27 13:24   ` Darrick J. Wong
  2023-12-27 13:24   ` [PATCH 3/9] xfs: prepare refcount btree tracepoints for widening Darrick J. Wong
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:24 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The only user of the "ag" tracepoint event classes is the refcount
btree, so rename them to make that obvious and make them take the btree
cursor to simplify the arguments.  This will save us a lot of trouble
later on.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |   24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 9bb7acbdc6f..67c9895efb3 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -50,7 +50,7 @@ xfs_refcount_lookup_le(
 	xfs_agblock_t		bno,
 	int			*stat)
 {
-	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+	trace_xfs_refcount_lookup(cur,
 			xfs_refcount_encode_startblock(bno, domain),
 			XFS_LOOKUP_LE);
 	cur->bc_rec.rc.rc_startblock = bno;
@@ -70,7 +70,7 @@ xfs_refcount_lookup_ge(
 	xfs_agblock_t		bno,
 	int			*stat)
 {
-	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+	trace_xfs_refcount_lookup(cur,
 			xfs_refcount_encode_startblock(bno, domain),
 			XFS_LOOKUP_GE);
 	cur->bc_rec.rc.rc_startblock = bno;
@@ -90,7 +90,7 @@ xfs_refcount_lookup_eq(
 	xfs_agblock_t		bno,
 	int			*stat)
 {
-	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+	trace_xfs_refcount_lookup(cur,
 			xfs_refcount_encode_startblock(bno, domain),
 			XFS_LOOKUP_LE);
 	cur->bc_rec.rc.rc_startblock = bno;
@@ -1261,11 +1261,9 @@ xfs_refcount_adjust(
 	int			error;
 
 	if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
-		trace_xfs_refcount_increase(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, *agbno, *aglen);
+		trace_xfs_refcount_increase(cur, *agbno, *aglen);
 	else
-		trace_xfs_refcount_decrease(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, *agbno, *aglen);
+		trace_xfs_refcount_decrease(cur, *agbno, *aglen);
 
 	/*
 	 * Ensure that no rcextents cross the boundary of the adjustment range.
@@ -1525,8 +1523,7 @@ xfs_refcount_find_shared(
 	int				have;
 	int				error;
 
-	trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			agbno, aglen);
+	trace_xfs_refcount_find_shared(cur, agbno, aglen);
 
 	/* By default, skip the whole range */
 	*fbno = NULLAGBLOCK;
@@ -1613,8 +1610,7 @@ xfs_refcount_find_shared(
 	}
 
 done:
-	trace_xfs_refcount_find_shared_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, *fbno, *flen);
+	trace_xfs_refcount_find_shared_result(cur, *fbno, *flen);
 
 out_error:
 	if (error)
@@ -1832,8 +1828,7 @@ __xfs_refcount_cow_alloc(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		aglen)
 {
-	trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
-			agbno, aglen);
+	trace_xfs_refcount_cow_increase(rcur, agbno, aglen);
 
 	/* Add refcount btree reservation */
 	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
@@ -1849,8 +1844,7 @@ __xfs_refcount_cow_free(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		aglen)
 {
-	trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
-			agbno, aglen);
+	trace_xfs_refcount_cow_decrease(rcur, agbno, aglen);
 
 	/* Remove refcount btree reservation */
 	return xfs_refcount_adjust_cow(rcur, agbno, aglen,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/9] xfs: prepare refcount btree tracepoints for widening
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
  2023-12-27 13:24   ` [PATCH 1/9] xfs: give refcount btree cursor error tracepoints their own class Darrick J. Wong
  2023-12-27 13:24   ` [PATCH 2/9] xfs: create specialized classes for refcount tracepoints Darrick J. Wong
@ 2023-12-27 13:24   ` Darrick J. Wong
  2023-12-27 13:24   ` [PATCH 4/9] xfs: clean up refcount log intent item tracepoint callsites Darrick J. Wong
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:24 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Prepare the rest of refcount btree tracepoints for use with realtime
reflink by making them take the btree cursor object as a parameter.
This will save us a lot of trouble later on.

Remove the xfs_refcount_recover_extent tracepoint since it's already
covered by other refcount tracepoints.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |   42 +++++++++++++++---------------------------
 1 file changed, 15 insertions(+), 27 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 67c9895efb3..18b04c38cdd 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -182,7 +182,7 @@ xfs_refcount_get_rec(
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, irec);
 
-	trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+	trace_xfs_refcount_get(cur, irec);
 	return 0;
 }
 
@@ -200,7 +200,7 @@ xfs_refcount_update(
 	uint32_t		start;
 	int			error;
 
-	trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+	trace_xfs_refcount_update(cur, irec);
 
 	start = xfs_refcount_encode_startblock(irec->rc_startblock,
 			irec->rc_domain);
@@ -227,7 +227,7 @@ xfs_refcount_insert(
 {
 	int				error;
 
-	trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+	trace_xfs_refcount_insert(cur, irec);
 
 	cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
 	cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
@@ -272,7 +272,7 @@ xfs_refcount_delete(
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
-	trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
+	trace_xfs_refcount_delete(cur, &irec);
 	error = xfs_btree_delete(cur, i);
 	if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
 		xfs_btree_mark_sick(cur);
@@ -409,8 +409,7 @@ xfs_refcount_split_extent(
 		return 0;
 
 	*shape_changed = true;
-	trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			&rcext, agbno);
+	trace_xfs_refcount_split_extent(cur, &rcext, agbno);
 
 	/* Establish the right extent. */
 	tmp = rcext;
@@ -453,8 +452,7 @@ xfs_refcount_merge_center_extents(
 	int				error;
 	int				found_rec;
 
-	trace_xfs_refcount_merge_center_extents(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, left, center, right);
+	trace_xfs_refcount_merge_center_extents(cur, left, center, right);
 
 	ASSERT(left->rc_domain == center->rc_domain);
 	ASSERT(right->rc_domain == center->rc_domain);
@@ -535,8 +533,7 @@ xfs_refcount_merge_left_extent(
 	int				error;
 	int				found_rec;
 
-	trace_xfs_refcount_merge_left_extent(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, left, cleft);
+	trace_xfs_refcount_merge_left_extent(cur, left, cleft);
 
 	ASSERT(left->rc_domain == cleft->rc_domain);
 
@@ -600,8 +597,7 @@ xfs_refcount_merge_right_extent(
 	int				error;
 	int				found_rec;
 
-	trace_xfs_refcount_merge_right_extent(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, cright, right);
+	trace_xfs_refcount_merge_right_extent(cur, cright, right);
 
 	ASSERT(right->rc_domain == cright->rc_domain);
 
@@ -740,8 +736,7 @@ xfs_refcount_find_left_extents(
 		cleft->rc_refcount = 1;
 		cleft->rc_domain = domain;
 	}
-	trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			left, cleft, agbno);
+	trace_xfs_refcount_find_left_extent(cur, left, cleft, agbno);
 	return error;
 
 out_error:
@@ -834,8 +829,8 @@ xfs_refcount_find_right_extents(
 		cright->rc_refcount = 1;
 		cright->rc_domain = domain;
 	}
-	trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			cright, right, agbno + aglen);
+	trace_xfs_refcount_find_right_extent(cur, cright, right,
+			agbno + aglen);
 	return error;
 
 out_error:
@@ -1138,8 +1133,7 @@ xfs_refcount_adjust_extents(
 			tmp.rc_refcount = 1 + adj;
 			tmp.rc_domain = XFS_REFC_DOMAIN_SHARED;
 
-			trace_xfs_refcount_modify_extent(cur->bc_mp,
-					cur->bc_ag.pag->pag_agno, &tmp);
+			trace_xfs_refcount_modify_extent(cur, &tmp);
 
 			/*
 			 * Either cover the hole (increment) or
@@ -1204,8 +1198,7 @@ xfs_refcount_adjust_extents(
 		if (ext.rc_refcount == MAXREFCOUNT)
 			goto skip;
 		ext.rc_refcount += adj;
-		trace_xfs_refcount_modify_extent(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, &ext);
+		trace_xfs_refcount_modify_extent(cur, &ext);
 		cur->bc_ag.refc.nr_ops++;
 		if (ext.rc_refcount > 1) {
 			error = xfs_refcount_update(cur, &ext);
@@ -1720,8 +1713,7 @@ xfs_refcount_adjust_cow_extents(
 		tmp.rc_refcount = 1;
 		tmp.rc_domain = XFS_REFC_DOMAIN_COW;
 
-		trace_xfs_refcount_modify_extent(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, &tmp);
+		trace_xfs_refcount_modify_extent(cur, &tmp);
 
 		error = xfs_refcount_insert(cur, &tmp,
 				&found_tmp);
@@ -1752,8 +1744,7 @@ xfs_refcount_adjust_cow_extents(
 		}
 
 		ext.rc_refcount = 0;
-		trace_xfs_refcount_modify_extent(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, &ext);
+		trace_xfs_refcount_modify_extent(cur, &ext);
 		error = xfs_refcount_delete(cur, &found_rec);
 		if (error)
 			goto out_error;
@@ -1987,9 +1978,6 @@ xfs_refcount_recover_cow_leftovers(
 		if (error)
 			goto out_free;
 
-		trace_xfs_refcount_recover_extent(mp, pag->pag_agno,
-				&rr->rr_rrec);
-
 		/* Free the orphan record */
 		fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
 				rr->rr_rrec.rc_startblock);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/9] xfs: clean up refcount log intent item tracepoint callsites
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:24   ` [PATCH 3/9] xfs: prepare refcount btree tracepoints for widening Darrick J. Wong
@ 2023-12-27 13:24   ` Darrick J. Wong
  2023-12-27 13:25   ` [PATCH 5/9] xfs: add a ci_entry helper Darrick J. Wong
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:24 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pass the incore refcount intent structure to the tracepoints instead of
open-coding the argument passing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |   14 ++++----------
 libxfs/xfs_refcount.h |    6 ++++++
 2 files changed, 10 insertions(+), 10 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 18b04c38cdd..3ae68ea22e3 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -1366,9 +1366,7 @@ xfs_refcount_finish_one(
 
 	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
 
-	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
-			ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
-			ri->ri_blockcount);
+	trace_xfs_refcount_deferred(mp, ri);
 
 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
 		return -EIO;
@@ -1431,8 +1429,7 @@ xfs_refcount_finish_one(
 		return -EFSCORRUPTED;
 	}
 	if (!error && ri->ri_blockcount > 0)
-		trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno,
-				ri->ri_type, bno, ri->ri_blockcount);
+		trace_xfs_refcount_finish_one_leftover(mp, ri);
 	return error;
 }
 
@@ -1448,11 +1445,6 @@ __xfs_refcount_add(
 {
 	struct xfs_refcount_intent	*ri;
 
-	trace_xfs_refcount_defer(tp->t_mountp,
-			XFS_FSB_TO_AGNO(tp->t_mountp, startblock),
-			type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
-			blockcount);
-
 	ri = kmem_cache_alloc(xfs_refcount_intent_cache,
 			GFP_NOFS | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&ri->ri_list);
@@ -1460,6 +1452,8 @@ __xfs_refcount_add(
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
 
+	trace_xfs_refcount_defer(tp->t_mountp, ri);
+
 	xfs_refcount_update_get_group(tp->t_mountp, ri);
 	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
 }
diff --git a/libxfs/xfs_refcount.h b/libxfs/xfs_refcount.h
index 9b56768a590..01a20621192 100644
--- a/libxfs/xfs_refcount.h
+++ b/libxfs/xfs_refcount.h
@@ -48,6 +48,12 @@ enum xfs_refcount_intent_type {
 	XFS_REFCOUNT_FREE_COW,
 };
 
+#define XFS_REFCOUNT_INTENT_STRINGS \
+	{ XFS_REFCOUNT_INCREASE,	"incr" }, \
+	{ XFS_REFCOUNT_DECREASE,	"decr" }, \
+	{ XFS_REFCOUNT_ALLOC_COW,	"alloc_cow" }, \
+	{ XFS_REFCOUNT_FREE_COW,	"free_cow" }
+
 struct xfs_refcount_intent {
 	struct list_head			ri_list;
 	struct xfs_perag			*ri_pag;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/9] xfs: add a ci_entry helper
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:24   ` [PATCH 4/9] xfs: clean up refcount log intent item tracepoint callsites Darrick J. Wong
@ 2023-12-27 13:25   ` Darrick J. Wong
  2023-12-27 13:25   ` [PATCH 6/9] xfs: reuse xfs_refcount_update_cancel_item Darrick J. Wong
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:25 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a helper to translate from the item list head to the
refcount_intent_item structure and use it so shorten assignments and
avoid the need for extra local variables.

Inspired-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index e7270d02c4b..471e4f6867d 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -485,6 +485,11 @@ const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = {
 
 /* Reference Counting */
 
+static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_refcount_intent, ri_list);
+}
+
 /* Sort refcount intents by AG. */
 static int
 xfs_refcount_update_diff_items(
@@ -492,11 +497,8 @@ xfs_refcount_update_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	const struct xfs_refcount_intent *ra;
-	const struct xfs_refcount_intent *rb;
-
-	ra = container_of(a, struct xfs_refcount_intent, ri_list);
-	rb = container_of(b, struct xfs_refcount_intent, ri_list);
+	struct xfs_refcount_intent	*ra = ci_entry(a);
+	struct xfs_refcount_intent	*rb = ci_entry(b);
 
 	return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
 }
@@ -551,10 +553,9 @@ xfs_refcount_update_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
-	struct xfs_refcount_intent	*ri;
+	struct xfs_refcount_intent	*ri = ci_entry(item);
 	int				error;
 
-	ri = container_of(item, struct xfs_refcount_intent, ri_list);
 	error = xfs_refcount_finish_one(tp, ri, state);
 
 	/* Did we run out of reservation?  Requeue what we didn't finish. */
@@ -581,9 +582,7 @@ STATIC void
 xfs_refcount_update_cancel_item(
 	struct list_head		*item)
 {
-	struct xfs_refcount_intent	*ri;
-
-	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+	struct xfs_refcount_intent	*ri = ci_entry(item);
 
 	xfs_refcount_update_put_group(ri);
 	kmem_cache_free(xfs_refcount_intent_cache, ri);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/9] xfs: reuse xfs_refcount_update_cancel_item
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:25   ` [PATCH 5/9] xfs: add a ci_entry helper Darrick J. Wong
@ 2023-12-27 13:25   ` Darrick J. Wong
  2023-12-27 13:25   ` [PATCH 7/9] xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one Darrick J. Wong
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:25 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reuse xfs_refcount_update_cancel_item to put the AG/RTG and free the
item in a few places that currently open code the logic.

Inspired-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 471e4f6867d..e056c3b449b 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -545,6 +545,17 @@ xfs_refcount_update_put_group(
 	xfs_perag_intent_put(ri->ri_pag);
 }
 
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_refcount_intent	*ri = ci_entry(item);
+
+	xfs_refcount_update_put_group(ri);
+	kmem_cache_free(xfs_refcount_intent_cache, ri);
+}
+
 /* Process a deferred refcount update. */
 STATIC int
 xfs_refcount_update_finish_item(
@@ -565,8 +576,7 @@ xfs_refcount_update_finish_item(
 		return -EAGAIN;
 	}
 
-	xfs_refcount_update_put_group(ri);
-	kmem_cache_free(xfs_refcount_intent_cache, ri);
+	xfs_refcount_update_cancel_item(item);
 	return error;
 }
 
@@ -577,17 +587,6 @@ xfs_refcount_update_abort_intent(
 {
 }
 
-/* Cancel a deferred refcount update. */
-STATIC void
-xfs_refcount_update_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_refcount_intent	*ri = ci_entry(item);
-
-	xfs_refcount_update_put_group(ri);
-	kmem_cache_free(xfs_refcount_intent_cache, ri);
-}
-
 const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.name		= "refcount",
 	.create_intent	= xfs_refcount_update_create_intent,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 7/9] xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:25   ` [PATCH 6/9] xfs: reuse xfs_refcount_update_cancel_item Darrick J. Wong
@ 2023-12-27 13:25   ` Darrick J. Wong
  2023-12-27 13:25   ` [PATCH 8/9] xfs: simplify usage of the rcur local variable " Darrick J. Wong
  2023-12-27 13:26   ` [PATCH 9/9] xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:25 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In xfs_refcount_finish_one we know the cursor is non-zero when calling
xfs_refcount_finish_one_cleanup and we pass a 0 error variable.  This
means xfs_refcount_finish_one_cleanup is just doing a
xfs_btree_del_cursor.

Open code that and move xfs_refcount_finish_one_cleanup to
fs/xfs/xfs_refcount_item.c.

Inspired-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c   |   17 +++++++++++++++++
 libxfs/xfs_refcount.c |   19 +------------------
 libxfs/xfs_refcount.h |    2 --
 3 files changed, 18 insertions(+), 20 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index e056c3b449b..58a18c7876d 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -587,6 +587,23 @@ xfs_refcount_update_abort_intent(
 {
 }
 
+/* Clean up after calling xfs_refcount_finish_one. */
+STATIC void
+xfs_refcount_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	struct xfs_buf		*agbp;
+
+	if (rcur == NULL)
+		return;
+	agbp = rcur->bc_ag.agbp;
+	xfs_btree_del_cursor(rcur, error);
+	if (error)
+		xfs_trans_brelse(tp, agbp);
+}
+
 const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.name		= "refcount",
 	.create_intent	= xfs_refcount_update_create_intent,
diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 3ae68ea22e3..635bbf7f99d 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -1299,23 +1299,6 @@ xfs_refcount_adjust(
 	return error;
 }
 
-/* Clean up after calling xfs_refcount_finish_one. */
-void
-xfs_refcount_finish_one_cleanup(
-	struct xfs_trans	*tp,
-	struct xfs_btree_cur	*rcur,
-	int			error)
-{
-	struct xfs_buf		*agbp;
-
-	if (rcur == NULL)
-		return;
-	agbp = rcur->bc_ag.agbp;
-	xfs_btree_del_cursor(rcur, error);
-	if (error)
-		xfs_trans_brelse(tp, agbp);
-}
-
 /*
  * Set up a continuation a deferred refcount operation by updating the intent.
  * Checks to make sure we're not going to run off the end of the AG.
@@ -1379,7 +1362,7 @@ xfs_refcount_finish_one(
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
 		nr_ops = rcur->bc_ag.refc.nr_ops;
 		shape_changes = rcur->bc_ag.refc.shape_changes;
-		xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+		xfs_btree_del_cursor(rcur, 0);
 		rcur = NULL;
 		*pcur = NULL;
 	}
diff --git a/libxfs/xfs_refcount.h b/libxfs/xfs_refcount.h
index 01a20621192..c94b8f71d40 100644
--- a/libxfs/xfs_refcount.h
+++ b/libxfs/xfs_refcount.h
@@ -82,8 +82,6 @@ void xfs_refcount_increase_extent(struct xfs_trans *tp,
 void xfs_refcount_decrease_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
 
-extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
-		struct xfs_btree_cur *rcur, int error);
 extern int xfs_refcount_finish_one(struct xfs_trans *tp,
 		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 8/9] xfs: simplify usage of the rcur local variable in xfs_refcount_finish_one
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:25   ` [PATCH 7/9] xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one Darrick J. Wong
@ 2023-12-27 13:25   ` Darrick J. Wong
  2023-12-27 13:26   ` [PATCH 9/9] xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:25 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Only update rcur when we know the final *pcur value.

Inspired-by: Christoph Hellwig <hch@lst.de>
[djwong: don't leave the caller with a dangling ref]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |    7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 635bbf7f99d..5cd279786ce 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -1340,7 +1340,7 @@ xfs_refcount_finish_one(
 	struct xfs_btree_cur		**pcur)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_btree_cur		*rcur;
+	struct xfs_btree_cur		*rcur = *pcur;
 	struct xfs_buf			*agbp = NULL;
 	int				error = 0;
 	xfs_agblock_t			bno;
@@ -1358,7 +1358,6 @@ xfs_refcount_finish_one(
 	 * If we haven't gotten a cursor or the cursor AG doesn't match
 	 * the startblock, get one now.
 	 */
-	rcur = *pcur;
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
 		nr_ops = rcur->bc_ag.refc.nr_ops;
 		shape_changes = rcur->bc_ag.refc.shape_changes;
@@ -1372,11 +1371,11 @@ xfs_refcount_finish_one(
 		if (error)
 			return error;
 
-		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
+		*pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp,
+							  ri->ri_pag);
 		rcur->bc_ag.refc.nr_ops = nr_ops;
 		rcur->bc_ag.refc.shape_changes = shape_changes;
 	}
-	*pcur = rcur;
 
 	switch (ri->ri_type) {
 	case XFS_REFCOUNT_INCREASE:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 9/9] xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 13:25   ` [PATCH 8/9] xfs: simplify usage of the rcur local variable " Darrick J. Wong
@ 2023-12-27 13:26   ` Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:26 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the code that adds the incore xfs_refcount_update_item deferred
work data to a transaction live with the CUI log item code.  This means
that the refcount code no longer has to know about the inner workings of
the CUI log items.

As a consequence, we can get rid of the _{get,put}_group helpers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c   |   21 +++++++++------------
 libxfs/defer_item.h   |    5 +++++
 libxfs/xfs_refcount.c |    6 ++----
 libxfs/xfs_refcount.h |    3 ---
 4 files changed, 16 insertions(+), 19 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 58a18c7876d..3956a38b414 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -528,21 +528,18 @@ xfs_refcount_update_create_done(
 	return NULL;
 }
 
-/* Take an active ref to the AG containing the space we're refcounting. */
+/* Add this deferred CUI to the transaction. */
 void
-xfs_refcount_update_get_group(
-	struct xfs_mount		*mp,
+xfs_refcount_defer_add(
+	struct xfs_trans		*tp,
 	struct xfs_refcount_intent	*ri)
 {
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	trace_xfs_refcount_defer(mp, ri);
+
 	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
-}
-
-/* Release an active AG ref after finishing refcounting work. */
-static inline void
-xfs_refcount_update_put_group(
-	struct xfs_refcount_intent	*ri)
-{
-	xfs_perag_intent_put(ri->ri_pag);
+	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
 }
 
 /* Cancel a deferred refcount update. */
@@ -552,7 +549,7 @@ xfs_refcount_update_cancel_item(
 {
 	struct xfs_refcount_intent	*ri = ci_entry(item);
 
-	xfs_refcount_update_put_group(ri);
+	xfs_perag_intent_put(ri->ri_pag);
 	kmem_cache_free(xfs_refcount_intent_cache, ri);
 }
 
diff --git a/libxfs/defer_item.h b/libxfs/defer_item.h
index 3ef31ad0aec..bbb4587b97f 100644
--- a/libxfs/defer_item.h
+++ b/libxfs/defer_item.h
@@ -24,4 +24,9 @@ struct xfs_rmap_intent;
 
 void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
 
+struct xfs_refcount_intent;
+
+void xfs_refcount_defer_add(struct xfs_trans *tp,
+		struct xfs_refcount_intent *ri);
+
 #endif /* __LIBXFS_DEFER_ITEM_H_ */
diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 5cd279786ce..b094d9a41f6 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -23,6 +23,7 @@
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
 #include "xfs_health.h"
+#include "defer_item.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -1434,10 +1435,7 @@ __xfs_refcount_add(
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
 
-	trace_xfs_refcount_defer(tp->t_mountp, ri);
-
-	xfs_refcount_update_get_group(tp->t_mountp, ri);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
+	xfs_refcount_defer_add(tp, ri);
 }
 
 /*
diff --git a/libxfs/xfs_refcount.h b/libxfs/xfs_refcount.h
index c94b8f71d40..68acb0b1b4a 100644
--- a/libxfs/xfs_refcount.h
+++ b/libxfs/xfs_refcount.h
@@ -74,9 +74,6 @@ xfs_refcount_check_domain(
 	return true;
 }
 
-void xfs_refcount_update_get_group(struct xfs_mount *mp,
-		struct xfs_refcount_intent *ri);
-
 void xfs_refcount_increase_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
 void xfs_refcount_decrease_extent(struct xfs_trans *tp,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/42] xfs: introduce realtime refcount btree definitions
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
@ 2023-12-27 13:26   ` Darrick J. Wong
  2023-12-27 13:26   ` [PATCH 02/42] xfs: namespace the maximum length/refcount symbols Darrick J. Wong
                     ` (40 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:26 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add new realtime refcount btree definitions. The realtime refcount btree
will be rooted from a hidden inode, but has its own shape and therefore
needs to have most of its own separate types.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_btree.h  |    1 +
 libxfs/xfs_format.h |    6 ++++++
 libxfs/xfs_types.h  |    5 +++--
 3 files changed, 10 insertions(+), 2 deletions(-)


diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h
index 4753a5c8476..f58240adda6 100644
--- a/libxfs/xfs_btree.h
+++ b/libxfs/xfs_btree.h
@@ -65,6 +65,7 @@ union xfs_btree_rec {
 #define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
 #define	XFS_BTNUM_RCBAG	((xfs_btnum_t)XFS_BTNUM_RCBAGi)
 #define	XFS_BTNUM_RTRMAP ((xfs_btnum_t)XFS_BTNUM_RTRMAPi)
+#define	XFS_BTNUM_RTREFC ((xfs_btnum_t)XFS_BTNUM_RTREFCi)
 
 struct xfs_btree_ops;
 uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 1c1910256a9..0dc169fde2e 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1815,6 +1815,12 @@ struct xfs_refcount_key {
 /* btree pointer type */
 typedef __be32 xfs_refcount_ptr_t;
 
+/*
+ * Realtime Reference Count btree format definitions
+ *
+ * This is a btree for reference count records for realtime volumes
+ */
+#define	XFS_RTREFC_CRC_MAGIC	0x52434e54	/* 'RCNT' */
 
 /*
  * BMAP Btree format definitions
diff --git a/libxfs/xfs_types.h b/libxfs/xfs_types.h
index b3edc57dc65..4147ba288ec 100644
--- a/libxfs/xfs_types.h
+++ b/libxfs/xfs_types.h
@@ -126,7 +126,7 @@ typedef enum {
 typedef enum {
 	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
 	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_RCBAGi,
-	XFS_BTNUM_RTRMAPi, XFS_BTNUM_MAX
+	XFS_BTNUM_RTRMAPi, XFS_BTNUM_RTREFCi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 
 #define XFS_BTNUM_STRINGS \
@@ -138,7 +138,8 @@ typedef enum {
 	{ XFS_BTNUM_FINOi,	"finobt" }, \
 	{ XFS_BTNUM_REFCi,	"refcbt" }, \
 	{ XFS_BTNUM_RCBAGi,	"rcbagbt" }, \
-	{ XFS_BTNUM_RTRMAPi,	"rtrmapbt" }
+	{ XFS_BTNUM_RTRMAPi,	"rtrmapbt" }, \
+	{ XFS_BTNUM_RTREFCi,	"rtrefcbt" }
 
 struct xfs_name {
 	const unsigned char	*name;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/42] xfs: namespace the maximum length/refcount symbols
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
  2023-12-27 13:26   ` [PATCH 01/42] xfs: introduce realtime refcount btree definitions Darrick J. Wong
@ 2023-12-27 13:26   ` Darrick J. Wong
  2023-12-27 13:26   ` [PATCH 03/42] xfs: define the on-disk realtime refcount btree format Darrick J. Wong
                     ` (39 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:26 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Actually namespace these variables properly, so that readers can tell
that this is an XFS symbol, and that it's for the refcount
functionality.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h   |    4 ++--
 libxfs/xfs_refcount.c |   18 +++++++++---------
 repair/rmap.c         |    4 ++--
 repair/scan.c         |    2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)


diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 0dc169fde2e..473bdc2a1ad 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1809,8 +1809,8 @@ struct xfs_refcount_key {
 	__be32		rc_startblock;	/* starting block number */
 };
 
-#define MAXREFCOUNT	((xfs_nlink_t)~0U)
-#define MAXREFCEXTLEN	((xfs_extlen_t)~0U)
+#define XFS_REFC_REFCOUNT_MAX	((xfs_nlink_t)~0U)
+#define XFS_REFC_LEN_MAX	((xfs_extlen_t)~0U)
 
 /* btree pointer type */
 typedef __be32 xfs_refcount_ptr_t;
diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index b094d9a41f6..e98d5ea6ca2 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -127,7 +127,7 @@ xfs_refcount_check_irec(
 	struct xfs_perag		*pag,
 	const struct xfs_refcount_irec	*irec)
 {
-	if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
+	if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
 		return __this_address;
 
 	if (!xfs_refcount_check_domain(irec))
@@ -137,7 +137,7 @@ xfs_refcount_check_irec(
 	if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
 		return __this_address;
 
-	if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
+	if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
 		return __this_address;
 
 	return NULL;
@@ -852,9 +852,9 @@ xfs_refc_merge_refcount(
 	const struct xfs_refcount_irec	*irec,
 	enum xfs_refc_adjust_op		adjust)
 {
-	/* Once a record hits MAXREFCOUNT, it is pinned there forever */
-	if (irec->rc_refcount == MAXREFCOUNT)
-		return MAXREFCOUNT;
+	/* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */
+	if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX)
+		return XFS_REFC_REFCOUNT_MAX;
 	return irec->rc_refcount + adjust;
 }
 
@@ -897,7 +897,7 @@ xfs_refc_want_merge_center(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cleft->rc_blockcount + right->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	*ulenp = ulen;
@@ -932,7 +932,7 @@ xfs_refc_want_merge_left(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cleft->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	return true;
@@ -966,7 +966,7 @@ xfs_refc_want_merge_right(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cright->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	return true;
@@ -1196,7 +1196,7 @@ xfs_refcount_adjust_extents(
 		 * Adjust the reference count and either update the tree
 		 * (incr) or free the blocks (decr).
 		 */
-		if (ext.rc_refcount == MAXREFCOUNT)
+		if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX)
 			goto skip;
 		ext.rc_refcount += adj;
 		trace_xfs_refcount_modify_extent(cur, &ext);
diff --git a/repair/rmap.c b/repair/rmap.c
index 1312a0dde34..6fd537f533f 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -1004,8 +1004,8 @@ refcount_emit(
 		agno, agbno, len, nr_rmaps);
 	rlrec.rc_startblock = agbno;
 	rlrec.rc_blockcount = len;
-	if (nr_rmaps > MAXREFCOUNT)
-		nr_rmaps = MAXREFCOUNT;
+	if (nr_rmaps > XFS_REFC_REFCOUNT_MAX)
+		nr_rmaps = XFS_REFC_REFCOUNT_MAX;
 	rlrec.rc_refcount = nr_rmaps;
 	rlrec.rc_domain = XFS_REFC_DOMAIN_SHARED;
 
diff --git a/repair/scan.c b/repair/scan.c
index 2f414898078..ba634af8bb1 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -1895,7 +1895,7 @@ _("extent (%u/%u) len %u claimed, state is %d\n"),
 						break;
 					}
 				}
-			} else if (nr < 2 || nr > MAXREFCOUNT) {
+			} else if (nr < 2 || nr > XFS_REFC_REFCOUNT_MAX) {
 				do_warn(
 	_("invalid reference count %u in record %u of %s btree block %u/%u\n"),
 					nr, i, name, agno, bno);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/42] xfs: define the on-disk realtime refcount btree format
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
  2023-12-27 13:26   ` [PATCH 01/42] xfs: introduce realtime refcount btree definitions Darrick J. Wong
  2023-12-27 13:26   ` [PATCH 02/42] xfs: namespace the maximum length/refcount symbols Darrick J. Wong
@ 2023-12-27 13:26   ` Darrick J. Wong
  2023-12-27 13:27   ` [PATCH 04/42] xfs: realtime refcount btree transaction reservations Darrick J. Wong
                     ` (38 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:26 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Start filling out the rtrefcount btree implementation. Start with the
on-disk btree format; add everything needed to read, write and
manipulate refcount btree blocks. This prepares the way for connecting
the btree operations implementation.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/libxfs.h              |    1 
 include/xfs_mount.h           |    9 +
 libxfs/Makefile               |    2 
 libxfs/init.c                 |    6 +
 libxfs/xfs_btree.c            |    5 +
 libxfs/xfs_btree.h            |   11 +
 libxfs/xfs_format.h           |    3 
 libxfs/xfs_ondisk.h           |    1 
 libxfs/xfs_rtrefcount_btree.c |  310 +++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrefcount_btree.h |   71 +++++++++
 libxfs/xfs_sb.c               |    8 +
 libxfs/xfs_shared.h           |    2 
 12 files changed, 424 insertions(+), 5 deletions(-)
 create mode 100644 libxfs/xfs_rtrefcount_btree.c
 create mode 100644 libxfs/xfs_rtrefcount_btree.h


diff --git a/include/libxfs.h b/include/libxfs.h
index 46003fe641d..83cee8a6e4c 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -83,6 +83,7 @@ struct iomap;
 #include "xfs_rmap_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_rtrefcount_btree.h"
 #include "xfs_refcount.h"
 #include "xfs_btree_staging.h"
 #include "xfs_rtbitmap.h"
diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index 07f9e33b8b2..916da55793b 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -96,11 +96,14 @@ typedef struct xfs_mount {
 	uint			m_rtrmap_mnr[2]; /* min rtrmap btree records */
 	uint			m_refc_mxr[2];	/* max refc btree records */
 	uint			m_refc_mnr[2];	/* min refc btree records */
+	unsigned int		m_rtrefc_mxr[2]; /* max rtrefc btree records */
+	unsigned int		m_rtrefc_mnr[2]; /* min rtrefc btree records */
 	uint			m_alloc_maxlevels; /* max alloc btree levels */
 	uint			m_bm_maxlevels[2];  /* max bmap btree levels */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
 	uint			m_rtrmap_maxlevels; /* max rtrmap btree level */
 	uint			m_refc_maxlevels; /* max refc btree levels */
+	unsigned int		m_rtrefc_maxlevels; /* max rtrefc btree level */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
 	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
@@ -242,6 +245,12 @@ static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
 	       xfs_has_rmapbt(mp);
 }
 
+static inline bool xfs_has_rtreflink(struct xfs_mount *mp)
+{
+	return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
+	       xfs_has_reflink(mp);
+}
+
 /* Kernel mount features that we don't support */
 #define __XFS_UNSUPP_FEAT(name) \
 static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
diff --git a/libxfs/Makefile b/libxfs/Makefile
index 0b3e8c896bd..6faeca34562 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -59,6 +59,7 @@ HFILES = \
 	xfs_quota_defs.h \
 	xfs_refcount.h \
 	xfs_refcount_btree.h \
+	xfs_rtrefcount_btree.h \
 	xfs_rmap.h \
 	xfs_rmap_btree.h \
 	xfs_rtbitmap.h \
@@ -118,6 +119,7 @@ CFILES = cache.c \
 	xfs_parent.c \
 	xfs_refcount.c \
 	xfs_refcount_btree.c \
+	xfs_rtrefcount_btree.c \
 	xfs_rmap.c \
 	xfs_rmap_btree.c \
 	xfs_rtbitmap.c \
diff --git a/libxfs/init.c b/libxfs/init.c
index 18bd2116c50..36b4b486145 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -650,7 +650,10 @@ static inline void
 xfs_rtbtree_compute_maxlevels(
 	struct xfs_mount	*mp)
 {
-	mp->m_rtbtree_maxlevels = mp->m_rtrmap_maxlevels;
+	unsigned int		levels;
+
+	levels = max(mp->m_rtrmap_maxlevels, mp->m_rtrefc_maxlevels);
+	mp->m_rtbtree_maxlevels = levels;
 }
 
 /* Compute maximum possible height of all btrees. */
@@ -668,6 +671,7 @@ libxfs_compute_all_maxlevels(
 	xfs_rmapbt_compute_maxlevels(mp);
 	xfs_rtrmapbt_compute_maxlevels(mp);
 	xfs_refcountbt_compute_maxlevels(mp);
+	xfs_rtrefcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
 	xfs_rtbtree_compute_maxlevels(mp);
diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c
index 450c48ceaf1..ebb409e4280 100644
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -33,6 +33,7 @@
 #include "xfs_bmap.h"
 #include "xfs_rmap.h"
 #include "xfs_imeta.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Btree magic numbers.
@@ -5534,6 +5535,9 @@ xfs_btree_init_cur_caches(void)
 	if (error)
 		goto err;
 	error = xfs_rtrmapbt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_rtrefcountbt_init_cur_cache();
 	if (error)
 		goto err;
 
@@ -5553,6 +5557,7 @@ xfs_btree_destroy_cur_caches(void)
 	xfs_rmapbt_destroy_cur_cache();
 	xfs_refcountbt_destroy_cur_cache();
 	xfs_rtrmapbt_destroy_cur_cache();
+	xfs_rtrefcountbt_destroy_cur_cache();
 }
 
 /* Move the btree cursor before the first record. */
diff --git a/libxfs/xfs_btree.h b/libxfs/xfs_btree.h
index f58240adda6..64e37a0ffb7 100644
--- a/libxfs/xfs_btree.h
+++ b/libxfs/xfs_btree.h
@@ -229,6 +229,11 @@ union xfs_btree_irec {
 	struct xfs_refcount_irec	rc;
 };
 
+struct xbtree_refc {
+	unsigned int	nr_ops;		/* # record updates */
+	unsigned int	shape_changes;	/* # of extent splits */
+};
+
 /* Per-AG btree information. */
 struct xfs_btree_cur_ag {
 	struct xfs_perag		*pag;
@@ -237,10 +242,7 @@ struct xfs_btree_cur_ag {
 		struct xbtree_afakeroot	*afake;	/* for staging cursor */
 	};
 	union {
-		struct {
-			unsigned int	nr_ops;	/* # record updates */
-			unsigned int	shape_changes;	/* # of extent splits */
-		} refc;
+		struct xbtree_refc	refc;
 		struct {
 			bool		active;	/* allocation cursor state */
 		} abt;
@@ -261,6 +263,7 @@ struct xfs_btree_cur_ino {
 
 /* For extent swap, ignore owner check in verifier */
 #define	XFS_BTCUR_BMBT_INVALID_OWNER	(1 << 1)
+	struct xbtree_refc		refc;
 };
 
 /* In-memory btree information */
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 473bdc2a1ad..c938b814c43 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1822,6 +1822,9 @@ typedef __be32 xfs_refcount_ptr_t;
  */
 #define	XFS_RTREFC_CRC_MAGIC	0x52434e54	/* 'RCNT' */
 
+/* inode-rooted btree pointer type */
+typedef __be64 xfs_rtrefcount_ptr_t;
+
 /*
  * BMAP Btree format definitions
  *
diff --git a/libxfs/xfs_ondisk.h b/libxfs/xfs_ondisk.h
index 102a3574fc6..242b6831256 100644
--- a/libxfs/xfs_ondisk.h
+++ b/libxfs/xfs_ondisk.h
@@ -79,6 +79,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t,		8);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/libxfs/xfs_rtrefcount_btree.c b/libxfs/xfs_rtrefcount_btree.c
new file mode 100644
index 00000000000..e0db4cbf34c
--- /dev/null
+++ b/libxfs/xfs_rtrefcount_btree.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs_priv.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+
+static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
+
+/*
+ * Realtime Reference Count btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device.  See the comments in xfs_refcount_btree.c for more information.
+ *
+ * This tree is basically the same as the regular refcount btree except that
+ * it's rooted in an inode.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrefcountbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_cur	*new;
+
+	new = xfs_rtrefcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_ino.rtg, cur->bc_ino.ip);
+
+	/* Copy the flags values since init cursor doesn't get them. */
+	new->bc_ino.flags = cur->bc_ino.flags;
+
+	return new;
+}
+
+static xfs_failaddr_t
+xfs_rtrefcountbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	int			level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (!xfs_has_reflink(mp))
+		return __this_address;
+	fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+	level = be16_to_cpu(block->bb_level);
+	if (level > mp->m_rtrefc_maxlevels)
+		return __this_address;
+
+	return xfs_btree_lblock_verify(bp, mp->m_rtrefc_mxr[level != 0]);
+}
+
+static void
+xfs_rtrefcountbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_lblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rtrefcountbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrefcountbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_rtrefcountbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_lblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
+	.name			= "xfs_rtrefcountbt",
+	.magic			= { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) },
+	.verify_read		= xfs_rtrefcountbt_read_verify,
+	.verify_write		= xfs_rtrefcountbt_write_verify,
+	.verify_struct		= xfs_rtrefcountbt_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
+	.rec_len		= sizeof(struct xfs_refcount_rec),
+	.key_len		= sizeof(struct xfs_refcount_key),
+	.lru_refs		= XFS_REFC_BTREE_REF,
+	.geom_flags		= XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE |
+				  XFS_BTREE_CRC_BLOCKS | XFS_BTREE_IROOT_RECORDS,
+
+	.dup_cursor		= xfs_rtrefcountbt_dup_cursor,
+	.buf_ops		= &xfs_rtrefcountbt_buf_ops,
+};
+
+/* Initialize a new rt refcount btree cursor. */
+static struct xfs_btree_cur *
+xfs_rtrefcountbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip)
+{
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RTREFC,
+			&xfs_rtrefcountbt_ops, mp->m_rtrefc_maxlevels,
+			xfs_rtrefcountbt_cur_cache);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
+
+	cur->bc_ino.ip = ip;
+	cur->bc_ino.allocated = 0;
+	cur->bc_ino.flags = 0;
+	cur->bc_ino.refc.nr_ops = 0;
+	cur->bc_ino.refc.shape_changes = 0;
+
+	cur->bc_ino.rtg = xfs_rtgroup_hold(rtg);
+	return cur;
+}
+
+/* Allocate a new rt refcount btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrefcountbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+	cur = xfs_rtrefcountbt_init_common(mp, tp, rtg, ip);
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+	cur->bc_ino.whichfork = XFS_DATA_FORK;
+	return cur;
+}
+
+/* Create a new rt reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rtrefcountbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip,
+	struct xbtree_ifakeroot	*ifake)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_rtrefcountbt_init_common(mp, NULL, rtg, ip);
+	cur->bc_nlevels = ifake->if_levels;
+	cur->bc_ino.forksize = ifake->if_fork_size;
+	cur->bc_ino.whichfork = -1;
+	xfs_btree_stage_ifakeroot(cur, ifake, NULL);
+	return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root.  Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrefcountbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp)
+{
+	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
+	struct xfs_ifork	*ifp;
+	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	/*
+	 * Free any resources hanging off the real fork, then shallow-copy the
+	 * staging fork's contents into the real fork to transfer everything
+	 * we just built.
+	 */
+	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+	xfs_idestroy_fork(ifp);
+	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK,
+			&xfs_rtrefcountbt_ops);
+}
+
+/* Calculate number of records in a realtime refcount btree block. */
+static inline unsigned int
+xfs_rtrefcountbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (sizeof(struct xfs_refcount_key) +
+			   sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount btree block.
+ */
+unsigned int
+xfs_rtrefcountbt_maxrecs(
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_RTREFCOUNT_BLOCK_LEN;
+	return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime refcount btrees. */
+unsigned int
+xfs_rtrefcountbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2;
+
+	/* We need at most one record for every block in an rt group. */
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrefcountbt_init_cur_cache(void)
+{
+	xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur",
+			xfs_btree_cur_sizeof(
+					xfs_rtrefcountbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_rtrefcountbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_rtrefcountbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_rtrefcountbt_cur_cache);
+	xfs_rtrefcountbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of a realtime refcount btree. */
+void
+xfs_rtrefcountbt_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	unsigned int		d_maxlevels, r_maxlevels;
+
+	if (!xfs_has_rtreflink(mp)) {
+		mp->m_rtrefc_maxlevels = 0;
+		return;
+	}
+
+	/*
+	 * The realtime refcountbt lives on the data device, which means that
+	 * its maximum height is constrained by the size of the data device and
+	 * the height required to store one refcount record for each rtextent
+	 * in an rt group.
+	 */
+	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr,
+				mp->m_sb.sb_dblocks);
+	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr,
+				xfs_rtb_to_rtx(mp, mp->m_sb.sb_rgblocks));
+
+	/* Add one level to handle the inode root level. */
+	mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
diff --git a/libxfs/xfs_rtrefcount_btree.h b/libxfs/xfs_rtrefcount_btree.h
new file mode 100644
index 00000000000..6d23ab3a9ad
--- /dev/null
+++ b/libxfs/xfs_rtrefcount_btree.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTREFCOUNT_BTREE_H__
+#define __XFS_RTREFCOUNT_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* refcounts only exist on crc enabled filesystems */
+#define XFS_RTREFCOUNT_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+		struct xfs_inode *ip);
+struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		struct xbtree_ifakeroot *ifake);
+void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp);
+unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
+		unsigned int blocklen, bool leaf);
+void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_rec_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_refcount_rec *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_refcount_key *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_ptr_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrefcount_ptr_t *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 maxrecs * sizeof(struct xfs_refcount_key) +
+		 (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
+int __init xfs_rtrefcountbt_init_cur_cache(void);
+void xfs_rtrefcountbt_destroy_cur_cache(void);
+
+#endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index 891b71190d5..4e3b481327c 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -27,6 +27,7 @@
 #include "xfs_swapext.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1134,6 +1135,13 @@ xfs_sb_mount_common(
 	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
 	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
 
+	mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+			true);
+	mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+			false);
+	mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2;
+	mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2;
+
 	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 	mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h
index adb742267c9..3a7f92a9ac7 100644
--- a/libxfs/xfs_shared.h
+++ b/libxfs/xfs_shared.h
@@ -42,6 +42,7 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
 extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
@@ -56,6 +57,7 @@ extern const struct xfs_btree_ops xfs_bmbt_ops;
 extern const struct xfs_btree_ops xfs_refcountbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_ops;
 extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
+extern const struct xfs_btree_ops xfs_rtrefcountbt_ops;
 
 /* log size calculation functions */
 int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/42] xfs: realtime refcount btree transaction reservations
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:26   ` [PATCH 03/42] xfs: define the on-disk realtime refcount btree format Darrick J. Wong
@ 2023-12-27 13:27   ` Darrick J. Wong
  2023-12-27 13:27   ` [PATCH 05/42] xfs: add realtime refcount btree operations Darrick J. Wong
                     ` (37 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:27 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that there's enough log reservation to handle mapping
and unmapping realtime extents.  We have to reserve enough space
to handle a split in the rtrefcountbt to add the record and a second
split in the regular refcountbt to record the rtrefcountbt split.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_trans_resv.c |   25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_trans_resv.c b/libxfs/xfs_trans_resv.c
index 18efae57975..2fd942c42e7 100644
--- a/libxfs/xfs_trans_resv.c
+++ b/libxfs/xfs_trans_resv.c
@@ -90,6 +90,14 @@ xfs_refcountbt_block_count(
 	return num_ops * (2 * mp->m_refc_maxlevels - 1);
 }
 
+static unsigned int
+xfs_rtrefcountbt_block_count(
+	struct xfs_mount	*mp,
+	unsigned int		num_ops)
+{
+	return num_ops * (2 * mp->m_rtrefc_maxlevels - 1);
+}
+
 /*
  * Logging inodes is really tricksy. They are logged in memory format,
  * which means that what we write into the log doesn't directly translate into
@@ -257,10 +265,13 @@ xfs_rtalloc_block_count(
  * Compute the log reservation required to handle the refcount update
  * transaction.  Refcount updates are always done via deferred log items.
  *
- * This is calculated as:
+ * This is calculated as the max of:
  * Data device refcount updates (t1):
  *    the agfs of the ags containing the blocks: nr_ops * sector size
  *    the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ * Realtime refcount updates (t2);
+ *    the rt refcount inode
+ *    the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
  */
 static unsigned int
 xfs_calc_refcountbt_reservation(
@@ -268,12 +279,20 @@ xfs_calc_refcountbt_reservation(
 	unsigned int		nr_ops)
 {
 	unsigned int		blksz = XFS_FSB_TO_B(mp, 1);
+	unsigned int		t1, t2 = 0;
 
 	if (!xfs_has_reflink(mp))
 		return 0;
 
-	return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
-	       xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+	t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+	     xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+
+	if (xfs_has_realtime(mp))
+		t2 = xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+				     blksz);
+
+	return max(t1, t2);
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/42] xfs: add realtime refcount btree operations
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:27   ` [PATCH 04/42] xfs: realtime refcount btree transaction reservations Darrick J. Wong
@ 2023-12-27 13:27   ` Darrick J. Wong
  2023-12-27 13:27   ` [PATCH 06/42] xfs: prepare refcount functions to deal with rtrefcountbt Darrick J. Wong
                     ` (36 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:27 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Implement the generic btree operations needed to manipulate rtrefcount
btree blocks. This is different from the regular refcountbt in that we
allocate space from the filesystem at large, and are neither constrained
to the free space nor any particular AG.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtrefcount_btree.c |  148 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)


diff --git a/libxfs/xfs_rtrefcount_btree.c b/libxfs/xfs_rtrefcount_btree.c
index e0db4cbf34c..fb4944d570f 100644
--- a/libxfs/xfs_rtrefcount_btree.c
+++ b/libxfs/xfs_rtrefcount_btree.c
@@ -19,6 +19,7 @@
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
 #include "xfs_rtrefcount_btree.h"
+#include "xfs_refcount.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_rtgroup.h"
@@ -51,6 +52,106 @@ xfs_rtrefcountbt_dup_cursor(
 	return new;
 }
 
+STATIC int
+xfs_rtrefcountbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_rtrefc_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrefcountbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0);
+	}
+
+	return cur->bc_mp->m_rtrefc_mxr[level != 0];
+}
+
+STATIC void
+xfs_rtrefcountbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_rtrefcountbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	__u32				x;
+
+	x = be32_to_cpu(rec->refc.rc_startblock);
+	x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+	key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+	uint32_t		start;
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	rec->refc.rc_startblock = cpu_to_be32(start);
+	rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+	rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	const struct xfs_refcount_key	*kp = &key->refc;
+	const struct xfs_refcount_irec	*irec = &cur->bc_rec.rc;
+	uint32_t			start;
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->refc.rc_startblock);
+
+	return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+			be32_to_cpu(k2->refc.rc_startblock);
+}
+
 static xfs_failaddr_t
 xfs_rtrefcountbt_verify(
 	struct xfs_buf		*bp)
@@ -117,6 +218,40 @@ const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
 	.verify_struct		= xfs_rtrefcountbt_verify,
 };
 
+STATIC int
+xfs_rtrefcountbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->refc.rc_startblock) <
+	       be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_rtrefcountbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	return  be32_to_cpu(r1->refc.rc_startblock) +
+		be32_to_cpu(r1->refc.rc_blockcount) <=
+		be32_to_cpu(r2->refc.rc_startblock);
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrefcountbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->refc.rc_startblock);
+
+	return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock),
+				 be32_to_cpu(key2->refc.rc_startblock));
+}
+
 const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.rec_len		= sizeof(struct xfs_refcount_rec),
 	.key_len		= sizeof(struct xfs_refcount_key),
@@ -125,7 +260,20 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 				  XFS_BTREE_CRC_BLOCKS | XFS_BTREE_IROOT_RECORDS,
 
 	.dup_cursor		= xfs_rtrefcountbt_dup_cursor,
+	.alloc_block		= xfs_btree_alloc_imeta_block,
+	.free_block		= xfs_btree_free_imeta_block,
+	.get_minrecs		= xfs_rtrefcountbt_get_minrecs,
+	.get_maxrecs		= xfs_rtrefcountbt_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrefcountbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrefcountbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrefcountbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_rtrefcountbt_init_ptr_from_cur,
+	.key_diff		= xfs_rtrefcountbt_key_diff,
 	.buf_ops		= &xfs_rtrefcountbt_buf_ops,
+	.diff_two_keys		= xfs_rtrefcountbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrefcountbt_keys_inorder,
+	.recs_inorder		= xfs_rtrefcountbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrefcountbt_keys_contiguous,
 };
 
 /* Initialize a new rt refcount btree cursor. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/42] xfs: prepare refcount functions to deal with rtrefcountbt
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:27   ` [PATCH 05/42] xfs: add realtime refcount btree operations Darrick J. Wong
@ 2023-12-27 13:27   ` Darrick J. Wong
  2023-12-27 13:27   ` [PATCH 07/42] xfs: add a realtime flag to the refcount update log redo items Darrick J. Wong
                     ` (35 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:27 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Prepare the high-level refcount functions to deal with the new realtime
refcountbt and its slightly different conventions.  Provide the ability
to talk to either refcountbt or rtrefcountbt formats from the same high
level code.

Note that we leave the _recover_cow_leftovers functions for a separate
patch so that we can convert it all at once.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |   93 ++++++++++++++++++++++++++++++++++++++++---------
 libxfs/xfs_refcount.h |    3 ++
 2 files changed, 78 insertions(+), 18 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index e98d5ea6ca2..2202d9cfb37 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -24,6 +24,7 @@
 #include "xfs_ag.h"
 #include "xfs_health.h"
 #include "defer_item.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -40,6 +41,16 @@ STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur,
 STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
 		xfs_agblock_t agbno, xfs_extlen_t aglen);
 
+/* Return the maximum startblock number of the refcountbt. */
+static inline xfs_agblock_t
+xrefc_max_startblock(
+	struct xfs_btree_cur	*cur)
+{
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC)
+		return cur->bc_mp->m_sb.sb_rgblocks;
+	return cur->bc_mp->m_sb.sb_agblocks;
+}
+
 /*
  * Look up the first record less than or equal to [bno, len] in the btree
  * given by cur.
@@ -143,6 +154,37 @@ xfs_refcount_check_irec(
 	return NULL;
 }
 
+xfs_failaddr_t
+xfs_rtrefcount_check_irec(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
+		return __this_address;
+
+	if (!xfs_refcount_check_domain(irec))
+		return __this_address;
+
+	/* check for valid extent range, including overflow */
+	if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount))
+		return __this_address;
+
+	if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
+		return __this_address;
+
+	return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_refcount_check_btrec(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC)
+		return xfs_rtrefcount_check_irec(cur->bc_ino.rtg, irec);
+	return xfs_refcount_check_irec(cur->bc_ag.pag, irec);
+}
+
 static inline int
 xfs_refcount_complain_bad_rec(
 	struct xfs_btree_cur		*cur,
@@ -151,9 +193,15 @@ xfs_refcount_complain_bad_rec(
 {
 	struct xfs_mount		*mp = cur->bc_mp;
 
-	xfs_warn(mp,
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC) {
+		xfs_warn(mp,
+ "RT Refcount BTree record corruption in rtgroup %u detected at %pS!",
+				cur->bc_ino.rtg->rtg_rgno, fa);
+	} else {
+		xfs_warn(mp,
  "Refcount BTree record corruption in AG %d detected at %pS!",
 				cur->bc_ag.pag->pag_agno, fa);
+	}
 	xfs_warn(mp,
 		"Start block 0x%x, block count 0x%x, references 0x%x",
 		irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -179,7 +227,7 @@ xfs_refcount_get_rec(
 		return error;
 
 	xfs_refcount_btrec_to_irec(rec, irec);
-	fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
+	fa = xfs_refcount_check_btrec(cur, irec);
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, irec);
 
@@ -1046,6 +1094,15 @@ xfs_refcount_merge_extents(
 	return 0;
 }
 
+static inline struct xbtree_refc *
+xrefc_btree_state(
+	struct xfs_btree_cur	*cur)
+{
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC)
+		return &cur->bc_ino.refc;
+	return &cur->bc_ag.refc;
+}
+
 /*
  * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
  * true incorrectly is a shutdown FS; the penalty for guessing false
@@ -1063,25 +1120,25 @@ xfs_refcount_still_have_space(
 	 * to handle each of the shape changes to the refcount btree.
 	 */
 	overhead = xfs_allocfree_block_count(cur->bc_mp,
-				cur->bc_ag.refc.shape_changes);
-	overhead += cur->bc_mp->m_refc_maxlevels;
+				xrefc_btree_state(cur)->shape_changes);
+	overhead += cur->bc_maxlevels;
 	overhead *= cur->bc_mp->m_sb.sb_blocksize;
 
 	/*
 	 * Only allow 2 refcount extent updates per transaction if the
 	 * refcount continue update "error" has been injected.
 	 */
-	if (cur->bc_ag.refc.nr_ops > 2 &&
+	if (xrefc_btree_state(cur)->nr_ops > 2 &&
 	    XFS_TEST_ERROR(false, cur->bc_mp,
 			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
 		return false;
 
-	if (cur->bc_ag.refc.nr_ops == 0)
+	if (xrefc_btree_state(cur)->nr_ops == 0)
 		return true;
 	else if (overhead > cur->bc_tp->t_log_res)
 		return false;
 	return  cur->bc_tp->t_log_res - overhead >
-		cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+		xrefc_btree_state(cur)->nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
 }
 
 /*
@@ -1116,7 +1173,7 @@ xfs_refcount_adjust_extents(
 		if (error)
 			goto out_error;
 		if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
-			ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+			ext.rc_startblock = xrefc_max_startblock(cur);
 			ext.rc_blockcount = 0;
 			ext.rc_refcount = 0;
 			ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
@@ -1140,7 +1197,7 @@ xfs_refcount_adjust_extents(
 			 * Either cover the hole (increment) or
 			 * delete the range (decrement).
 			 */
-			cur->bc_ag.refc.nr_ops++;
+			xrefc_btree_state(cur)->nr_ops++;
 			if (tmp.rc_refcount) {
 				error = xfs_refcount_insert(cur, &tmp,
 						&found_tmp);
@@ -1200,7 +1257,7 @@ xfs_refcount_adjust_extents(
 			goto skip;
 		ext.rc_refcount += adj;
 		trace_xfs_refcount_modify_extent(cur, &ext);
-		cur->bc_ag.refc.nr_ops++;
+		xrefc_btree_state(cur)->nr_ops++;
 		if (ext.rc_refcount > 1) {
 			error = xfs_refcount_update(cur, &ext);
 			if (error)
@@ -1286,7 +1343,7 @@ xfs_refcount_adjust(
 	if (shape_changed)
 		shape_changes++;
 	if (shape_changes)
-		cur->bc_ag.refc.shape_changes++;
+		xrefc_btree_state(cur)->shape_changes++;
 
 	/* Now that we've taken care of the ends, adjust the middle extents */
 	error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
@@ -1360,8 +1417,8 @@ xfs_refcount_finish_one(
 	 * the startblock, get one now.
 	 */
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
-		nr_ops = rcur->bc_ag.refc.nr_ops;
-		shape_changes = rcur->bc_ag.refc.shape_changes;
+		nr_ops = xrefc_btree_state(rcur)->nr_ops;
+		shape_changes = xrefc_btree_state(rcur)->shape_changes;
 		xfs_btree_del_cursor(rcur, 0);
 		rcur = NULL;
 		*pcur = NULL;
@@ -1374,8 +1431,8 @@ xfs_refcount_finish_one(
 
 		*pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp,
 							  ri->ri_pag);
-		rcur->bc_ag.refc.nr_ops = nr_ops;
-		rcur->bc_ag.refc.shape_changes = shape_changes;
+		xrefc_btree_state(rcur)->nr_ops = nr_ops;
+		xrefc_btree_state(rcur)->shape_changes = shape_changes;
 	}
 
 	switch (ri->ri_type) {
@@ -1666,7 +1723,7 @@ xfs_refcount_adjust_cow_extents(
 		goto out_error;
 	}
 	if (!found_rec) {
-		ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+		ext.rc_startblock = xrefc_max_startblock(cur);
 		ext.rc_blockcount = 0;
 		ext.rc_refcount = 0;
 		ext.rc_domain = XFS_REFC_DOMAIN_COW;
@@ -1877,7 +1934,7 @@ xfs_refcount_recover_extent(
 	INIT_LIST_HEAD(&rr->rr_list);
 	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
 
-	if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
+	if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL ||
 	    XFS_IS_CORRUPT(cur->bc_mp,
 			   rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
 		xfs_btree_mark_sick(cur);
@@ -2026,7 +2083,7 @@ xfs_refcount_query_range_helper(
 	xfs_failaddr_t			fa;
 
 	xfs_refcount_btrec_to_irec(rec, &irec);
-	fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec);
+	fa = xfs_refcount_check_btrec(cur, &irec);
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, &irec);
 
diff --git a/libxfs/xfs_refcount.h b/libxfs/xfs_refcount.h
index 68acb0b1b4a..13344b402a7 100644
--- a/libxfs/xfs_refcount.h
+++ b/libxfs/xfs_refcount.h
@@ -12,6 +12,7 @@ struct xfs_perag;
 struct xfs_btree_cur;
 struct xfs_bmbt_irec;
 struct xfs_refcount_irec;
+struct xfs_rtgroup;
 
 extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
 		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
@@ -120,6 +121,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_refcount_irec *irec);
 xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
 		const struct xfs_refcount_irec *irec);
+xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg,
+		const struct xfs_refcount_irec *irec);
 extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
 		struct xfs_refcount_irec *irec, int *stat);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/42] xfs: add a realtime flag to the refcount update log redo items
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:27   ` [PATCH 06/42] xfs: prepare refcount functions to deal with rtrefcountbt Darrick J. Wong
@ 2023-12-27 13:27   ` Darrick J. Wong
  2023-12-27 13:28   ` [PATCH 08/42] xfs: add realtime refcount btree inode to metadata directory Darrick J. Wong
                     ` (34 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:27 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Extend the refcount update (CUI) log items with a new realtime flag that
indicates that the updates apply against the realtime refcountbt.  We'll
wire up the actual refcount code later.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c     |  101 ++++++++++++++++++++++++++++++++++++++++++++++-
 libxfs/xfs_bmap.c       |   10 +++--
 libxfs/xfs_defer.h      |    1 
 libxfs/xfs_log_format.h |    6 ++-
 libxfs/xfs_refcount.c   |   71 ++++++++++++++++++++++++---------
 libxfs/xfs_refcount.h   |   22 +++++++---
 6 files changed, 176 insertions(+), 35 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 3956a38b414..d553b42d77e 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -538,8 +538,22 @@ xfs_refcount_defer_add(
 
 	trace_xfs_refcount_defer(mp, ri);
 
-	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
+	/*
+	 * Deferred refcount updates for the realtime and data sections must
+	 * use separate transactions to finish deferred work because updates to
+	 * realtime metadata files can lock AGFs to allocate btree blocks and
+	 * we don't want that mixing with the AGF locks taken to finish data
+	 * section updates.
+	 */
+	if (ri->ri_realtime) {
+		ri->ri_rtg = xfs_rtgroup_intent_get(mp, ri->ri_startblock);
+		xfs_defer_add(tp, &ri->ri_list,
+				&xfs_rtrefcount_update_defer_type);
+	} else {
+		ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
+		xfs_defer_add(tp, &ri->ri_list,
+				&xfs_refcount_update_defer_type);
+	}
 }
 
 /* Cancel a deferred refcount update. */
@@ -611,6 +625,89 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.cancel_item	= xfs_refcount_update_cancel_item,
 };
 
+/* Sort refcount intents by rtgroup. */
+static int
+xfs_rtrefcount_update_diff_items(
+	void					*priv,
+	const struct list_head			*a,
+	const struct list_head			*b)
+{
+	struct xfs_refcount_intent		*ra = ci_entry(a);
+	struct xfs_refcount_intent		*rb = ci_entry(b);
+
+	return ra->ri_rtg->rtg_rgno - rb->ri_rtg->rtg_rgno;
+}
+
+static struct xfs_log_item *
+xfs_rtrefcount_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	if (sort)
+		list_sort(mp, items, xfs_rtrefcount_update_diff_items);
+	return NULL;
+}
+
+/* Cancel a deferred realtime refcount update. */
+STATIC void
+xfs_rtrefcount_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_refcount_intent	*ri = ci_entry(item);
+
+	xfs_rtgroup_intent_put(ri->ri_rtg);
+	kmem_cache_free(xfs_refcount_intent_cache, ri);
+}
+
+/* Process a deferred realtime refcount update. */
+STATIC int
+xfs_rtrefcount_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_refcount_intent	*ri = ci_entry(item);
+	int				error;
+
+	error = xfs_rtrefcount_finish_one(tp, ri, state);
+
+	/* Did we run out of reservation?  Requeue what we didn't finish. */
+	if (!error && ri->ri_blockcount > 0) {
+		ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
+		       ri->ri_type == XFS_REFCOUNT_DECREASE);
+		return -EAGAIN;
+	}
+
+	xfs_rtrefcount_update_cancel_item(item);
+	return error;
+}
+
+/* Clean up after calling xfs_rtrefcount_finish_one. */
+STATIC void
+xfs_rtrefcount_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	if (rcur)
+		xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+	.name		= "rtrefcount",
+	.create_intent	= xfs_rtrefcount_update_create_intent,
+	.abort_intent	= xfs_refcount_update_abort_intent,
+	.create_done	= xfs_refcount_update_create_done,
+	.finish_item	= xfs_rtrefcount_update_finish_item,
+	.finish_cleanup = xfs_rtrefcount_finish_one_cleanup,
+	.cancel_item	= xfs_rtrefcount_update_cancel_item,
+};
+
 /* Inode Block Mapping */
 
 static inline struct xfs_bmap_intent *bi_entry(const struct list_head *e)
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index b84d1ad57f1..31129fb4884 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -4486,8 +4486,9 @@ xfs_bmapi_write(
 			 * the refcount btree for orphan recovery.
 			 */
 			if (whichfork == XFS_COW_FORK)
-				xfs_refcount_alloc_cow_extent(tp, bma.blkno,
-						bma.length);
+				xfs_refcount_alloc_cow_extent(tp,
+						XFS_IS_REALTIME_INODE(ip),
+						bma.blkno, bma.length);
 		}
 
 		/* Deal with the allocated space we found.  */
@@ -4653,7 +4654,8 @@ xfs_bmapi_convert_delalloc(
 	*seq = READ_ONCE(ifp->if_seq);
 
 	if (whichfork == XFS_COW_FORK)
-		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+		xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip),
+				bma.blkno, bma.length);
 
 	error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
 			whichfork);
@@ -5265,7 +5267,7 @@ xfs_bmap_del_extent_real(
 	 */
 	if (want_free) {
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
-			xfs_refcount_decrease_extent(tp, del);
+			xfs_refcount_decrease_extent(tp, isrt, del);
 		} else {
 			unsigned int	efi_flags = 0;
 
diff --git a/libxfs/xfs_defer.h b/libxfs/xfs_defer.h
index fddcb4cccbc..a351f00e6d7 100644
--- a/libxfs/xfs_defer.h
+++ b/libxfs/xfs_defer.h
@@ -68,6 +68,7 @@ struct xfs_defer_op_type {
 
 extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
diff --git a/libxfs/xfs_log_format.h b/libxfs/xfs_log_format.h
index ea4e88d6657..a888e3d98a3 100644
--- a/libxfs/xfs_log_format.h
+++ b/libxfs/xfs_log_format.h
@@ -252,6 +252,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_EFD_RT		0x124b	/* realtime extent free done */
 #define	XFS_LI_RUI_RT		0x124c	/* realtime rmap update intent */
 #define	XFS_LI_RUD_RT		0x124d	/* realtime rmap update done */
+#define	XFS_LI_CUI_RT		0x124e	/* realtime refcount update intent */
+#define	XFS_LI_CUD_RT		0x124f	/* realtime refcount update done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -275,7 +277,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_EFI_RT,	"XFS_LI_EFI_RT" }, \
 	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }, \
 	{ XFS_LI_RUI_RT,	"XFS_LI_RUI_RT" }, \
-	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }
+	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }, \
+	{ XFS_LI_CUI_RT,	"XFS_LI_CUI_RT" }, \
+	{ XFS_LI_CUD_RT,	"XFS_LI_CUD_RT" }
 
 /*
  * Inode Log Item Format definitions.
diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 2202d9cfb37..d4ed0f44fa1 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -1141,6 +1141,28 @@ xfs_refcount_still_have_space(
 		xrefc_btree_state(cur)->nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
 }
 
+/* Schedule an extent free. */
+static int
+xrefc_free_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*rec)
+{
+	xfs_fsblock_t			fsbno;
+	unsigned int			flags = 0;
+
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC) {
+		flags |= XFS_FREE_EXTENT_REALTIME;
+		fsbno = xfs_rgbno_to_rtb(cur->bc_mp, cur->bc_ino.rtg->rtg_rgno,
+				rec->rc_startblock);
+	} else {
+		fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+				rec->rc_startblock);
+	}
+
+	return xfs_free_extent_later(cur->bc_tp, fsbno, rec->rc_blockcount,
+			NULL, XFS_AG_RESV_NONE, flags);
+}
+
 /*
  * Adjust the refcounts of middle extents.  At this point we should have
  * split extents that crossed the adjustment range; merged with adjacent
@@ -1157,7 +1179,6 @@ xfs_refcount_adjust_extents(
 	struct xfs_refcount_irec	ext, tmp;
 	int				error;
 	int				found_rec, found_tmp;
-	xfs_fsblock_t			fsbno;
 
 	/* Merging did all the work already. */
 	if (*aglen == 0)
@@ -1210,12 +1231,7 @@ xfs_refcount_adjust_extents(
 					goto out_error;
 				}
 			} else {
-				fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
-						cur->bc_ag.pag->pag_agno,
-						tmp.rc_startblock);
-				error = xfs_free_extent_later(cur->bc_tp, fsbno,
-						  tmp.rc_blockcount, NULL,
-						  XFS_AG_RESV_NONE, 0);
+				error = xrefc_free_extent(cur, &tmp);
 				if (error)
 					goto out_error;
 			}
@@ -1273,12 +1289,7 @@ xfs_refcount_adjust_extents(
 			}
 			goto advloop;
 		} else {
-			fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
-					cur->bc_ag.pag->pag_agno,
-					ext.rc_startblock);
-			error = xfs_free_extent_later(cur->bc_tp, fsbno,
-					ext.rc_blockcount, NULL,
-					XFS_AG_RESV_NONE, 0);
+			error = xrefc_free_extent(cur, &ext);
 			if (error)
 				goto out_error;
 		}
@@ -1473,6 +1484,20 @@ xfs_refcount_finish_one(
 	return error;
 }
 
+/*
+ * Process one of the deferred realtime refcount operations.  We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ */
+int
+xfs_rtrefcount_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_refcount_intent	*ri,
+	struct xfs_btree_cur		**pcur)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
 /*
  * Record a refcount intent for later processing.
  */
@@ -1480,6 +1505,7 @@ static void
 __xfs_refcount_add(
 	struct xfs_trans		*tp,
 	enum xfs_refcount_intent_type	type,
+	bool				isrt,
 	xfs_fsblock_t			startblock,
 	xfs_extlen_t			blockcount)
 {
@@ -1491,6 +1517,7 @@ __xfs_refcount_add(
 	ri->ri_type = type;
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
+	ri->ri_realtime = isrt;
 
 	xfs_refcount_defer_add(tp, ri);
 }
@@ -1501,12 +1528,13 @@ __xfs_refcount_add(
 void
 xfs_refcount_increase_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_has_reflink(tp->t_mountp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock,
 			PREV->br_blockcount);
 }
 
@@ -1516,12 +1544,13 @@ xfs_refcount_increase_extent(
 void
 xfs_refcount_decrease_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_has_reflink(tp->t_mountp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock,
 			PREV->br_blockcount);
 }
 
@@ -1877,6 +1906,7 @@ __xfs_refcount_cow_free(
 void
 xfs_refcount_alloc_cow_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
@@ -1885,16 +1915,17 @@ xfs_refcount_alloc_cow_extent(
 	if (!xfs_has_reflink(mp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
 void
 xfs_refcount_free_cow_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
@@ -1904,8 +1935,8 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
-	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+	xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len);
 }
 
 struct xfs_refcount_recovery {
@@ -2012,7 +2043,7 @@ xfs_refcount_recover_cow_leftovers(
 		/* Free the orphan record */
 		fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
 				rr->rr_rrec.rc_startblock);
-		xfs_refcount_free_cow_extent(tp, fsb,
+		xfs_refcount_free_cow_extent(tp, false, fsb,
 				rr->rr_rrec.rc_blockcount);
 
 		/* Free the block. */
diff --git a/libxfs/xfs_refcount.h b/libxfs/xfs_refcount.h
index 13344b402a7..56e5834feb6 100644
--- a/libxfs/xfs_refcount.h
+++ b/libxfs/xfs_refcount.h
@@ -57,10 +57,14 @@ enum xfs_refcount_intent_type {
 
 struct xfs_refcount_intent {
 	struct list_head			ri_list;
-	struct xfs_perag			*ri_pag;
+	union {
+		struct xfs_perag		*ri_pag;
+		struct xfs_rtgroup		*ri_rtg;
+	};
 	enum xfs_refcount_intent_type		ri_type;
 	xfs_extlen_t				ri_blockcount;
 	xfs_fsblock_t				ri_startblock;
+	bool					ri_realtime;
 };
 
 /* Check that the refcount is appropriate for the record domain. */
@@ -75,22 +79,24 @@ xfs_refcount_check_domain(
 	return true;
 }
 
-void xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt,
 		struct xfs_bmbt_irec *irec);
-void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt,
 		struct xfs_bmbt_irec *irec);
 
-extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+int xfs_refcount_finish_one(struct xfs_trans *tp,
+		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
+int xfs_rtrefcount_finish_one(struct xfs_trans *tp,
 		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
 
 extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
 		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
 		xfs_extlen_t *flen, bool find_end_of_shared);
 
-void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-		xfs_extlen_t len);
-void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-		xfs_extlen_t len);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
+		xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
+		xfs_fsblock_t fsb, xfs_extlen_t len);
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/42] xfs: add realtime refcount btree inode to metadata directory
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:27   ` [PATCH 07/42] xfs: add a realtime flag to the refcount update log redo items Darrick J. Wong
@ 2023-12-27 13:28   ` Darrick J. Wong
  2023-12-27 13:28   ` [PATCH 09/42] xfs: add metadata reservations for realtime refcount btree Darrick J. Wong
                     ` (33 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:28 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a metadir path to select the realtime refcount btree inode and load
it at mount time.  The rtrefcountbt inode will have a unique extent format
code, which means that we also have to update the inode validation and
flush routines to look for it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/init.c                 |    4 ++++
 libxfs/xfs_bmap.c             |    8 ++++++--
 libxfs/xfs_format.h           |    4 +++-
 libxfs/xfs_inode_buf.c        |    8 ++++++++
 libxfs/xfs_inode_fork.c       |    9 +++++++++
 libxfs/xfs_rtgroup.h          |    3 +++
 libxfs/xfs_rtrefcount_btree.c |   33 +++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrefcount_btree.h |    4 ++++
 8 files changed, 70 insertions(+), 3 deletions(-)


diff --git a/libxfs/init.c b/libxfs/init.c
index 36b4b486145..6add79121b2 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -970,6 +970,10 @@ libxfs_rtmount_destroy(xfs_mount_t *mp)
 	xfs_rgnumber_t		rgno;
 
 	for_each_rtgroup(mp, rgno, rtg) {
+		if (rtg->rtg_refcountip)
+			libxfs_imeta_irele(rtg->rtg_refcountip);
+		rtg->rtg_refcountip = NULL;
+
 		if (rtg->rtg_rmapip)
 			libxfs_imeta_irele(rtg->rtg_rmapip);
 		rtg->rtg_rmapip = NULL;
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 31129fb4884..9e30b4441c1 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -5105,9 +5105,13 @@ xfs_bmap_del_extent_real(
 		 * the same order of operations as the data device, which is:
 		 * Remove the file mapping, remove the reverse mapping, and
 		 * then free the blocks.  This means that we must delay the
-		 * freeing until after we've scheduled the rmap update.
+		 * freeing until after we've scheduled the rmap update.  If
+		 * realtime reflink is enabled, use deferred refcount intent
+		 * items to decide what to do with the extent, just like we do
+		 * for the data device.
 		 */
-		if (want_free && !xfs_has_rtrmapbt(mp)) {
+		if (want_free && !xfs_has_rtrmapbt(mp) &&
+				 !xfs_has_rtreflink(mp)) {
 			error = xfs_rtfree_blocks(tp, del->br_startblock,
 					del->br_blockcount);
 			if (error)
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index c938b814c43..93a9b8e3b56 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1028,6 +1028,7 @@ enum xfs_dinode_fmt {
 	XFS_DINODE_FMT_BTREE,		/* struct xfs_bmdr_block */
 	XFS_DINODE_FMT_UUID,		/* added long ago, but never used */
 	XFS_DINODE_FMT_RMAP,		/* reverse mapping btree */
+	XFS_DINODE_FMT_REFCOUNT,	/* reference count btree */
 };
 
 #define XFS_INODE_FORMAT_STR \
@@ -1036,7 +1037,8 @@ enum xfs_dinode_fmt {
 	{ XFS_DINODE_FMT_EXTENTS,	"extent" }, \
 	{ XFS_DINODE_FMT_BTREE,		"btree" }, \
 	{ XFS_DINODE_FMT_UUID,		"uuid" }, \
-	{ XFS_DINODE_FMT_RMAP,		"rmap" }
+	{ XFS_DINODE_FMT_RMAP,		"rmap" }, \
+	{ XFS_DINODE_FMT_REFCOUNT,	"refcount" }
 
 /*
  * Max values for extnum and aextnum.
diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c
index e7bf8ff7046..6f4dbe8367b 100644
--- a/libxfs/xfs_inode_buf.c
+++ b/libxfs/xfs_inode_buf.c
@@ -414,6 +414,12 @@ xfs_dinode_verify_fork(
 		if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
 			return __this_address;
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		if (!xfs_has_rtreflink(mp))
+			return __this_address;
+		if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
+			return __this_address;
+		break;
 	default:
 		return __this_address;
 	}
@@ -434,6 +440,7 @@ xfs_dinode_verify_forkoff(
 			return __this_address;
 		break;
 	case XFS_DINODE_FMT_RMAP:
+	case XFS_DINODE_FMT_REFCOUNT:
 		if (!(xfs_has_metadir(mp) && xfs_has_parent(mp)))
 			return __this_address;
 		fallthrough;
@@ -705,6 +712,7 @@ xfs_dinode_verify(
 	if (flags2 & XFS_DIFLAG2_METADIR) {
 		switch (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK)) {
 		case XFS_DINODE_FMT_RMAP:
+		case XFS_DINODE_FMT_REFCOUNT:
 			break;
 		default:
 			if (nextents + naextents == 0 && nblocks != 0)
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 127571527cf..11cfdc9b2bf 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -269,6 +269,11 @@ xfs_iformat_data_fork(
 				return -EFSCORRUPTED;
 			}
 			return xfs_iformat_rtrmap(ip, dip);
+		case XFS_DINODE_FMT_REFCOUNT:
+			if (!xfs_has_rtreflink(ip->i_mount))
+				return -EFSCORRUPTED;
+			ASSERT(0); /* to be implemented later */
+			return -EFSCORRUPTED;
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -664,6 +669,10 @@ xfs_iflush_fork(
 			xfs_iflush_rtrmap(ip, dip);
 		break;
 
+	case XFS_DINODE_FMT_REFCOUNT:
+		ASSERT(0); /* to be implemented later */
+		break;
+
 	default:
 		ASSERT(0);
 		break;
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 3522527e553..bd88a4d7281 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -25,6 +25,9 @@ struct xfs_rtgroup {
 	/* reverse mapping btree inode */
 	struct xfs_inode	*rtg_rmapip;
 
+	/* refcount btree inode */
+	struct xfs_inode	*rtg_refcountip;
+
 	/* Number of blocks in this group */
 	xfs_rgblock_t		rtg_blockcount;
 
diff --git a/libxfs/xfs_rtrefcount_btree.c b/libxfs/xfs_rtrefcount_btree.c
index fb4944d570f..425a56578c6 100644
--- a/libxfs/xfs_rtrefcount_btree.c
+++ b/libxfs/xfs_rtrefcount_btree.c
@@ -24,6 +24,7 @@
 #include "xfs_cksum.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_imeta.h"
 
 static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
 
@@ -353,6 +354,7 @@ xfs_rtrefcountbt_commit_staged_btree(
 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_REFCOUNT);
 
 	/*
 	 * Free any resources hanging off the real fork, then shallow-copy the
@@ -456,3 +458,34 @@ xfs_rtrefcountbt_compute_maxlevels(
 	/* Add one level to handle the inode root level. */
 	mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
 }
+
+#define XFS_RTREFC_NAMELEN		21
+
+/* Create the metadata directory path for an rtrefcount btree inode. */
+int
+xfs_rtrefcountbt_create_path(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_imeta_path	**pathp)
+{
+	struct xfs_imeta_path	*path;
+	unsigned char		*fname;
+	int			error;
+
+	error = xfs_imeta_create_file_path(mp, 2, &path);
+	if (error)
+		return error;
+
+	fname = kmalloc(XFS_RTREFC_NAMELEN, GFP_KERNEL);
+	if (!fname) {
+		xfs_imeta_free_path(path);
+		return -ENOMEM;
+	}
+
+	snprintf(fname, XFS_RTREFC_NAMELEN, "%u.refcount", rgno);
+	path->im_path[0] = "realtime";
+	path->im_path[1] = fname;
+	path->im_dynamicmask = 0x2;
+	*pathp = path;
+	return 0;
+}
diff --git a/libxfs/xfs_rtrefcount_btree.h b/libxfs/xfs_rtrefcount_btree.h
index 6d23ab3a9ad..ff49e95d1a4 100644
--- a/libxfs/xfs_rtrefcount_btree.h
+++ b/libxfs/xfs_rtrefcount_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_cur;
 struct xfs_mount;
 struct xbtree_ifakeroot;
 struct xfs_rtgroup;
+struct xfs_imeta_path;
 
 /* refcounts only exist on crc enabled filesystems */
 #define XFS_RTREFCOUNT_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
@@ -68,4 +69,7 @@ unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
 int __init xfs_rtrefcountbt_init_cur_cache(void);
 void xfs_rtrefcountbt_destroy_cur_cache(void);
 
+int xfs_rtrefcountbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		struct xfs_imeta_path **pathp);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/42] xfs: add metadata reservations for realtime refcount btree
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 13:28   ` [PATCH 08/42] xfs: add realtime refcount btree inode to metadata directory Darrick J. Wong
@ 2023-12-27 13:28   ` Darrick J. Wong
  2023-12-27 13:28   ` [PATCH 10/42] xfs: wire up a new inode fork type for the realtime refcount Darrick J. Wong
                     ` (32 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:28 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reserve some free blocks so that we will always have enough free blocks
in the data volume to handle expansion of the realtime refcount btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtrefcount_btree.c |   39 +++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrefcount_btree.h |    4 ++++
 2 files changed, 43 insertions(+)


diff --git a/libxfs/xfs_rtrefcount_btree.c b/libxfs/xfs_rtrefcount_btree.c
index 425a56578c6..99eac508cda 100644
--- a/libxfs/xfs_rtrefcount_btree.c
+++ b/libxfs/xfs_rtrefcount_btree.c
@@ -489,3 +489,42 @@ xfs_rtrefcountbt_create_path(
 	*pathp = path;
 	return 0;
 }
+
+/* Calculate the rtrefcount btree size for some records. */
+unsigned long long
+xfs_rtrefcountbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_rtrefc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+static unsigned long long
+xfs_rtrefcountbt_max_size(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtblocks)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rtrefc_mxr[0] == 0)
+		return 0;
+
+	return xfs_rtrefcountbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ * We need enough space to hold one record for every rt extent in the rtgroup.
+ */
+xfs_filblks_t
+xfs_rtrefcountbt_calc_reserves(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_has_rtreflink(mp))
+		return 0;
+
+	return xfs_rtrefcountbt_max_size(mp,
+			xfs_rtb_to_rtx(mp, mp->m_sb.sb_rgblocks));
+}
diff --git a/libxfs/xfs_rtrefcount_btree.h b/libxfs/xfs_rtrefcount_btree.h
index ff49e95d1a4..045f7b1f728 100644
--- a/libxfs/xfs_rtrefcount_btree.h
+++ b/libxfs/xfs_rtrefcount_btree.h
@@ -72,4 +72,8 @@ void xfs_rtrefcountbt_destroy_cur_cache(void);
 int xfs_rtrefcountbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 		struct xfs_imeta_path **pathp);
 
+xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
+unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/42] xfs: wire up a new inode fork type for the realtime refcount
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-27 13:28   ` [PATCH 09/42] xfs: add metadata reservations for realtime refcount btree Darrick J. Wong
@ 2023-12-27 13:28   ` Darrick J. Wong
  2023-12-27 13:29   ` [PATCH 11/42] xfs: wire up realtime refcount btree cursors Darrick J. Wong
                     ` (31 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:28 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Plumb in the pieces we need to embed the root of the realtime refcount
btree in an inode's data fork, complete with new fork type and
on-disk interpretation functions.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h           |    8 +
 libxfs/xfs_inode_fork.c       |    8 +
 libxfs/xfs_ondisk.h           |    1 
 libxfs/xfs_rtrefcount_btree.c |  236 +++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrefcount_btree.h |  112 +++++++++++++++++++
 5 files changed, 362 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 93a9b8e3b56..ca964befb51 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1824,6 +1824,14 @@ typedef __be32 xfs_refcount_ptr_t;
  */
 #define	XFS_RTREFC_CRC_MAGIC	0x52434e54	/* 'RCNT' */
 
+/*
+ * rt refcount root header, on-disk form only.
+ */
+struct xfs_rtrefcount_root {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+};
+
 /* inode-rooted btree pointer type */
 typedef __be64 xfs_rtrefcount_ptr_t;
 
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index 11cfdc9b2bf..bfc06af904e 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -26,6 +26,7 @@
 #include "xfs_health.h"
 #include "xfs_symlink_remote.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 struct kmem_cache *xfs_ifork_cache;
 
@@ -272,8 +273,7 @@ xfs_iformat_data_fork(
 		case XFS_DINODE_FMT_REFCOUNT:
 			if (!xfs_has_rtreflink(ip->i_mount))
 				return -EFSCORRUPTED;
-			ASSERT(0); /* to be implemented later */
-			return -EFSCORRUPTED;
+			return xfs_iformat_rtrefcount(ip, dip);
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -670,7 +670,9 @@ xfs_iflush_fork(
 		break;
 
 	case XFS_DINODE_FMT_REFCOUNT:
-		ASSERT(0); /* to be implemented later */
+		ASSERT(whichfork == XFS_DATA_FORK);
+		if (iip->ili_fields & brootflag[whichfork])
+			xfs_iflush_rtrefcount(ip, dip);
 		break;
 
 	default:
diff --git a/libxfs/xfs_ondisk.h b/libxfs/xfs_ondisk.h
index 242b6831256..3a5581ecb36 100644
--- a/libxfs/xfs_ondisk.h
+++ b/libxfs/xfs_ondisk.h
@@ -80,6 +80,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t,		8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root,	4);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/libxfs/xfs_rtrefcount_btree.c b/libxfs/xfs_rtrefcount_btree.c
index 99eac508cda..530bb9d361d 100644
--- a/libxfs/xfs_rtrefcount_btree.c
+++ b/libxfs/xfs_rtrefcount_btree.c
@@ -83,6 +83,41 @@ xfs_rtrefcountbt_get_maxrecs(
 	return cur->bc_mp->m_rtrefc_mxr[level != 0];
 }
 
+/*
+ * Calculate number of records in a realtime refcount btree inode root.
+ */
+unsigned int
+xfs_rtrefcountbt_droot_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= sizeof(struct xfs_rtrefcount_root);
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (2 * sizeof(struct xfs_refcount_key) +
+			sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork so that
+ * we can resize the in-memory buffer to match it.  After a resize to the
+ * maximum size this function returns the same value as
+ * xfs_rtrefcountbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrefcountbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_rtrefc_mxr[level != 0];
+	return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
 STATIC void
 xfs_rtrefcountbt_init_key_from_rec(
 	union xfs_btree_key		*key,
@@ -253,6 +288,68 @@ xfs_rtrefcountbt_keys_contiguous(
 				 be32_to_cpu(key2->refc.rc_startblock));
 }
 
+/* Move the rt refcount btree root from one incore buffer to another. */
+static void
+xfs_rtrefcountbt_broot_move(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_btree_block	*dst_broot,
+	size_t			dst_bytes,
+	struct xfs_btree_block	*src_broot,
+	size_t			src_bytes,
+	unsigned int		level,
+	unsigned int		numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	void			*dptr;
+	void			*sptr;
+
+	ASSERT(xfs_rtrefcount_droot_space(src_broot) <=
+			xfs_inode_fork_size(ip, whichfork));
+
+	/*
+	 * We always have to move the pointers because they are not butted
+	 * against the btree block header.
+	 */
+	if (numrecs && level > 0) {
+		sptr = xfs_rtrefcount_broot_ptr_addr(mp, src_broot, 1,
+				src_bytes);
+		dptr = xfs_rtrefcount_broot_ptr_addr(mp, dst_broot, 1,
+				dst_bytes);
+		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
+	}
+
+	if (src_broot == dst_broot)
+		return;
+
+	/*
+	 * If the root is being totally relocated, we have to migrate the block
+	 * header and the keys/records that come after it.
+	 */
+	memcpy(dst_broot, src_broot, XFS_RTREFCOUNT_BLOCK_LEN);
+
+	if (!numrecs)
+		return;
+
+	if (level == 0) {
+		sptr = xfs_rtrefcount_rec_addr(src_broot, 1);
+		dptr = xfs_rtrefcount_rec_addr(dst_broot, 1);
+		memcpy(dptr, sptr,
+				numrecs * sizeof(struct xfs_refcount_rec));
+	} else {
+		sptr = xfs_rtrefcount_key_addr(src_broot, 1);
+		dptr = xfs_rtrefcount_key_addr(dst_broot, 1);
+		memcpy(dptr, sptr,
+				numrecs * sizeof(struct xfs_refcount_key));
+	}
+}
+
+static const struct xfs_ifork_broot_ops xfs_rtrefcountbt_iroot_ops = {
+	.maxrecs		= xfs_rtrefcountbt_maxrecs,
+	.size			= xfs_rtrefcount_broot_space_calc,
+	.move			= xfs_rtrefcountbt_broot_move,
+};
+
 const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.rec_len		= sizeof(struct xfs_refcount_rec),
 	.key_len		= sizeof(struct xfs_refcount_key),
@@ -265,6 +362,7 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.free_block		= xfs_btree_free_imeta_block,
 	.get_minrecs		= xfs_rtrefcountbt_get_minrecs,
 	.get_maxrecs		= xfs_rtrefcountbt_get_maxrecs,
+	.get_dmaxrecs		= xfs_rtrefcountbt_get_dmaxrecs,
 	.init_key_from_rec	= xfs_rtrefcountbt_init_key_from_rec,
 	.init_high_key_from_rec	= xfs_rtrefcountbt_init_high_key_from_rec,
 	.init_rec_from_cur	= xfs_rtrefcountbt_init_rec_from_cur,
@@ -275,6 +373,7 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.keys_inorder		= xfs_rtrefcountbt_keys_inorder,
 	.recs_inorder		= xfs_rtrefcountbt_recs_inorder,
 	.keys_contiguous	= xfs_rtrefcountbt_keys_contiguous,
+	.iroot_ops		= &xfs_rtrefcountbt_iroot_ops,
 };
 
 /* Initialize a new rt refcount btree cursor. */
@@ -528,3 +627,140 @@ xfs_rtrefcountbt_calc_reserves(
 	return xfs_rtrefcountbt_max_size(mp,
 			xfs_rtb_to_rtx(mp, mp->m_sb.sb_rgblocks));
 }
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+STATIC void
+xfs_rtrefcountbt_from_disk(
+	struct xfs_inode		*ip,
+	struct xfs_rtrefcount_root	*dblock,
+	int				dblocklen,
+	struct xfs_btree_block		*rblock)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_refcount_key	*fkp;
+	__be64				*fpp;
+	struct xfs_refcount_key	*tkp;
+	__be64				*tpp;
+	struct xfs_refcount_rec	*frp;
+	struct xfs_refcount_rec	*trp;
+	unsigned int			numrecs;
+	unsigned int			maxrecs;
+	unsigned int			rblocklen;
+
+	rblocklen = xfs_rtrefcount_broot_space(mp, dblock);
+
+	xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0,
+			ip->i_ino);
+
+	rblock->bb_level = dblock->bb_level;
+	rblock->bb_numrecs = dblock->bb_numrecs;
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+		tkp = xfs_rtrefcount_key_addr(rblock, 1);
+		fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+		tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		numrecs = be16_to_cpu(dblock->bb_numrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+		trp = xfs_rtrefcount_rec_addr(rblock, 1);
+		numrecs = be16_to_cpu(dblock->bb_numrecs);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Load a realtime reference count btree root in from disk. */
+int
+xfs_iformat_rtrefcount(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	unsigned int		numrecs;
+	unsigned int		level;
+	int			dsize;
+
+	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+	numrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > mp->m_rtrefc_maxlevels ||
+	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize)
+		return -EFSCORRUPTED;
+
+	xfs_iroot_alloc(ip, XFS_DATA_FORK,
+			xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
+	xfs_rtrefcountbt_from_disk(ip, dfp, dsize, ifp->if_broot);
+	return 0;
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_rtrefcountbt_to_disk(
+	struct xfs_mount		*mp,
+	struct xfs_btree_block		*rblock,
+	int				rblocklen,
+	struct xfs_rtrefcount_root	*dblock,
+	int				dblocklen)
+{
+	struct xfs_refcount_key	*fkp;
+	__be64				*fpp;
+	struct xfs_refcount_key	*tkp;
+	__be64				*tpp;
+	struct xfs_refcount_rec	*frp;
+	struct xfs_refcount_rec	*trp;
+	unsigned int			maxrecs;
+	unsigned int			numrecs;
+
+	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC));
+	ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+	ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrefcount_key_addr(rblock, 1);
+		tkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+		fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+		numrecs = be16_to_cpu(rblock->bb_numrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrefcount_rec_addr(rblock, 1);
+		trp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+		numrecs = be16_to_cpu(rblock->bb_numrecs);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Flush a realtime reference count btree root out to disk. */
+void
+xfs_iflush_rtrefcount(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+	ASSERT(ifp->if_broot != NULL);
+	ASSERT(ifp->if_broot_bytes > 0);
+	ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, XFS_DATA_FORK));
+	xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot,
+			ifp->if_broot_bytes, dfp,
+			XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
diff --git a/libxfs/xfs_rtrefcount_btree.h b/libxfs/xfs_rtrefcount_btree.h
index 045f7b1f728..bd070e54781 100644
--- a/libxfs/xfs_rtrefcount_btree.h
+++ b/libxfs/xfs_rtrefcount_btree.h
@@ -27,6 +27,7 @@ void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
 unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
 		unsigned int blocklen, bool leaf);
 void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf);
 
 /*
  * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
@@ -76,4 +77,115 @@ xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
 unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
+/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */
+
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_droot_rec_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index)
+{
+	return (struct xfs_refcount_rec *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_droot_key_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index)
+{
+	return (struct xfs_refcount_key *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_droot_ptr_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index,
+	unsigned int			maxrecs)
+{
+	return (xfs_rtrefcount_ptr_t *)
+		((char *)(block + 1) +
+		 maxrecs * sizeof(struct xfs_refcount_key) +
+		 (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_broot_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*bb,
+	unsigned int		index,
+	unsigned int		block_size)
+{
+	return xfs_rtrefcount_ptr_addr(bb, index,
+			xfs_rtrefcountbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space_calc(
+	struct xfs_mount	*mp,
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = XFS_RTREFCOUNT_BLOCK_LEN;
+
+	if (level > 0)
+		return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+				     sizeof(xfs_rtrefcount_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb)
+{
+	return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrefcount_droot_space_calc(
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = sizeof(struct xfs_rtrefcount_root);
+
+	if (level > 0)
+		return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+				     sizeof(xfs_rtrefcount_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrefcount_droot_space(struct xfs_btree_block *bb)
+{
+	return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
+		struct xfs_btree_block *rblock, int rblocklen,
+		struct xfs_rtrefcount_root *dblock, int dblocklen);
+void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/42] xfs: wire up realtime refcount btree cursors
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-27 13:28   ` [PATCH 10/42] xfs: wire up a new inode fork type for the realtime refcount Darrick J. Wong
@ 2023-12-27 13:29   ` Darrick J. Wong
  2023-12-27 13:29   ` [PATCH 12/42] xfs: create routine to allocate and initialize a realtime refcount btree inode Darrick J. Wong
                     ` (30 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:29 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Wire up realtime refcount btree cursors wherever they're needed
throughout the code base.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |  101 ++++++++++++++++++++++++++++++++++++++++++++++++-
 libxfs/xfs_rtgroup.c  |   10 +++++
 libxfs/xfs_rtgroup.h  |    5 ++
 3 files changed, 113 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index d4ed0f44fa1..3f0f0f41a69 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -25,6 +25,7 @@
 #include "xfs_health.h"
 #include "defer_item.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrefcount_btree.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -1484,6 +1485,33 @@ xfs_refcount_finish_one(
 	return error;
 }
 
+/*
+ * Set up a continuation a deferred rtrefcount operation by updating the
+ * intent.  Checks to make sure we're not going to run off the end of the
+ * rtgroup.
+ */
+static inline int
+xfs_rtrefcount_continue_op(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_intent	*ri,
+	xfs_agblock_t			new_agbno)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rtgroup		*rtg = ri->ri_rtg;
+
+	if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno,
+					ri->ri_blockcount))) {
+		xfs_btree_mark_sick(cur);
+		return -EFSCORRUPTED;
+	}
+
+	ri->ri_startblock = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, new_agbno);
+
+	ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount));
+	ASSERT(rtg->rtg_rgno == xfs_rtb_to_rgno(mp, ri->ri_startblock));
+	return 0;
+}
+
 /*
  * Process one of the deferred realtime refcount operations.  We pass back the
  * btree cursor to maintain our lock on the btree between calls.
@@ -1494,8 +1522,77 @@ xfs_rtrefcount_finish_one(
 	struct xfs_refcount_intent	*ri,
 	struct xfs_btree_cur		**pcur)
 {
-	ASSERT(0);
-	return -EFSCORRUPTED;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*rcur = *pcur;
+	int				error = 0;
+	xfs_rgnumber_t			rgno;
+	xfs_rgblock_t			bno;
+	unsigned long			nr_ops = 0;
+	int				shape_changes = 0;
+
+	bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock, &rgno);
+
+	trace_xfs_refcount_deferred(mp, ri);
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+		return -EIO;
+
+	/*
+	 * If we haven't gotten a cursor or the cursor AG doesn't match
+	 * the startblock, get one now.
+	 */
+	if (rcur != NULL && rcur->bc_ino.rtg != ri->ri_rtg) {
+		nr_ops = xrefc_btree_state(rcur)->nr_ops;
+		shape_changes = xrefc_btree_state(rcur)->shape_changes;
+		xfs_btree_del_cursor(rcur, 0);
+		rcur = NULL;
+		*pcur = NULL;
+	}
+	if (rcur == NULL) {
+		xfs_rtgroup_lock(tp, ri->ri_rtg, XFS_RTGLOCK_REFCOUNT);
+		*pcur = rcur = xfs_rtrefcountbt_init_cursor(mp, tp, ri->ri_rtg,
+						ri->ri_rtg->rtg_refcountip);
+
+		xrefc_btree_state(rcur)->nr_ops = nr_ops;
+		xrefc_btree_state(rcur)->shape_changes = shape_changes;
+	}
+
+	switch (ri->ri_type) {
+	case XFS_REFCOUNT_INCREASE:
+		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+				XFS_REFCOUNT_ADJUST_INCREASE);
+		if (error)
+			return error;
+		if (ri->ri_blockcount > 0)
+			error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+		break;
+	case XFS_REFCOUNT_DECREASE:
+		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+				XFS_REFCOUNT_ADJUST_DECREASE);
+		if (error)
+			return error;
+		if (ri->ri_blockcount > 0)
+			error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+		break;
+	case XFS_REFCOUNT_ALLOC_COW:
+		error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
+		if (error)
+			return error;
+		ri->ri_blockcount = 0;
+		break;
+	case XFS_REFCOUNT_FREE_COW:
+		error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
+		if (error)
+			return error;
+		ri->ri_blockcount = 0;
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+	if (!error && ri->ri_blockcount > 0)
+		trace_xfs_refcount_finish_one_leftover(mp, ri);
+	return error;
 }
 
 /*
diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index 03eb776ef8b..8d13ba6e0ea 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -558,6 +558,13 @@ xfs_rtgroup_lock(
 		if (tp)
 			xfs_trans_ijoin(tp, rtg->rtg_rmapip, XFS_ILOCK_EXCL);
 	}
+
+	if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg->rtg_refcountip) {
+		xfs_ilock(rtg->rtg_refcountip, XFS_ILOCK_EXCL);
+		if (tp)
+			xfs_trans_ijoin(tp, rtg->rtg_refcountip,
+					XFS_ILOCK_EXCL);
+	}
 }
 
 /* Unlock metadata inodes associated with this rt group. */
@@ -570,6 +577,9 @@ xfs_rtgroup_unlock(
 	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
 	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
+	if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg->rtg_refcountip)
+		xfs_iunlock(rtg->rtg_refcountip, XFS_ILOCK_EXCL);
+
 	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg->rtg_rmapip)
 		xfs_iunlock(rtg->rtg_rmapip, XFS_ILOCK_EXCL);
 
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index bd88a4d7281..659d0c15d2a 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -248,10 +248,13 @@ int xfs_rtgroup_init_secondary_super(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 #define XFS_RTGLOCK_BITMAP_SHARED	(1U << 1)
 /* Lock the rt rmap inode in exclusive mode */
 #define XFS_RTGLOCK_RMAP		(1U << 2)
+/* Lock the rt refcount inode in exclusive mode */
+#define XFS_RTGLOCK_REFCOUNT		(1U << 3)
 
 #define XFS_RTGLOCK_ALL_FLAGS	(XFS_RTGLOCK_BITMAP | \
 				 XFS_RTGLOCK_BITMAP_SHARED | \
-				 XFS_RTGLOCK_RMAP)
+				 XFS_RTGLOCK_RMAP | \
+				 XFS_RTGLOCK_REFCOUNT)
 
 void xfs_rtgroup_lock(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
 		unsigned int rtglock_flags);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/42] xfs: create routine to allocate and initialize a realtime refcount btree inode
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-27 13:29   ` [PATCH 11/42] xfs: wire up realtime refcount btree cursors Darrick J. Wong
@ 2023-12-27 13:29   ` Darrick J. Wong
  2023-12-27 13:29   ` [PATCH 13/42] xfs: update rmap to allow cow staging extents in the rt rmap Darrick J. Wong
                     ` (29 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:29 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a library routine to allocate and initialize an empty realtime
refcountbt inode.  We'll use this for growfs, mkfs, and repair.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtrefcount_btree.c |   34 ++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrefcount_btree.h |    5 +++++
 2 files changed, 39 insertions(+)


diff --git a/libxfs/xfs_rtrefcount_btree.c b/libxfs/xfs_rtrefcount_btree.c
index 530bb9d361d..fa04395eed0 100644
--- a/libxfs/xfs_rtrefcount_btree.c
+++ b/libxfs/xfs_rtrefcount_btree.c
@@ -764,3 +764,37 @@ xfs_iflush_rtrefcount(
 			ifp->if_broot_bytes, dfp,
 			XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
 }
+
+/*
+ * Create a realtime refcount btree inode.
+ *
+ * Regardless of the return value, the caller must clean up @upd.  If a new
+ * inode is returned through *ipp, the caller must finish setting up the incore
+ * inode and release it.
+ */
+int
+xfs_rtrefcountbt_create(
+	struct xfs_imeta_update	*upd,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = upd->mp;
+	struct xfs_ifork	*ifp;
+	int			error;
+
+	error = xfs_imeta_create(upd, S_IFREG, ipp);
+	if (error)
+		return error;
+
+	ifp = xfs_ifork_ptr(upd->ip, XFS_DATA_FORK);
+	ifp->if_format = XFS_DINODE_FMT_REFCOUNT;
+	ASSERT(ifp->if_broot_bytes == 0);
+	ASSERT(ifp->if_bytes == 0);
+
+	/* Initialize the empty incore btree root. */
+	xfs_iroot_alloc(upd->ip, XFS_DATA_FORK,
+			xfs_rtrefcount_broot_space_calc(mp, 0, 0));
+	xfs_btree_init_block(mp, ifp->if_broot, &xfs_rtrefcountbt_ops,
+			0, 0, upd->ip->i_ino);
+	xfs_trans_log_inode(upd->tp, upd->ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+	return 0;
+}
diff --git a/libxfs/xfs_rtrefcount_btree.h b/libxfs/xfs_rtrefcount_btree.h
index bd070e54781..749c6fd02f8 100644
--- a/libxfs/xfs_rtrefcount_btree.h
+++ b/libxfs/xfs_rtrefcount_btree.h
@@ -188,4 +188,9 @@ void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
 		struct xfs_rtrefcount_root *dblock, int dblocklen);
 void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
 
+struct xfs_imeta_update;
+
+int xfs_rtrefcountbt_create(struct xfs_imeta_update *upd,
+		struct xfs_inode **ipp);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/42] xfs: update rmap to allow cow staging extents in the rt rmap
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-27 13:29   ` [PATCH 12/42] xfs: create routine to allocate and initialize a realtime refcount btree inode Darrick J. Wong
@ 2023-12-27 13:29   ` Darrick J. Wong
  2023-12-27 13:29   ` [PATCH 14/42] xfs: compute rtrmap btree max levels when reflink enabled Darrick J. Wong
                     ` (28 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:29 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Don't error out on CoW staging extent records when realtime reflink is
enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rmap.c |   14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfs_rmap.c b/libxfs/xfs_rmap.c
index 0056dc08662..acee6a36fb9 100644
--- a/libxfs/xfs_rmap.c
+++ b/libxfs/xfs_rmap.c
@@ -275,6 +275,7 @@ xfs_rtrmap_check_irec(
 	bool				is_unwritten;
 	bool				is_bmbt;
 	bool				is_attr;
+	bool				is_cow;
 
 	if (irec->rm_blockcount == 0)
 		return __this_address;
@@ -286,6 +287,12 @@ xfs_rtrmap_check_irec(
 			return __this_address;
 		if (irec->rm_offset != 0)
 			return __this_address;
+	} else if (irec->rm_owner == XFS_RMAP_OWN_COW) {
+		if (!xfs_has_rtreflink(mp))
+			return __this_address;
+		if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
+					    irec->rm_blockcount))
+			return __this_address;
 	} else {
 		if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
 					    irec->rm_blockcount))
@@ -302,8 +309,10 @@ xfs_rtrmap_check_irec(
 	is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
 	is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
 	is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+	is_cow = xfs_has_rtreflink(mp) &&
+		 irec->rm_owner == XFS_RMAP_OWN_COW;
 
-	if (!is_inode && irec->rm_owner != XFS_RMAP_OWN_FS)
+	if (!is_inode && !is_cow && irec->rm_owner != XFS_RMAP_OWN_FS)
 		return __this_address;
 
 	if (!is_inode && irec->rm_offset != 0)
@@ -315,6 +324,9 @@ xfs_rtrmap_check_irec(
 	if (is_unwritten && !is_inode)
 		return __this_address;
 
+	if (is_unwritten && is_cow)
+		return __this_address;
+
 	/* Check for a valid fork offset, if applicable. */
 	if (is_inode &&
 	    !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/42] xfs: compute rtrmap btree max levels when reflink enabled
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-27 13:29   ` [PATCH 13/42] xfs: update rmap to allow cow staging extents in the rt rmap Darrick J. Wong
@ 2023-12-27 13:29   ` Darrick J. Wong
  2023-12-27 13:30   ` [PATCH 15/42] xfs: allow inodes to have the realtime and reflink flags Darrick J. Wong
                     ` (27 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:29 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Compute the maximum possible height of the realtime rmap btree when
reflink is enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtrmap_btree.c |   28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)


diff --git a/libxfs/xfs_rtrmap_btree.c b/libxfs/xfs_rtrmap_btree.c
index b5adfe362a7..b339526a77d 100644
--- a/libxfs/xfs_rtrmap_btree.c
+++ b/libxfs/xfs_rtrmap_btree.c
@@ -736,6 +736,7 @@ xfs_rtrmapbt_maxrecs(
 unsigned int
 xfs_rtrmapbt_maxlevels_ondisk(void)
 {
+	unsigned long long	max_dblocks;
 	unsigned int		minrecs[2];
 	unsigned int		blocklen;
 
@@ -744,8 +745,20 @@ xfs_rtrmapbt_maxlevels_ondisk(void)
 	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
 	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
 
-	/* We need at most one record for every block in an rt group. */
-	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+	/*
+	 * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs.
+	 *
+	 * On a reflink filesystem, each block in an rtgroup can have up to
+	 * 2^32 (per the refcount record format) owners, which means that
+	 * theoretically we could face up to 2^64 rmap records.  However, we're
+	 * likely to run out of blocks in the data device long before that
+	 * happens, which means that we must compute the max height based on
+	 * what the btree will look like if it consumes almost all the blocks
+	 * in the data device due to maximal sharing factor.
+	 */
+	max_dblocks = -1U; /* max ag count */
+	max_dblocks *= XFS_MAX_CRC_AG_BLOCKS;
+	return xfs_btree_space_to_height(minrecs, max_dblocks);
 }
 
 int __init
@@ -784,9 +797,20 @@ xfs_rtrmapbt_compute_maxlevels(
 	 * maximum height is constrained by the size of the data device and
 	 * the height required to store one rmap record for each block in an
 	 * rt group.
+	 *
+	 * On a reflink filesystem, each rt block can have up to 2^32 (per the
+	 * refcount record format) owners, which means that theoretically we
+	 * could face up to 2^64 rmap records.  This makes the computation of
+	 * maxlevels based on record count meaningless, so we only consider the
+	 * size of the data device.
 	 */
 	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
 				mp->m_sb.sb_dblocks);
+	if (xfs_has_rtreflink(mp)) {
+		mp->m_rtrmap_maxlevels = d_maxlevels + 1;
+		return;
+	}
+
 	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
 				mp->m_sb.sb_rgblocks);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/42] xfs: allow inodes to have the realtime and reflink flags
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-27 13:29   ` [PATCH 14/42] xfs: compute rtrmap btree max levels when reflink enabled Darrick J. Wong
@ 2023-12-27 13:30   ` Darrick J. Wong
  2023-12-27 13:30   ` [PATCH 16/42] xfs: refcover CoW leftovers in the realtime volume Darrick J. Wong
                     ` (26 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:30 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we can share blocks between realtime files, allow this
combination.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_inode_buf.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c
index 6f4dbe8367b..f4eabd99a60 100644
--- a/libxfs/xfs_inode_buf.c
+++ b/libxfs/xfs_inode_buf.c
@@ -688,7 +688,8 @@ xfs_dinode_verify(
 		return __this_address;
 
 	/* don't let reflink and realtime mix */
-	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) &&
+	    !xfs_has_rtreflink(mp))
 		return __this_address;
 
 	/* COW extent size hint validation */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/42] xfs: refcover CoW leftovers in the realtime volume
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-27 13:30   ` [PATCH 15/42] xfs: allow inodes to have the realtime and reflink flags Darrick J. Wong
@ 2023-12-27 13:30   ` Darrick J. Wong
  2023-12-27 13:30   ` [PATCH 17/42] xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files Darrick J. Wong
                     ` (25 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:30 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Scan the realtime refcount tree at mount time to get rid of leftover
CoW staging extents.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_refcount.c |   64 ++++++++++++++++++++++++++++++++++++++-----------
 libxfs/xfs_refcount.h |    4 ++-
 2 files changed, 53 insertions(+), 15 deletions(-)


diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c
index 3f0f0f41a69..e094acbe663 100644
--- a/libxfs/xfs_refcount.c
+++ b/libxfs/xfs_refcount.c
@@ -2075,14 +2075,15 @@ xfs_refcount_recover_extent(
 }
 
 /* Find and remove leftover CoW reservations. */
-int
-xfs_refcount_recover_cow_leftovers(
+static int
+xfs_refcount_recover_group_cow_leftovers(
 	struct xfs_mount		*mp,
-	struct xfs_perag		*pag)
+	struct xfs_perag		*pag,
+	struct xfs_rtgroup		*rtg)
 {
 	struct xfs_trans		*tp;
 	struct xfs_btree_cur		*cur;
-	struct xfs_buf			*agbp;
+	struct xfs_buf			*agbp = NULL;
 	struct xfs_refcount_recovery	*rr, *n;
 	struct list_head		debris;
 	union xfs_btree_irec		low = {
@@ -2097,7 +2098,12 @@ xfs_refcount_recover_cow_leftovers(
 
 	/* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
 	BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
-	if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
+	if (pag && mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
+		return -EOPNOTSUPP;
+
+	/* rtreflink filesystems can't have rtgroups larger than 2^31-1 blocks */
+	BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG);
+	if (rtg && mp->m_sb.sb_rgblocks >= XFS_MAX_RGBLOCKS)
 		return -EOPNOTSUPP;
 
 	INIT_LIST_HEAD(&debris);
@@ -2116,16 +2122,25 @@ xfs_refcount_recover_cow_leftovers(
 	if (error)
 		return error;
 
-	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
-	if (error)
-		goto out_trans;
-	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+	if (rtg) {
+		xfs_rtgroup_lock(NULL, rtg, XFS_RTGLOCK_REFCOUNT);
+		cur = xfs_rtrefcountbt_init_cursor(mp, tp, rtg,
+				rtg->rtg_refcountip);
+	} else {
+		error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+		if (error)
+			goto out_trans;
+		cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+	}
 
 	/* Find all the leftover CoW staging extents. */
 	error = xfs_btree_query_range(cur, &low, &high,
 			xfs_refcount_recover_extent, &debris);
 	xfs_btree_del_cursor(cur, error);
-	xfs_trans_brelse(tp, agbp);
+	if (agbp)
+		xfs_trans_brelse(tp, agbp);
+	else
+		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
 	xfs_trans_cancel(tp);
 	if (error)
 		goto out_free;
@@ -2138,15 +2153,20 @@ xfs_refcount_recover_cow_leftovers(
 			goto out_free;
 
 		/* Free the orphan record */
-		fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
-				rr->rr_rrec.rc_startblock);
-		xfs_refcount_free_cow_extent(tp, false, fsb,
+		if (rtg)
+			fsb = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno,
+					rr->rr_rrec.rc_startblock);
+		else
+			fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
+					rr->rr_rrec.rc_startblock);
+		xfs_refcount_free_cow_extent(tp, rtg != NULL, fsb,
 				rr->rr_rrec.rc_blockcount);
 
 		/* Free the block. */
 		error = xfs_free_extent_later(tp, fsb,
 				rr->rr_rrec.rc_blockcount, NULL,
-				XFS_AG_RESV_NONE, 0);
+				XFS_AG_RESV_NONE,
+				rtg != NULL ? XFS_FREE_EXTENT_REALTIME : 0);
 		if (error)
 			goto out_trans;
 
@@ -2170,6 +2190,22 @@ xfs_refcount_recover_cow_leftovers(
 	return error;
 }
 
+int
+xfs_refcount_recover_cow_leftovers(
+	struct xfs_mount		*mp,
+	struct xfs_perag		*pag)
+{
+	return xfs_refcount_recover_group_cow_leftovers(mp, pag, NULL);
+}
+
+int
+xfs_refcount_recover_rtcow_leftovers(
+	struct xfs_mount		*mp,
+	struct xfs_rtgroup		*rtg)
+{
+	return xfs_refcount_recover_group_cow_leftovers(mp, NULL, rtg);
+}
+
 /*
  * Scan part of the keyspace of the refcount records and tell us if the area
  * has no records, is fully mapped by records, or is partially filled.
diff --git a/libxfs/xfs_refcount.h b/libxfs/xfs_refcount.h
index 56e5834feb6..18e0479d3d1 100644
--- a/libxfs/xfs_refcount.h
+++ b/libxfs/xfs_refcount.h
@@ -97,8 +97,10 @@ void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
 		xfs_fsblock_t fsb, xfs_extlen_t len);
 void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
 		xfs_fsblock_t fsb, xfs_extlen_t len);
-extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		struct xfs_perag *pag);
+int xfs_refcount_recover_rtcow_leftovers(struct xfs_mount *mp,
+		struct xfs_rtgroup *rtg);
 
 /*
  * While we're adjusting the refcounts records of an extent, we have


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/42] xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-27 13:30   ` [PATCH 16/42] xfs: refcover CoW leftovers in the realtime volume Darrick J. Wong
@ 2023-12-27 13:30   ` Darrick J. Wong
  2023-12-27 13:30   ` [PATCH 18/42] xfs: apply rt extent alignment constraints to CoW extsize hint Darrick J. Wong
                     ` (24 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:30 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Currently, we (ab)use xfs_get_extsz_hint so that it always returns a
nonzero value for realtime files.  This apparently was done to disable
delayed allocation for realtime files.

However, once we enable realtime reflink, we can also turn on the
alwayscow flag to force CoW writes to realtime files.  In this case, the
logic will incorrectly send the write through the delalloc write path.

Fix this by adjusting the logic slightly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_bmap.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 9e30b4441c1..c87ca1938ec 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -6378,9 +6378,8 @@ xfs_get_extsz_hint(
 	 * No point in aligning allocations if we need to COW to actually
 	 * write to them.
 	 */
-	if (xfs_is_always_cow_inode(ip))
-		return 0;
-	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+	if (!xfs_is_always_cow_inode(ip) &&
+	    (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
 		return ip->i_extsize;
 	if (XFS_IS_REALTIME_INODE(ip))
 		return ip->i_mount->m_sb.sb_rextsize;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/42] xfs: apply rt extent alignment constraints to CoW extsize hint
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-27 13:30   ` [PATCH 17/42] xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files Darrick J. Wong
@ 2023-12-27 13:30   ` Darrick J. Wong
  2023-12-27 13:31   ` [PATCH 19/42] xfs: enable extent size hints for CoW operations Darrick J. Wong
                     ` (23 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:30 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The copy-on-write extent size hint is subject to the same alignment
constraints as the regular extent size hint.  Since we're in the process
of adding reflink (and therefore CoW) to the realtime device, we must
apply the same scattered rextsize alignment validation strategies to
both hints to deal with the possibility of rextsize changing.

Therefore, fix the inode validator to perform rextsize alignment checks
on regular realtime files, and to remove misaligned directory hints.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/logitem.c       |   14 ++++++++++++++
 libxfs/xfs_inode_buf.c |   25 ++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 5 deletions(-)


diff --git a/libxfs/logitem.c b/libxfs/logitem.c
index 3ce2d7574a3..60f8fcb187d 100644
--- a/libxfs/logitem.c
+++ b/libxfs/logitem.c
@@ -233,6 +233,20 @@ xfs_inode_item_precommit(
 	if (flags & XFS_ILOG_IVERSION)
 		flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
 
+	/*
+	 * Inode verifiers do not check that the CoW extent size hint is an
+	 * integer multiple of the rt extent size on a directory with both
+	 * rtinherit and cowextsize flags set.  If we're logging a directory
+	 * that is misconfigured in this way, clear the hint.
+	 */
+	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = 0;
+		flags |= XFS_ILOG_CORE;
+	}
+
 	if (!iip->ili_item.li_buf) {
 		struct xfs_buf	*bp;
 		int		error;
diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c
index f4eabd99a60..d9fc254b354 100644
--- a/libxfs/xfs_inode_buf.c
+++ b/libxfs/xfs_inode_buf.c
@@ -858,11 +858,29 @@ xfs_inode_validate_cowextsize(
 	bool				rt_flag;
 	bool				hint_flag;
 	uint32_t			cowextsize_bytes;
+	uint32_t			blocksize_bytes;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
 	cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
 
+	/*
+	 * Similar to extent size hints, a directory can be configured to
+	 * propagate realtime status and a CoW extent size hint to newly
+	 * created files even if there is no realtime device, and the hints on
+	 * disk can become misaligned if the sysadmin changes the rt extent
+	 * size while adding the realtime device.
+	 *
+	 * Therefore, we can only enforce the rextsize alignment check against
+	 * regular realtime files, and rely on callers to decide when alignment
+	 * checks are appropriate, and fix things up as needed.
+	 */
+
+	if (rt_flag)
+		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+	else
+		blocksize_bytes = mp->m_sb.sb_blocksize;
+
 	if (hint_flag && !xfs_has_reflink(mp))
 		return __this_address;
 
@@ -876,16 +894,13 @@ xfs_inode_validate_cowextsize(
 	if (mode && !hint_flag && cowextsize != 0)
 		return __this_address;
 
-	if (hint_flag && rt_flag)
-		return __this_address;
-
-	if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+	if (cowextsize_bytes % blocksize_bytes)
 		return __this_address;
 
 	if (cowextsize > XFS_MAX_BMBT_EXTLEN)
 		return __this_address;
 
-	if (cowextsize > mp->m_sb.sb_agblocks / 2)
+	if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2)
 		return __this_address;
 
 	return NULL;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/42] xfs: enable extent size hints for CoW operations
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-27 13:30   ` [PATCH 18/42] xfs: apply rt extent alignment constraints to CoW extsize hint Darrick J. Wong
@ 2023-12-27 13:31   ` Darrick J. Wong
  2023-12-27 13:31   ` [PATCH 20/42] xfs: report realtime refcount btree corruption errors to the health system Darrick J. Wong
                     ` (22 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:31 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Wire up the copy-on-write extent size hint for realtime files, and
connect it to the rt allocator so that we avoid fragmentation on rt
filesystems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_bmap.c |    8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index c87ca1938ec..bc703b13b7e 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -6401,7 +6401,13 @@ xfs_get_cowextsz_hint(
 	a = 0;
 	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 		a = ip->i_cowextsize;
-	b = xfs_get_extsz_hint(ip);
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		b = 0;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+			b = ip->i_extsize;
+	} else {
+		b = xfs_get_extsz_hint(ip);
+	}
 
 	a = max(a, b);
 	if (a == 0)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/42] xfs: report realtime refcount btree corruption errors to the health system
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-27 13:31   ` [PATCH 19/42] xfs: enable extent size hints for CoW operations Darrick J. Wong
@ 2023-12-27 13:31   ` Darrick J. Wong
  2023-12-27 13:31   ` [PATCH 21/42] xfs: scrub the realtime refcount btree Darrick J. Wong
                     ` (21 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:31 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Whenever we encounter corrupt realtime refcount btree blocks, we should
report that to the health monitoring system for later reporting.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs_staging.h               |    1 +
 libxfs/xfs_health.h                   |    4 +++-
 libxfs/xfs_inode_fork.c               |    4 +++-
 libxfs/xfs_rtrefcount_btree.c         |    5 ++++-
 man/man2/ioctl_xfs_rtgroup_geometry.2 |    3 +++
 5 files changed, 14 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_fs_staging.h b/libxfs/xfs_fs_staging.h
index 9d5d6af62b6..9f0c03103f0 100644
--- a/libxfs/xfs_fs_staging.h
+++ b/libxfs/xfs_fs_staging.h
@@ -217,6 +217,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_SUPER	(1 << 0)  /* superblock */
 #define XFS_RTGROUP_GEOM_SICK_BITMAP	(1 << 1)  /* rtbitmap for this group */
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1 << 2)  /* reverse mappings */
+#define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1 << 3)  /* reference counts */
 
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 63, struct xfs_rtgroup_geometry)
 
diff --git a/libxfs/xfs_health.h b/libxfs/xfs_health.h
index aeeb6276977..4fe4daca4c4 100644
--- a/libxfs/xfs_health.h
+++ b/libxfs/xfs_health.h
@@ -69,6 +69,7 @@ struct xfs_rtgroup;
 #define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
 #define XFS_SICK_RT_SUPER	(1 << 2)  /* rt group superblock */
 #define XFS_SICK_RT_RMAPBT	(1 << 3)  /* reverse mappings */
+#define XFS_SICK_RT_REFCNTBT	(1 << 4)  /* reference counts */
 
 /* Observable health issues for AG metadata. */
 #define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
@@ -115,7 +116,8 @@ struct xfs_rtgroup;
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY | \
 				 XFS_SICK_RT_SUPER | \
-				 XFS_SICK_RT_RMAPBT)
+				 XFS_SICK_RT_RMAPBT | \
+				 XFS_SICK_RT_REFCNTBT)
 
 #define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
 				 XFS_SICK_AG_AGF | \
diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c
index bfc06af904e..a25f6bd1f20 100644
--- a/libxfs/xfs_inode_fork.c
+++ b/libxfs/xfs_inode_fork.c
@@ -271,8 +271,10 @@ xfs_iformat_data_fork(
 			}
 			return xfs_iformat_rtrmap(ip, dip);
 		case XFS_DINODE_FMT_REFCOUNT:
-			if (!xfs_has_rtreflink(ip->i_mount))
+			if (!xfs_has_rtreflink(ip->i_mount)) {
+				xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 				return -EFSCORRUPTED;
+			}
 			return xfs_iformat_rtrefcount(ip, dip);
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
diff --git a/libxfs/xfs_rtrefcount_btree.c b/libxfs/xfs_rtrefcount_btree.c
index fa04395eed0..035e41137a6 100644
--- a/libxfs/xfs_rtrefcount_btree.c
+++ b/libxfs/xfs_rtrefcount_btree.c
@@ -25,6 +25,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_imeta.h"
+#include "xfs_health.h"
 
 static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
 
@@ -692,8 +693,10 @@ xfs_iformat_rtrefcount(
 	level = be16_to_cpu(dfp->bb_level);
 
 	if (level > mp->m_rtrefc_maxlevels ||
-	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize)
+	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
+	}
 
 	xfs_iroot_alloc(ip, XFS_DATA_FORK,
 			xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
diff --git a/man/man2/ioctl_xfs_rtgroup_geometry.2 b/man/man2/ioctl_xfs_rtgroup_geometry.2
index 38753b93055..0e4e4592b22 100644
--- a/man/man2/ioctl_xfs_rtgroup_geometry.2
+++ b/man/man2/ioctl_xfs_rtgroup_geometry.2
@@ -76,6 +76,9 @@ Realtime bitmap for this group.
 .TP
 .B XFS_RTGROUP_GEOM_SICK_RTRMAPBT
 Reverse mapping btree for this group.
+.TP
+.B XFS_RTGROUP_GEOM_SICK_REFCNTBT
+Reference count btree for this group.
 .RE
 .SH RETURN VALUE
 On error, \-1 is returned, and


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/42] xfs: scrub the realtime refcount btree
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-27 13:31   ` [PATCH 20/42] xfs: report realtime refcount btree corruption errors to the health system Darrick J. Wong
@ 2023-12-27 13:31   ` Darrick J. Wong
  2023-12-27 13:31   ` [PATCH 22/42] xfs: scrub the metadir path of rt refcount btree files Darrick J. Wong
                     ` (20 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:31 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add code to scrub realtime refcount btrees.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs.h                     |    3 ++-
 man/man2/ioctl_xfs_scrub_metadata.2 |    9 +++++----
 2 files changed, 7 insertions(+), 5 deletions(-)


diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index 0bbdbfb0a8a..7847da61db2 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -738,9 +738,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
 #define XFS_SCRUB_TYPE_RGBITMAP	31	/* realtime group bitmap */
 #define XFS_SCRUB_TYPE_RTRMAPBT	32	/* rtgroup reverse mapping btree */
+#define XFS_SCRUB_TYPE_RTREFCBT	33	/* realtime reference count btree */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	33
+#define XFS_SCRUB_TYPE_NR	34
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/man/man2/ioctl_xfs_scrub_metadata.2 b/man/man2/ioctl_xfs_scrub_metadata.2
index 79875968d1c..e8717b41ccd 100644
--- a/man/man2/ioctl_xfs_scrub_metadata.2
+++ b/man/man2/ioctl_xfs_scrub_metadata.2
@@ -100,11 +100,12 @@ must be zero.
 .PP
 .nf
 .B XFS_SCRUB_TYPE_RGBITMAP
-.fi
-.TP
 .B XFS_SCRUB_TYPE_RTRMAPBT
-Examine a given realtime allocation group's free space bitmap or reverse
-mapping btree, respectively.
+.fi
+.TP
+.B XFS_SCRUB_TYPE_RTREFCBT
+Examine a given realtime allocation group's free space bitmap, reverse
+mapping btree, or reference count btree, respectively.
 Records are checked for obviously incorrect values and cross-referenced
 with other allocation group metadata records to ensure that there are no
 conflicts.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/42] xfs: scrub the metadir path of rt refcount btree files
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-27 13:31   ` [PATCH 21/42] xfs: scrub the realtime refcount btree Darrick J. Wong
@ 2023-12-27 13:31   ` Darrick J. Wong
  2023-12-27 13:32   ` [PATCH 23/42] libfrog: enable scrubbing of the realtime refcount data Darrick J. Wong
                     ` (19 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:31 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new XFS_SCRUB_METAPATH subtype so that we can scrub the metadata
directory tree path to the refcount btree file for each rt group.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/scrub.c |    5 +++++
 libxfs/xfs_fs.h |    3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)


diff --git a/libfrog/scrub.c b/libfrog/scrub.c
index 290ba0fb8bf..97b3d533910 100644
--- a/libfrog/scrub.c
+++ b/libfrog/scrub.c
@@ -207,6 +207,11 @@ const struct xfrog_scrub_descr xfrog_metapaths[XFS_SCRUB_METAPATH_NR] = {
 		.descr	= "rmap btree file metadir path",
 		.group	= XFROG_SCRUB_GROUP_RTGROUP,
 	},
+	[XFS_SCRUB_METAPATH_RTREFCBT]	= {
+		.name	= "rtrefcbt",
+		.descr	= "refcount btree file metadir path",
+		.group	= XFROG_SCRUB_GROUP_RTGROUP,
+	},
 };
 
 /* Invoke the scrub ioctl.  Returns zero or negative error code. */
diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index 7847da61db2..4159e96d01a 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -806,9 +806,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_METAPATH_GRPQUOTA	3
 #define XFS_SCRUB_METAPATH_PRJQUOTA	4
 #define XFS_SCRUB_METAPATH_RTRMAPBT	5
+#define XFS_SCRUB_METAPATH_RTREFCBT	6
 
 /* Number of metapath sm_ino values */
-#define XFS_SCRUB_METAPATH_NR		6
+#define XFS_SCRUB_METAPATH_NR		7
 
 /*
  * ioctl limits


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/42] libfrog: enable scrubbing of the realtime refcount data
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-27 13:31   ` [PATCH 22/42] xfs: scrub the metadir path of rt refcount btree files Darrick J. Wong
@ 2023-12-27 13:32   ` Darrick J. Wong
  2023-12-27 13:32   ` [PATCH 24/42] xfs_db: display the realtime refcount btree contents Darrick J. Wong
                     ` (18 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:32 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new entry so that we can scrub the rtrefcountbt.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/scrub.c |    5 +++++
 scrub/repair.c  |    1 +
 2 files changed, 6 insertions(+)


diff --git a/libfrog/scrub.c b/libfrog/scrub.c
index 97b3d533910..62220f1eb0b 100644
--- a/libfrog/scrub.c
+++ b/libfrog/scrub.c
@@ -174,6 +174,11 @@ const struct xfrog_scrub_descr xfrog_scrubbers[XFS_SCRUB_TYPE_NR] = {
 		.descr	= "realtime reverse mapping btree",
 		.group	= XFROG_SCRUB_GROUP_RTGROUP,
 	},
+	[XFS_SCRUB_TYPE_RTREFCBT] = {
+		.name	= "rtrefcountbt",
+		.descr	= "realtime reference count btree",
+		.group	= XFROG_SCRUB_GROUP_RTGROUP,
+	},
 };
 
 const struct xfrog_scrub_descr xfrog_metapaths[XFS_SCRUB_METAPATH_NR] = {
diff --git a/scrub/repair.c b/scrub/repair.c
index 72533ab5b02..08d11079547 100644
--- a/scrub/repair.c
+++ b/scrub/repair.c
@@ -556,6 +556,7 @@ repair_item_difficulty(
 		case XFS_SCRUB_TYPE_RTBITMAP:
 		case XFS_SCRUB_TYPE_RTSUM:
 		case XFS_SCRUB_TYPE_RGSUPER:
+		case XFS_SCRUB_TYPE_RTREFCBT:
 			ret |= REPAIR_DIFFICULTY_PRIMARY;
 			break;
 		}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/42] xfs_db: display the realtime refcount btree contents
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-27 13:32   ` [PATCH 23/42] libfrog: enable scrubbing of the realtime refcount data Darrick J. Wong
@ 2023-12-27 13:32   ` Darrick J. Wong
  2023-12-27 13:32   ` [PATCH 25/42] xfs_db: support the realtime refcountbt Darrick J. Wong
                     ` (17 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:32 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Implement all the code we need to dump rtrefcountbt contents, starting
from the root inode.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/bmroot.c              |  148 ++++++++++++++++++++++++++++++++++++++++++++++
 db/bmroot.h              |    2 +
 db/btblock.c             |   50 ++++++++++++++++
 db/btblock.h             |    5 ++
 db/field.c               |   15 +++++
 db/field.h               |    6 ++
 db/inode.c               |   73 ++++++++++++++++++++++-
 db/inode.h               |    1 
 db/type.c                |    5 ++
 db/type.h                |    1 
 libxfs/libxfs_api_defs.h |    5 ++
 man/man8/xfs_db.8        |   48 +++++++++++++++
 12 files changed, 357 insertions(+), 2 deletions(-)


diff --git a/db/bmroot.c b/db/bmroot.c
index 19490bd2499..cb334aa4583 100644
--- a/db/bmroot.c
+++ b/db/bmroot.c
@@ -31,6 +31,13 @@ static int	rtrmaproot_key_offset(void *obj, int startoff, int idx);
 static int	rtrmaproot_ptr_count(void *obj, int startoff);
 static int	rtrmaproot_ptr_offset(void *obj, int startoff, int idx);
 
+static int	rtrefcroot_rec_count(void *obj, int startoff);
+static int	rtrefcroot_rec_offset(void *obj, int startoff, int idx);
+static int	rtrefcroot_key_count(void *obj, int startoff);
+static int	rtrefcroot_key_offset(void *obj, int startoff, int idx);
+static int	rtrefcroot_ptr_count(void *obj, int startoff);
+static int	rtrefcroot_ptr_offset(void *obj, int startoff, int idx);
+
 #define	OFF(f)	bitize(offsetof(xfs_bmdr_block_t, bb_ ## f))
 const field_t	bmroota_flds[] = {
 	{ "level", FLDT_UINT16D, OI(OFF(level)), C1, 0, TYP_NONE },
@@ -73,6 +80,19 @@ const field_t	rtrmaproot_flds[] = {
 	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_RTRMAPBT },
 	{ NULL }
 };
+
+/* realtime refcount btree root */
+const field_t	rtrefcroot_flds[] = {
+	{ "level", FLDT_UINT16D, OI(OFF(level)), C1, 0, TYP_NONE },
+	{ "numrecs", FLDT_UINT16D, OI(OFF(numrecs)), C1, 0, TYP_NONE },
+	{ "recs", FLDT_RTREFCBTREC, rtrefcroot_rec_offset, rtrefcroot_rec_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_NONE },
+	{ "keys", FLDT_RTREFCBTKEY, rtrefcroot_key_offset, rtrefcroot_key_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_NONE },
+	{ "ptrs", FLDT_RTREFCBTPTR, rtrefcroot_ptr_offset, rtrefcroot_ptr_count,
+	  FLD_ARRAY|FLD_ABASE1|FLD_COUNT|FLD_OFFSET, TYP_RTREFCBT },
+	{ NULL }
+};
 #undef OFF
 
 static int
@@ -390,3 +410,131 @@ rtrmaproot_size(
 	dip = obj;
 	return bitize((int)XFS_DFORK_DSIZE(dip, mp));
 }
+
+/* realtime refcount root */
+static int
+rtrefcroot_rec_count(
+	void				*obj,
+	int				startoff)
+{
+	struct xfs_rtrefcount_root	*block;
+#ifdef DEBUG
+	struct xfs_dinode		*dip = obj;
+#endif
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrefcount_root *)((char *)obj + byteize(startoff));
+	ASSERT((char *)block == XFS_DFORK_DPTR(dip));
+	if (be16_to_cpu(block->bb_level) > 0)
+		return 0;
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static int
+rtrefcroot_rec_offset(
+	void				*obj,
+	int				startoff,
+	int				idx)
+{
+	struct xfs_rtrefcount_root	*block;
+	struct xfs_refcount_rec		*kp;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrefcount_root *)((char *)obj + byteize(startoff));
+	ASSERT(be16_to_cpu(block->bb_level) == 0);
+	kp = xfs_rtrefcount_droot_rec_addr(block, idx);
+	return bitize((int)((char *)kp - (char *)block));
+}
+
+static int
+rtrefcroot_key_count(
+	void				*obj,
+	int				startoff)
+{
+	struct xfs_rtrefcount_root	*block;
+#ifdef DEBUG
+	struct xfs_dinode		*dip = obj;
+#endif
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrefcount_root *)((char *)obj + byteize(startoff));
+	ASSERT((char *)block == XFS_DFORK_DPTR(dip));
+	if (be16_to_cpu(block->bb_level) == 0)
+		return 0;
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static int
+rtrefcroot_key_offset(
+	void				*obj,
+	int				startoff,
+	int				idx)
+{
+	struct xfs_rtrefcount_root	*block;
+	struct xfs_refcount_key		*kp;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrefcount_root *)((char *)obj + byteize(startoff));
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+	kp = xfs_rtrefcount_droot_key_addr(block, idx);
+	return bitize((int)((char *)kp - (char *)block));
+}
+
+static int
+rtrefcroot_ptr_count(
+	void				*obj,
+	int				startoff)
+{
+	struct xfs_rtrefcount_root	*block;
+#ifdef DEBUG
+	struct xfs_dinode		*dip = obj;
+#endif
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	block = (struct xfs_rtrefcount_root *)((char *)obj + byteize(startoff));
+	ASSERT((char *)block == XFS_DFORK_DPTR(dip));
+	if (be16_to_cpu(block->bb_level) == 0)
+		return 0;
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static int
+rtrefcroot_ptr_offset(
+	void				*obj,
+	int				startoff,
+	int				idx)
+{
+	struct xfs_rtrefcount_root	*block;
+	xfs_rtrefcount_ptr_t		*pp;
+	struct xfs_dinode		*dip;
+	int				dmxr;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	dip = obj;
+	block = (struct xfs_rtrefcount_root *)((char *)obj + byteize(startoff));
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+	dmxr = libxfs_rtrefcountbt_droot_maxrecs(XFS_DFORK_DSIZE(dip, mp), false);
+	pp = xfs_rtrefcount_droot_ptr_addr(block, idx, dmxr);
+	return bitize((int)((char *)pp - (char *)block));
+}
+
+int
+rtrefcroot_size(
+	void			*obj,
+	int			startoff,
+	int			idx)
+{
+	struct xfs_dinode	*dip;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	ASSERT(idx == 0);
+	dip = obj;
+	return bitize((int)XFS_DFORK_DSIZE(dip, mp));
+}
diff --git a/db/bmroot.h b/db/bmroot.h
index a2c5cfb18f0..70bc8483cd8 100644
--- a/db/bmroot.h
+++ b/db/bmroot.h
@@ -9,7 +9,9 @@ extern const struct field	bmroota_key_flds[];
 extern const struct field	bmrootd_flds[];
 extern const struct field	bmrootd_key_flds[];
 extern const struct field	rtrmaproot_flds[];
+extern const struct field	rtrefcroot_flds[];
 
 extern int	bmroota_size(void *obj, int startoff, int idx);
 extern int	bmrootd_size(void *obj, int startoff, int idx);
 extern int	rtrmaproot_size(void *obj, int startoff, int idx);
+extern int	rtrefcroot_size(void *obj, int startoff, int idx);
diff --git a/db/btblock.c b/db/btblock.c
index 70f6c3f6aed..40913a09437 100644
--- a/db/btblock.c
+++ b/db/btblock.c
@@ -104,6 +104,12 @@ static struct xfs_db_btree {
 		sizeof(struct xfs_refcount_rec),
 		sizeof(__be32),
 	},
+	{	XFS_RTREFC_CRC_MAGIC,
+		XFS_BTREE_LBLOCK_CRC_LEN,
+		sizeof(struct xfs_refcount_key),
+		sizeof(struct xfs_refcount_rec),
+		sizeof(__be64),
+	},
 	{	0,
 	},
 };
@@ -962,3 +968,47 @@ const field_t	refcbt_rec_flds[] = {
 	{ NULL }
 };
 #undef ROFF
+
+/* realtime refcount btree blocks */
+const field_t	rtrefcbt_crc_hfld[] = {
+	{ "", FLDT_RTREFCBT_CRC, OI(0), C1, 0, TYP_NONE },
+	{ NULL }
+};
+
+#define	OFF(f)	bitize(offsetof(struct xfs_btree_block, bb_ ## f))
+const field_t	rtrefcbt_crc_flds[] = {
+	{ "magic", FLDT_UINT32X, OI(OFF(magic)), C1, 0, TYP_NONE },
+	{ "level", FLDT_UINT16D, OI(OFF(level)), C1, 0, TYP_NONE },
+	{ "numrecs", FLDT_UINT16D, OI(OFF(numrecs)), C1, 0, TYP_NONE },
+	{ "leftsib", FLDT_DFSBNO, OI(OFF(u.l.bb_leftsib)), C1, 0, TYP_RTREFCBT },
+	{ "rightsib", FLDT_DFSBNO, OI(OFF(u.l.bb_rightsib)), C1, 0, TYP_RTREFCBT },
+	{ "bno", FLDT_DFSBNO, OI(OFF(u.l.bb_blkno)), C1, 0, TYP_REFCBT },
+	{ "lsn", FLDT_UINT64X, OI(OFF(u.l.bb_lsn)), C1, 0, TYP_NONE },
+	{ "uuid", FLDT_UUID, OI(OFF(u.l.bb_uuid)), C1, 0, TYP_NONE },
+	{ "owner", FLDT_INO, OI(OFF(u.l.bb_owner)), C1, 0, TYP_NONE },
+	{ "crc", FLDT_CRC, OI(OFF(u.l.bb_crc)), C1, 0, TYP_NONE },
+	{ "recs", FLDT_RTREFCBTREC, btblock_rec_offset, btblock_rec_count,
+	  FLD_ARRAY | FLD_ABASE1 | FLD_COUNT | FLD_OFFSET, TYP_NONE },
+	{ "keys", FLDT_RTREFCBTKEY, btblock_key_offset, btblock_key_count,
+	  FLD_ARRAY | FLD_ABASE1 | FLD_COUNT | FLD_OFFSET, TYP_NONE },
+	{ "ptrs", FLDT_RTREFCBTPTR, btblock_ptr_offset, btblock_key_count,
+	  FLD_ARRAY | FLD_ABASE1 | FLD_COUNT | FLD_OFFSET, TYP_RTREFCBT },
+	{ NULL }
+};
+#undef OFF
+
+const field_t	rtrefcbt_key_flds[] = {
+	{ "startblock", FLDT_CRGBLOCK, OI(REFCNTBT_STARTBLOCK_BITOFF), C1, 0, TYP_DATA },
+	{ "cowflag", FLDT_CCOWFLG, OI(REFCNTBT_COWFLAG_BITOFF), C1, 0, TYP_DATA },
+	{ NULL }
+};
+
+#define	ROFF(f)	bitize(offsetof(struct xfs_refcount_rec, rc_ ## f))
+const field_t	rtrefcbt_rec_flds[] = {
+	{ "startblock", FLDT_CRGBLOCK, OI(REFCNTBT_STARTBLOCK_BITOFF), C1, 0, TYP_DATA },
+	{ "blockcount", FLDT_EXTLEN, OI(ROFF(blockcount)), C1, 0, TYP_NONE },
+	{ "refcount", FLDT_UINT32D, OI(ROFF(refcount)), C1, 0, TYP_DATA },
+	{ "cowflag", FLDT_CCOWFLG, OI(REFCNTBT_COWFLAG_BITOFF), C1, 0, TYP_DATA },
+	{ NULL }
+};
+#undef ROFF
diff --git a/db/btblock.h b/db/btblock.h
index b4013ea8073..5bbe857a7ef 100644
--- a/db/btblock.h
+++ b/db/btblock.h
@@ -63,4 +63,9 @@ extern const struct field	refcbt_crc_hfld[];
 extern const struct field	refcbt_key_flds[];
 extern const struct field	refcbt_rec_flds[];
 
+extern const struct field	rtrefcbt_crc_flds[];
+extern const struct field	rtrefcbt_crc_hfld[];
+extern const struct field	rtrefcbt_key_flds[];
+extern const struct field	rtrefcbt_rec_flds[];
+
 extern int	btblock_size(void *obj, int startoff, int idx);
diff --git a/db/field.c b/db/field.c
index b3efbb5698d..66f3dd28ada 100644
--- a/db/field.c
+++ b/db/field.c
@@ -204,6 +204,21 @@ const ftattr_t	ftattrtab[] = {
 	{ FLDT_REFCBTREC, "refcntbtrec", fp_sarray, (char *)refcbt_rec_flds,
 	  SI(bitsz(struct xfs_refcount_rec)), 0, NULL, refcbt_rec_flds },
 
+	{ FLDT_CRGBLOCK, "crgblock", fp_num, "%u", SI(REFCNTBT_AGBLOCK_BITLEN),
+	  FTARG_DONULL, NULL, NULL },
+	{ FLDT_RTREFCBT_CRC, "rtrefcntbt", NULL, (char *)rtrefcbt_crc_flds,
+	  btblock_size, FTARG_SIZE, NULL, rtrefcbt_crc_flds },
+	{ FLDT_RTREFCBTKEY, "rtrefcntbtkey", fp_sarray,
+	  (char *)rtrefcbt_key_flds, SI(bitsz(struct xfs_refcount_key)), 0,
+	  NULL, rtrefcbt_key_flds },
+	{ FLDT_RTREFCBTPTR, "rtrefcntbtptr", fp_num, "%u",
+	  SI(bitsz(xfs_rtrefcount_ptr_t)), 0, fa_dfsbno, NULL },
+	{ FLDT_RTREFCBTREC, "rtrefcntbtrec", fp_sarray,
+	  (char *)rtrefcbt_rec_flds, SI(bitsz(struct xfs_refcount_rec)), 0,
+	  NULL, rtrefcbt_rec_flds },
+	{ FLDT_RTREFCROOT, "rtrefcroot", NULL, (char *)rtrefcroot_flds,
+	  rtrefcroot_size, FTARG_SIZE, NULL, rtrefcroot_flds },
+
 /* CRC field */
 	{ FLDT_CRC, "crc", fp_crc, "%#x (%s)", SI(bitsz(uint32_t)),
 	  0, NULL, NULL },
diff --git a/db/field.h b/db/field.h
index db3e13d3927..4a5b35f4a56 100644
--- a/db/field.h
+++ b/db/field.h
@@ -92,6 +92,12 @@ typedef enum fldt	{
 	FLDT_REFCBTKEY,
 	FLDT_REFCBTPTR,
 	FLDT_REFCBTREC,
+	FLDT_CRGBLOCK,
+	FLDT_RTREFCBT_CRC,
+	FLDT_RTREFCBTKEY,
+	FLDT_RTREFCBTPTR,
+	FLDT_RTREFCBTREC,
+	FLDT_RTREFCROOT,
 
 	/* CRC field type */
 	FLDT_CRC,
diff --git a/db/inode.c b/db/inode.c
index 492a8f53ed0..760d47c0f04 100644
--- a/db/inode.c
+++ b/db/inode.c
@@ -49,6 +49,7 @@ static int	inode_u_sfdir2_count(void *obj, int startoff);
 static int	inode_u_sfdir3_count(void *obj, int startoff);
 static int	inode_u_symlink_count(void *obj, int startoff);
 static int	inode_u_rtrmapbt_count(void *obj, int startoff);
+static int	inode_u_rtrefcbt_count(void *obj, int startoff);
 
 static const cmdinfo_t	inode_cmd =
 	{ "inode", NULL, inode_f, 0, 1, 1, "[inode#]",
@@ -234,6 +235,8 @@ const field_t	inode_u_flds[] = {
 	  TYP_NONE },
 	{ "rtrmapbt", FLDT_RTRMAPROOT, NULL, inode_u_rtrmapbt_count, FLD_COUNT,
 	  TYP_NONE },
+	{ "rtrefcbt", FLDT_RTREFCROOT, NULL, inode_u_rtrefcbt_count, FLD_COUNT,
+	  TYP_NONE },
 	{ NULL }
 };
 
@@ -247,7 +250,7 @@ const field_t	inode_a_flds[] = {
 };
 
 static const char	*dinode_fmt_name[] =
-	{ "dev", "local", "extents", "btree", "uuid", "rmap" };
+	{ "dev", "local", "extents", "btree", "uuid", "rmap", "refcount" };
 static const int	dinode_fmt_name_size =
 	sizeof(dinode_fmt_name) / sizeof(dinode_fmt_name[0]);
 
@@ -643,6 +646,7 @@ struct rtgroup_inodes {
 
 static struct rtgroup_inodes	*rtgroup_inodes;
 static struct bitmap		*rmap_inodes;
+static struct bitmap		*refcount_inodes;
 
 static inline int
 set_rtgroup_rmap_inode(
@@ -687,6 +691,44 @@ set_rtgroup_rmap_inode(
 	return error;
 }
 
+static inline int
+set_rtgroup_refcount_inode(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_imeta_path	*path;
+	struct xfs_trans	*tp;
+	xfs_ino_t		rtino;
+	int			error;
+
+	if (!xfs_has_rtreflink(mp))
+		return 0;
+
+	error = -libxfs_rtrefcountbt_create_path(mp, rgno, &path);
+	if (error)
+		return error;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		goto out_path;
+
+	error = -libxfs_imeta_lookup(tp, path, &rtino);
+	if (error)
+		goto out_trans;
+
+	if (rtino == NULLFSINO) {
+		error = EFSCORRUPTED;
+		goto out_trans;
+	}
+
+	error = bitmap_set(refcount_inodes, rtino, 1);
+out_trans:
+	libxfs_trans_cancel(tp);
+out_path:
+	libxfs_imeta_free_path(path);
+	return error;
+}
+
 int
 init_rtmeta_inode_bitmaps(
 	struct xfs_mount	*mp)
@@ -706,10 +748,17 @@ init_rtmeta_inode_bitmaps(
 	if (error)
 		return error;
 
+	error = bitmap_alloc(&refcount_inodes);
+	if (error)
+		return error;
+
 	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
 		int err2 = set_rtgroup_rmap_inode(mp, rgno);
 		if (err2 && !error)
 			error = err2;
+		err2 = set_rtgroup_refcount_inode(mp, rgno);
+		if (err2 && !error)
+			error = err2;
 	}
 
 	return error;
@@ -732,6 +781,11 @@ xfs_rgnumber_t rtgroup_for_rtrmap_ino(struct xfs_mount *mp, xfs_ino_t ino)
 	return NULLRGNUMBER;
 }
 
+bool is_rtrefcount_inode(xfs_ino_t ino)
+{
+	return bitmap_test(refcount_inodes, ino, 1);
+}
+
 typnm_t
 inode_next_type(void)
 {
@@ -764,6 +818,9 @@ inode_next_type(void)
 			return TYP_DQBLK;
 		else if (is_rtrmap_inode(iocur_top->ino))
 			return TYP_RTRMAPBT;
+		else if (is_rtrefcount_inode(iocur_top->ino))
+			return TYP_RTREFCBT;
+
 		return TYP_DATA;
 	default:
 		return TYP_NONE;
@@ -912,6 +969,20 @@ inode_u_rtrmapbt_count(
 	return dip->di_format == XFS_DINODE_FMT_RMAP;
 }
 
+static int
+inode_u_rtrefcbt_count(
+	void			*obj,
+	int			startoff)
+{
+	struct xfs_dinode	*dip;
+
+	ASSERT(bitoffs(startoff) == 0);
+	ASSERT(obj == iocur_top->data);
+	dip = obj;
+	ASSERT((char *)XFS_DFORK_DPTR(dip) - (char *)dip == byteize(startoff));
+	return dip->di_format == XFS_DINODE_FMT_REFCOUNT;
+}
+
 int
 inode_u_size(
 	void			*obj,
diff --git a/db/inode.h b/db/inode.h
index 04e606abed3..666bb5201ea 100644
--- a/db/inode.h
+++ b/db/inode.h
@@ -27,3 +27,4 @@ extern void	set_cur_inode(xfs_ino_t ino);
 int init_rtmeta_inode_bitmaps(struct xfs_mount *mp);
 bool is_rtrmap_inode(xfs_ino_t ino);
 xfs_rgnumber_t rtgroup_for_rtrmap_ino(struct xfs_mount *mp, xfs_ino_t ino);
+bool is_rtrefcount_inode(xfs_ino_t ino);
diff --git a/db/type.c b/db/type.c
index 1dfc33ffb44..324f416a49c 100644
--- a/db/type.c
+++ b/db/type.c
@@ -53,6 +53,7 @@ static const typ_t	__typtab[] = {
 	{ TYP_RMAPBT, NULL },
 	{ TYP_RTRMAPBT, NULL },
 	{ TYP_REFCBT, NULL },
+	{ TYP_RTREFCBT, NULL },
 	{ TYP_DATA, "data", handle_block, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_DIR2, "dir2", handle_struct, dir2_hfld, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_DQBLK, "dqblk", handle_struct, dqblk_hfld, NULL, TYP_F_NO_CRC_OFF },
@@ -96,6 +97,8 @@ static const typ_t	__typtab_crc[] = {
 		&xfs_rtrmapbt_buf_ops, XFS_BTREE_LBLOCK_CRC_OFF },
 	{ TYP_REFCBT, "refcntbt", handle_struct, refcbt_crc_hfld,
 		&xfs_refcountbt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
+	{ TYP_RTREFCBT, "rtrefcntbt", handle_struct, rtrefcbt_crc_hfld,
+		&xfs_rtrefcountbt_buf_ops, XFS_BTREE_LBLOCK_CRC_OFF },
 	{ TYP_DATA, "data", handle_block, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_DIR2, "dir3", handle_struct, dir3_hfld,
 		&xfs_dir3_db_buf_ops, TYP_F_CRC_FUNC, xfs_dir3_set_crc },
@@ -148,6 +151,8 @@ static const typ_t	__typtab_spcrc[] = {
 		&xfs_rtrmapbt_buf_ops, XFS_BTREE_LBLOCK_CRC_OFF },
 	{ TYP_REFCBT, "refcntbt", handle_struct, refcbt_crc_hfld,
 		&xfs_refcountbt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
+	{ TYP_RTREFCBT, "rtrefcntbt", handle_struct, rtrefcbt_crc_hfld,
+		&xfs_rtrefcountbt_buf_ops, XFS_BTREE_LBLOCK_CRC_OFF },
 	{ TYP_DATA, "data", handle_block, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_DIR2, "dir3", handle_struct, dir3_hfld,
 		&xfs_dir3_db_buf_ops, TYP_F_CRC_FUNC, xfs_dir3_set_crc },
diff --git a/db/type.h b/db/type.h
index c98f3640202..a2488a663db 100644
--- a/db/type.h
+++ b/db/type.h
@@ -22,6 +22,7 @@ typedef enum typnm
 	TYP_RMAPBT,
 	TYP_RTRMAPBT,
 	TYP_REFCBT,
+	TYP_RTREFCBT,
 	TYP_DATA,
 	TYP_DIR2,
 	TYP_DQBLK,
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index cfa8c474160..1a21eb30f39 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -292,6 +292,11 @@
 #define xfs_rtgroup_put			libxfs_rtgroup_put
 #define xfs_rtgroup_update_secondary_sbs	libxfs_rtgroup_update_secondary_sbs
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
+
+#define xfs_rtrefcountbt_create_path	libxfs_rtrefcountbt_create_path
+#define xfs_rtrefcountbt_droot_maxrecs	libxfs_rtrefcountbt_droot_maxrecs
+#define xfs_rtrefcountbt_maxrecs	libxfs_rtrefcountbt_maxrecs
+
 #define xfs_rtrmapbt_calc_reserves	libxfs_rtrmapbt_calc_reserves
 #define xfs_rtrmapbt_calc_size		libxfs_rtrmapbt_calc_size
 #define xfs_rtrmapbt_commit_staged_btree	libxfs_rtrmapbt_commit_staged_btree
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 778e3f9dd70..71859b3af44 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -1201,7 +1201,7 @@ The possible data types are:
 .BR agf ", " agfl ", " agi ", " attr ", " bmapbta ", " bmapbtd ,
 .BR bnobt ", " cntbt ", " data ", " dir ", " dir2 ", " dqblk ,
 .BR inobt ", " inode ", " log ", " refcntbt ", " rmapbt ", " rtbitmap ,
-.BR rtsummary ", " sb ", " symlink ", " rtrmapbt ", and " text .
+.BR rtsummary ", " sb ", " symlink ", " rtrmapbt ", " rtrefcbt ", and " text .
 See the TYPES section below for more information on these data types.
 .TP
 .BI "timelimit [" OPTIONS ]
@@ -2295,6 +2295,52 @@ block number within the allocation group to the next level in the Btree.
 .PD
 .RE
 .TP
+.B rtrefcbt
+There is one reference count Btree for the entire realtime device.  The
+.BR startblock " and "
+.B blockcount
+fields are 32 bits wide and record block counts within a realtime group.
+The root of this Btree is the realtime refcount inode, which is recorded in the
+metadata directory.
+Blocks are linked to sibling left and right blocks at each level, as well as by
+pointers from parent to child blocks.
+Each block has the following fields:
+.RS 1.4i
+.PD 0
+.TP 1.2i
+.B magic
+RTREFC block magic number, 0x52434e54 ('RCNT').
+.TP
+.B level
+level number of this block, 0 is a leaf.
+.TP
+.B numrecs
+number of data entries in the block.
+.TP
+.B leftsib
+left (logically lower) sibling block, 0 if none.
+.TP
+.B rightsib
+right (logically higher) sibling block, 0 if none.
+.TP
+.B recs
+[leaf blocks only] array of reference count records. Each record contains
+.BR startblock ,
+.BR blockcount ,
+and
+.BR refcount .
+.TP
+.B keys
+[non-leaf blocks only] array of key records. These are the first value
+of each block in the level below this one. Each record contains
+.BR startblock .
+.TP
+.B ptrs
+[non-leaf blocks only] array of child block pointers. Each pointer is a
+block number within the allocation group to the next level in the Btree.
+.PD
+.RE
+.TP
 .B rmapbt
 There is one set of filesystem blocks forming the reverse mapping Btree for
 each allocation group. The root block of this Btree is designated by the


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 25/42] xfs_db: support the realtime refcountbt
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (23 preceding siblings ...)
  2023-12-27 13:32   ` [PATCH 24/42] xfs_db: display the realtime refcount btree contents Darrick J. Wong
@ 2023-12-27 13:32   ` Darrick J. Wong
  2023-12-27 13:32   ` [PATCH 26/42] xfs_db: widen block type mask to 64 bits Darrick J. Wong
                     ` (16 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:32 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Wire up various parts of xfs_db for realtime refcount support.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/btblock.c             |    3 +++
 db/btdump.c              |   11 ++++++++++-
 db/btheight.c            |    5 +++++
 libxfs/libxfs_api_defs.h |    1 +
 man/man8/xfs_db.8        |    1 +
 5 files changed, 20 insertions(+), 1 deletion(-)


diff --git a/db/btblock.c b/db/btblock.c
index 40913a09437..e9e5d2f86b9 100644
--- a/db/btblock.c
+++ b/db/btblock.c
@@ -159,6 +159,9 @@ block_to_bt(
 	case TYP_REFCBT:
 		magic = crc ? XFS_REFC_CRC_MAGIC : 0;
 		break;
+	case TYP_RTREFCBT:
+		magic = crc ? XFS_RTREFC_CRC_MAGIC : 0;
+		break;
 	default:
 		ASSERT(0);
 	}
diff --git a/db/btdump.c b/db/btdump.c
index 9c528e5a11a..31f32a8f7a5 100644
--- a/db/btdump.c
+++ b/db/btdump.c
@@ -447,7 +447,8 @@ is_btree_inode(void)
 	struct xfs_dinode	*dip;
 
 	dip = iocur_top->data;
-	return dip->di_format == XFS_DINODE_FMT_RMAP;
+	return dip->di_format == XFS_DINODE_FMT_RMAP ||
+	       dip->di_format == XFS_DINODE_FMT_REFCOUNT;
 }
 
 static int
@@ -457,6 +458,7 @@ dump_btree_inode(
 	char			*prefix;
 	struct xfs_dinode	*dip;
 	struct xfs_rtrmap_root	*rtrmap;
+	struct xfs_rtrefcount_root *rtrefc;
 	int			level;
 	int			numrecs;
 	int			ret;
@@ -469,6 +471,12 @@ dump_btree_inode(
 		level = be16_to_cpu(rtrmap->bb_level);
 		numrecs = be16_to_cpu(rtrmap->bb_numrecs);
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		prefix = "u3.rtrefcbt";
+		rtrefc = (struct xfs_rtrefcount_root *)XFS_DFORK_DPTR(dip);
+		level = be16_to_cpu(rtrefc->bb_level);
+		numrecs = be16_to_cpu(rtrefc->bb_numrecs);
+		break;
 	default:
 		dbprintf("Unknown metadata inode type %u\n", dip->di_format);
 		return 0;
@@ -550,6 +558,7 @@ btdump_f(
 	case TYP_BMAPBTA:
 	case TYP_BMAPBTD:
 	case TYP_RTRMAPBT:
+	case TYP_RTREFCBT:
 		return dump_btree_long(iflag);
 	case TYP_INODE:
 		if (is_btree_inode())
diff --git a/db/btheight.c b/db/btheight.c
index 25ce3400334..9dd21ddae9a 100644
--- a/db/btheight.c
+++ b/db/btheight.c
@@ -58,6 +58,11 @@ struct btmap {
 		.maxlevels	= libxfs_rtrmapbt_maxlevels_ondisk,
 		.maxrecs	= libxfs_rtrmapbt_maxrecs,
 	},
+	{
+		.tag		= "rtrefcountbt",
+		.maxlevels	= libxfs_rtrefcountbt_maxlevels_ondisk,
+		.maxrecs	= libxfs_rtrefcountbt_maxrecs,
+	},
 };
 
 static void
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 1a21eb30f39..36e09b3a237 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -295,6 +295,7 @@
 
 #define xfs_rtrefcountbt_create_path	libxfs_rtrefcountbt_create_path
 #define xfs_rtrefcountbt_droot_maxrecs	libxfs_rtrefcountbt_droot_maxrecs
+#define xfs_rtrefcountbt_maxlevels_ondisk	libxfs_rtrefcountbt_maxlevels_ondisk
 #define xfs_rtrefcountbt_maxrecs	libxfs_rtrefcountbt_maxrecs
 
 #define xfs_rtrmapbt_calc_reserves	libxfs_rtrmapbt_calc_reserves
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 71859b3af44..3e80bcc57de 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -455,6 +455,7 @@ The supported btree types are:
 .IR bmapbt ,
 .IR refcountbt ,
 .IR rmapbt ,
+.IR rtrefcountbt ,
 and
 .IR rtrmapbt .
 The magic value


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 26/42] xfs_db: widen block type mask to 64 bits
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (24 preceding siblings ...)
  2023-12-27 13:32   ` [PATCH 25/42] xfs_db: support the realtime refcountbt Darrick J. Wong
@ 2023-12-27 13:32   ` Darrick J. Wong
  2023-12-27 13:33   ` [PATCH 27/42] xfs_db: support rudimentary checks of the rtrefcount btree Darrick J. Wong
                     ` (15 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:32 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

We're about to enlarge enum dbm beyond 32 items, so we need to widen the
block type mask to 64 bits to avoid problems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/check.c |   72 ++++++++++++++++++++++++++++++------------------------------
 1 file changed, 36 insertions(+), 36 deletions(-)


diff --git a/db/check.c b/db/check.c
index 351bb94a48e..4ef24c20d79 100644
--- a/db/check.c
+++ b/db/check.c
@@ -282,9 +282,9 @@ static void		check_set_rdbmap(xfs_rfsblock_t bno, xfs_extlen_t len,
 					 dbm_t type1, dbm_t type2);
 static void		check_summary(void);
 static void		checknot_dbmap(xfs_agnumber_t agno, xfs_agblock_t agbno,
-				       xfs_extlen_t len, int typemask);
+				       xfs_extlen_t len, uint64_t typemask);
 static void		checknot_rdbmap(xfs_rfsblock_t bno, xfs_extlen_t len,
-					int typemask);
+					uint64_t typemask);
 static void		dir_hash_add(xfs_dahash_t hash,
 				     xfs_dir2_dataptr_t addr);
 static void		dir_hash_check(inodata_t *id, int v);
@@ -904,7 +904,7 @@ blockget_f(
 	if (!tflag) {	/* are we in test mode, faking out freespace? */
 		for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)
 			checknot_dbmap(agno, 0, mp->m_sb.sb_agblocks,
-				(1 << DBM_UNKNOWN) | (1 << DBM_FREE1));
+				(1ULL << DBM_UNKNOWN) | (1ULL << DBM_FREE1));
 	}
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)
 		check_linkcounts(agno);
@@ -912,7 +912,7 @@ blockget_f(
 		checknot_rdbmap(0,
 			(xfs_extlen_t)(mp->m_sb.sb_rextents *
 				       mp->m_sb.sb_rextsize),
-			1 << DBM_UNKNOWN);
+			1ULL << DBM_UNKNOWN);
 		check_summary();
 	}
 	if (mp->m_sb.sb_icount != icount) {
@@ -1083,7 +1083,7 @@ blocktrash_f(
 	int		c;
 	int		count;
 	int		done;
-	int		goodmask;
+	uint64_t	goodmask;
 	int		i;
 	ltab_t		*lentab;
 	int		lentablen;
@@ -1095,7 +1095,7 @@ blocktrash_f(
 	xfs_rfsblock_t	randb;
 	uint		seed;
 	int		sopt;
-	int		tmask;
+	uint64_t	tmask;
 	bool		this_block = false;
 	int		bit_offset = -1;
 
@@ -1108,27 +1108,27 @@ blocktrash_f(
 	seed = (unsigned int)(now.tv_sec ^ now.tv_usec);
 	sopt = 0;
 	tmask = 0;
-	goodmask = (1 << DBM_AGF) |
-		   (1 << DBM_AGFL) |
-		   (1 << DBM_AGI) |
-		   (1 << DBM_ATTR) |
-		   (1 << DBM_BTBMAPA) |
-		   (1 << DBM_BTBMAPD) |
-		   (1 << DBM_BTBNO) |
-		   (1 << DBM_BTCNT) |
-		   (1 << DBM_BTINO) |
-		   (1 << DBM_DIR) |
-		   (1 << DBM_INODE) |
-		   (1 << DBM_LOG) |
-		   (1 << DBM_QUOTA) |
-		   (1 << DBM_RTBITMAP) |
-		   (1 << DBM_RTSUM) |
-		   (1 << DBM_BTRTRMAP) |
-		   (1 << DBM_SYMLINK) |
-		   (1 << DBM_BTFINO) |
-		   (1 << DBM_BTRMAP) |
-		   (1 << DBM_BTREFC) |
-		   (1 << DBM_SB);
+	goodmask = (1ULL << DBM_AGF) |
+		   (1ULL << DBM_AGFL) |
+		   (1ULL << DBM_AGI) |
+		   (1ULL << DBM_ATTR) |
+		   (1ULL << DBM_BTBMAPA) |
+		   (1ULL << DBM_BTBMAPD) |
+		   (1ULL << DBM_BTBNO) |
+		   (1ULL << DBM_BTCNT) |
+		   (1ULL << DBM_BTINO) |
+		   (1ULL << DBM_DIR) |
+		   (1ULL << DBM_INODE) |
+		   (1ULL << DBM_LOG) |
+		   (1ULL << DBM_QUOTA) |
+		   (1ULL << DBM_RTBITMAP) |
+		   (1ULL << DBM_RTSUM) |
+		   (1ULL << DBM_BTRTRMAP) |
+		   (1ULL << DBM_SYMLINK) |
+		   (1ULL << DBM_BTFINO) |
+		   (1ULL << DBM_BTRMAP) |
+		   (1ULL << DBM_BTREFC) |
+		   (1ULL << DBM_SB);
 	while ((c = getopt(argc, argv, "0123n:o:s:t:x:y:z")) != EOF) {
 		switch (c) {
 		case '0':
@@ -1174,11 +1174,11 @@ blocktrash_f(
 				if (strcmp(typename[i], optarg) == 0)
 					break;
 			}
-			if (!typename[i] || (((1 << i) & goodmask) == 0)) {
+			if (!typename[i] || (((1ULL << i) & goodmask) == 0)) {
 				dbprintf(_("bad blocktrash type %s\n"), optarg);
 				return 0;
 			}
-			tmask |= 1 << i;
+			tmask |= 1ULL << i;
 			break;
 		case 'x':
 			min = (int)strtol(optarg, &p, 0);
@@ -1217,7 +1217,7 @@ blocktrash_f(
 		return 0;
 	}
 	if (tmask == 0)
-		tmask = goodmask & ~((1 << DBM_LOG) | (1 << DBM_SB));
+		tmask = goodmask & ~((1ULL << DBM_LOG) | (1ULL << DBM_SB));
 	lentab = xmalloc(sizeof(ltab_t));
 	lentab->min = lentab->max = min;
 	lentablen = 1;
@@ -1242,7 +1242,7 @@ blocktrash_f(
 		for (agbno = 0, p = dbmap[agno];
 		     agbno < mp->m_sb.sb_agblocks;
 		     agbno++, p++) {
-			if ((1 << *p) & tmask)
+			if ((1ULL << *p) & tmask)
 				blocks++;
 		}
 	}
@@ -1259,7 +1259,7 @@ blocktrash_f(
 			for (agbno = 0, p = dbmap[agno];
 			     agbno < mp->m_sb.sb_agblocks;
 			     agbno++, p++) {
-				if (!((1 << *p) & tmask))
+				if (!((1ULL << *p) & tmask))
 					continue;
 				if (bi++ < randb)
 					continue;
@@ -1812,7 +1812,7 @@ checknot_dbmap(
 	xfs_agnumber_t	agno,
 	xfs_agblock_t	agbno,
 	xfs_extlen_t	len,
-	int		typemask)
+	uint64_t	typemask)
 {
 	xfs_extlen_t	i;
 	char		*p;
@@ -1820,7 +1820,7 @@ checknot_dbmap(
 	if (!check_range(agno, agbno, len))
 		return;
 	for (i = 0, p = &dbmap[agno][agbno]; i < len; i++, p++) {
-		if ((1 << *p) & typemask) {
+		if ((1ULL << *p) & typemask) {
 			if (!sflag || CHECK_BLISTA(agno, agbno + i))
 				dbprintf(_("block %u/%u type %s not expected\n"),
 					agno, agbno + i, typename[(dbm_t)*p]);
@@ -1833,7 +1833,7 @@ static void
 checknot_rdbmap(
 	xfs_rfsblock_t	bno,
 	xfs_extlen_t	len,
-	int		typemask)
+	uint64_t	typemask)
 {
 	xfs_extlen_t	i;
 	char		*p;
@@ -1841,7 +1841,7 @@ checknot_rdbmap(
 	if (!check_rrange(bno, len))
 		return;
 	for (i = 0, p = &dbmap[mp->m_sb.sb_agcount][bno]; i < len; i++, p++) {
-		if ((1 << *p) & typemask) {
+		if ((1ULL << *p) & typemask) {
 			if (!sflag || CHECK_BLIST(bno + i))
 				dbprintf(_("rtblock %llu type %s not expected\n"),
 					bno + i, typename[(dbm_t)*p]);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 27/42] xfs_db: support rudimentary checks of the rtrefcount btree
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (25 preceding siblings ...)
  2023-12-27 13:32   ` [PATCH 26/42] xfs_db: widen block type mask to 64 bits Darrick J. Wong
@ 2023-12-27 13:33   ` Darrick J. Wong
  2023-12-27 13:33   ` [PATCH 28/42] xfs_db: copy the realtime refcount btree Darrick J. Wong
                     ` (14 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:33 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Perform some fairly superficial checks of the rtrefcount btree.  We'll
do more sophisticated checks in xfs_repair, but provide enough of
a spot-check here that we can do simple things.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/check.c |  254 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 db/inode.c |   18 ++++
 db/inode.h |    1 
 3 files changed, 265 insertions(+), 8 deletions(-)


diff --git a/db/check.c b/db/check.c
index 4ef24c20d79..b6ad28799af 100644
--- a/db/check.c
+++ b/db/check.c
@@ -59,6 +59,8 @@ typedef enum {
 	DBM_COWDATA,
 	DBM_RTSB,
 	DBM_BTRTRMAP,
+	DBM_BTRTREFC,
+	DBM_RLRTDATA,
 	DBM_NDBM
 } dbm_t;
 
@@ -193,6 +195,8 @@ static const char	*typename[] = {
 	"cowdata",
 	"rtsb",
 	"btrtrmap",
+	"btrtrefc",
+	"rlrtdata",
 	NULL
 };
 
@@ -268,7 +272,7 @@ static void		check_linkcounts(xfs_agnumber_t agno);
 static int		check_range(xfs_agnumber_t agno, xfs_agblock_t agbno,
 				    xfs_extlen_t len);
 static void		check_rdbmap(xfs_rfsblock_t bno, xfs_extlen_t len,
-				     dbm_t type);
+				     dbm_t type, bool is_reflink);
 static int		check_rinomap(xfs_rfsblock_t bno, xfs_extlen_t len,
 				      xfs_ino_t c_ino);
 static void		check_rootdir(void);
@@ -348,6 +352,9 @@ static xfs_ino_t	process_sf_dir_v2(struct xfs_dinode *dip, int *dot,
 static void		process_rtrmap(struct inodata *id,
 				       struct xfs_dinode *dip,
 				       xfs_rfsblock_t *toti);
+static void		process_rtrefc(struct inodata *id,
+				       struct xfs_dinode *dip,
+				       xfs_rfsblock_t *toti);
 static void		quota_add(xfs_dqid_t *p, xfs_dqid_t *g, xfs_dqid_t *u,
 				  int dq, xfs_qcnt_t bc, xfs_qcnt_t ic,
 				  xfs_qcnt_t rc);
@@ -379,6 +386,12 @@ static void		scanfunc_rtrmap(struct xfs_btree_block *block,
 				      xfs_rfsblock_t *toti, xfs_extnum_t *nex,
 				      blkmap_t **blkmapp, int isroot,
 				      typnm_t btype);
+static void		scanfunc_rtrefc(struct xfs_btree_block *block,
+				      int level, dbm_t type, xfs_fsblock_t bno,
+				      inodata_t *id, xfs_rfsblock_t *totd,
+				      xfs_rfsblock_t *toti, xfs_extnum_t *nex,
+				      blkmap_t **blkmapp, int isroot,
+				      typnm_t btype);
 static void		scanfunc_bno(struct xfs_btree_block *block, int level,
 				     xfs_agf_t *agf, xfs_agblock_t bno,
 				     int isroot);
@@ -1128,6 +1141,7 @@ blocktrash_f(
 		   (1ULL << DBM_BTFINO) |
 		   (1ULL << DBM_BTRMAP) |
 		   (1ULL << DBM_BTREFC) |
+		   (1ULL << DBM_BTRTREFC) |
 		   (1ULL << DBM_SB);
 	while ((c = getopt(argc, argv, "0123n:o:s:t:x:y:z")) != EOF) {
 		switch (c) {
@@ -1562,7 +1576,8 @@ static void
 check_rdbmap(
 	xfs_rfsblock_t	bno,
 	xfs_extlen_t	len,
-	dbm_t		type)
+	dbm_t		type,
+	bool		ignore_reflink)
 {
 	xfs_extlen_t	i;
 	char		*p;
@@ -1574,6 +1589,9 @@ check_rdbmap(
 			error++;
 			break;
 		}
+		if (ignore_reflink && (*p == DBM_UNKNOWN || *p == DBM_RTDATA ||
+				       *p == DBM_RLRTDATA))
+			continue;
 		if ((dbm_t)*p != type) {
 			if (!sflag || CHECK_BLIST(bno + i))
 				dbprintf(_("rtblock %llu expected type %s got "
@@ -1600,6 +1618,8 @@ check_rinomap(
 			bno, bno + len - 1, c_ino);
 		return 0;
 	}
+	if (xfs_has_rtreflink(mp))
+		return 0;
 	for (i = 0, rval = 1, idp = &inomap[mp->m_sb.sb_agcount][bno];
 	     i < len;
 	     i++, idp++) {
@@ -1740,6 +1760,26 @@ check_set_dbmap(
 	}
 }
 
+/*
+ * We don't check the accuracy of reference counts -- all we do is ensure
+ * that a data block never crosses with non-data blocks.  repair can check
+ * those kinds of things.
+ *
+ * So with that in mind, if we're setting a block to be data or rldata,
+ * don't complain so long as the block is currently unknown, data, or rldata.
+ * Don't let blocks downgrade from rldata -> data.
+ */
+static bool
+is_rtreflink(
+	dbm_t		type2)
+{
+	if (!xfs_has_rtreflink(mp))
+		return false;
+	if (type2 == DBM_RTDATA || type2 == DBM_RLRTDATA)
+		return true;
+	return false;
+}
+
 static void
 check_set_rdbmap(
 	xfs_rfsblock_t	bno,
@@ -1753,7 +1793,7 @@ check_set_rdbmap(
 
 	if (!check_rrange(bno, len))
 		return;
-	check_rdbmap(bno, len, type1);
+	check_rdbmap(bno, len, type1, is_rtreflink(type2));
 	mayprint = verbose | blist_size;
 	for (i = 0, p = &dbmap[mp->m_sb.sb_agcount][bno]; i < len; i++, p++) {
 		if (!rdbmap_boundscheck(bno + i)) {
@@ -2873,7 +2913,7 @@ process_inode(
 		0				/* type 15 unused */
 	};
 	static char		*fmtnames[] = {
-		"dev", "local", "extents", "btree", "uuid", "rmap"
+		"dev", "local", "extents", "btree", "uuid", "rmap", "refcount"
 	};
 
 	ino = XFS_AGINO_TO_INO(mp, be32_to_cpu(agf->agf_seqno), agino);
@@ -2952,6 +2992,16 @@ process_inode(
 			error++;
 			return;
 		}
+	} else if (is_rtrefcount_inode(ino)) {
+		if (!S_ISREG(mode) || dip->di_format != XFS_DINODE_FMT_REFCOUNT) {
+			if (v)
+				dbprintf(
+			_("bad format %d for rtrefc inode %lld type %#o\n"),
+					dip->di_format, (long long)ino,
+					mode & S_IFMT);
+			error++;
+			return;
+		}
 	} else if ((((mode & S_IFMT) >> 12) > 15) ||
 	    (!(okfmts[(mode & S_IFMT) >> 12] & (1 << dip->di_format)))) {
 		if (v)
@@ -3028,6 +3078,9 @@ process_inode(
 		} else if (is_rtrmap_inode(id->ino)) {
 			type = DBM_BTRTRMAP;
 			blkmap = blkmap_alloc(be32_to_cpu(dip->di_nextents));
+		} else if (is_rtrefcount_inode(id->ino)) {
+			type = DBM_BTRTREFC;
+			blkmap = blkmap_alloc(be32_to_cpu(dip->di_nextents));
 		}
 		else
 			type = DBM_DATA;
@@ -3063,6 +3116,10 @@ process_inode(
 		id->rgno = rtgroup_for_rtrmap_ino(mp, id->ino);
 		process_rtrmap(id, dip, &totiblocks);
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		id->rgno = rtgroup_for_rtrefcount_ino(mp, id->ino);
+		process_rtrefc(id, dip, &totiblocks);
+		break;
 	}
 	if (dip->di_forkoff) {
 		sbversion |= XFS_SB_VERSION_ATTRBIT;
@@ -3089,6 +3146,7 @@ process_inode(
 		case DBM_RTSUM:
 		case DBM_SYMLINK:
 		case DBM_BTRTRMAP:
+		case DBM_BTRTREFC:
 		case DBM_UNKNOWN:
 			bc = totdblocks + totiblocks +
 			     atotdblocks + atotiblocks;
@@ -3916,8 +3974,7 @@ process_rtrmap(
 					 i, be32_to_cpu(rp[i].rm_startblock),
 					 be32_to_cpu(rp[i].rm_startblock));
 			} else {
-				lastblock = be32_to_cpu(rp[i].rm_startblock) +
-					    be32_to_cpu(rp[i].rm_blockcount);
+				lastblock = be32_to_cpu(rp[i].rm_startblock);
 			}
 		}
 		return;
@@ -3932,6 +3989,79 @@ process_rtrmap(
 	}
 }
 
+static void
+process_rtrefc(
+	struct inodata		*id,
+	struct xfs_dinode	*dip,
+	xfs_rfsblock_t		*toti)
+{
+	xfs_extnum_t		nex = 0;
+	xfs_rfsblock_t		totd = 0;
+	struct xfs_rtrefcount_root *dib;
+	int			whichfork = XFS_DATA_FORK;
+	int			i;
+	int			maxrecs;
+	xfs_rtrefcount_ptr_t	*pp;
+
+	if (id->rgno == NULLRGNUMBER) {
+		dbprintf(
+	_("rt group for refcount ino %lld not found\n"),
+				id->ino);
+		error++;
+		return;
+	}
+
+	dib = (struct xfs_rtrefcount_root *)XFS_DFORK_PTR(dip, whichfork);
+	if (be16_to_cpu(dib->bb_level) >= mp->m_rtrefc_maxlevels) {
+		if (!sflag || id->ilist)
+			dbprintf(_("level for ino %lld rtrefc root too "
+				 "large (%u)\n"),
+				id->ino,
+				be16_to_cpu(dib->bb_level));
+		error++;
+		return;
+	}
+	maxrecs = libxfs_rtrefcountbt_droot_maxrecs(
+			XFS_DFORK_SIZE(dip, mp, whichfork),
+			dib->bb_level == 0);
+	if (be16_to_cpu(dib->bb_numrecs) > maxrecs) {
+		if (!sflag || id->ilist)
+			dbprintf(_("numrecs for ino %lld rtrefc root too "
+				 "large (%u)\n"),
+				id->ino,
+				be16_to_cpu(dib->bb_numrecs));
+		error++;
+		return;
+	}
+	if (be16_to_cpu(dib->bb_level) == 0) {
+		struct xfs_refcount_rec	*rp;
+		xfs_fsblock_t		lastblock;
+
+		rp = xfs_rtrefcount_droot_rec_addr(dib, 1);
+		lastblock = 0;
+		for (i = 0; i < be16_to_cpu(dib->bb_numrecs); i++) {
+			if (be32_to_cpu(rp[i].rc_startblock) < lastblock) {
+				dbprintf(_(
+		"out-of-order rtrefc btree record %d (%u %u) root\n"),
+					 i, be32_to_cpu(rp[i].rc_startblock),
+					 be32_to_cpu(rp[i].rc_startblock));
+			} else {
+				lastblock = be32_to_cpu(rp[i].rc_startblock) +
+					    be32_to_cpu(rp[i].rc_blockcount);
+			}
+		}
+		return;
+	} else {
+		pp = xfs_rtrefcount_droot_ptr_addr(dib, 1, maxrecs);
+		for (i = 0; i < be16_to_cpu(dib->bb_numrecs); i++)
+			scan_lbtree(get_unaligned_be64(&pp[i]),
+					be16_to_cpu(dib->bb_level),
+					scanfunc_rtrefc, DBM_BTRTREFC,
+					id, &totd, toti,
+					&nex, NULL, 1, TYP_RTREFCBT);
+	}
+}
+
 static xfs_ino_t
 process_sf_dir_v2(
 	struct xfs_dinode	*dip,
@@ -5089,8 +5219,7 @@ scanfunc_rtrmap(
 					 be32_to_cpu(rp[i].rm_blockcount),
 					 agno, bno, lastblock);
 			} else {
-				lastblock = be32_to_cpu(rp[i].rm_startblock) +
-					    be32_to_cpu(rp[i].rm_blockcount);
+				lastblock = be32_to_cpu(rp[i].rm_startblock);
 			}
 		}
 		return;
@@ -5206,6 +5335,115 @@ scanfunc_refcnt(
 				TYP_REFCBT);
 }
 
+static void
+scanfunc_rtrefc(
+	struct xfs_btree_block	*block,
+	int			level,
+	dbm_t			type,
+	xfs_fsblock_t		bno,
+	inodata_t		*id,
+	xfs_rfsblock_t		*totd,
+	xfs_rfsblock_t		*toti,
+	xfs_extnum_t		*nex,
+	blkmap_t		**blkmapp,
+	int			isroot,
+	typnm_t			btype)
+{
+	xfs_agblock_t		agbno;
+	xfs_agnumber_t		agno;
+	int			i;
+	xfs_rtrefcount_ptr_t	*pp;
+	struct xfs_refcount_rec *rp;
+	xfs_rtblock_t		lastblock;
+
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	if (be32_to_cpu(block->bb_magic) != XFS_RTREFC_CRC_MAGIC) {
+		dbprintf(_("bad magic # %#x in rtrefcbt block %u/%u\n"),
+			be32_to_cpu(block->bb_magic), agno, agbno);
+		serious_error++;
+		return;
+	}
+	if (be16_to_cpu(block->bb_level) != level) {
+		if (!sflag)
+			dbprintf(_("expected level %d got %d in rtrefcntbt block "
+				 "%u/%u\n"),
+				level, be16_to_cpu(block->bb_level), agno, agbno);
+		error++;
+	}
+	set_dbmap(agno, agbno, 1, type, agno, agbno);
+	set_inomap(agno, agbno, 1, id);
+	(*toti)++;
+	if (level == 0) {
+		if (be16_to_cpu(block->bb_numrecs) > mp->m_rtrefc_mxr[0] ||
+		    (isroot == 0 && be16_to_cpu(block->bb_numrecs) < mp->m_rtrefc_mnr[0])) {
+			dbprintf(_("bad btree nrecs (%u, min=%u, max=%u) in "
+				 "rtrefcntbt block %u/%u\n"),
+				be16_to_cpu(block->bb_numrecs), mp->m_rtrefc_mnr[0],
+				mp->m_rtrefc_mxr[0], agno, agbno);
+			serious_error++;
+			return;
+		}
+		rp = xfs_rtrefcount_rec_addr(block, 1);
+		lastblock = 0;
+		for (i = 0; i < be16_to_cpu(block->bb_numrecs); i++) {
+			xfs_rtblock_t	rtbno;
+
+			if (be32_to_cpu(rp[i].rc_refcount) == 1) {
+				xfs_fsblock_t	bno;
+				char		*msg;
+
+				bno = be32_to_cpu(rp[i].rc_startblock);
+				if (bno & XFS_REFC_COWFLAG) {
+					bno &= ~XFS_REFC_COWFLAG;
+					msg = _(
+		"leftover rt CoW extent (%lu) len %u\n");
+				} else {
+					msg = _(
+		"leftover rt CoW extent at unexpected address (%lu) len %lu\n");
+				}
+				dbprintf(msg,
+					agbno,
+					be32_to_cpu(rp[i].rc_blockcount));
+				rtbno = xfs_rgbno_to_rtb(mp, id->rgno, bno);
+				set_rdbmap(rtbno,
+					be32_to_cpu(rp[i].rc_blockcount),
+					DBM_COWDATA);
+			} else {
+				rtbno = xfs_rgbno_to_rtb(mp, id->rgno,
+						be32_to_cpu(rp[i].rc_startblock));
+				set_rdbmap(rtbno,
+					   be32_to_cpu(rp[i].rc_blockcount),
+					   DBM_RLRTDATA);
+			}
+			if (be32_to_cpu(rp[i].rc_startblock) < lastblock) {
+				dbprintf(_(
+		"out-of-order rt refcnt btree record %d (%llu %llu) block %llu\n"),
+					 i, be32_to_cpu(rp[i].rc_startblock),
+					 be32_to_cpu(rp[i].rc_startblock),
+					 bno);
+			} else {
+				lastblock = be32_to_cpu(rp[i].rc_startblock) +
+					    be32_to_cpu(rp[i].rc_blockcount);
+			}
+		}
+		return;
+	}
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_rtrefc_mxr[1] ||
+	    (isroot == 0 && be16_to_cpu(block->bb_numrecs) < mp->m_rtrefc_mnr[1])) {
+		dbprintf(_("bad btree nrecs (%u, min=%u, max=%u) in rtrefcntbt "
+			 "block %u/%u\n"),
+			be16_to_cpu(block->bb_numrecs), mp->m_rtrefc_mnr[1],
+			mp->m_rtrefc_mxr[1], agno, agbno);
+		serious_error++;
+		return;
+	}
+	pp = xfs_rtrefcount_ptr_addr(block, 1, mp->m_rtrefc_mxr[1]);
+	for (i = 0; i < be16_to_cpu(block->bb_numrecs); i++)
+		scan_lbtree(be64_to_cpu(pp[i]), level, scanfunc_rtrefc,
+				type, id, totd, toti, nex, blkmapp, 0, btype);
+}
+
 static void
 set_dbmap(
 	xfs_agnumber_t	agno,
diff --git a/db/inode.c b/db/inode.c
index 760d47c0f04..4feff172cbb 100644
--- a/db/inode.c
+++ b/db/inode.c
@@ -642,6 +642,7 @@ inode_init(void)
 
 struct rtgroup_inodes {
 	xfs_ino_t		rmap_ino;
+	xfs_ino_t		refcount_ino;
 };
 
 static struct rtgroup_inodes	*rtgroup_inodes;
@@ -722,6 +723,11 @@ set_rtgroup_refcount_inode(
 	}
 
 	error = bitmap_set(refcount_inodes, rtino, 1);
+	if (error)
+		goto out_trans;
+
+	rtgroup_inodes[rgno].refcount_ino = rtino;
+
 out_trans:
 	libxfs_trans_cancel(tp);
 out_path:
@@ -786,6 +792,18 @@ bool is_rtrefcount_inode(xfs_ino_t ino)
 	return bitmap_test(refcount_inodes, ino, 1);
 }
 
+xfs_rgnumber_t rtgroup_for_rtrefcount_ino(struct xfs_mount *mp, xfs_ino_t ino)
+{
+	unsigned int i;
+
+	for (i = 0; i < mp->m_sb.sb_rgcount; i++) {
+		if (rtgroup_inodes[i].refcount_ino == ino)
+			return i;
+	}
+
+	return NULLRGNUMBER;
+}
+
 typnm_t
 inode_next_type(void)
 {
diff --git a/db/inode.h b/db/inode.h
index 666bb5201ea..c789017e0c8 100644
--- a/db/inode.h
+++ b/db/inode.h
@@ -28,3 +28,4 @@ int init_rtmeta_inode_bitmaps(struct xfs_mount *mp);
 bool is_rtrmap_inode(xfs_ino_t ino);
 xfs_rgnumber_t rtgroup_for_rtrmap_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool is_rtrefcount_inode(xfs_ino_t ino);
+xfs_rgnumber_t rtgroup_for_rtrefcount_ino(struct xfs_mount *mp, xfs_ino_t ino);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 28/42] xfs_db: copy the realtime refcount btree
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (26 preceding siblings ...)
  2023-12-27 13:33   ` [PATCH 27/42] xfs_db: support rudimentary checks of the rtrefcount btree Darrick J. Wong
@ 2023-12-27 13:33   ` Darrick J. Wong
  2023-12-27 13:33   ` [PATCH 29/42] xfs_spaceman: report health of " Darrick J. Wong
                     ` (13 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:33 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Copy the realtime refcountbt when we're metadumping the filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/metadump.c |  129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)


diff --git a/db/metadump.c b/db/metadump.c
index c6f6bf1ea17..6a33335a15b 100644
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -710,6 +710,55 @@ copy_refcount_btree(
 	return scan_btree(agno, root, levels, TYP_REFCBT, agf, scanfunc_refcntbt);
 }
 
+static int
+scanfunc_rtrefcbt(
+	struct xfs_btree_block	*block,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	int			level,
+	typnm_t			btype,
+	void			*arg)
+{
+	xfs_rtrefcount_ptr_t	*pp;
+	int			i;
+	int			numrecs;
+
+	if (level == 0)
+		return 1;
+
+	numrecs = be16_to_cpu(block->bb_numrecs);
+	if (numrecs > mp->m_rtrefc_mxr[1]) {
+		if (metadump.show_warnings)
+			print_warning("invalid numrecs (%u) in %s block %u/%u",
+				numrecs, typtab[btype].name, agno, agbno);
+		return 1;
+	}
+
+	pp = xfs_rtrefcount_ptr_addr(block, 1, mp->m_rtrefc_mxr[1]);
+	for (i = 0; i < numrecs; i++) {
+		xfs_agnumber_t	pagno;
+		xfs_agblock_t	pbno;
+
+		pagno = XFS_FSB_TO_AGNO(mp, get_unaligned_be64(&pp[i]));
+		pbno = XFS_FSB_TO_AGBNO(mp, get_unaligned_be64(&pp[i]));
+
+		if (pbno == 0 || pbno > mp->m_sb.sb_agblocks ||
+		    pagno > mp->m_sb.sb_agcount) {
+			if (metadump.show_warnings)
+				print_warning("invalid block number (%u/%u) "
+						"in inode %llu %s block %u/%u",
+						pagno, pbno,
+						(unsigned long long)metadump.cur_ino,
+						typtab[btype].name, agno, agbno);
+			continue;
+		}
+		if (!scan_btree(pagno, pbno, level, btype, arg,
+				scanfunc_rtrefcbt))
+			return 0;
+	}
+	return 1;
+}
+
 /* filename and extended attribute obfuscation routines */
 
 struct name_ent {
@@ -2445,6 +2494,83 @@ process_rtrmap(
 	return 1;
 }
 
+static int
+process_rtrefc(
+	struct xfs_dinode	*dip,
+	typnm_t			itype)
+{
+	struct xfs_rtrefcount_root	*dib;
+	int			i;
+	xfs_rtrefcount_ptr_t	*pp;
+	int			level;
+	int			nrecs;
+	int			maxrecs;
+	int			whichfork;
+	typnm_t			btype;
+
+	if (itype == TYP_ATTR && metadump.show_warnings) {
+		print_warning("ignoring rtrefcbt root in inode %llu attr fork",
+				(unsigned long long)metadump.cur_ino);
+		return 1;
+	}
+
+	whichfork = XFS_DATA_FORK;
+	btype = TYP_RTREFCBT;
+
+	dib = (struct xfs_rtrefcount_root *)XFS_DFORK_PTR(dip, whichfork);
+	level = be16_to_cpu(dib->bb_level);
+	nrecs = be16_to_cpu(dib->bb_numrecs);
+
+	if (level > mp->m_rtrefc_maxlevels) {
+		if (metadump.show_warnings)
+			print_warning("invalid level (%u) in inode %lld %s "
+					"root", level,
+					(unsigned long long)metadump.cur_ino,
+					typtab[btype].name);
+		return 1;
+	}
+
+	if (level == 0)
+		return 1;
+
+	maxrecs = libxfs_rtrefcountbt_droot_maxrecs(
+			XFS_DFORK_SIZE(dip, mp, whichfork),
+			false);
+	if (nrecs > maxrecs) {
+		if (metadump.show_warnings)
+			print_warning("invalid numrecs (%u) in inode %lld %s "
+					"root", nrecs,
+					(unsigned long long)metadump.cur_ino,
+					typtab[btype].name);
+		return 1;
+	}
+
+	pp = xfs_rtrefcount_droot_ptr_addr(dib, 1, maxrecs);
+	for (i = 0; i < nrecs; i++) {
+		xfs_agnumber_t	ag;
+		xfs_agblock_t	bno;
+
+		ag = XFS_FSB_TO_AGNO(mp, get_unaligned_be64(&pp[i]));
+		bno = XFS_FSB_TO_AGBNO(mp, get_unaligned_be64(&pp[i]));
+
+		if (bno == 0 || bno > mp->m_sb.sb_agblocks ||
+				ag > mp->m_sb.sb_agcount) {
+			if (metadump.show_warnings)
+				print_warning("invalid block number (%u/%u) "
+						"in inode %llu %s root", ag,
+						bno,
+						(unsigned long long)metadump.cur_ino,
+						typtab[btype].name);
+			continue;
+		}
+
+		if (!scan_btree(ag, bno, level, btype, &itype,
+				scanfunc_rtrefcbt))
+			return 0;
+	}
+	return 1;
+}
+
 static int
 process_inode_data(
 	struct xfs_dinode	*dip,
@@ -2492,6 +2618,9 @@ process_inode_data(
 
 		case XFS_DINODE_FMT_RMAP:
 			return process_rtrmap(dip, itype);
+
+		case XFS_DINODE_FMT_REFCOUNT:
+			return process_rtrefc(dip, itype);
 	}
 	return 1;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 29/42] xfs_spaceman: report health of the realtime refcount btree
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (27 preceding siblings ...)
  2023-12-27 13:33   ` [PATCH 28/42] xfs_db: copy the realtime refcount btree Darrick J. Wong
@ 2023-12-27 13:33   ` Darrick J. Wong
  2023-12-27 13:33   ` [PATCH 30/42] xfs_repair: allow CoW staging extents in the realtime rmap records Darrick J. Wong
                     ` (12 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:33 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Report the health of the realtime reference count btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 spaceman/health.c |   10 ++++++++++
 1 file changed, 10 insertions(+)


diff --git a/spaceman/health.c b/spaceman/health.c
index fac81f4eda5..ce906967d47 100644
--- a/spaceman/health.c
+++ b/spaceman/health.c
@@ -46,6 +46,11 @@ static bool has_rtrmapbt(const struct xfs_fsop_geom *g)
 	return g->rtblocks > 0 && (g->flags & XFS_FSOP_GEOM_FLAGS_RMAPBT);
 }
 
+static bool has_rtreflink(const struct xfs_fsop_geom *g)
+{
+	return g->rtblocks > 0 && (g->flags & XFS_FSOP_GEOM_FLAGS_REFLINK);
+}
+
 struct flag_map {
 	unsigned int		mask;
 	bool			(*has_fn)(const struct xfs_fsop_geom *g);
@@ -155,6 +160,11 @@ static const struct flag_map rtgroup_flags[] = {
 		.descr = "realtime reverse mappings btree",
 		.has_fn = has_rtrmapbt,
 	},
+	{
+		.mask = XFS_RTGROUP_GEOM_SICK_REFCNTBT,
+		.descr = "realtime reference count btree",
+		.has_fn = has_rtreflink,
+	},
 	{0},
 };
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 30/42] xfs_repair: allow CoW staging extents in the realtime rmap records
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (28 preceding siblings ...)
  2023-12-27 13:33   ` [PATCH 29/42] xfs_spaceman: report health of " Darrick J. Wong
@ 2023-12-27 13:33   ` Darrick J. Wong
  2023-12-27 13:34   ` [PATCH 31/42] xfs_repair: use realtime refcount btree data to check block types Darrick J. Wong
                     ` (11 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:33 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Don't flag the rt rmap btree as having errors if there are CoW staging
extent records in it and the filesystem supports.  As far as reporting
leftover staging extents, we'll report them when we scan the rt refcount
btree, in a future patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/scan.c |   17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)


diff --git a/repair/scan.c b/repair/scan.c
index ba634af8bb1..f04afff60ef 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -1416,9 +1416,20 @@ _("invalid length %llu in record %u of %s\n"),
 			continue;
 		}
 
-		/* We only store file data and superblocks in the rtrmap. */
-		if (XFS_RMAP_NON_INODE_OWNER(owner) &&
-		    owner != XFS_RMAP_OWN_FS) {
+		/*
+		 * We only store file data, COW data, and superblocks in the
+		 * rtrmap.
+		 */
+		if (owner == XFS_RMAP_OWN_COW) {
+			if (!xfs_has_reflink(mp)) {
+				do_warn(
+_("invalid CoW staging extent in record %u of %s\n"),
+						i, name);
+				suspect++;
+				continue;
+			}
+		} else if (XFS_RMAP_NON_INODE_OWNER(owner) &&
+			   owner != XFS_RMAP_OWN_FS) {
 			do_warn(
 _("invalid owner %lld in record %u of %s\n"),
 				(long long int)owner, i, name);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 31/42] xfs_repair: use realtime refcount btree data to check block types
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (29 preceding siblings ...)
  2023-12-27 13:33   ` [PATCH 30/42] xfs_repair: allow CoW staging extents in the realtime rmap records Darrick J. Wong
@ 2023-12-27 13:34   ` Darrick J. Wong
  2023-12-27 13:34   ` [PATCH 32/42] xfs_repair: find and mark the rtrefcountbt inode Darrick J. Wong
                     ` (10 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:34 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use the realtime refcount btree to pre-populate the block type information
so that when repair iterates the primary metadata, we can confirm the
block type.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |  152 ++++++++++++++++++++++++++++
 repair/rmap.c   |    9 ++
 repair/rmap.h   |    3 +
 repair/scan.c   |  299 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 repair/scan.h   |   33 ++++++
 5 files changed, 490 insertions(+), 6 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index a6713a7bc6b..f1cb119df8b 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -274,6 +274,8 @@ _("bad state in rt extent map %" PRIu64 "\n"),
 			break;
 		case XR_E_INUSE:
 		case XR_E_MULT:
+			if (xfs_has_rtreflink(mp))
+				break;
 			set_rtbmap(ext, XR_E_MULT);
 			break;
 		case XR_E_FREE1:
@@ -348,6 +350,8 @@ _("data fork in rt inode %" PRIu64 " found rt metadata extent %" PRIu64 " in rt
 			return 1;
 		case XR_E_INUSE:
 		case XR_E_MULT:
+			if (xfs_has_rtreflink(mp))
+				break;
 			do_warn(
 _("data fork in rt inode %" PRIu64 " claims used rt extent %" PRIu64 "\n"),
 				ino, b);
@@ -1012,6 +1016,148 @@ _("bad rtrmap btree ptr 0x%" PRIx64 " in ino %" PRIu64 "\n"),
 	return suspect ? 1 : 0;
 }
 
+/*
+ * return 1 if inode should be cleared, 0 otherwise
+ */
+static int
+process_rtrefc(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	xfs_agino_t			ino,
+	struct xfs_dinode		*dip,
+	int				type,
+	int				*dirty,
+	xfs_rfsblock_t			*tot,
+	uint64_t			*nex,
+	blkmap_t			**blkmapp,
+	int				check_dups)
+{
+	struct refc_priv		priv = { .nr_blocks = 0 };
+	struct xfs_rtrefcount_root	*dib;
+	xfs_rtrefcount_ptr_t		*pp;
+	struct xfs_refcount_key		*kp;
+	struct xfs_refcount_rec		*rp;
+	char				*forkname = get_forkname(XFS_DATA_FORK);
+	xfs_rgblock_t			oldkey, key;
+	xfs_ino_t			lino;
+	xfs_fsblock_t			bno;
+	size_t				droot_sz;
+	int				i;
+	int				level;
+	int				numrecs;
+	int				dmxr;
+	int				suspect = 0;
+	int				error;
+
+	/* We rebuild the rtrefcountbt, so no need to process blocks again. */
+	if (check_dups) {
+		*tot = be64_to_cpu(dip->di_nblocks);
+		return 0;
+	}
+
+	lino = XFS_AGINO_TO_INO(mp, agno, ino);
+
+	/*
+	 * This refcount btree inode must be a metadata inode reachable via
+	 * /realtime/$rgno.refcount in the metadata directory tree.
+	 */
+	if (!(dip->di_flags2 & be64_to_cpu(XFS_DIFLAG2_METADIR))) {
+		do_warn(
+_("rtrefcount inode %" PRIu64 " not flagged as metadata\n"),
+			lino);
+		return 1;
+	}
+
+	priv.rgno = rtgroup_for_rtrefcount_inode(mp, ino);
+	if (priv.rgno == NULLRGNUMBER) {
+		do_warn(
+_("could not associate refcount inode %" PRIu64 " with any rtgroup\n"),
+			lino);
+		return 1;
+	}
+
+	dib = (struct xfs_rtrefcount_root *)XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	*tot = 0;
+	*nex = 0;
+
+	level = be16_to_cpu(dib->bb_level);
+	numrecs = be16_to_cpu(dib->bb_numrecs);
+
+	if (level > mp->m_rtrefc_maxlevels) {
+		do_warn(
+_("bad level %d in inode %" PRIu64 " rtrefcount btree root block\n"),
+			level, lino);
+		return 1;
+	}
+
+	/*
+	 * use rtroot/dfork_dsize since the root block is in the data fork
+	 */
+	droot_sz = xfs_rtrefcount_droot_space_calc(level, numrecs);
+	if (droot_sz > XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK)) {
+		do_warn(
+_("computed size of rtrefcountbt root (%zu bytes) is greater than space in "
+	  "inode %" PRIu64 " %s fork\n"),
+				droot_sz, lino, forkname);
+		return 1;
+	}
+
+	if (level == 0) {
+		rp = xfs_rtrefcount_droot_rec_addr(dib, 1);
+		error = process_rtrefc_reclist(mp, rp, numrecs,
+				&priv, "rtrefcountbt root");
+		if (error) {
+			refcount_avoid_check();
+			return 1;
+		}
+		return 0;
+	}
+
+	dmxr = libxfs_rtrefcountbt_droot_maxrecs(
+			XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK), false);
+	pp = xfs_rtrefcount_droot_ptr_addr(dib, 1, dmxr);
+
+	/* check for in-order keys */
+	for (i = 0; i < numrecs; i++)  {
+		kp = xfs_rtrefcount_droot_key_addr(dib, i + 1);
+
+		key = be32_to_cpu(kp->rc_startblock);
+		if (i == 0) {
+			oldkey = key;
+			continue;
+		}
+		if (key < oldkey) {
+			do_warn(
+_("out of order key %u in rtrefcount root ino %" PRIu64 "\n"),
+				i, lino);
+			suspect++;
+			continue;
+		}
+		oldkey = key;
+	}
+
+	/* probe keys */
+	for (i = 0; i < numrecs; i++)  {
+		bno = get_unaligned_be64(&pp[i]);
+
+		if (!libxfs_verify_fsbno(mp, bno))  {
+			do_warn(
+_("bad rtrefcount btree ptr 0x%" PRIx64 " in ino %" PRIu64 "\n"),
+				bno, lino);
+			return 1;
+		}
+
+		if (scan_lbtree(bno, level, scan_rtrefcbt,
+				type, XFS_DATA_FORK, lino, tot, nex, blkmapp,
+				NULL, 0, 1, check_dups, XFS_RTREFC_CRC_MAGIC,
+				&priv, &xfs_rtrefcountbt_buf_ops))
+			return 1;
+	}
+
+	*tot = priv.nr_blocks;
+	return suspect ? 1 : 0;
+}
+
 /*
  * return 1 if inode should be cleared, 0 otherwise
  */
@@ -1807,6 +1953,7 @@ check_dinode_mode_format(
 		case XFS_DINODE_FMT_RMAP:
 		case XFS_DINODE_FMT_EXTENTS:
 		case XFS_DINODE_FMT_BTREE:
+		case XFS_DINODE_FMT_REFCOUNT:
 			return 0;
 		}
 		return -1;
@@ -2244,6 +2391,10 @@ process_inode_data_fork(
 		err = process_rtrmap(mp, agno, ino, dino, type, dirty,
 				totblocks, nextents, dblkmap, check_dups);
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		err = process_rtrefc(mp, agno, ino, dino, type, dirty,
+			totblocks, nextents, dblkmap, check_dups);
+		break;
 	case XFS_DINODE_FMT_DEV:
 		err = 0;
 		break;
@@ -2304,6 +2455,7 @@ _("would have tried to rebuild inode %"PRIu64" data fork\n"),
 			break;
 		case XFS_DINODE_FMT_DEV:
 		case XFS_DINODE_FMT_RMAP:
+		case XFS_DINODE_FMT_REFCOUNT:
 			err = 0;
 			break;
 		default:
diff --git a/repair/rmap.c b/repair/rmap.c
index 6fd537f533f..b27e3079155 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -258,6 +258,15 @@ bool is_rtrmap_inode(xfs_ino_t ino)
 	return bitmap_test(rmap_inodes, ino, 1);
 }
 
+xfs_rgnumber_t
+rtgroup_for_rtrefcount_inode(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	/* This will be implemented later. */
+	return NULLRGNUMBER;
+}
+
 /*
  * Initialize per-AG reverse map data.
  */
diff --git a/repair/rmap.h b/repair/rmap.h
index 1f99606a455..051481d2e2d 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -71,4 +71,7 @@ int populate_rtgroup_rmapbt(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
 		xfs_filblks_t fdblocks);
 xfs_filblks_t estimate_rtrmapbt_blocks(struct xfs_rtgroup *rtg);
 
+xfs_rgnumber_t rtgroup_for_rtrefcount_inode(struct xfs_mount *mp,
+		xfs_ino_t ino);
+
 #endif /* RMAP_H_ */
diff --git a/repair/scan.c b/repair/scan.c
index f04afff60ef..abf605e4978 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -1754,12 +1754,6 @@ _("bad %s btree ptr 0x%llx in ino %" PRIu64 "\n"),
 	return 0;
 }
 
-struct refc_priv {
-	struct xfs_refcount_irec	last_rec;
-	xfs_agblock_t			nr_blocks;
-};
-
-
 static void
 scan_refcbt(
 	struct xfs_btree_block	*block,
@@ -1997,6 +1991,299 @@ _("extent (%u/%u) len %u claimed, state is %d\n"),
 	return;
 }
 
+
+int
+process_rtrefc_reclist(
+	struct xfs_mount	*mp,
+	struct xfs_refcount_rec	*rp,
+	int			numrecs,
+	struct refc_priv	*refc_priv,
+	const char		*name)
+{
+	xfs_rtblock_t		lastblock = 0;
+	xfs_rtblock_t		rtbno, next_rtbno;
+	int			state;
+	int			suspect = 0;
+	int			i;
+
+	for (i = 0; i < numrecs; i++) {
+		enum xfs_refc_domain	domain;
+		xfs_rgblock_t		b, rgbno, end;
+		xfs_extlen_t		len;
+		xfs_nlink_t		nr;
+
+		b = rgbno = be32_to_cpu(rp[i].rc_startblock);
+		len = be32_to_cpu(rp[i].rc_blockcount);
+		nr = be32_to_cpu(rp[i].rc_refcount);
+
+		if (b & XFS_REFC_COWFLAG) {
+			domain = XFS_REFC_DOMAIN_COW;
+			rgbno &= ~XFS_REFC_COWFLAG;
+		} else {
+			domain = XFS_REFC_DOMAIN_SHARED;
+		}
+
+		if (domain == XFS_REFC_DOMAIN_COW && nr != 1) {
+			do_warn(
+_("leftover rt CoW extent has incorrect refcount in record %u of %s\n"),
+					i, name);
+			suspect++;
+		}
+		if (nr == 1) {
+			if (domain != XFS_REFC_DOMAIN_COW) {
+				do_warn(
+_("leftover rt CoW extent has invalid startblock in record %u of %s\n"),
+					i, name);
+				suspect++;
+			}
+		}
+		end = rgbno + len;
+
+		rtbno = xfs_rgbno_to_rtb(mp, refc_priv->rgno, rgbno);
+		if (!libxfs_verify_rtbno(mp, rtbno)) {
+			do_warn(
+_("invalid start block %llu in record %u of %s\n"),
+					(unsigned long long)b, i, name);
+			suspect++;
+			continue;
+		}
+
+		next_rtbno = xfs_rgbno_to_rtb(mp, refc_priv->rgno, end);
+		if (len == 0 || end <= rgbno ||
+		    !libxfs_verify_rtbno(mp, next_rtbno - 1)) {
+			do_warn(
+_("invalid length %llu in record %u of %s\n"),
+					(unsigned long long)len, i, name);
+			suspect++;
+			continue;
+		}
+
+		if (nr == 1) {
+			xfs_rtxnum_t	rtx, next_rtx;
+
+			rtx = xfs_rtb_to_rtx(mp, rtbno);
+			next_rtx = xfs_rtb_to_rtx(mp, next_rtbno);
+			for (; rtx < next_rtx; rtx++) {
+				state = get_rtbmap(rtx);
+				switch (state) {
+				case XR_E_UNKNOWN:
+				case XR_E_COW:
+					do_warn(
+_("leftover CoW rtextent (%llu)\n"),
+						(unsigned long long)rtx);
+					suspect++;
+					set_rtbmap(rtx, XR_E_FREE);
+					break;
+				default:
+					do_warn(
+_("rtextent (%llu) claimed, state is %d\n"),
+						(unsigned long long)rtx, state);
+					suspect++;
+					break;
+				}
+			}
+		} else if (nr < 2 || nr > XFS_REFC_REFCOUNT_MAX) {
+			do_warn(
+_("invalid rt reference count %u in record %u of %s\n"),
+					nr, i, name);
+			suspect++;
+			continue;
+		}
+
+		if (b && b <= lastblock) {
+			do_warn(_(
+"out-of-order %s btree record %d (%llu %llu) in %s\n"),
+					name, i, (unsigned long long)b,
+					(unsigned long long)len, name);
+			suspect++;
+		} else {
+			lastblock = end - 1;
+		}
+
+		/* Is this record mergeable with the last one? */
+		if (refc_priv->last_rec.rc_domain == domain &&
+		    refc_priv->last_rec.rc_startblock +
+		    refc_priv->last_rec.rc_blockcount == rgbno &&
+		    refc_priv->last_rec.rc_refcount == nr) {
+			do_warn(
+_("record %d of %s tree should be merged with previous record\n"),
+					i, name);
+			suspect++;
+			refc_priv->last_rec.rc_blockcount += len;
+		} else {
+			refc_priv->last_rec.rc_domain = domain;
+			refc_priv->last_rec.rc_startblock = rgbno;
+			refc_priv->last_rec.rc_blockcount = len;
+			refc_priv->last_rec.rc_refcount = nr;
+		}
+
+		/* XXX: probably want to mark the reflinked areas? */
+	}
+
+	return suspect;
+}
+
+int
+scan_rtrefcbt(
+	struct xfs_btree_block		*block,
+	int				level,
+	int				type,
+	int				whichfork,
+	xfs_fsblock_t			fsbno,
+	xfs_ino_t			ino,
+	xfs_rfsblock_t			*tot,
+	uint64_t			*nex,
+	struct blkmap			**blkmapp,
+	bmap_cursor_t			*bm_cursor,
+	int				suspect,
+	int				isroot,
+	int				check_dups,
+	int				*dirty,
+	uint64_t			magic,
+	void				*priv)
+{
+	const char			*name = "rtrefcount";
+	char				rootname[256];
+	int				i;
+	xfs_rtrefcount_ptr_t		*pp;
+	struct xfs_refcount_rec	*rp;
+	struct refc_priv		*refc_priv = priv;
+	int				hdr_errors = 0;
+	int				numrecs;
+	int				state;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	int				error;
+
+	agno = XFS_FSB_TO_AGNO(mp, fsbno);
+	agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
+
+	if (magic != XFS_RTREFC_CRC_MAGIC) {
+		name = "(unknown)";
+		hdr_errors++;
+		suspect++;
+		goto out;
+	}
+
+	if (be32_to_cpu(block->bb_magic) != magic) {
+		do_warn(_("bad magic # %#x in %s btree block %d/%d\n"),
+				be32_to_cpu(block->bb_magic), name, agno,
+				agbno);
+		hdr_errors++;
+		if (suspect)
+			goto out;
+	}
+
+	if (be16_to_cpu(block->bb_level) != level) {
+		do_warn(_("expected level %d got %d in %s btree block %d/%d\n"),
+				level, be16_to_cpu(block->bb_level), name,
+				agno, agbno);
+		hdr_errors++;
+		if (suspect)
+			goto out;
+	}
+
+	refc_priv->nr_blocks++;
+
+	/*
+	 * Check for btree blocks multiply claimed.  We're going to regenerate
+	 * the btree anyway, so mark the blocks as metadata so they get freed.
+	 */
+	state = get_bmap(agno, agbno);
+	if (!(state == XR_E_UNKNOWN || state == XR_E_INUSE1))  {
+		do_warn(
+_("%s btree block claimed (state %d), agno %d, agbno %d, suspect %d\n"),
+				name, state, agno, agbno, suspect);
+		goto out;
+	}
+	set_bmap(agno, agbno, XR_E_METADATA);
+
+	numrecs = be16_to_cpu(block->bb_numrecs);
+	if (level == 0) {
+		if (numrecs > mp->m_rtrefc_mxr[0])  {
+			numrecs = mp->m_rtrefc_mxr[0];
+			hdr_errors++;
+		}
+		if (isroot == 0 && numrecs < mp->m_rtrefc_mnr[0])  {
+			numrecs = mp->m_rtrefc_mnr[0];
+			hdr_errors++;
+		}
+
+		if (hdr_errors) {
+			do_warn(
+	_("bad btree nrecs (%u, min=%u, max=%u) in %s btree block %u/%u\n"),
+					be16_to_cpu(block->bb_numrecs),
+					mp->m_rtrefc_mnr[0],
+					mp->m_rtrefc_mxr[0], name, agno, agbno);
+			suspect++;
+		}
+
+		rp = xfs_rtrefcount_rec_addr(block, 1);
+		snprintf(rootname, 256, "%s btree block %u/%u", name, agno,
+				agbno);
+		error = process_rtrefc_reclist(mp, rp, numrecs, refc_priv,
+				rootname);
+		if (error)
+			suspect++;
+		goto out;
+	}
+
+	/*
+	 * interior record
+	 */
+	pp = xfs_rtrefcount_ptr_addr(block, 1, mp->m_rtrefc_mxr[1]);
+
+	if (numrecs > mp->m_rtrefc_mxr[1])  {
+		numrecs = mp->m_rtrefc_mxr[1];
+		hdr_errors++;
+	}
+	if (isroot == 0 && numrecs < mp->m_rtrefc_mnr[1])  {
+		numrecs = mp->m_rtrefc_mnr[1];
+		hdr_errors++;
+	}
+
+	/*
+	 * don't pass bogus tree flag down further if this block
+	 * looked ok.  bail out if two levels in a row look bad.
+	 */
+	if (hdr_errors)  {
+		do_warn(
+	_("bad btree nrecs (%u, min=%u, max=%u) in %s btree block %u/%u\n"),
+				be16_to_cpu(block->bb_numrecs),
+				mp->m_rtrefc_mnr[1], mp->m_rtrefc_mxr[1], name,
+				agno, agbno);
+		if (suspect)
+			goto out;
+		suspect++;
+	} else if (suspect) {
+		suspect = 0;
+	}
+
+	for (i = 0; i < numrecs; i++)  {
+		xfs_fsblock_t		pbno = be64_to_cpu(pp[i]);
+
+		if (!libxfs_verify_fsbno(mp, pbno)) {
+			do_warn(
+	_("bad btree pointer (%u) in %sbt block %u/%u\n"),
+					agbno, name, agno, agbno);
+			suspect++;
+			return 0;
+		}
+
+		scan_lbtree(pbno, level, scan_rtrefcbt, type, whichfork, ino,
+				tot, nex, blkmapp, bm_cursor, suspect, 0,
+				check_dups, magic, refc_priv,
+				&xfs_rtrefcountbt_buf_ops);
+	}
+out:
+	if (suspect) {
+		refcount_avoid_check();
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * The following helpers are to help process and validate individual on-disk
  * inode btree records. We have two possible inode btrees with slightly
diff --git a/repair/scan.h b/repair/scan.h
index a624c882734..1643a2397ae 100644
--- a/repair/scan.h
+++ b/repair/scan.h
@@ -100,4 +100,37 @@ int scan_rtrmapbt(
 	uint64_t		magic,
 	void			*priv);
 
+struct refc_priv {
+	struct xfs_refcount_irec	last_rec;
+	xfs_agblock_t			nr_blocks;
+	xfs_rgnumber_t			rgno;
+};
+
+int
+process_rtrefc_reclist(
+	struct xfs_mount	*mp,
+	struct xfs_refcount_rec	*rp,
+	int			numrecs,
+	struct refc_priv	*refc_priv,
+	const char		*name);
+
+int
+scan_rtrefcbt(
+	struct xfs_btree_block	*block,
+	int			level,
+	int			type,
+	int			whichfork,
+	xfs_fsblock_t		bno,
+	xfs_ino_t		ino,
+	xfs_rfsblock_t		*tot,
+	uint64_t		*nex,
+	struct blkmap		**blkmapp,
+	bmap_cursor_t		*bm_cursor,
+	int			suspect,
+	int			isroot,
+	int			check_dups,
+	int			*dirty,
+	uint64_t		magic,
+	void			*priv);
+
 #endif /* _XR_SCAN_H */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 32/42] xfs_repair: find and mark the rtrefcountbt inode
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (30 preceding siblings ...)
  2023-12-27 13:34   ` [PATCH 31/42] xfs_repair: use realtime refcount btree data to check block types Darrick J. Wong
@ 2023-12-27 13:34   ` Darrick J. Wong
  2023-12-27 13:34   ` [PATCH 33/42] xfs_repair: compute refcount data for the realtime groups Darrick J. Wong
                     ` (9 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:34 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that we find the realtime refcountbt inode and mark it
appropriately, just in case we find a rogue inode claiming to
be an rtrefcount, or just plain garbage in the superblock field.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dino_chunks.c |   11 ++++++
 repair/dinode.c      |   30 +++++++++++++++
 repair/dir2.c        |    2 +
 repair/incore.h      |    1 +
 repair/rmap.c        |   98 +++++++++++++++++++++++++++++++++++++++++++++++++-
 repair/rmap.h        |    3 +-
 repair/scan.c        |    8 ++--
 7 files changed, 145 insertions(+), 8 deletions(-)


diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c
index b7a5879bf4b..04898643883 100644
--- a/repair/dino_chunks.c
+++ b/repair/dino_chunks.c
@@ -1025,6 +1025,17 @@ process_inode_chunk(
 	_("would clear realtime rmap inode %" PRIu64 "\n"),
 						ino);
 				}
+			} else if (is_rtrefcount_ino(ino)) {
+				refcount_avoid_check(mp);
+				if (!no_modify)  {
+					do_warn(
+	_("cleared realtime refcount inode %" PRIu64 "\n"),
+						ino);
+				} else  {
+					do_warn(
+	_("would clear realtime refcount inode %" PRIu64 "\n"),
+						ino);
+				}
 			} else if (!no_modify)  {
 				do_warn(_("cleared inode %" PRIu64 "\n"),
 					ino);
diff --git a/repair/dinode.c b/repair/dinode.c
index f1cb119df8b..ad4263f9aa8 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -156,6 +156,9 @@ clear_dinode(xfs_mount_t *mp, struct xfs_dinode *dino, xfs_ino_t ino_num)
 	if (is_rtrmap_inode(ino_num))
 		rmap_avoid_check(mp);
 
+	if (is_rtrefcount_ino(ino_num))
+		refcount_avoid_check(mp);
+
 	/* and clear the forks */
 	memset(XFS_DFORK_DPTR(dino), 0, XFS_LITINO(mp));
 	return;
@@ -1067,6 +1070,12 @@ _("rtrefcount inode %" PRIu64 " not flagged as metadata\n"),
 			lino);
 		return 1;
 	}
+	if (type != XR_INO_RTREFC) {
+		do_warn(
+_("rtrefcount inode %" PRIu64 " was not found in the metadata directory tree\n"),
+			lino);
+		return 1;
+	}
 
 	priv.rgno = rtgroup_for_rtrefcount_inode(mp, ino);
 	if (priv.rgno == NULLRGNUMBER) {
@@ -1107,7 +1116,7 @@ _("computed size of rtrefcountbt root (%zu bytes) is greater than space in "
 		error = process_rtrefc_reclist(mp, rp, numrecs,
 				&priv, "rtrefcountbt root");
 		if (error) {
-			refcount_avoid_check();
+			refcount_avoid_check(mp);
 			return 1;
 		}
 		return 0;
@@ -2063,6 +2072,9 @@ process_check_sb_inodes(
 	if (is_rtrmap_inode(lino))
 		return process_check_rt_inode(mp, dinoc, lino, type, dirty,
 				XR_INO_RTRMAP, _("realtime rmap btree"));
+	if (is_rtrefcount_ino(lino))
+		return process_check_rt_inode(mp, dinoc, lino, type, dirty,
+				XR_INO_RTREFC, _("realtime refcount btree"));
 	return 0;
 }
 
@@ -2172,6 +2184,18 @@ _("found inode %" PRIu64 " claiming to be a rtrmapbt file, but rmapbt is disable
 		}
 		break;
 
+	case XR_INO_RTREFC:
+		/*
+		 * if we have no refcountbt, any inode claiming
+		 * to be a real-time file is bogus
+		 */
+		if (!xfs_has_reflink(mp)) {
+			do_warn(
+_("found inode %" PRIu64 " claiming to be a rtrefcountbt file, but reflink is disabled\n"), lino);
+			return 1;
+		}
+		break;
+
 	default:
 		break;
 	}
@@ -2201,6 +2225,7 @@ _("bad attr fork offset %d in dev inode %" PRIu64 ", should be %d\n"),
 		}
 		break;
 	case XFS_DINODE_FMT_RMAP:
+	case XFS_DINODE_FMT_REFCOUNT:
 		if (!(xfs_has_metadir(mp) && xfs_has_parent(mp))) {
 			do_warn(
 _("metadata inode %" PRIu64 " type %d cannot have attr fork\n"),
@@ -3311,6 +3336,8 @@ _("bad (negative) size %" PRId64 " on inode %" PRIu64 "\n"),
 			type = XR_INO_PQUOTA;
 		else if (is_rtrmap_inode(lino))
 			type = XR_INO_RTRMAP;
+		else if (is_rtrefcount_ino(lino))
+			type = XR_INO_RTREFC;
 		else
 			type = XR_INO_DATA;
 		break;
@@ -3417,6 +3444,7 @@ _("Bad CoW extent size %u on inode %" PRIu64 ", "),
 		case XR_INO_GQUOTA:
 		case XR_INO_PQUOTA:
 		case XR_INO_RTRMAP:
+		case XR_INO_RTREFC:
 			/*
 			 * This inode was recognized as being filesystem
 			 * metadata, so preserve the inode and its contents for
diff --git a/repair/dir2.c b/repair/dir2.c
index 43229b3cd9b..9fd9569ec9f 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -158,6 +158,8 @@ is_meta_ino(
 		reason = _("realtime summary");
 	else if (is_rtrmap_inode(lino))
 		reason = _("realtime rmap");
+	else if (is_rtrefcount_ino(lino))
+		reason = _("realtime refcount");
 	else if (lino == mp->m_sb.sb_uquotino)
 		reason = _("user quota");
 	else if (lino == mp->m_sb.sb_gquotino)
diff --git a/repair/incore.h b/repair/incore.h
index 6ee7a662930..5014a2d303c 100644
--- a/repair/incore.h
+++ b/repair/incore.h
@@ -222,6 +222,7 @@ int		count_bcnt_extents(xfs_agnumber_t);
 #define XR_INO_GQUOTA	13		/* group quota inode */
 #define XR_INO_PQUOTA	14		/* project quota inode */
 #define XR_INO_RTRMAP	15		/* realtime rmap */
+#define XR_INO_RTREFC	16		/* realtime refcount */
 
 /* inode allocation tree */
 
diff --git a/repair/rmap.c b/repair/rmap.c
index b27e3079155..55b41cfa540 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -40,6 +40,12 @@ struct xfs_ag_rmap {
 	 * NULLFSINO to signal to phase 6 to link a new inode into the metadir.
 	 */
 	xfs_ino_t	rg_rmap_ino;
+
+	/*
+	 * inumber of the refcount btree for this rtgroup.  This can be set to
+	 * NULLFSINO to signal to phase 6 to link a new inode into the metadir.
+	 */
+	xfs_ino_t	rg_refcount_ino;
 };
 
 static struct xfs_ag_rmap *ag_rmaps;
@@ -50,6 +56,9 @@ static bool refcbt_suspect;
 /* Bitmap of rt group rmap inodes reachable via /realtime/$rgno.rmap. */
 static struct bitmap	*rmap_inodes;
 
+/* Bitmap of rt group refcount inodes reachable via /realtime/$rgno.refcount. */
+static struct bitmap	*refcount_inodes;
+
 static struct xfs_ag_rmap *rmaps_for_group(bool isrt, unsigned int group)
 {
 	if (isrt)
@@ -127,6 +136,7 @@ rmaps_init_rt(
 		goto nomem;
 
 	ag_rmap->rg_rmap_ino = NULLFSINO;
+	ag_rmap->rg_refcount_ino = NULLFSINO;
 	return;
 nomem:
 	do_error(
@@ -218,6 +228,50 @@ set_rtgroup_rmap_inode(
 	return error;
 }
 
+static inline int
+set_rtgroup_refcount_inode(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_imeta_path	*path;
+	struct xfs_ag_rmap	*ar = rmaps_for_group(true, rgno);
+	struct xfs_trans	*tp;
+	xfs_ino_t		ino;
+	int			error;
+
+	if (!xfs_has_rtreflink(mp))
+		return 0;
+
+	error = -libxfs_rtrefcountbt_create_path(mp, rgno, &path);
+	if (error)
+		return error;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		goto out_path;
+
+	error = -libxfs_imeta_lookup(tp, path, &ino);
+	if (error)
+		goto out_trans;
+
+	if (ino == NULLFSINO || bitmap_test(refcount_inodes, ino, 1)) {
+		error = EFSCORRUPTED;
+		goto out_trans;
+	}
+
+	error = bitmap_set(refcount_inodes, ino, 1);
+	if (error)
+		goto out_trans;
+
+	ar->rg_refcount_ino = ino;
+
+out_trans:
+	libxfs_trans_cancel(tp);
+out_path:
+	libxfs_imeta_free_path(path);
+	return error;
+}
+
 static void
 discover_rtgroup_inodes(
 	struct xfs_mount	*mp)
@@ -229,10 +283,20 @@ discover_rtgroup_inodes(
 	if (error)
 		goto out;
 
+	error = bitmap_alloc(&refcount_inodes);
+	if (error) {
+		bitmap_free(&rmap_inodes);
+		goto out;
+	}
+
 	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
 		int err2 = set_rtgroup_rmap_inode(mp, rgno);
 		if (err2 && !error)
 			error = err2;
+
+		err2 = set_rtgroup_refcount_inode(mp, rgno);
+		if (err2 && !error)
+			error = err2;
 	}
 
 out:
@@ -248,6 +312,7 @@ discover_rtgroup_inodes(
 static inline void
 free_rtmeta_inode_bitmaps(void)
 {
+	bitmap_free(&refcount_inodes);
 	bitmap_free(&rmap_inodes);
 }
 
@@ -263,10 +328,28 @@ rtgroup_for_rtrefcount_inode(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino)
 {
-	/* This will be implemented later. */
+	xfs_rgnumber_t		rgno;
+
+	if (!refcount_inodes)
+		return NULLRGNUMBER;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		if (rg_rmaps[rgno].rg_refcount_ino == ino)
+			return rgno;
+	}
+
 	return NULLRGNUMBER;
 }
 
+bool
+is_rtrefcount_ino(
+	xfs_ino_t		ino)
+{
+	if (!refcount_inodes)
+		return false;
+	return bitmap_test(refcount_inodes, ino, 1);
+}
+
 /*
  * Initialize per-AG reverse map data.
  */
@@ -1864,8 +1947,19 @@ init_refcount_cursor(
  * Disable the refcount btree check.
  */
 void
-refcount_avoid_check(void)
+refcount_avoid_check(
+	struct xfs_mount	*mp)
 {
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
+	for_each_rtgroup(mp, rgno, rtg) {
+		struct xfs_ag_rmap *ar = rmaps_for_group(true, rtg->rtg_rgno);
+
+		ar->rg_refcount_ino = NULLFSINO;
+	}
+
+	bitmap_clear(refcount_inodes, 0, XFS_MAXINUMBER);
 	refcbt_suspect = true;
 }
 
diff --git a/repair/rmap.h b/repair/rmap.h
index 051481d2e2d..3e210fac87b 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -41,7 +41,7 @@ extern void rmap_high_key_from_rec(struct xfs_rmap_irec *rec,
 extern int compute_refcounts(struct xfs_mount *, xfs_agnumber_t);
 uint64_t refcount_record_count(struct xfs_mount *mp, xfs_agnumber_t agno);
 extern int init_refcount_cursor(xfs_agnumber_t, struct xfs_slab_cursor **);
-extern void refcount_avoid_check(void);
+extern void refcount_avoid_check(struct xfs_mount *mp);
 void check_refcounts(struct xfs_mount *mp, xfs_agnumber_t agno);
 
 extern void record_inode_reflink_flag(struct xfs_mount *, struct xfs_dinode *,
@@ -73,5 +73,6 @@ xfs_filblks_t estimate_rtrmapbt_blocks(struct xfs_rtgroup *rtg);
 
 xfs_rgnumber_t rtgroup_for_rtrefcount_inode(struct xfs_mount *mp,
 		xfs_ino_t ino);
+bool is_rtrefcount_ino(xfs_ino_t ino);
 
 #endif /* RMAP_H_ */
diff --git a/repair/scan.c b/repair/scan.c
index abf605e4978..88ca5acc040 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -1987,7 +1987,7 @@ _("extent (%u/%u) len %u claimed, state is %d\n"),
 	libxfs_perag_put(pag);
 out:
 	if (suspect)
-		refcount_avoid_check();
+		refcount_avoid_check(mp);
 	return;
 }
 
@@ -2277,7 +2277,7 @@ _("%s btree block claimed (state %d), agno %d, agbno %d, suspect %d\n"),
 	}
 out:
 	if (suspect) {
-		refcount_avoid_check();
+		refcount_avoid_check(mp);
 		return 1;
 	}
 
@@ -3140,7 +3140,7 @@ validate_agf(
 		if (levels == 0 || levels > mp->m_refc_maxlevels) {
 			do_warn(_("bad levels %u for refcountbt root, agno %d\n"),
 				levels, agno);
-			refcount_avoid_check();
+			refcount_avoid_check(mp);
 		}
 
 		bno = be32_to_cpu(agf->agf_refcount_root);
@@ -3158,7 +3158,7 @@ validate_agf(
 		} else {
 			do_warn(_("bad agbno %u for refcntbt root, agno %d\n"),
 				bno, agno);
-			refcount_avoid_check();
+			refcount_avoid_check(mp);
 		}
 	}
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 33/42] xfs_repair: compute refcount data for the realtime groups
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (31 preceding siblings ...)
  2023-12-27 13:34   ` [PATCH 32/42] xfs_repair: find and mark the rtrefcountbt inode Darrick J. Wong
@ 2023-12-27 13:34   ` Darrick J. Wong
  2023-12-27 13:35   ` [PATCH 34/42] xfs_repair: check existing realtime refcountbt entries against observed refcounts Darrick J. Wong
                     ` (8 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:34 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

At the end of phase 4, compute reference count information for realtime
groups from the realtime rmap information collected, just like we do for
AGs in the data section.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase4.c |   19 ++++++++++++++++++-
 repair/rmap.c   |   17 ++++++++++++-----
 repair/rmap.h   |    2 +-
 3 files changed, 31 insertions(+), 7 deletions(-)


diff --git a/repair/phase4.c b/repair/phase4.c
index b0cb805f30c..e90533689e0 100644
--- a/repair/phase4.c
+++ b/repair/phase4.c
@@ -173,13 +173,28 @@ compute_ag_refcounts(
 {
 	int		error;
 
-	error = compute_refcounts(wq->wq_ctx, agno);
+	error = compute_refcounts(wq->wq_ctx, false, agno);
 	if (error)
 		do_error(
 _("%s while computing reference count records.\n"),
 			 strerror(error));
 }
 
+static void
+compute_rt_refcounts(
+	struct workqueue*wq,
+	xfs_agnumber_t	rgno,
+	void		*arg)
+{
+	int		error;
+
+	error = compute_refcounts(wq->wq_ctx, true, rgno);
+	if (error)
+		do_error(
+_("%s while computing realtime reference count records.\n"),
+			 strerror(error));
+}
+
 static void
 process_inode_reflink_flags(
 	struct workqueue	*wq,
@@ -227,6 +242,8 @@ process_rmap_data(
 	create_work_queue(&wq, mp, platform_nproc());
 	for (i = 0; i < mp->m_sb.sb_agcount; i++)
 		queue_work(&wq, compute_ag_refcounts, i, NULL);
+	for (i = 0; i < mp->m_sb.sb_rgcount; i++)
+		queue_work(&wq, compute_rt_refcounts, i, NULL);
 	destroy_work_queue(&wq);
 
 	create_work_queue(&wq, mp, platform_nproc());
diff --git a/repair/rmap.c b/repair/rmap.c
index 55b41cfa540..38a508ced16 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -135,6 +135,11 @@ rmaps_init_rt(
 	if (error)
 		goto nomem;
 
+	error = init_slab(&ag_rmap->ar_refcount_items,
+			  sizeof(struct xfs_refcount_irec));
+	if (error)
+		goto nomem;
+
 	ag_rmap->rg_rmap_ino = NULLFSINO;
 	ag_rmap->rg_refcount_ino = NULLFSINO;
 	return;
@@ -1080,6 +1085,7 @@ mark_reflink_inodes(
 static void
 refcount_emit(
 	struct xfs_mount	*mp,
+	bool			isrt,
 	xfs_agnumber_t		agno,
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		len,
@@ -1089,7 +1095,7 @@ refcount_emit(
 	int			error;
 	struct xfs_slab		*rlslab;
 
-	rlslab = rmaps_for_group(false, agno)->ar_refcount_items;
+	rlslab = rmaps_for_group(isrt, agno)->ar_refcount_items;
 	ASSERT(nr_rmaps > 0);
 
 	dbg_printf("REFL: agno=%u pblk=%u, len=%u -> refcount=%zu\n",
@@ -1208,6 +1214,7 @@ refcount_push_rmaps_at(
 int
 compute_refcounts(
 	struct xfs_mount	*mp,
+	bool			isrt,
 	xfs_agnumber_t		agno)
 {
 	struct rcbag		*rcstack;
@@ -1223,12 +1230,12 @@ compute_refcounts(
 
 	if (!xfs_has_reflink(mp))
 		return 0;
-	if (rmaps_for_group(false, agno)->ar_xfbtree == NULL)
+	if (rmaps_for_group(isrt, agno)->ar_xfbtree == NULL)
 		return 0;
 
-	nr_rmaps = rmap_record_count(mp, false, agno);
+	nr_rmaps = rmap_record_count(mp, isrt, agno);
 
-	error = rmap_init_mem_cursor(mp, NULL, false, agno, &rmcur);
+	error = rmap_init_mem_cursor(mp, NULL, isrt, agno, &rmcur);
 	if (error)
 		return error;
 
@@ -1285,7 +1292,7 @@ compute_refcounts(
 			ASSERT(nbno > cbno);
 			if (rcbag_count(rcstack) != old_stack_height) {
 				if (old_stack_height > 1) {
-					refcount_emit(mp, agno, cbno,
+					refcount_emit(mp, isrt, agno, cbno,
 							nbno - cbno,
 							old_stack_height);
 				}
diff --git a/repair/rmap.h b/repair/rmap.h
index 3e210fac87b..9663b3e3a20 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -38,7 +38,7 @@ extern int64_t rmap_diffkeys(struct xfs_rmap_irec *kp1,
 extern void rmap_high_key_from_rec(struct xfs_rmap_irec *rec,
 		struct xfs_rmap_irec *key);
 
-extern int compute_refcounts(struct xfs_mount *, xfs_agnumber_t);
+int compute_refcounts(struct xfs_mount *mp, bool isrt, xfs_agnumber_t agno);
 uint64_t refcount_record_count(struct xfs_mount *mp, xfs_agnumber_t agno);
 extern int init_refcount_cursor(xfs_agnumber_t, struct xfs_slab_cursor **);
 extern void refcount_avoid_check(struct xfs_mount *mp);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 34/42] xfs_repair: check existing realtime refcountbt entries against observed refcounts
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (32 preceding siblings ...)
  2023-12-27 13:34   ` [PATCH 33/42] xfs_repair: compute refcount data for the realtime groups Darrick J. Wong
@ 2023-12-27 13:35   ` Darrick J. Wong
  2023-12-27 13:35   ` [PATCH 35/42] xfs_repair: reject unwritten shared extents Darrick J. Wong
                     ` (7 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:35 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Once we've finished collecting reverse mapping observations from the
metadata scan, check those observations against the realtime refcount
btree (particularly if we're in -n mode) to detect rtrefcountbt
problems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    1 
 repair/agbtree.c         |    2 
 repair/phase4.c          |   11 +++
 repair/rmap.c            |  196 +++++++++++++++++++++++++++++++++++-----------
 repair/rmap.h            |    4 +
 5 files changed, 166 insertions(+), 48 deletions(-)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 36e09b3a237..cfa70778262 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -295,6 +295,7 @@
 
 #define xfs_rtrefcountbt_create_path	libxfs_rtrefcountbt_create_path
 #define xfs_rtrefcountbt_droot_maxrecs	libxfs_rtrefcountbt_droot_maxrecs
+#define xfs_rtrefcountbt_init_cursor	libxfs_rtrefcountbt_init_cursor
 #define xfs_rtrefcountbt_maxlevels_ondisk	libxfs_rtrefcountbt_maxlevels_ondisk
 #define xfs_rtrefcountbt_maxrecs	libxfs_rtrefcountbt_maxrecs
 
diff --git a/repair/agbtree.c b/repair/agbtree.c
index a401c80da38..90863b0dd7d 100644
--- a/repair/agbtree.c
+++ b/repair/agbtree.c
@@ -746,7 +746,7 @@ build_refcount_tree(
 {
 	int			error;
 
-	error = init_refcount_cursor(agno, &btr->slab_cursor);
+	error = init_refcount_cursor(false, agno, &btr->slab_cursor);
 	if (error)
 		do_error(
 _("Insufficient memory to construct refcount cursor.\n"));
diff --git a/repair/phase4.c b/repair/phase4.c
index e90533689e0..8d97b63b2ce 100644
--- a/repair/phase4.c
+++ b/repair/phase4.c
@@ -219,6 +219,15 @@ check_refcount_btrees(
 	check_refcounts(wq->wq_ctx, agno);
 }
 
+static void
+check_rt_refcount_btrees(
+	struct workqueue	*wq,
+	xfs_agnumber_t		agno,
+	void			*arg)
+{
+	check_rtrefcounts(wq->wq_ctx, agno);
+}
+
 static void
 process_rmap_data(
 	struct xfs_mount	*mp)
@@ -251,6 +260,8 @@ process_rmap_data(
 		queue_work(&wq, process_inode_reflink_flags, i, NULL);
 		queue_work(&wq, check_refcount_btrees, i, NULL);
 	}
+	for (i = 0; i < mp->m_sb.sb_rgcount; i++)
+		queue_work(&wq, check_rt_refcount_btrees, i, NULL);
 	destroy_work_queue(&wq);
 }
 
diff --git a/repair/rmap.c b/repair/rmap.c
index 38a508ced16..51431808db2 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -1942,10 +1942,11 @@ refcount_record_count(
  */
 int
 init_refcount_cursor(
+	bool			isrt,
 	xfs_agnumber_t		agno,
 	struct xfs_slab_cursor	**cur)
 {
-	struct xfs_ag_rmap	*x = rmaps_for_group(false, agno);
+	struct xfs_ag_rmap	*x = rmaps_for_group(isrt, agno);
 
 	return init_slab_cursor(x->ar_refcount_items, NULL, cur);
 }
@@ -1970,56 +1971,18 @@ refcount_avoid_check(
 	refcbt_suspect = true;
 }
 
-/*
- * Compare the observed reference counts against what's in the ag btree.
- */
-void
-check_refcounts(
-	struct xfs_mount		*mp,
+static int
+check_refcount_records(
+	struct xfs_slab_cursor		*rl_cur,
+	struct xfs_btree_cur		*bt_cur,
 	xfs_agnumber_t			agno)
 {
 	struct xfs_refcount_irec	tmp;
-	struct xfs_slab_cursor		*rl_cur;
-	struct xfs_btree_cur		*bt_cur = NULL;
-	struct xfs_buf			*agbp = NULL;
-	struct xfs_perag		*pag = NULL;
 	struct xfs_refcount_irec	*rl_rec;
-	int				have;
 	int				i;
+	int				have;
 	int				error;
 
-	if (!xfs_has_reflink(mp) || add_reflink)
-		return;
-	if (refcbt_suspect) {
-		if (no_modify && agno == 0)
-			do_warn(_("would rebuild corrupt refcount btrees.\n"));
-		return;
-	}
-
-	/* Create cursors to refcount structures */
-	error = init_refcount_cursor(agno, &rl_cur);
-	if (error) {
-		do_warn(_("Not enough memory to check refcount data.\n"));
-		return;
-	}
-
-	pag = libxfs_perag_get(mp, agno);
-	error = -libxfs_alloc_read_agf(pag, NULL, 0, &agbp);
-	if (error) {
-		do_warn(_("Could not read AGF %u to check refcount btree.\n"),
-				agno);
-		goto err_pag;
-	}
-
-	/* Leave the per-ag data "uninitialized" since we rewrite it later */
-	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
-
-	bt_cur = libxfs_refcountbt_init_cursor(mp, NULL, agbp, pag);
-	if (!bt_cur) {
-		do_warn(_("Not enough memory to check refcount data.\n"));
-		goto err_agf;
-	}
-
 	rl_rec = pop_slab_cursor(rl_cur);
 	while (rl_rec) {
 		/* Look for a refcount record in the btree */
@@ -2030,7 +1993,7 @@ check_refcounts(
 			do_warn(
 _("Could not read reference count record for (%u/%u).\n"),
 					agno, rl_rec->rc_startblock);
-			goto err_cur;
+			return error;
 		}
 		if (!have) {
 			do_warn(
@@ -2045,7 +2008,7 @@ _("Missing reference count record for (%u/%u) len %u count %u\n"),
 			do_warn(
 _("Could not read reference count record for (%u/%u).\n"),
 					agno, rl_rec->rc_startblock);
-			goto err_cur;
+			return error;
 		}
 		if (!i) {
 			do_warn(
@@ -2075,6 +2038,59 @@ _("Incorrect reference count: saw (%u/%u) len %u nlinks %u; should be (%u/%u) le
 		rl_rec = pop_slab_cursor(rl_cur);
 	}
 
+	return 0;
+}
+
+/*
+ * Compare the observed reference counts against what's in the ag btree.
+ */
+void
+check_refcounts(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno)
+{
+	struct xfs_slab_cursor		*rl_cur;
+	struct xfs_btree_cur		*bt_cur = NULL;
+	struct xfs_buf			*agbp = NULL;
+	struct xfs_perag		*pag = NULL;
+	int				error;
+
+	if (!xfs_has_reflink(mp) || add_reflink)
+		return;
+	if (refcbt_suspect) {
+		if (no_modify && agno == 0)
+			do_warn(_("would rebuild corrupt refcount btrees.\n"));
+		return;
+	}
+
+	/* Create cursors to refcount structures */
+	error = init_refcount_cursor(false, agno, &rl_cur);
+	if (error) {
+		do_warn(_("Not enough memory to check refcount data.\n"));
+		return;
+	}
+
+	pag = libxfs_perag_get(mp, agno);
+	error = -libxfs_alloc_read_agf(pag, NULL, 0, &agbp);
+	if (error) {
+		do_warn(_("Could not read AGF %u to check refcount btree.\n"),
+				agno);
+		goto err_pag;
+	}
+
+	/* Leave the per-ag data "uninitialized" since we rewrite it later */
+	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
+
+	bt_cur = libxfs_refcountbt_init_cursor(mp, NULL, agbp, pag);
+	if (!bt_cur) {
+		do_warn(_("Not enough memory to check refcount data.\n"));
+		goto err_agf;
+	}
+
+	error = check_refcount_records(rl_cur, bt_cur, agno);
+	if (error)
+		goto err_cur;
+
 err_cur:
 	libxfs_btree_del_cursor(bt_cur, error);
 err_agf:
@@ -2084,6 +2100,94 @@ _("Incorrect reference count: saw (%u/%u) len %u nlinks %u; should be (%u/%u) le
 	free_slab_cursor(&rl_cur);
 }
 
+/*
+ * Compare the observed reference counts against what's in the ondisk btree.
+ */
+void
+check_rtrefcounts(
+	struct xfs_mount		*mp,
+	xfs_rgnumber_t			rgno)
+{
+	struct xfs_slab_cursor		*rl_cur;
+	struct xfs_btree_cur		*bt_cur = NULL;
+	struct xfs_rtgroup		*rtg = NULL;
+	struct xfs_inode		*ip = NULL;
+	struct xfs_ag_rmap		*ar = rmaps_for_group(true, rgno);
+	int				error;
+
+	if (!xfs_has_reflink(mp) || add_reflink)
+		return;
+	if (refcbt_suspect) {
+		if (no_modify && rgno == 0)
+			do_warn(_("would rebuild corrupt refcount btrees.\n"));
+		return;
+	}
+	if (mp->m_sb.sb_rblocks == 0) {
+		if (rmap_record_count(mp, true, rgno) != 0)
+			do_error(_("realtime refcounts but no rtdev?\n"));
+		return;
+	}
+
+	/* Create cursors to refcount structures */
+	error = init_refcount_cursor(true, rgno, &rl_cur);
+	if (error) {
+		do_warn(_("Not enough memory to check refcount data.\n"));
+		return;
+	}
+
+	rtg = libxfs_rtgroup_get(mp, rgno);
+	if (!rtg) {
+		do_warn(_("Could not load rtgroup %u.\n"), rgno);
+		goto err_rcur;
+	}
+
+	error = threadsafe_imeta_iget(mp, ar->rg_refcount_ino, &ip);
+	if (error) {
+		do_warn(
+_("Cannot load rtgroup %u refcount inode 0x%llx, error %d.\n"),
+				rgno,
+				(unsigned long long)ar->rg_refcount_ino,
+				error);
+		goto err_rtg;
+	}
+
+	if (ip->i_df.if_format != XFS_DINODE_FMT_REFCOUNT) {
+		do_warn(
+_("rtgroup %u refcount inode has wrong format 0x%x, expected 0x%x\n"),
+				rgno,
+				ip->i_df.if_format,
+				XFS_DINODE_FMT_REFCOUNT);
+		goto err_ino;
+	}
+
+	if (xfs_inode_has_attr_fork(ip) &&
+	    !(xfs_has_metadir(mp) && xfs_has_parent(mp))) {
+		do_warn(
+_("rtgroup %u refcount inode should not have extended attributes\n"),
+				rgno);
+		goto err_ino;
+	}
+
+	bt_cur = libxfs_rtrefcountbt_init_cursor(mp, NULL, rtg, ip);
+	if (!bt_cur) {
+		do_warn(_("Not enough memory to check refcount data.\n"));
+		goto err_ino;
+	}
+
+	error = check_refcount_records(rl_cur, bt_cur, rgno);
+	if (error)
+		goto err_cur;
+
+err_cur:
+	libxfs_btree_del_cursor(bt_cur, error);
+err_ino:
+	libxfs_imeta_irele(ip);
+err_rtg:
+	libxfs_rtgroup_put(rtg);
+err_rcur:
+	free_slab_cursor(&rl_cur);
+}
+
 /*
  * Regenerate the AGFL so that we don't run out of it while rebuilding the
  * rmap btree.  If skip_rmapbt is true, don't update the rmapbt (most probably
diff --git a/repair/rmap.h b/repair/rmap.h
index 9663b3e3a20..45560805a4e 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -40,9 +40,11 @@ extern void rmap_high_key_from_rec(struct xfs_rmap_irec *rec,
 
 int compute_refcounts(struct xfs_mount *mp, bool isrt, xfs_agnumber_t agno);
 uint64_t refcount_record_count(struct xfs_mount *mp, xfs_agnumber_t agno);
-extern int init_refcount_cursor(xfs_agnumber_t, struct xfs_slab_cursor **);
+int init_refcount_cursor(bool isrt, xfs_agnumber_t agno,
+		struct xfs_slab_cursor **pcur);
 extern void refcount_avoid_check(struct xfs_mount *mp);
 void check_refcounts(struct xfs_mount *mp, xfs_agnumber_t agno);
+void check_rtrefcounts(struct xfs_mount *mp, xfs_rgnumber_t rgno);
 
 extern void record_inode_reflink_flag(struct xfs_mount *, struct xfs_dinode *,
 	xfs_agnumber_t, xfs_agino_t, xfs_ino_t);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 35/42] xfs_repair: reject unwritten shared extents
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (33 preceding siblings ...)
  2023-12-27 13:35   ` [PATCH 34/42] xfs_repair: check existing realtime refcountbt entries against observed refcounts Darrick J. Wong
@ 2023-12-27 13:35   ` Darrick J. Wong
  2023-12-27 13:35   ` [PATCH 36/42] xfs_repair: rebuild the realtime refcount btree Darrick J. Wong
                     ` (6 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:35 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

We don't allow sharing of unwritten extents, which means that repair
should reject an unwritten extent if someone else has already claimed
the space.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |   23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index ad4263f9aa8..f0c0ba4da4e 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -277,7 +277,8 @@ _("bad state in rt extent map %" PRIu64 "\n"),
 			break;
 		case XR_E_INUSE:
 		case XR_E_MULT:
-			if (xfs_has_rtreflink(mp))
+			if (xfs_has_rtreflink(mp) &&
+			    irec->br_state == XFS_EXT_NORM)
 				break;
 			set_rtbmap(ext, XR_E_MULT);
 			break;
@@ -353,8 +354,14 @@ _("data fork in rt inode %" PRIu64 " found rt metadata extent %" PRIu64 " in rt
 			return 1;
 		case XR_E_INUSE:
 		case XR_E_MULT:
-			if (xfs_has_rtreflink(mp))
-				break;
+			if (xfs_has_rtreflink(mp)) {
+				if (irec->br_state == XFS_EXT_NORM)
+					break;
+				do_warn(
+_("data fork in rt inode %" PRIu64 " claims shared unwritten rt extent %" PRIu64 "\n"),
+					ino, b);
+				return 1;
+			}
 			do_warn(
 _("data fork in rt inode %" PRIu64 " claims used rt extent %" PRIu64 "\n"),
 				ino, b);
@@ -671,8 +678,14 @@ _("%s fork in inode %" PRIu64 " claims metadata block %" PRIu64 "\n"),
 			case XR_E_INUSE:
 			case XR_E_MULT:
 				if (type == XR_INO_DATA &&
-				    xfs_has_reflink(mp))
-					break;
+				    xfs_has_reflink(mp)) {
+					if (irec.br_state == XFS_EXT_NORM)
+						break;
+					do_warn(
+_("%s fork in %s inode %" PRIu64 " claims shared unwritten block %" PRIu64 "\n"),
+						forkname, ftype, ino, b);
+					goto done;
+				}
 				do_warn(
 _("%s fork in %s inode %" PRIu64 " claims used block %" PRIu64 "\n"),
 					forkname, ftype, ino, b);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 36/42] xfs_repair: rebuild the realtime refcount btree
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (34 preceding siblings ...)
  2023-12-27 13:35   ` [PATCH 35/42] xfs_repair: reject unwritten shared extents Darrick J. Wong
@ 2023-12-27 13:35   ` Darrick J. Wong
  2023-12-27 13:35   ` [PATCH 37/42] xfs_repair: allow realtime files to have the reflink flag set Darrick J. Wong
                     ` (5 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:35 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use the collected reference count information to rebuild the btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h   |    5 +
 repair/Makefile            |    1 
 repair/agbtree.c           |    2 
 repair/phase5.c            |    3 -
 repair/phase6.c            |  133 +++++++++++++++++++++++
 repair/rmap.c              |   31 +++++
 repair/rmap.h              |    7 +
 repair/rtrefcount_repair.c |  256 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 434 insertions(+), 4 deletions(-)
 create mode 100644 repair/rtrefcount_repair.c


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index cfa70778262..bc34ce9caad 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -293,11 +293,16 @@
 #define xfs_rtgroup_update_secondary_sbs	libxfs_rtgroup_update_secondary_sbs
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
 
+#define xfs_rtrefcountbt_absolute_maxlevels	libxfs_rtrefcountbt_absolute_maxlevels
+#define xfs_rtrefcountbt_calc_size		libxfs_rtrefcountbt_calc_size
+#define xfs_rtrefcountbt_commit_staged_btree	libxfs_rtrefcountbt_commit_staged_btree
+#define xfs_rtrefcountbt_create		libxfs_rtrefcountbt_create
 #define xfs_rtrefcountbt_create_path	libxfs_rtrefcountbt_create_path
 #define xfs_rtrefcountbt_droot_maxrecs	libxfs_rtrefcountbt_droot_maxrecs
 #define xfs_rtrefcountbt_init_cursor	libxfs_rtrefcountbt_init_cursor
 #define xfs_rtrefcountbt_maxlevels_ondisk	libxfs_rtrefcountbt_maxlevels_ondisk
 #define xfs_rtrefcountbt_maxrecs	libxfs_rtrefcountbt_maxrecs
+#define xfs_rtrefcountbt_stage_cursor	libxfs_rtrefcountbt_stage_cursor
 
 #define xfs_rtrmapbt_calc_reserves	libxfs_rtrmapbt_calc_reserves
 #define xfs_rtrmapbt_calc_size		libxfs_rtrmapbt_calc_size
diff --git a/repair/Makefile b/repair/Makefile
index 5bec8154829..5eca6d64c51 100644
--- a/repair/Makefile
+++ b/repair/Makefile
@@ -75,6 +75,7 @@ CFILES = \
 	rcbag.c \
 	rmap.c \
 	rt.c \
+	rtrefcount_repair.c \
 	rtrmap_repair.c \
 	sb.c \
 	scan.c \
diff --git a/repair/agbtree.c b/repair/agbtree.c
index 90863b0dd7d..571eeef231a 100644
--- a/repair/agbtree.c
+++ b/repair/agbtree.c
@@ -729,7 +729,7 @@ init_refc_cursor(
 
 	/* Compute how many blocks we'll need. */
 	error = -libxfs_btree_bload_compute_geometry(btr->cur, &btr->bload,
-			refcount_record_count(sc->mp, agno));
+			refcount_record_count(sc->mp, false, agno));
 	if (error)
 		do_error(
 _("Unable to compute refcount btree geometry, error %d.\n"), error);
diff --git a/repair/phase5.c b/repair/phase5.c
index 5e1dff0aadd..8c0685f494f 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -705,7 +705,7 @@ are_packed_btrees_needed(
 	 * If we don't have inode-based metadata, we can let the AG btrees
 	 * pack as needed; there are no global space concerns here.
 	 */
-	if (!xfs_has_rtrmapbt(mp))
+	if (!xfs_has_rtrmapbt(mp) && !xfs_has_rtreflink(mp))
 		return false;
 
 	for_each_perag(mp, agno, pag) {
@@ -718,6 +718,7 @@ are_packed_btrees_needed(
 
 	for_each_rtgroup(mp, rgno, rtg) {
 		metadata_blocks += estimate_rtrmapbt_blocks(rtg);
+		metadata_blocks += estimate_rtrefcountbt_blocks(rtg);
 	}
 
 	/*
diff --git a/repair/phase6.c b/repair/phase6.c
index c9974623d12..9c79b233e35 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -1148,6 +1148,137 @@ ensure_rtgroup_rmapbt(
 	libxfs_imeta_irele(ip);
 }
 
+static void
+ensure_rtgroup_refcountbt(
+	struct xfs_rtgroup	*rtg,
+	xfs_filblks_t		est_fdblocks)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_path	*path;
+	struct xfs_inode	*ip;
+	struct xfs_imeta_update	upd;
+	xfs_ino_t		ino;
+	int			error;
+
+	if (!xfs_has_rtreflink(mp))
+		return;
+
+	ino = rtgroup_refcount_ino(rtg);
+	if (no_modify) {
+		if (ino == NULLFSINO)
+			do_warn(_("would reset rtgroup %u refcount btree\n"),
+					rtg->rtg_rgno);
+		return;
+	}
+
+	if (ino == NULLFSINO)
+		do_warn(_("resetting rtgroup %u refcount btree\n"),
+				rtg->rtg_rgno);
+
+	error = -libxfs_rtrefcountbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		do_error(
+ _("Couldn't create rtgroup %u refcount file path, err %d\n"),
+				rtg->rtg_rgno, error);
+
+	error = ensure_imeta_dirpath(mp, path);
+	if (error)
+		do_error(
+ _("Couldn't create rtgroup %u metadata directory, error %d\n"),
+				rtg->rtg_rgno, error);
+
+	if (ino != NULLFSINO) {
+		struct xfs_trans	*tp;
+
+		/*
+		 * We're still hanging on to our old inode pointer, so grab it
+		 * and reconnect it to the metadata directory tree.  If it
+		 * can't be grabbed, create a new rtrefcount file.
+		 */
+		error = -libxfs_trans_alloc_empty(mp, &tp);
+		if (error)
+			do_error(
+ _("Couldn't allocate transaction to iget rtgroup %u refcountbt inode 0x%llx, error %d\n"),
+					rtg->rtg_rgno, (unsigned long long)ino,
+					error);
+		error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, &ip);
+		libxfs_trans_cancel(tp);
+		if (error) {
+			do_warn(
+ _("Couldn't iget rtgroup %u refcountbt inode 0x%llx, error %d\n"),
+					rtg->rtg_rgno,
+					(unsigned long long)ino,
+					error);
+			goto zap;
+		}
+
+		/*
+		 * Since we're reattaching this file to the metadata directory
+		 * tree, try to remove all the parent pointers that might be
+		 * attached.
+		 */
+		try_erase_parent_ptrs(ip);
+
+		error = -libxfs_imeta_start_link(mp, path, ip, &upd);
+		if (error)
+			do_error(
+ _("Couldn't grab resources to reconnect rtgroup %u refcountbt, error %d\n"),
+					rtg->rtg_rgno, error);
+
+		error = -libxfs_imeta_link(&upd);
+		if (error)
+			do_error(
+ _("Failed to link rtgroup %u refcountbt inode 0x%llx, error %d\n"),
+					rtg->rtg_rgno,
+					(unsigned long long)ino,
+					error);
+
+		/* Reset the link count to something sane. */
+		set_nlink(VFS_I(ip), 1);
+		ip->i_df.if_format = XFS_DINODE_FMT_REFCOUNT;
+		libxfs_trans_log_inode(upd.tp, ip, XFS_ILOG_CORE);
+	} else {
+zap:
+		/*
+		 * The rtrefcount inode was bad or gone, so just make a new one
+		 * and give our reference to the rtgroup structure.
+		 */
+		error = -libxfs_imeta_start_create(mp, path, &upd);
+		if (error)
+			do_error(
+ _("Couldn't grab resources to recreate rtgroup %u refcountbt, error %d\n"),
+					rtg->rtg_rgno, error);
+
+		error = -libxfs_rtrefcountbt_create(&upd, &ip);
+		if (error)
+			do_error(
+ _("Couldn't create rtgroup %u refcountbt inode, error %d\n"),
+					rtg->rtg_rgno, error);
+	}
+
+	/* Mark the inode in use. */
+	mark_ino_inuse(mp, ip->i_ino, S_IFREG, upd.dp->i_ino);
+	mark_ino_metadata(mp, ip->i_ino);
+
+	error = -libxfs_imeta_commit_update(&upd);
+	if (error)
+		do_error(
+ _("Couldn't commit new rtgroup %u refcountbt inode %llu, error %d\n"),
+				rtg->rtg_rgno,
+				(unsigned long long)ip->i_ino,
+				error);
+
+	/* Copy our incore refcount data to the ondisk refcount inode. */
+	error = populate_rtgroup_refcountbt(rtg, ip, est_fdblocks);
+	if (error)
+		do_error(
+ _("rtgroup %u refcount btree could not be rebuilt, error %d\n"),
+				rtg->rtg_rgno, error);
+
+	libxfs_imeta_free_path(path);
+	libxfs_imeta_irele(ip);
+}
+
 /* Initialize a root directory. */
 static int
 init_fs_root_dir(
@@ -3931,6 +4062,7 @@ reset_rt_metadata_inodes(
 	if (!need_packed_btrees) {
 		for_each_rtgroup(mp, rgno, rtg) {
 			metadata_blocks += estimate_rtrmapbt_blocks(rtg);
+			metadata_blocks += estimate_rtrefcountbt_blocks(rtg);
 		}
 		if (mp->m_sb.sb_fdblocks > metadata_blocks)
 			est_fdblocks = mp->m_sb.sb_fdblocks - metadata_blocks;
@@ -3938,6 +4070,7 @@ reset_rt_metadata_inodes(
 
 	for_each_rtgroup(mp, rgno, rtg) {
 		ensure_rtgroup_rmapbt(rtg, est_fdblocks);
+		ensure_rtgroup_refcountbt(rtg, est_fdblocks);
 	}
 }
 
diff --git a/repair/rmap.c b/repair/rmap.c
index 51431808db2..ab5664846c8 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -1930,9 +1930,10 @@ _("Unable to fix reflink flag on inode %"PRIu64".\n"),
 uint64_t
 refcount_record_count(
 	struct xfs_mount	*mp,
+	bool			isrt,
 	xfs_agnumber_t		agno)
 {
-	struct xfs_ag_rmap	*x = rmaps_for_group(false, agno);
+	struct xfs_ag_rmap	*x = rmaps_for_group(isrt, agno);
 
 	return slab_count(x->ar_refcount_items);
 }
@@ -2346,3 +2347,31 @@ estimate_rtrmapbt_blocks(
 	nr_recs = xfbtree_bytes(x->ar_xfbtree) / sizeof(struct xfs_rmap_rec);
 	return libxfs_rtrmapbt_calc_size(mp, nr_recs);
 }
+
+xfs_ino_t
+rtgroup_refcount_ino(
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_ag_rmap	*ar = rmaps_for_group(true, rtg->rtg_rgno);
+
+	return ar->rg_refcount_ino;
+}
+
+/* Estimate the size of the ondisk rtrefcountbt from the incore data. */
+xfs_filblks_t
+estimate_rtrefcountbt_blocks(
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_ag_rmap	*x;
+
+	if (!rmap_needs_work(mp) || !xfs_has_rtreflink(mp))
+		return 0;
+
+	x = &rg_rmaps[rtg->rtg_rgno];
+	if (!x->ar_refcount_items)
+		return 0;
+
+	return libxfs_rtrefcountbt_calc_size(mp,
+			slab_count(x->ar_refcount_items));
+}
diff --git a/repair/rmap.h b/repair/rmap.h
index 45560805a4e..74322044cb5 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -39,7 +39,8 @@ extern void rmap_high_key_from_rec(struct xfs_rmap_irec *rec,
 		struct xfs_rmap_irec *key);
 
 int compute_refcounts(struct xfs_mount *mp, bool isrt, xfs_agnumber_t agno);
-uint64_t refcount_record_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+uint64_t refcount_record_count(struct xfs_mount *mp, bool isrt,
+		xfs_agnumber_t agno);
 int init_refcount_cursor(bool isrt, xfs_agnumber_t agno,
 		struct xfs_slab_cursor **pcur);
 extern void refcount_avoid_check(struct xfs_mount *mp);
@@ -76,5 +77,9 @@ xfs_filblks_t estimate_rtrmapbt_blocks(struct xfs_rtgroup *rtg);
 xfs_rgnumber_t rtgroup_for_rtrefcount_inode(struct xfs_mount *mp,
 		xfs_ino_t ino);
 bool is_rtrefcount_ino(xfs_ino_t ino);
+xfs_ino_t rtgroup_refcount_ino(struct xfs_rtgroup *rtg);
+int populate_rtgroup_refcountbt(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		xfs_filblks_t fdblocks);
+xfs_filblks_t estimate_rtrefcountbt_blocks(struct xfs_rtgroup *rtg);
 
 #endif /* RMAP_H_ */
diff --git a/repair/rtrefcount_repair.c b/repair/rtrefcount_repair.c
new file mode 100644
index 00000000000..f11684b4775
--- /dev/null
+++ b/repair/rtrefcount_repair.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include <libxfs.h>
+#include "btree.h"
+#include "err_protos.h"
+#include "libxlog.h"
+#include "incore.h"
+#include "globals.h"
+#include "dinode.h"
+#include "slab.h"
+#include "rmap.h"
+#include "bulkload.h"
+
+/*
+ * Realtime Reference Count (RTREFCBT) Repair
+ * ==========================================
+ *
+ * Gather all the reference count records for the realtime device, reset the
+ * incore fork, then recreate the btree.
+ */
+struct xrep_rtrefc {
+	/* rtrefcbt slab cursor */
+	struct xfs_slab_cursor	*slab_cursor;
+
+	/* New fork. */
+	struct bulkload		new_fork_info;
+	struct xfs_btree_bload	rtrefc_bload;
+
+	struct repair_ctx	*sc;
+	struct xfs_rtgroup	*rtg;
+
+	/* Estimated free space after building all rt btrees */
+	xfs_filblks_t		est_fdblocks;
+};
+
+/* Retrieve rtrefc data for bulk load. */
+STATIC int
+xrep_rtrefc_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xfs_refcount_irec	*rec;
+	struct xrep_rtrefc		*rc = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		rec = pop_slab_cursor(rc->slab_cursor);
+		memcpy(&cur->bc_rec.rc, rec, sizeof(struct xfs_refcount_irec));
+
+		block_rec = libxfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrefc_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_rtrefc	*rr = priv;
+
+	return bulkload_claim_block(cur, &rr->new_fork_info, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrefc_iroot_size(
+	struct xfs_btree_cur	*cur,
+	unsigned int		level,
+	unsigned int		nr_this_level,
+	void			*priv)
+{
+	return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level,
+			nr_this_level);
+}
+
+/* Reserve new btree blocks and bulk load all the rtrmap records. */
+STATIC int
+xrep_rtrefc_btree_load(
+	struct xrep_rtrefc	*rr,
+	struct xfs_btree_cur	*rtrmap_cur)
+{
+	struct repair_ctx	*sc = rr->sc;
+	int			error;
+
+	rr->rtrefc_bload.get_records = xrep_rtrefc_get_records;
+	rr->rtrefc_bload.claim_block = xrep_rtrefc_claim_block;
+	rr->rtrefc_bload.iroot_size = xrep_rtrefc_iroot_size;
+	bulkload_estimate_inode_slack(sc->mp, &rr->rtrefc_bload,
+			rr->est_fdblocks);
+
+	/* Compute how many blocks we'll need. */
+	error = -libxfs_btree_bload_compute_geometry(rtrmap_cur,
+			&rr->rtrefc_bload,
+			refcount_record_count(sc->mp, true, rr->rtg->rtg_rgno));
+	if (error)
+		return error;
+
+	/*
+	 * Guess how many blocks we're going to need to rebuild an entire
+	 * rtrefcountbt from the number of extents we found, and pump up our
+	 * transaction to have sufficient block reservation.
+	 */
+	error = -libxfs_trans_reserve_more(sc->tp, rr->rtrefc_bload.nr_blocks,
+			0);
+	if (error)
+		return error;
+
+	/*
+	 * Reserve the space we'll need for the new btree.  Drop the cursor
+	 * while we do this because that can roll the transaction and cursors
+	 * can't handle that.
+	 */
+	error = bulkload_alloc_file_blocks(&rr->new_fork_info,
+			rr->rtrefc_bload.nr_blocks);
+	if (error)
+		return error;
+
+	/* Add all observed rtrmap records. */
+	error = init_refcount_cursor(true, rr->rtg->rtg_rgno, &rr->slab_cursor);
+	if (error)
+		return error;
+	error = -libxfs_btree_bload(rtrmap_cur, &rr->rtrefc_bload, rr);
+	free_slab_cursor(&rr->slab_cursor);
+	return error;
+}
+
+/* Update the inode counters. */
+STATIC int
+xrep_rtrefc_reset_counters(
+	struct xrep_rtrefc	*rr)
+{
+	struct repair_ctx	*sc = rr->sc;
+
+	/*
+	 * Update the inode block counts to reflect the btree we just
+	 * generated.
+	 */
+	sc->ip->i_nblocks = rr->new_fork_info.ifake.if_blocks;
+	libxfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+
+	/* Quotas don't exist so we're done. */
+	return 0;
+}
+
+/*
+ * Use the collected rmap information to stage a new rt refcount btree.  If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+static int
+xrep_rtrefc_build_new_tree(
+	struct xrep_rtrefc	*rr)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_btree_cur	*cur;
+	struct repair_ctx	*sc = rr->sc;
+	struct xbtree_ifakeroot	*ifake = &rr->new_fork_info.ifake;
+	int			error;
+
+	/*
+	 * Prepare to construct the new fork by initializing the new btree
+	 * structure and creating a fake ifork in the ifakeroot structure.
+	 */
+	libxfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+	bulkload_init_inode(&rr->new_fork_info, sc, XFS_DATA_FORK, &oinfo);
+	cur = libxfs_rtrefcountbt_stage_cursor(sc->mp, rr->rtg, sc->ip, ifake);
+
+	/*
+	 * Figure out the size and format of the new fork, then fill it with
+	 * all the rtrmap records we've found.  Join the inode to the
+	 * transaction so that we can roll the transaction while holding the
+	 * inode locked.
+	 */
+	libxfs_trans_ijoin(sc->tp, sc->ip, 0);
+	ifake->if_fork->if_format = XFS_DINODE_FMT_REFCOUNT;
+	error = xrep_rtrefc_btree_load(rr, cur);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Install the new fork in the inode.  After this point the old mapping
+	 * data are no longer accessible and the new tree is live.  We delete
+	 * the cursor immediately after committing the staged root because the
+	 * staged fork might be in extents format.
+	 */
+	libxfs_rtrefcountbt_commit_staged_btree(cur, sc->tp);
+	libxfs_btree_del_cursor(cur, 0);
+
+	/* Reset the inode counters now that we've changed the fork. */
+	error = xrep_rtrefc_reset_counters(rr);
+	if (error)
+		goto err_newbt;
+
+	/* Dispose of any unused blocks and the accounting infomation. */
+	error = bulkload_commit(&rr->new_fork_info);
+	if (error)
+		return error;
+
+	return -libxfs_trans_roll_inode(&sc->tp, sc->ip);
+err_cur:
+	if (cur)
+		libxfs_btree_del_cursor(cur, error);
+err_newbt:
+	bulkload_cancel(&rr->new_fork_info);
+	return error;
+}
+
+/* Store the realtime reference counts in the rtrefcbt. */
+int
+populate_rtgroup_refcountbt(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip,
+	xfs_filblks_t		est_fdblocks)
+{
+	struct repair_ctx	sc = {
+		.mp		= rtg->rtg_mount,
+		.ip		= ip,
+	};
+	struct xrep_rtrefc	rr = {
+		.sc		= &sc,
+		.rtg		= rtg,
+		.est_fdblocks	= est_fdblocks,
+	};
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	int			error;
+
+	if (!xfs_has_rtreflink(mp))
+		return 0;
+
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+			&sc.tp);
+	if (error)
+		return error;
+
+	error = xrep_rtrefc_build_new_tree(&rr);
+	if (error)
+		goto out_cancel;
+
+	return -libxfs_trans_commit(sc.tp);
+
+out_cancel:
+	libxfs_trans_cancel(sc.tp);
+	return error;
+}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 37/42] xfs_repair: allow realtime files to have the reflink flag set
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (35 preceding siblings ...)
  2023-12-27 13:35   ` [PATCH 36/42] xfs_repair: rebuild the realtime refcount btree Darrick J. Wong
@ 2023-12-27 13:35   ` Darrick J. Wong
  2023-12-27 13:36   ` [PATCH 38/42] xfs_repair: validate CoW extent size hint on rtinherit directories Darrick J. Wong
                     ` (4 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:35 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we allow reflink on the realtime volume, allow that combination
of inode flags if the feature's enabled.  Note that we now allow inodes
to have rtinherit even if there's no realtime volume, since the kernel
has never restricted that.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index f0c0ba4da4e..2b254b29c4a 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -3199,7 +3199,8 @@ _("bad (negative) size %" PRId64 " on inode %" PRIu64 "\n"),
 		}
 
 		if ((flags2 & XFS_DIFLAG2_REFLINK) &&
-		    (flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT))) {
+		    !xfs_has_rtreflink(mp) &&
+		    (flags & XFS_DIFLAG_REALTIME)) {
 			if (!uncertain) {
 				do_warn(
 	_("Cannot have a reflinked realtime inode %" PRIu64 "\n"),
@@ -3231,7 +3232,8 @@ _("bad (negative) size %" PRId64 " on inode %" PRIu64 "\n"),
 		}
 
 		if ((flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
-		    (flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT))) {
+		    !xfs_has_rtreflink(mp) &&
+		    (flags & XFS_DIFLAG_REALTIME)) {
 			if (!uncertain) {
 				do_warn(
 	_("Cannot have CoW extent size hint on a realtime inode %" PRIu64 "\n"),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 38/42] xfs_repair: validate CoW extent size hint on rtinherit directories
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (36 preceding siblings ...)
  2023-12-27 13:35   ` [PATCH 37/42] xfs_repair: allow realtime files to have the reflink flag set Darrick J. Wong
@ 2023-12-27 13:36   ` Darrick J. Wong
  2023-12-27 13:36   ` [PATCH 39/42] xfs_repair: allow sysadmins to add realtime reflink Darrick J. Wong
                     ` (3 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:36 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

XFS allows a sysadmin to change the rt extent size when adding a rt
section to a filesystem after formatting.  If there are any directories
with both a cowextsize hint and rtinherit set, the hint could become
misaligned with the new rextsize.  Offer to fix the problem if we're in
modify mode and the verifier didn't trip.  If we're in dry run mode,
we let the kernel fix it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |   64 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 21 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index 2b254b29c4a..df88a162829 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -2808,6 +2808,47 @@ _("Bad extent size hint %u on inode %" PRIu64 ", "),
 	}
 }
 
+static void
+validate_cowextsize(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dino,
+	xfs_ino_t		lino,
+	int			*dirty)
+{
+	uint16_t		flags = be16_to_cpu(dino->di_flags);
+	uint64_t		flags2 = be64_to_cpu(dino->di_flags2);
+	unsigned int		value = be32_to_cpu(dino->di_cowextsize);
+	bool			misaligned = false;
+	bool			bad;
+
+	/*
+	 * XFS allows a sysadmin to change the rt extent size when adding a
+	 * rt section to a filesystem after formatting.  If there are any
+	 * directories with both a cowextsize hint and rtinherit set, the
+	 * hint could become misaligned with the new rextsize.
+	 */
+	if ((flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    (flags & XFS_DIFLAG_RTINHERIT) &&
+	    value % mp->m_sb.sb_rextsize > 0)
+		misaligned = true;
+
+	/* Complain if the verifier fails. */
+	bad = libxfs_inode_validate_cowextsize(mp, value,
+			be16_to_cpu(dino->di_mode), flags, flags2) != NULL;
+	if (bad || misaligned) {
+		do_warn(
+_("Bad CoW extent size hint %u on inode %" PRIu64 ", "),
+				be32_to_cpu(dino->di_cowextsize), lino);
+		if (!no_modify) {
+			do_warn(_("resetting to zero\n"));
+			dino->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
+			dino->di_cowextsize = 0;
+			*dirty = 1;
+		} else
+			do_warn(_("would reset to zero\n"));
+	}
+}
+
 /*
  * returns 0 if the inode is ok, 1 if the inode is corrupt
  * check_dups can be set to 1 *only* when called by the
@@ -3385,27 +3426,8 @@ _("bad (negative) size %" PRId64 " on inode %" PRIu64 "\n"),
 
 	validate_extsize(mp, dino, lino, dirty);
 
-	/*
-	 * Only (regular files and directories) with COWEXTSIZE flags
-	 * set can have extsize set.
-	 */
-	if (dino->di_version >= 3 &&
-	    libxfs_inode_validate_cowextsize(mp,
-			be32_to_cpu(dino->di_cowextsize),
-			be16_to_cpu(dino->di_mode),
-			be16_to_cpu(dino->di_flags),
-			be64_to_cpu(dino->di_flags2)) != NULL) {
-		do_warn(
-_("Bad CoW extent size %u on inode %" PRIu64 ", "),
-				be32_to_cpu(dino->di_cowextsize), lino);
-		if (!no_modify)  {
-			do_warn(_("resetting to zero\n"));
-			dino->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
-			dino->di_cowextsize = 0;
-			*dirty = 1;
-		} else
-			do_warn(_("would reset to zero\n"));
-	}
+	if (dino->di_version >= 3)
+		validate_cowextsize(mp, dino, lino, dirty);
 
 	/* nsec fields cannot be larger than 1 billion */
 	check_nsec("atime", lino, dino, &dino->di_atime, dirty);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 39/42] xfs_repair: allow sysadmins to add realtime reflink
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (37 preceding siblings ...)
  2023-12-27 13:36   ` [PATCH 38/42] xfs_repair: validate CoW extent size hint on rtinherit directories Darrick J. Wong
@ 2023-12-27 13:36   ` Darrick J. Wong
  2023-12-27 13:36   ` [PATCH 40/42] xfs_logprint: report realtime CUIs Darrick J. Wong
                     ` (2 subsequent siblings)
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:36 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow the sysadmin to use xfs_repair to upgrade an existing filesystem
to support the realtime reference count btree, and therefore reflink on
realtime volumes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    1 +
 repair/phase2.c          |   83 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 2 deletions(-)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index bc34ce9caad..4e7b3caba4b 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -294,6 +294,7 @@
 #define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
 
 #define xfs_rtrefcountbt_absolute_maxlevels	libxfs_rtrefcountbt_absolute_maxlevels
+#define xfs_rtrefcountbt_calc_reserves	libxfs_rtrefcountbt_calc_reserves
 #define xfs_rtrefcountbt_calc_size		libxfs_rtrefcountbt_calc_size
 #define xfs_rtrefcountbt_commit_staged_btree	libxfs_rtrefcountbt_commit_staged_btree
 #define xfs_rtrefcountbt_create		libxfs_rtrefcountbt_create
diff --git a/repair/phase2.c b/repair/phase2.c
index 22458aee4cd..cd3aa30eecd 100644
--- a/repair/phase2.c
+++ b/repair/phase2.c
@@ -223,14 +223,19 @@ set_reflink(
 		exit(0);
 	}
 
-	if (xfs_has_realtime(mp)) {
-		printf(_("Reflink feature not supported with realtime.\n"));
+	if (xfs_has_realtime(mp) && !xfs_has_rtgroups(mp)) {
+		printf(_("Reference count btree requires realtime groups.\n"));
 		exit(0);
 	}
 
 	printf(_("Adding reflink support to filesystem.\n"));
 	new_sb->sb_features_ro_compat |= XFS_SB_FEAT_RO_COMPAT_REFLINK;
 	new_sb->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
+
+	/* Quota counts will be wrong once we add the refcount inodes. */
+	if (xfs_has_realtime(mp))
+		quotacheck_skip();
+
 	return true;
 }
 
@@ -511,6 +516,63 @@ reserve_rtrmap_inode(
 	return error;
 }
 
+/*
+ * Reserve space to handle rt refcount btree expansion.
+ *
+ * If the refcount inode for this group already exists, we assume that we're
+ * adding some other feature.  Note that we have not validated the metadata
+ * directory tree, so we must perform the lookup by hand and abort the upgrade
+ * if there are errors.  If the inode does not exist, the amount of space
+ * needed to handle a new maximally sized refcount btree is added to @new_resv.
+ */
+static int
+reserve_rtrefcount_inode(
+	struct xfs_rtgroup	*rtg,
+	xfs_rfsblock_t		*new_resv)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_path	*path;
+	struct xfs_trans	*tp;
+	xfs_ino_t		ino;
+	xfs_filblks_t		ask;
+	int			error;
+
+	if (!xfs_has_rtreflink(mp))
+		return 0;
+
+	error = -libxfs_rtrefcountbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		return error;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		goto out_path;
+
+	ask = libxfs_rtrefcountbt_calc_reserves(mp);
+
+	error = -libxfs_imeta_lookup(tp, path, &ino);
+	if (error)
+		goto out_trans;
+
+	if (ino == NULLFSINO) {
+		*new_resv += ask;
+		goto out_trans;
+	}
+
+	error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE,
+			&rtg->rtg_refcountip);
+	if (error)
+		goto out_trans;
+
+	error = -libxfs_imeta_resv_init_inode(rtg->rtg_refcountip, ask);
+
+out_trans:
+	libxfs_trans_cancel(tp);
+out_path:
+	libxfs_imeta_free_path(path);
+	return error;
+}
+
 static void
 check_fs_free_space(
 	struct xfs_mount		*mp,
@@ -610,6 +672,18 @@ _("Not enough free space would remain for rtgroup %u rmap inode.\n"),
 			do_error(
 _("Error %d while checking rtgroup %u rmap inode space reservation.\n"),
 					error, rtg->rtg_rgno);
+
+		error = reserve_rtrefcount_inode(rtg, &new_resv);
+		if (error == ENOSPC) {
+			printf(
+_("Not enough free space would remain for rtgroup %u refcount inode.\n"),
+					rtg->rtg_rgno);
+			exit(0);
+		}
+		if (error)
+			do_error(
+_("Error %d while checking rtgroup %u refcount inode space reservation.\n"),
+					error, rtg->rtg_rgno);
 	}
 
 	/*
@@ -643,6 +717,11 @@ _("Error %d while checking rtgroup %u rmap inode space reservation.\n"),
 			libxfs_imeta_irele(rtg->rtg_rmapip);
 			rtg->rtg_rmapip = NULL;
 		}
+		if (rtg->rtg_refcountip) {
+			libxfs_imeta_resv_free_inode(rtg->rtg_refcountip);
+			libxfs_imeta_irele(rtg->rtg_refcountip);
+			rtg->rtg_refcountip = NULL;
+		}
 	}
 
 	/*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 40/42] xfs_logprint: report realtime CUIs
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (38 preceding siblings ...)
  2023-12-27 13:36   ` [PATCH 39/42] xfs_repair: allow sysadmins to add realtime reflink Darrick J. Wong
@ 2023-12-27 13:36   ` Darrick J. Wong
  2023-12-27 13:36   ` [PATCH 41/42] mkfs: validate CoW extent size hint when rtinherit is set Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 42/42] mkfs: enable reflink on the realtime device Darrick J. Wong
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:36 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Decode the CUI format just enough to report if an CUI targets the
realtime device or not.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 logprint/log_misc.c      |    2 ++
 logprint/log_print_all.c |    8 ++++++++
 logprint/log_redo.c      |   24 +++++++++++++++++++-----
 3 files changed, 29 insertions(+), 5 deletions(-)


diff --git a/logprint/log_misc.c b/logprint/log_misc.c
index 3661b595c53..6dad686d3b2 100644
--- a/logprint/log_misc.c
+++ b/logprint/log_misc.c
@@ -1034,12 +1034,14 @@ xlog_print_record(
 					be32_to_cpu(op_head->oh_len));
 			break;
 		    }
+		    case XFS_LI_CUI_RT:
 		    case XFS_LI_CUI: {
 			skip = xlog_print_trans_cui(&ptr,
 					be32_to_cpu(op_head->oh_len),
 					continued);
 			break;
 		    }
+		    case XFS_LI_CUD_RT:
 		    case XFS_LI_CUD: {
 			skip = xlog_print_trans_cud(&ptr,
 					be32_to_cpu(op_head->oh_len));
diff --git a/logprint/log_print_all.c b/logprint/log_print_all.c
index e67e2c57f26..2ae642ac000 100644
--- a/logprint/log_print_all.c
+++ b/logprint/log_print_all.c
@@ -432,9 +432,11 @@ xlog_recover_print_logitem(
 	case XFS_LI_RUI:
 		xlog_recover_print_rui(item);
 		break;
+	case XFS_LI_CUD_RT:
 	case XFS_LI_CUD:
 		xlog_recover_print_cud(item);
 		break;
+	case XFS_LI_CUI_RT:
 	case XFS_LI_CUI:
 		xlog_recover_print_cui(item);
 		break;
@@ -514,6 +516,12 @@ xlog_recover_print_item(
 	case XFS_LI_CUI:
 		printf("CUI");
 		break;
+	case XFS_LI_CUD_RT:
+		printf("CUD_RT");
+		break;
+	case XFS_LI_CUI_RT:
+		printf("CUI_RT");
+		break;
 	case XFS_LI_BUD:
 		printf("BUD");
 		break;
diff --git a/logprint/log_redo.c b/logprint/log_redo.c
index ae6f311f19b..381e819ceb7 100644
--- a/logprint/log_redo.c
+++ b/logprint/log_redo.c
@@ -440,6 +440,7 @@ xlog_print_trans_cui(
 	uint			src_len,
 	int			continued)
 {
+	const char		*item_name = "CUI?";
 	struct xfs_cui_log_format	*src_f, *f = NULL;
 	uint			dst_len;
 	uint			nextents;
@@ -480,8 +481,14 @@ xlog_print_trans_cui(
 		goto error;
 	}
 
-	printf(_("CUI:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
-		f->cui_size, f->cui_nextents, (unsigned long long)f->cui_id);
+	switch (f->cui_type) {
+	case XFS_LI_CUI:	item_name = "CUI"; break;
+	case XFS_LI_CUI_RT:	item_name = "CUI_RT"; break;
+	}
+
+	printf(_("%s:  #regs: %d	num_extents: %d  id: 0x%llx\n"),
+			item_name, f->cui_size, f->cui_nextents,
+			(unsigned long long)f->cui_id);
 
 	if (continued) {
 		printf(_("CUI extent data skipped (CONTINUE set, no space)\n"));
@@ -520,6 +527,7 @@ xlog_print_trans_cud(
 	char				**ptr,
 	uint				len)
 {
+	const char			*item_name = "CUD?";
 	struct xfs_cud_log_format	*f;
 	struct xfs_cud_log_format	lbuf;
 
@@ -528,11 +536,17 @@ xlog_print_trans_cud(
 
 	memcpy(&lbuf, *ptr, min(core_size, len));
 	f = &lbuf;
+
+	switch (f->cud_type) {
+	case XFS_LI_CUD:	item_name = "CUD"; break;
+	case XFS_LI_CUD_RT:	item_name = "CUD_RT"; break;
+	}
+
 	*ptr += len;
 	if (len >= core_size) {
-		printf(_("CUD:  #regs: %d	                 id: 0x%llx\n"),
-			f->cud_size,
-			(unsigned long long)f->cud_cui_id);
+		printf(_("%s:  #regs: %d	                 id: 0x%llx\n"),
+				item_name, f->cud_size,
+				(unsigned long long)f->cud_cui_id);
 
 		/* don't print extents as they are not used */
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 41/42] mkfs: validate CoW extent size hint when rtinherit is set
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (39 preceding siblings ...)
  2023-12-27 13:36   ` [PATCH 40/42] xfs_logprint: report realtime CUIs Darrick J. Wong
@ 2023-12-27 13:36   ` Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 42/42] mkfs: enable reflink on the realtime device Darrick J. Wong
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:36 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Extent size hints exist to nudge the behavior of the file data block
allocator towards trying to make aligned allocations.  Therefore, it
doesn't make sense to allow a hint that isn't a multiple of the
fundamental allocation unit for a given file.

This means that if the sysadmin is formatting with rtinherit set on the
root dir, validate_cowextsize_hint needs to check the hint value on a
simulated realtime file to make sure that it's correct.  This hasn't
been necessary in the past since one cannot have a CoW hint without a
reflink filesystem, and we previously didn't allow rt reflink
filesystems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 mkfs/xfs_mkfs.c |   20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)


diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 162546cd1e8..bcdfbf6fbf6 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -2751,6 +2751,26 @@ _("illegal CoW extent size hint %lld, must be less than %u.\n"),
 				min(XFS_MAX_BMBT_EXTLEN, mp->m_sb.sb_agblocks / 2));
 		usage();
 	}
+
+	/*
+	 * If the value is to be passed on to realtime files, revalidate with
+	 * a realtime file so that we know the hint and flag that get passed on
+	 * to realtime files will be correct.
+	 */
+	if (!(cli->fsx.fsx_xflags & FS_XFLAG_RTINHERIT))
+		return;
+
+	fa = libxfs_inode_validate_cowextsize(mp, cli->fsx.fsx_cowextsize,
+			S_IFREG, XFS_DIFLAG_REALTIME, flags2);
+
+	if (fa) {
+		fprintf(stderr,
+_("illegal CoW extent size hint %lld, must be less than %u and a multiple of %u. %p\n"),
+				(long long)cli->fsx.fsx_cowextsize,
+				min(XFS_MAX_BMBT_EXTLEN, mp->m_sb.sb_agblocks / 2),
+				mp->m_sb.sb_rextsize, fa);
+		usage();
+	}
 }
 
 /* Complain if this filesystem is not a supported configuration. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 42/42] mkfs: enable reflink on the realtime device
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                     ` (40 preceding siblings ...)
  2023-12-27 13:36   ` [PATCH 41/42] mkfs: validate CoW extent size hint when rtinherit is set Darrick J. Wong
@ 2023-12-27 13:37   ` Darrick J. Wong
  41 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:37 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow the creation of filesystems with both reflink and realtime volumes
enabled.  For now we don't support a realtime extent size > 1.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/init.c   |    4 ++--
 mkfs/proto.c    |   36 ++++++++++++++++++++++++++++++++++
 mkfs/xfs_mkfs.c |   59 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 92 insertions(+), 7 deletions(-)


diff --git a/libxfs/init.c b/libxfs/init.c
index 6add79121b2..2e0d3b1ce94 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -300,9 +300,9 @@ rtmount_init(
 	if (mp->m_sb.sb_rblocks == 0)
 		return 0;
 
-	if (xfs_has_reflink(mp)) {
+	if (xfs_has_reflink(mp) && mp->m_sb.sb_rextsize > 1) {
 		fprintf(stderr,
-	_("%s: Reflink not compatible with realtime device. Please try a newer xfsprogs.\n"),
+	_("%s: Reflink not compatible with realtime extent size > 1. Please try a newer xfsprogs.\n"),
 				progname);
 		return -1;
 	}
diff --git a/mkfs/proto.c b/mkfs/proto.c
index 436f9ac82b2..5b9f3642486 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -884,6 +884,40 @@ rtrmapbt_create(
 	libxfs_imeta_free_path(path);
 }
 
+/* Create the realtime refcount btree inode. */
+static void
+rtrefcountbt_create(
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_imeta_update	upd;
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_path	*path;
+	int			error;
+
+	error = -libxfs_rtrefcountbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		fail( _("rtrefcount inode path creation failed"), error);
+
+	error = -libxfs_imeta_ensure_dirpath(mp, path);
+	if (error)
+		fail(_("rtgroup allocation failed"),
+				error);
+
+	error = -libxfs_imeta_start_create(mp, path, &upd);
+	if (error)
+		res_failed(error);
+
+	error = -libxfs_rtrefcountbt_create(&upd, &rtg->rtg_refcountip);
+	if (error)
+		fail(_("rtrefcount inode creation failed"), error);
+
+	error = -libxfs_imeta_commit_update(&upd);
+	if (error)
+		fail(_("rtrefcountbt commit failed"), error);
+
+	libxfs_imeta_free_path(path);
+}
+
 /* Initialize block headers of rt free space files. */
 static int
 init_rtblock_headers(
@@ -1066,6 +1100,8 @@ rtinit(
 	for_each_rtgroup(mp, rgno, rtg) {
 		if (xfs_has_rtrmapbt(mp))
 			rtrmapbt_create(rtg);
+		if (xfs_has_rtreflink(mp))
+			rtrefcountbt_create(rtg);
 	}
 
 	if (mp->m_sb.sb_rbmblocks == 0)
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index bcdfbf6fbf6..83ab4a6c316 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -2467,12 +2467,36 @@ _("parent pointers not supported on v4 filesystems\n"));
 	}
 
 	if (cli->xi->rt.name) {
-		if (cli->sb_feat.reflink && cli_opt_set(&mopts, M_REFLINK)) {
-			fprintf(stderr,
-_("reflink not supported with realtime devices\n"));
-			usage();
+		if (cli->rtextsize && cli->sb_feat.reflink) {
+			if (cli_opt_set(&mopts, M_REFLINK)) {
+				fprintf(stderr,
+_("reflink not supported on realtime devices with rt extent size specified\n"));
+				usage();
+			}
+			cli->sb_feat.reflink = false;
+		}
+		if (cli->blocksize < XFS_MIN_RTEXTSIZE && cli->sb_feat.reflink) {
+			if (cli_opt_set(&mopts, M_REFLINK)) {
+				fprintf(stderr,
+_("reflink not supported on realtime devices with blocksize %d < %d\n"),
+						cli->blocksize,
+						XFS_MIN_RTEXTSIZE);
+				usage();
+			}
+			cli->sb_feat.reflink = false;
+		}
+		if (!cli->sb_feat.rtgroups && cli->sb_feat.reflink) {
+			if (cli_opt_set(&mopts, M_REFLINK) &&
+			    cli_opt_set(&ropts, R_RTGROUPS)) {
+				fprintf(stderr,
+_("reflink not supported on realtime devices without rtgroups feature\n"));
+				usage();
+			} else if (cli_opt_set(&mopts, M_REFLINK)) {
+				cli->sb_feat.rtgroups = true;
+			} else {
+				cli->sb_feat.reflink = false;
+			}
 		}
-		cli->sb_feat.reflink = false;
 
 		if (!cli->sb_feat.rtgroups && cli->sb_feat.rmapbt) {
 			if (cli_opt_set(&mopts, M_RMAPBT) &&
@@ -2640,6 +2664,19 @@ validate_rtextsize(
 			usage();
 		}
 		cfg->rtextblocks = (xfs_extlen_t)(rtextbytes >> cfg->blocklog);
+	} else if (cli->sb_feat.reflink && cli->xi->rt.name) {
+		/*
+		 * reflink doesn't support rt extent size > 1FSB yet, so set
+		 * an extent size of 1FSB.  Make sure we still satisfy the
+		 * minimum rt extent size.
+		 */
+		if (cfg->blocksize < XFS_MIN_RTEXTSIZE) {
+			fprintf(stderr,
+		_("reflink not supported on rt volume with blocksize %d\n"),
+				cfg->blocksize);
+			usage();
+		}
+		cfg->rtextblocks = 1;
 	} else {
 		/*
 		 * If realtime extsize has not been specified by the user,
@@ -2671,6 +2708,12 @@ validate_rtextsize(
 		}
 	}
 	ASSERT(cfg->rtextblocks);
+
+	if (cli->sb_feat.reflink && cfg->rtblocks > 0 && cfg->rtextblocks > 1) {
+		fprintf(stderr,
+_("reflink not supported on realtime with extent sizes > 1\n"));
+		usage();
+	}
 }
 
 /* Validate the incoming extsize hint. */
@@ -4636,10 +4679,16 @@ check_rt_meta_prealloc(
 		error = -libxfs_imeta_resv_init_inode(rtg->rtg_rmapip, ask);
 		if (error)
 			prealloc_fail(mp, error, ask, _("realtime rmap btree"));
+
+		ask = libxfs_rtrefcountbt_calc_reserves(mp);
+		error = -libxfs_imeta_resv_init_inode(rtg->rtg_refcountip, ask);
+		if (error)
+			prealloc_fail(mp, error, ask, _("realtime refcount btree"));
 	}
 
 	/* Unreserve the realtime metadata reservations. */
 	for_each_rtgroup(mp, rgno, rtg) {
+		libxfs_imeta_resv_free_inode(rtg->rtg_refcountip);
 		libxfs_imeta_resv_free_inode(rtg->rtg_rmapip);
 	}
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/3] xfs: enable extent size hints for CoW when rtextsize > 1
  2023-12-31 19:55 ` [PATCHSET v2.0 16/17] xfsprogs: reflink with large realtime extents Darrick J. Wong
@ 2023-12-27 13:37   ` Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 2/3] xfs: fix integer overflow when validating extent size hints Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 3/3] mkfs: enable reflink with realtime extent sizes > 1 Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:37 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

CoW extent size hints are not allowed on filesystems that have large
realtime extents because we only want to perform the minimum required
amount of write-around (aka write amplification) for shared extents.

On filesystems where rtextsize > 1, allocations can only be done in
units of full rt extents, which means that we can only map an entire rt
extent's worth of blocks into the data fork.  Hole punch requests become
conversions to unwritten if the request isn't aligned properly.

Because a copy-write fundamentally requires remapping, this means that
we also can only do copy-writes of a full rt extent.  This is too
expensive for large hint sizes, since it's all or nothing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_bmap.c |   22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)


diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index bc703b13b7e..13bcf146d08 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -6402,6 +6402,28 @@ xfs_get_cowextsz_hint(
 	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 		a = ip->i_cowextsize;
 	if (XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * For realtime files, the realtime extent is the fundamental
+		 * unit of allocation.  This means that data sharing and CoW
+		 * remapping can only be done in those units.  For filesystems
+		 * where the extent size is larger than one block, write
+		 * requests that are not aligned to an extent boundary employ
+		 * an unshare-around strategy to ensure that all pages for a
+		 * shared extent are fully dirtied.
+		 *
+		 * Because the remapping alignment requirement applies equally
+		 * to all CoW writes, any regular overwrites that could be
+		 * turned (by a speculative CoW preallocation) into a CoW write
+		 * must either employ this dirty-around strategy, or be smart
+		 * enough to ignore the CoW fork mapping unless the entire
+		 * extent is dirty or becomes shared by writeback time.  Doing
+		 * the first would dramatically increase write amplification,
+		 * and the second would require deeper insight into the state
+		 * of the page cache during a writeback request.  For now, we
+		 * ignore the hint.
+		 */
+		if (ip->i_mount->m_sb.sb_rextsize > 1)
+			return ip->i_mount->m_sb.sb_rextsize;
 		b = 0;
 		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
 			b = ip->i_extsize;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/3] xfs: fix integer overflow when validating extent size hints
  2023-12-31 19:55 ` [PATCHSET v2.0 16/17] xfsprogs: reflink with large realtime extents Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 1/3] xfs: enable extent size hints for CoW when rtextsize > 1 Darrick J. Wong
@ 2023-12-27 13:37   ` Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 3/3] mkfs: enable reflink with realtime extent sizes > 1 Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:37 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Both file extent size hints are stored as 32-bit quantities, in units of
filesystem blocks.  As part of validating the hints, we convert these
quantities to bytes to ensure that the hint is congruent with the file's
allocation size.

The maximum possible hint value is 2097151 (aka XFS_MAX_BMBT_EXTLEN).
If the file allocation unit is larger than 2048, the unit conversion
will exceed 32 bits in size, which overflows the uint32_t used to store
the value used in the comparison.  This isn't a problem for files on the
data device since the hint will always be a multiple of the block size.
However, this is a problem for realtime files because the rtextent size
can be any integer number of fs blocks, and truncation of upper bits
changes the outcome of division.

Eliminate the overflow by performing the congruency check in units of
blocks, not bytes.  Otherwise, we get errors like this:

$ truncate -s 500T /tmp/a
$ mkfs.xfs -f -N /tmp/a -d extszinherit=2097151,rtinherit=1 -r extsize=28k
illegal extent size hint 2097151, must be less than 2097151 and a multiple of 7.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_inode_buf.c |   20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)


diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c
index d9fc254b354..085c128c542 100644
--- a/libxfs/xfs_inode_buf.c
+++ b/libxfs/xfs_inode_buf.c
@@ -767,13 +767,11 @@ xfs_inode_validate_extsize(
 	bool				rt_flag;
 	bool				hint_flag;
 	bool				inherit_flag;
-	uint32_t			extsize_bytes;
-	uint32_t			blocksize_bytes;
+	uint32_t			alloc_unit = 1;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
 	inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
-	extsize_bytes = XFS_FSB_TO_B(mp, extsize);
 
 	/*
 	 * This comment describes a historic gap in this verifier function.
@@ -802,9 +800,7 @@ xfs_inode_validate_extsize(
 	 */
 
 	if (rt_flag)
-		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
-	else
-		blocksize_bytes = mp->m_sb.sb_blocksize;
+		alloc_unit = mp->m_sb.sb_rextsize;
 
 	if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
 		return __this_address;
@@ -822,7 +818,7 @@ xfs_inode_validate_extsize(
 	if (mode && !(hint_flag || inherit_flag) && extsize != 0)
 		return __this_address;
 
-	if (extsize_bytes % blocksize_bytes)
+	if (extsize % alloc_unit)
 		return __this_address;
 
 	if (extsize > XFS_MAX_BMBT_EXTLEN)
@@ -857,12 +853,10 @@ xfs_inode_validate_cowextsize(
 {
 	bool				rt_flag;
 	bool				hint_flag;
-	uint32_t			cowextsize_bytes;
-	uint32_t			blocksize_bytes;
+	uint32_t			alloc_unit = 1;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
-	cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
 
 	/*
 	 * Similar to extent size hints, a directory can be configured to
@@ -877,9 +871,7 @@ xfs_inode_validate_cowextsize(
 	 */
 
 	if (rt_flag)
-		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
-	else
-		blocksize_bytes = mp->m_sb.sb_blocksize;
+		alloc_unit = mp->m_sb.sb_rextsize;
 
 	if (hint_flag && !xfs_has_reflink(mp))
 		return __this_address;
@@ -894,7 +886,7 @@ xfs_inode_validate_cowextsize(
 	if (mode && !hint_flag && cowextsize != 0)
 		return __this_address;
 
-	if (cowextsize_bytes % blocksize_bytes)
+	if (cowextsize % alloc_unit)
 		return __this_address;
 
 	if (cowextsize > XFS_MAX_BMBT_EXTLEN)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/3] mkfs: enable reflink with realtime extent sizes > 1
  2023-12-31 19:55 ` [PATCHSET v2.0 16/17] xfsprogs: reflink with large realtime extents Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 1/3] xfs: enable extent size hints for CoW when rtextsize > 1 Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 2/3] xfs: fix integer overflow when validating extent size hints Darrick J. Wong
@ 2023-12-27 13:37   ` Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:37 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow creation of filesystems with reflink enabled and realtime extent
size larger than 1 block.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/init.c   |    7 -------
 mkfs/xfs_mkfs.c |   37 -------------------------------------
 2 files changed, 44 deletions(-)


diff --git a/libxfs/init.c b/libxfs/init.c
index 2e0d3b1ce94..0f626784902 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -300,13 +300,6 @@ rtmount_init(
 	if (mp->m_sb.sb_rblocks == 0)
 		return 0;
 
-	if (xfs_has_reflink(mp) && mp->m_sb.sb_rextsize > 1) {
-		fprintf(stderr,
-	_("%s: Reflink not compatible with realtime extent size > 1. Please try a newer xfsprogs.\n"),
-				progname);
-		return -1;
-	}
-
 	if (mp->m_rtdev_targp->bt_bdev == 0 && !xfs_is_debugger(mp)) {
 		fprintf(stderr, _("%s: filesystem has a realtime subvolume\n"),
 			progname);
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 83ab4a6c316..f2961c4d134 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -2467,24 +2467,6 @@ _("parent pointers not supported on v4 filesystems\n"));
 	}
 
 	if (cli->xi->rt.name) {
-		if (cli->rtextsize && cli->sb_feat.reflink) {
-			if (cli_opt_set(&mopts, M_REFLINK)) {
-				fprintf(stderr,
-_("reflink not supported on realtime devices with rt extent size specified\n"));
-				usage();
-			}
-			cli->sb_feat.reflink = false;
-		}
-		if (cli->blocksize < XFS_MIN_RTEXTSIZE && cli->sb_feat.reflink) {
-			if (cli_opt_set(&mopts, M_REFLINK)) {
-				fprintf(stderr,
-_("reflink not supported on realtime devices with blocksize %d < %d\n"),
-						cli->blocksize,
-						XFS_MIN_RTEXTSIZE);
-				usage();
-			}
-			cli->sb_feat.reflink = false;
-		}
 		if (!cli->sb_feat.rtgroups && cli->sb_feat.reflink) {
 			if (cli_opt_set(&mopts, M_REFLINK) &&
 			    cli_opt_set(&ropts, R_RTGROUPS)) {
@@ -2664,19 +2646,6 @@ validate_rtextsize(
 			usage();
 		}
 		cfg->rtextblocks = (xfs_extlen_t)(rtextbytes >> cfg->blocklog);
-	} else if (cli->sb_feat.reflink && cli->xi->rt.name) {
-		/*
-		 * reflink doesn't support rt extent size > 1FSB yet, so set
-		 * an extent size of 1FSB.  Make sure we still satisfy the
-		 * minimum rt extent size.
-		 */
-		if (cfg->blocksize < XFS_MIN_RTEXTSIZE) {
-			fprintf(stderr,
-		_("reflink not supported on rt volume with blocksize %d\n"),
-				cfg->blocksize);
-			usage();
-		}
-		cfg->rtextblocks = 1;
 	} else {
 		/*
 		 * If realtime extsize has not been specified by the user,
@@ -2708,12 +2677,6 @@ validate_rtextsize(
 		}
 	}
 	ASSERT(cfg->rtextblocks);
-
-	if (cli->sb_feat.reflink && cfg->rtblocks > 0 && cfg->rtextblocks > 1) {
-		fprintf(stderr,
-_("reflink not supported on realtime with extent sizes > 1\n"));
-		usage();
-	}
 }
 
 /* Validate the incoming extsize hint. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/1] xfs_quota: report warning limits for realtime space quotas
  2023-12-31 19:56 ` [PATCHSET v2.0 17/17] xfsprogs: enable quota for realtime voluems Darrick J. Wong
@ 2023-12-27 13:38   ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:38 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Report the number of warnings that a user will get for exceeding the
soft limit of a realtime volume.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xqm.h |    5 ++++-
 quota/state.c |    1 +
 2 files changed, 5 insertions(+), 1 deletion(-)


diff --git a/include/xqm.h b/include/xqm.h
index 573441db986..045af9b67fd 100644
--- a/include/xqm.h
+++ b/include/xqm.h
@@ -184,7 +184,10 @@ struct fs_quota_statv {
 	__s32			qs_rtbtimelimit;/* limit for rt blks timer */
 	__u16			qs_bwarnlimit;	/* limit for num warnings */
 	__u16			qs_iwarnlimit;	/* limit for num warnings */
-	__u64			qs_pad2[8];	/* for future proofing */
+	__u16			qs_rtbwarnlimit;/* limit for rt blks warnings */
+	__u16			qs_pad3;
+	__u32			qs_pad4;
+	__u64			qs_pad2[7];	/* for future proofing */
 };
 
 #endif	/* __XQM_H__ */
diff --git a/quota/state.c b/quota/state.c
index 260ef51db18..43fb700f9a7 100644
--- a/quota/state.c
+++ b/quota/state.c
@@ -244,6 +244,7 @@ state_quotafile_stat(
 	state_warnlimit(fp, XFS_INODE_QUOTA, sv->qs_iwarnlimit);
 
 	state_timelimit(fp, XFS_RTBLOCK_QUOTA, sv->qs_rtbtimelimit);
+	state_warnlimit(fp, XFS_RTBLOCK_QUOTA, sv->qs_rtbwarnlimit);
 }
 
 static void


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/11] xfs/122: fix metadirino
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
@ 2023-12-27 13:50   ` Darrick J. Wong
  2023-12-27 13:50   ` [PATCH 02/11] various: fix finding metadata inode numbers when metadir is enabled Darrick J. Wong
                     ` (9 subsequent siblings)
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:50 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Fix xfs/122 to work properly with metadirino.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/122.out |    1 +
 1 file changed, 1 insertion(+)


diff --git a/tests/xfs/122.out b/tests/xfs/122.out
index 5d14386518..430b805792 100644
--- a/tests/xfs/122.out
+++ b/tests/xfs/122.out
@@ -35,6 +35,7 @@ offsetof(xfs_sb_t, sb_logsunit) = 196
 offsetof(xfs_sb_t, sb_lsn) = 240
 offsetof(xfs_sb_t, sb_magicnum) = 0
 offsetof(xfs_sb_t, sb_meta_uuid) = 248
+offsetof(xfs_sb_t, sb_metadirino) = 264
 offsetof(xfs_sb_t, sb_pquotino) = 232
 offsetof(xfs_sb_t, sb_qflags) = 176
 offsetof(xfs_sb_t, sb_rblocks) = 16


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/11] various: fix finding metadata inode numbers when metadir is enabled
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
  2023-12-27 13:50   ` [PATCH 01/11] xfs/122: fix metadirino Darrick J. Wong
@ 2023-12-27 13:50   ` Darrick J. Wong
  2023-12-27 13:50   ` [PATCH 03/11] xfs/{030,033,178}: forcibly disable metadata directory trees Darrick J. Wong
                     ` (8 subsequent siblings)
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:50 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

There are a number of tests that use xfs_db to examine the contents of
metadata inodes to check correct functioning.  The logic is scattered
everywhere and won't work with metadata directory trees, so make a
shared helper to find these inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs    |   32 ++++++++++++++++++++++++++++++--
 tests/xfs/007 |   16 +++++++++-------
 tests/xfs/529 |    5 ++---
 tests/xfs/530 |    6 ++----
 tests/xfs/739 |    9 ++-------
 tests/xfs/740 |    9 ++-------
 tests/xfs/741 |    9 ++-------
 tests/xfs/742 |    9 ++-------
 tests/xfs/743 |    9 ++-------
 tests/xfs/744 |    9 ++-------
 tests/xfs/745 |    9 ++-------
 tests/xfs/746 |    9 ++-------
 12 files changed, 59 insertions(+), 72 deletions(-)


diff --git a/common/xfs b/common/xfs
index f77d4639b9..48643b7c18 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1422,7 +1422,7 @@ _scratch_get_bmx_prefix() {
 
 _scratch_get_iext_count()
 {
-	local ino=$1
+	local selector=$1
 	local whichfork=$2
 	local field=""
 
@@ -1437,7 +1437,7 @@ _scratch_get_iext_count()
 			return 1
 	esac
 
-	_scratch_xfs_get_metadata_field $field "inode $ino"
+	_scratch_xfs_get_metadata_field $field "$selector"
 }
 
 #
@@ -1843,3 +1843,31 @@ _require_xfs_parent()
 		|| _notrun "kernel does not support parent pointers"
 	_scratch_unmount
 }
+
+# Find a metadata file within an xfs filesystem.  The sole argument is the
+# name of the field within the superblock.
+_scratch_xfs_find_metafile()
+{
+	local metafile="$1"
+	local selector=
+
+	if ! _check_scratch_xfs_features METADIR > /dev/null; then
+		sb_field="$(_scratch_xfs_get_sb_field "$metafile")"
+		if echo "$sb_field" | grep -q -w 'not found'; then
+			return 1
+		fi
+		selector="inode $sb_field"
+	else
+		case "${metafile}" in
+		"rootino")	selector="path /";;
+		"uquotino")	selector="path -m /quota/user";;
+		"gquotino")	selector="path -m /quota/group";;
+		"pquotino")	selector="path -m /quota/project";;
+		"rbmino")	selector="path -m /realtime/bitmap";;
+		"rsumino")	selector="path -m /realtime/summary";;
+		esac
+	fi
+
+	echo "${selector}"
+	return 0
+}
diff --git a/tests/xfs/007 b/tests/xfs/007
index 4f864100fd..6d6d828b13 100755
--- a/tests/xfs/007
+++ b/tests/xfs/007
@@ -22,6 +22,11 @@ _require_xfs_quota
 _scratch_mkfs_xfs | _filter_mkfs > /dev/null 2> $tmp.mkfs
 . $tmp.mkfs
 
+get_qfile_nblocks() {
+	local selector="$(_scratch_xfs_find_metafile "$1")"
+	_scratch_xfs_db -c "$selector" -c "p core.nblocks"
+}
+
 do_test()
 {
 	qino_1=$1
@@ -31,12 +36,9 @@ do_test()
 	echo "*** umount"
 	_scratch_unmount
 
-	QINO_1=`_scratch_xfs_get_sb_field $qino_1`
-	QINO_2=`_scratch_xfs_get_sb_field $qino_2`
-
 	echo "*** Usage before quotarm ***"
-	_scratch_xfs_db -c "inode $QINO_1" -c "p core.nblocks"
-	_scratch_xfs_db -c "inode $QINO_2" -c "p core.nblocks"
+	get_qfile_nblocks $qino_1
+	get_qfile_nblocks $qino_2
 
 	_qmount
 	echo "*** turn off $off_opts quotas"
@@ -66,8 +68,8 @@ do_test()
 	_scratch_unmount
 
 	echo "*** Usage after quotarm ***"
-	_scratch_xfs_db -c "inode $QINO_1" -c "p core.nblocks"
-	_scratch_xfs_db -c "inode $QINO_2" -c "p core.nblocks"
+	get_qfile_nblocks $qino_1
+	get_qfile_nblocks $qino_2
 }
 
 # Test user and group first
diff --git a/tests/xfs/529 b/tests/xfs/529
index cd176877f5..43199a37ff 100755
--- a/tests/xfs/529
+++ b/tests/xfs/529
@@ -163,9 +163,8 @@ done
 _scratch_unmount >> $seqres.full
 
 echo "Verify uquota inode's extent count"
-uquotino=$(_scratch_xfs_get_metadata_field 'uquotino' 'sb 0')
-
-nextents=$(_scratch_get_iext_count $uquotino data || \
+selector="$(_scratch_xfs_find_metafile uquotino)"
+nextents=$(_scratch_get_iext_count "$selector" data || \
 		   _fail "Unable to obtain inode fork's extent count")
 if (( $nextents > 10 )); then
 	echo "Extent count overflow check failed: nextents = $nextents"
diff --git a/tests/xfs/530 b/tests/xfs/530
index 56f5e7ebdb..cb8c2e3978 100755
--- a/tests/xfs/530
+++ b/tests/xfs/530
@@ -104,10 +104,8 @@ _scratch_unmount >> $seqres.full
 
 echo "Verify rbmino's and rsumino's extent count"
 for rtino in rbmino rsumino; do
-	ino=$(_scratch_xfs_get_metadata_field $rtino "sb 0")
-	echo "$rtino = $ino" >> $seqres.full
-
-	nextents=$(_scratch_get_iext_count $ino data || \
+	selector="$(_scratch_xfs_find_metafile "$rtino")"
+	nextents=$(_scratch_get_iext_count "$selector" data || \
 			_fail "Unable to obtain inode fork's extent count")
 	if (( $nextents > 10 )); then
 		echo "Extent count overflow check failed: nextents = $nextents"
diff --git a/tests/xfs/739 b/tests/xfs/739
index fb7fe2e643..b6bbaf7c1d 100755
--- a/tests/xfs/739
+++ b/tests/xfs/739
@@ -27,13 +27,8 @@ echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
 echo "Fuzz rtbitmap"
-is_metadir=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime/0.bitmap')
-if [ -n "$is_metadir" ]; then
-	path=('path -m /realtime/0.bitmap')
-else
-	path=('sb' 'addr rbmino')
-fi
-_scratch_xfs_fuzz_metadata '' 'online' "${path[@]}" 'dblock 0' >> $seqres.full
+path="$(_scratch_xfs_find_metafile rbmino)"
+_scratch_xfs_fuzz_metadata '' 'online' "$path" 'dblock 0' >> $seqres.full
 echo "Done fuzzing rtbitmap"
 
 # success, all done
diff --git a/tests/xfs/740 b/tests/xfs/740
index a59fa37ecf..7cee2b110f 100755
--- a/tests/xfs/740
+++ b/tests/xfs/740
@@ -27,13 +27,8 @@ echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
 echo "Fuzz rtsummary"
-is_metadir=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime/0.summary')
-if [ -n "$is_metadir" ]; then
-	path=('path -m /realtime/0.summary')
-else
-	path=('sb' 'addr rsumino')
-fi
-_scratch_xfs_fuzz_metadata '' 'online' "${path[@]}" 'dblock 0' >> $seqres.full
+path="$(_scratch_xfs_find_metafile rsumino)"
+_scratch_xfs_fuzz_metadata '' 'online' "$path" 'dblock 0' >> $seqres.full
 echo "Done fuzzing rtsummary"
 
 # success, all done
diff --git a/tests/xfs/741 b/tests/xfs/741
index 957bed791b..1d18f4d9e4 100755
--- a/tests/xfs/741
+++ b/tests/xfs/741
@@ -27,13 +27,8 @@ echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
 echo "Fuzz rtbitmap"
-is_metadir=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime/0.bitmap')
-if [ -n "$is_metadir" ]; then
-	path=('path -m /realtime/0.bitmap')
-else
-	path=('sb' 'addr rbmino')
-fi
-_scratch_xfs_fuzz_metadata '' 'offline' "${path[@]}" 'dblock 0' >> $seqres.full
+path="$(_scratch_xfs_find_metafile rbmino)"
+_scratch_xfs_fuzz_metadata '' 'offline' "$path" 'dblock 0' >> $seqres.full
 echo "Done fuzzing rtbitmap"
 
 # success, all done
diff --git a/tests/xfs/742 b/tests/xfs/742
index d911748443..96ef89275c 100755
--- a/tests/xfs/742
+++ b/tests/xfs/742
@@ -27,13 +27,8 @@ echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
 echo "Fuzz rtsummary"
-is_metadir=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime/0.summary')
-if [ -n "$is_metadir" ]; then
-	path=('path -m /realtime/0.summary')
-else
-	path=('sb' 'addr rsumino')
-fi
-_scratch_xfs_fuzz_metadata '' 'offline' "${path[@]}" 'dblock 0' >> $seqres.full
+path="$(_scratch_xfs_find_metafile rsumino)"
+_scratch_xfs_fuzz_metadata '' 'offline' "$path" 'dblock 0' >> $seqres.full
 echo "Done fuzzing rtsummary"
 
 # success, all done
diff --git a/tests/xfs/743 b/tests/xfs/743
index 69e865a438..29f21e32c6 100755
--- a/tests/xfs/743
+++ b/tests/xfs/743
@@ -28,13 +28,8 @@ echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
 echo "Fuzz rtbitmap"
-is_metadir=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime/0.bitmap')
-if [ -n "$is_metadir" ]; then
-	path=('path -m /realtime/0.bitmap')
-else
-	path=('sb' 'addr rbmino')
-fi
-_scratch_xfs_fuzz_metadata '' 'both' "${path[@]}" 'dblock 0' >> $seqres.full
+path="$(_scratch_xfs_find_metafile rbmino)"
+_scratch_xfs_fuzz_metadata '' 'both' "$path" 'dblock 0' >> $seqres.full
 echo "Done fuzzing rtbitmap"
 
 # success, all done
diff --git a/tests/xfs/744 b/tests/xfs/744
index ea490b524e..72ae8f8593 100755
--- a/tests/xfs/744
+++ b/tests/xfs/744
@@ -28,13 +28,8 @@ echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
 echo "Fuzz rtsummary"
-is_metadir=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime/0.summary')
-if [ -n "$is_metadir" ]; then
-	path=('path -m /realtime/0.summary')
-else
-	path=('sb' 'addr rsumino')
-fi
-_scratch_xfs_fuzz_metadata '' 'both' "${path[@]}" 'dblock 0' >> $seqres.full
+path="$(_scratch_xfs_find_metafile rsumino)"
+_scratch_xfs_fuzz_metadata '' 'both' "$path" 'dblock 0' >> $seqres.full
 echo "Done fuzzing rtsummary"
 
 # success, all done
diff --git a/tests/xfs/745 b/tests/xfs/745
index 7621d45744..caecaa4edd 100755
--- a/tests/xfs/745
+++ b/tests/xfs/745
@@ -27,13 +27,8 @@ echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
 echo "Fuzz rtbitmap"
-is_metadir=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime/0.bitmap')
-if [ -n "$is_metadir" ]; then
-	path=('path -m /realtime/0.bitmap')
-else
-	path=('sb' 'addr rbmino')
-fi
-_scratch_xfs_fuzz_metadata '' 'none' "${path[@]}" 'dblock 0' >> $seqres.full
+path="$(_scratch_xfs_find_metafile rbmino)"
+_scratch_xfs_fuzz_metadata '' 'none' "$path" 'dblock 0' >> $seqres.full
 echo "Done fuzzing rtbitmap"
 
 # success, all done
diff --git a/tests/xfs/746 b/tests/xfs/746
index 401233162c..f005893965 100755
--- a/tests/xfs/746
+++ b/tests/xfs/746
@@ -27,13 +27,8 @@ echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
 echo "Fuzz rtsummary"
-is_metadir=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime/0.summary')
-if [ -n "$is_metadir" ]; then
-	path=('path -m /realtime/0.summary')
-else
-	path=('sb' 'addr rsumino')
-fi
-_scratch_xfs_fuzz_metadata '' 'none' "${path[@]}" 'dblock 0' >> $seqres.full
+path="$(_scratch_xfs_find_metafile rsumino)"
+_scratch_xfs_fuzz_metadata '' 'none' "$path" 'dblock 0' >> $seqres.full
 echo "Done fuzzing rtsummary"
 
 # success, all done


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/11] xfs/{030,033,178}: forcibly disable metadata directory trees
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
  2023-12-27 13:50   ` [PATCH 01/11] xfs/122: fix metadirino Darrick J. Wong
  2023-12-27 13:50   ` [PATCH 02/11] various: fix finding metadata inode numbers when metadir is enabled Darrick J. Wong
@ 2023-12-27 13:50   ` Darrick J. Wong
  2023-12-27 13:50   ` [PATCH 04/11] common/repair: patch up repair sb inode value complaints Darrick J. Wong
                     ` (7 subsequent siblings)
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:50 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The golden output for thests tests encode the xfs_repair output when we
fuzz various parts of the filesystem.  With metadata directory trees
enabled, however, the golden output changes dramatically to reflect
reconstruction of the metadata directory tree.

To avoid regressions, add a helper to force metadata directories off via
MKFS_OPTIONS.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs    |   13 +++++++++++++
 tests/xfs/030 |    1 +
 tests/xfs/033 |    1 +
 tests/xfs/178 |    1 +
 4 files changed, 16 insertions(+)


diff --git a/common/xfs b/common/xfs
index 48643b7c18..007c8704ce 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1871,3 +1871,16 @@ _scratch_xfs_find_metafile()
 	echo "${selector}"
 	return 0
 }
+
+# Force metadata directories off.
+_scratch_xfs_force_no_metadir()
+{
+	if echo "$MKFS_OPTIONS" | grep -q 'metadir='; then
+		MKFS_OPTIONS="$(echo "$MKFS_OPTIONS" | sed -e 's/metadir=\([01]\)/metadir=0/g')"
+		return
+	fi
+
+	if grep -q 'metadir=' $MKFS_XFS_PROG; then
+		MKFS_OPTIONS="-m metadir=0 $MKFS_OPTIONS"
+	fi
+}
diff --git a/tests/xfs/030 b/tests/xfs/030
index 201a901579..a62ea4fab3 100755
--- a/tests/xfs/030
+++ b/tests/xfs/030
@@ -50,6 +50,7 @@ _supported_fs xfs
 
 _require_scratch
 _require_no_large_scratch_dev
+_scratch_xfs_force_no_metadir
 
 DSIZE="-dsize=100m,agcount=6"
 
diff --git a/tests/xfs/033 b/tests/xfs/033
index ef5dc4fa36..e886c15082 100755
--- a/tests/xfs/033
+++ b/tests/xfs/033
@@ -53,6 +53,7 @@ _supported_fs xfs
 
 _require_scratch
 _require_no_large_scratch_dev
+_scratch_xfs_force_no_metadir
 
 # devzero blows away 512byte blocks, so make 512byte inodes (at least)
 _scratch_mkfs_xfs | _filter_mkfs 2>$tmp.mkfs >/dev/null
diff --git a/tests/xfs/178 b/tests/xfs/178
index fee1e92bf3..4e39cc364c 100755
--- a/tests/xfs/178
+++ b/tests/xfs/178
@@ -52,6 +52,7 @@ _supported_fs xfs
 #             fix filesystem, new mkfs.xfs will be fine.
 
 _require_scratch
+_scratch_xfs_force_no_metadir
 _scratch_mkfs_xfs | _filter_mkfs 2>$tmp.mkfs
 test "${PIPESTATUS[0]}" -eq 0 || _fail "mkfs failed!"
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/11] common/repair: patch up repair sb inode value complaints
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:50   ` [PATCH 03/11] xfs/{030,033,178}: forcibly disable metadata directory trees Darrick J. Wong
@ 2023-12-27 13:50   ` Darrick J. Wong
  2023-12-27 13:51   ` [PATCH 05/11] xfs/206: update for metadata directory support Darrick J. Wong
                     ` (6 subsequent siblings)
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:50 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Now that we've refactored xfs_repair to be more consistent in how it
reports unexpected superblock inode pointer values, we have to fix up
the fstests repair filters to emulate the old golden output.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/repair |    4 ++++
 1 file changed, 4 insertions(+)


diff --git a/common/repair b/common/repair
index 8945d0028c..c3afcfb3e6 100644
--- a/common/repair
+++ b/common/repair
@@ -28,6 +28,10 @@ _filter_repair()
 	perl -ne '
 # for sb
 /- agno = / && next;	# remove each AG line (variable number)
+s/realtime bitmap inode pointer/realtime bitmap ino pointer/;
+s/sb realtime bitmap inode value/sb realtime bitmap inode/;
+s/realtime summary inode pointer/realtime summary ino pointer/;
+s/sb realtime summary inode value/sb realtime summary inode/;
 s/(pointer to) (\d+)/\1 INO/;
 # Changed inode output in 5.5.0
 s/sb root inode value /sb root inode /;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/11] xfs/206: update for metadata directory support
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:50   ` [PATCH 04/11] common/repair: patch up repair sb inode value complaints Darrick J. Wong
@ 2023-12-27 13:51   ` Darrick J. Wong
  2023-12-27 13:51   ` [PATCH 06/11] xfs/{050,144,153,299,330}: update quota reports to handle metadir trees Darrick J. Wong
                     ` (5 subsequent siblings)
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:51 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Filter 'metadir=' out of the golden output so that metadata directories
don't cause this test to regress.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/206 |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/tests/xfs/206 b/tests/xfs/206
index f7f12ff1f9..9fd9400daf 100755
--- a/tests/xfs/206
+++ b/tests/xfs/206
@@ -65,7 +65,8 @@ mkfs_filter()
 	    -e "s/, lazy-count=[0-9]//" \
 	    -e "/.*crc=/d" \
 	    -e "/^Default configuration/d" \
-	    -e 's/, parent=[01]//'
+	    -e 's/, parent=[01]//' \
+	    -e '/metadir=.*/d'
 }
 
 # mkfs slightly smaller than that, small log for speed.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/11] xfs/{050,144,153,299,330}: update quota reports to handle metadir trees
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:51   ` [PATCH 05/11] xfs/206: update for metadata directory support Darrick J. Wong
@ 2023-12-27 13:51   ` Darrick J. Wong
  2023-12-27 13:51   ` [PATCH 07/11] xfs/856: add metadir upgrade to test matrix Darrick J. Wong
                     ` (4 subsequent siblings)
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:51 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Prior to the new metadir feature in XFS, the rtbitmap and rtsummary
files were included in icount, though their bcount contribution is zero
due to rt and quota not being supported together.  With the new metadir
feature in XFS, no files in the metadata directory tree are counted in
quota.

Hence we must adjust the icount of any quota report down by two to avoid
breaking golden outputs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/filter |    7 +++++--
 common/xfs    |   29 +++++++++++++++++++++++++++++
 tests/xfs/050 |    5 +++++
 tests/xfs/153 |    5 +++++
 tests/xfs/299 |    1 +
 tests/xfs/330 |    6 +++++-
 6 files changed, 50 insertions(+), 3 deletions(-)


diff --git a/common/filter b/common/filter
index 509ee95039..904263b3e7 100644
--- a/common/filter
+++ b/common/filter
@@ -618,11 +618,14 @@ _filter_getcap()
 
 # Filter user/group/project id numbers out of quota reports, and standardize
 # the block counts to use filesystem block size.  Callers must set the id and
-# bsize variables before calling this function.
+# bsize variables before calling this function.  The HIDDEN_QUOTA_FILES variable
+# (by default zero) is the number of root files to filter out of the inode
+# count part of the quota report.
 _filter_quota_report()
 {
 	test -n "$id" || echo "id must be set"
 	test -n "$bsize" || echo "block size must be set"
+	test -n "$HIDDEN_QUOTA_FILES" || HIDDEN_QUOTA_FILES=0
 
 	tr -s '[:space:]' | \
 	perl -npe '
@@ -630,7 +633,7 @@ _filter_quota_report()
 		s/^\#0 \d+ /[ROOT] 0 /g;
 		s/6 days/7 days/g' |
 	perl -npe '
-		$val = 0;
+		$val = '"$HIDDEN_QUOTA_FILES"';
 		if ($ENV{'LARGE_SCRATCH_DEV'}) {
 			$val = $ENV{'NUM_SPACE_FILES'};
 		}
diff --git a/common/xfs b/common/xfs
index 007c8704ce..c6a60119f9 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1884,3 +1884,32 @@ _scratch_xfs_force_no_metadir()
 		MKFS_OPTIONS="-m metadir=0 $MKFS_OPTIONS"
 	fi
 }
+
+# Decide if a mount filesystem has metadata directory trees.
+_xfs_mount_has_metadir() {
+	local mount="$1"
+
+	# spaceman (and its info command) predate metadir
+	test ! -e "$XFS_SPACEMAN_PROG" && return 1
+	$XFS_SPACEMAN_PROG -c "info" "$mount" | grep -q 'metadir=1'
+}
+
+# Compute the number of files that are not counted in quotas.
+_xfs_calc_hidden_quota_files() {
+	local mount="$1"
+
+	if _xfs_mount_has_metadir "$mount"; then
+		# Prior to the metadir feature, the realtime bitmap and summary
+		# file were "owned" by root and hence accounted to the root
+		# dquots.  The metadata directory feature stopped accounting
+		# metadata files to quotas, so we must subtract 2 inodes from
+		# the repquota golden outputs to keep the tests going.
+		#
+		# We needn't adjust the block counts because the kernel doesn't
+		# support rt quota and hence the rt metadata files will always
+		# be zero length.
+		echo -2
+	else
+		echo 0
+	fi
+}
diff --git a/tests/xfs/050 b/tests/xfs/050
index 2220e47016..10294e3f6d 100755
--- a/tests/xfs/050
+++ b/tests/xfs/050
@@ -32,9 +32,14 @@ _require_scratch
 _require_xfs_quota
 
 _scratch_mkfs >/dev/null 2>&1
+orig_mntopts="$MOUNT_OPTIONS"
+_qmount_option "uquota"
 _scratch_mount
 bsize=$(_get_file_block_size $SCRATCH_MNT)
+# needs quota enabled to compute the number of metadata dir files
+HIDDEN_QUOTA_FILES=$(_xfs_calc_hidden_quota_files $SCRATCH_MNT)
 _scratch_unmount
+MOUNT_OPTIONS="$orig_mntopts"
 
 bsoft=$(( 200 * $bsize ))
 bhard=$(( 1000 * $bsize ))
diff --git a/tests/xfs/153 b/tests/xfs/153
index dbe26b6803..9def579bba 100755
--- a/tests/xfs/153
+++ b/tests/xfs/153
@@ -37,9 +37,14 @@ _require_idmapped_mounts
 _require_test_program "vfs/mount-idmapped"
 
 _scratch_mkfs >/dev/null 2>&1
+orig_mntopts="$MOUNT_OPTIONS"
+_qmount_option "uquota"
 _scratch_mount
 bsize=$(_get_file_block_size $SCRATCH_MNT)
+# needs quota enabled to compute the number of metadata dir files
+HIDDEN_QUOTA_FILES=$(_xfs_calc_hidden_quota_files $SCRATCH_MNT)
 _scratch_unmount
+MOUNT_OPTIONS="$orig_mntopts"
 
 bsoft=$(( 200 * $bsize ))
 bhard=$(( 1000 * $bsize ))
diff --git a/tests/xfs/299 b/tests/xfs/299
index 4b9df3c6aa..49a6527255 100755
--- a/tests/xfs/299
+++ b/tests/xfs/299
@@ -159,6 +159,7 @@ _qmount_option "uquota,gquota,pquota"
 _qmount
 
 bsize=$(_get_file_block_size $SCRATCH_MNT)
+HIDDEN_QUOTA_FILES=$(_xfs_calc_hidden_quota_files $SCRATCH_MNT)
 
 bsoft=$(( 100 * $bsize ))
 bhard=$(( 500 * $bsize ))
diff --git a/tests/xfs/330 b/tests/xfs/330
index c6e74e67e8..7ebbffff2a 100755
--- a/tests/xfs/330
+++ b/tests/xfs/330
@@ -26,7 +26,10 @@ _require_nobody
 
 do_repquota()
 {
-	repquota $SCRATCH_MNT | grep -E '^(fsgqa|root|nobody)' | sort -r
+	repquota $SCRATCH_MNT | grep -E '^(fsgqa|root|nobody)' | sort -r | \
+	perl -npe '
+		$val = '"$HIDDEN_QUOTA_FILES"';
+		s/(^root\s+--\s+\S+\s+\S+\s+\S+\s+)(\S+)/$1@{[$2 - $val]}/g'
 }
 
 rm -f "$seqres.full"
@@ -35,6 +38,7 @@ echo "Format and mount"
 _scratch_mkfs > "$seqres.full" 2>&1
 export MOUNT_OPTIONS="-o usrquota,grpquota $MOUNT_OPTIONS"
 _scratch_mount >> "$seqres.full" 2>&1
+HIDDEN_QUOTA_FILES=$(_xfs_calc_hidden_quota_files $SCRATCH_MNT)
 quotacheck -u -g $SCRATCH_MNT 2> /dev/null
 quotaon $SCRATCH_MNT 2> /dev/null
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/11] xfs/856: add metadir upgrade to test matrix
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:51   ` [PATCH 06/11] xfs/{050,144,153,299,330}: update quota reports to handle metadir trees Darrick J. Wong
@ 2023-12-27 13:51   ` Darrick J. Wong
  2023-12-27 13:51   ` [PATCH 08/11] xfs/509: adjust inumbers accounting for metadata directories Darrick J. Wong
                     ` (3 subsequent siblings)
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:51 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Add metadata directory trees to the features that this test will try to
upgrade.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1856 |    2 ++
 1 file changed, 2 insertions(+)


diff --git a/tests/xfs/1856 b/tests/xfs/1856
index 84e72d7c81..93bdbaa531 100755
--- a/tests/xfs/1856
+++ b/tests/xfs/1856
@@ -173,12 +173,14 @@ if rt_configured; then
 	check_repair_upgrade finobt && FEATURES+=("finobt")
 	check_repair_upgrade inobtcount && FEATURES+=("inobtcount")
 	check_repair_upgrade bigtime && FEATURES+=("bigtime")
+	check_repair_upgrade metadir && FEATURES+=("metadir")
 else
 	check_repair_upgrade finobt && FEATURES+=("finobt")
 	check_repair_upgrade rmapbt && FEATURES+=("rmapbt")
 	check_repair_upgrade reflink && FEATURES+=("reflink")
 	check_repair_upgrade inobtcount && FEATURES+=("inobtcount")
 	check_repair_upgrade bigtime && FEATURES+=("bigtime")
+	check_repair_upgrade metadir && FEATURES+=("metadir")
 fi
 
 test "${#FEATURES[@]}" -eq 0 && \


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/11] xfs/509: adjust inumbers accounting for metadata directories
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:51   ` [PATCH 07/11] xfs/856: add metadir upgrade to test matrix Darrick J. Wong
@ 2023-12-27 13:51   ` Darrick J. Wong
  2023-12-27 13:52   ` [PATCH 09/11] xfs: create fuzz tests " Darrick J. Wong
                     ` (2 subsequent siblings)
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:51 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The INUMBERS ioctl exports data from the inode btree directly -- the
number of inodes it reports is taken from ir_freemask and includes all
the files in the metadata directory tree.  BULKSTAT, on the other hand,
only reports non-metadata files.  When metadir is enabled, this will
(eventually) cause a discrepancy in the inode counts that is large
enough to exceed the tolerances, thereby causing a test failure.

Correct this by counting the files in the metadata directory and
subtracting that from the INUMBERS totals.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/509 |   21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)


diff --git a/tests/xfs/509 b/tests/xfs/509
index d04dfbbfba..b87abef964 100755
--- a/tests/xfs/509
+++ b/tests/xfs/509
@@ -91,13 +91,13 @@ inumbers_count()
 	bstat_versions | while read v_tag v_flag; do
 		echo -n "inumbers all($v_tag): "
 		nr=$(inumbers_fs $SCRATCH_MNT $v_flag)
-		_within_tolerance "inumbers" $nr $expect $tolerance -v
+		_within_tolerance "inumbers" $((nr - METADATA_FILES)) $expect $tolerance -v
 
 		local agcount=$(_xfs_mount_agcount $SCRATCH_MNT)
 		for batchsize in 71 2 1; do
 			echo -n "inumbers $batchsize($v_tag): "
 			nr=$(inumbers_ag $agcount $batchsize $SCRATCH_MNT $v_flag)
-			_within_tolerance "inumbers" $nr $expect $tolerance -v
+			_within_tolerance "inumbers" $((nr - METADATA_FILES)) $expect $tolerance -v
 		done
 	done
 }
@@ -143,9 +143,26 @@ _supported_fs xfs
 DIRCOUNT=8
 INOCOUNT=$((2048 / DIRCOUNT))
 
+# Count everything in the metadata directory tree.
+count_metadir_files() {
+	local metadirs=('/realtime' '/quota')
+	local db_args=('-f')
+
+	for m in "${metadirs[@]}"; do
+		db_args+=('-c' "ls -m $m")
+	done
+
+	local ret=$(_scratch_xfs_db "${db_args[@]}" 2>/dev/null | grep regular | wc -l)
+	test -z "$ret" && ret=0
+	echo $ret
+}
+
 _scratch_mkfs "-d agcount=$DIRCOUNT" >> $seqres.full 2>&1 || _fail "mkfs failed"
 _scratch_mount
 
+METADATA_FILES=$(count_metadir_files)
+echo "found $METADATA_FILES metadata files" >> $seqres.full
+
 # Figure out if we have v5 bulkstat/inumbers ioctls.
 has_v5=
 bs_root_out="$($XFS_IO_PROG -c 'bulkstat_single root' $SCRATCH_MNT 2>>$seqres.full)"


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/11] xfs: create fuzz tests for metadata directories
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 13:51   ` [PATCH 08/11] xfs/509: adjust inumbers accounting for metadata directories Darrick J. Wong
@ 2023-12-27 13:52   ` Darrick J. Wong
  2023-12-27 13:52   ` [PATCH 10/11] xfs: baseline golden output for metadata directory fuzz tests Darrick J. Wong
  2023-12-27 13:52   ` [PATCH 11/11] xfs: test metapath repairs Darrick J. Wong
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:52 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Create fuzz tests to make sure that all the validation works for
metadata directories and subdirectories.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs         |   22 ++++++++++++++++++++++
 tests/xfs/1546     |   37 +++++++++++++++++++++++++++++++++++++
 tests/xfs/1546.out |    4 ++++
 tests/xfs/1547     |   37 +++++++++++++++++++++++++++++++++++++
 tests/xfs/1547.out |    4 ++++
 tests/xfs/1548     |   37 +++++++++++++++++++++++++++++++++++++
 tests/xfs/1548.out |    4 ++++
 tests/xfs/1549     |   38 ++++++++++++++++++++++++++++++++++++++
 tests/xfs/1549.out |    4 ++++
 tests/xfs/1550     |   37 +++++++++++++++++++++++++++++++++++++
 tests/xfs/1550.out |    4 ++++
 tests/xfs/1551     |   37 +++++++++++++++++++++++++++++++++++++
 tests/xfs/1551.out |    4 ++++
 tests/xfs/1552     |   37 +++++++++++++++++++++++++++++++++++++
 tests/xfs/1552.out |    4 ++++
 tests/xfs/1553     |   38 ++++++++++++++++++++++++++++++++++++++
 tests/xfs/1553.out |    4 ++++
 17 files changed, 352 insertions(+)
 create mode 100755 tests/xfs/1546
 create mode 100644 tests/xfs/1546.out
 create mode 100755 tests/xfs/1547
 create mode 100644 tests/xfs/1547.out
 create mode 100755 tests/xfs/1548
 create mode 100644 tests/xfs/1548.out
 create mode 100755 tests/xfs/1549
 create mode 100644 tests/xfs/1549.out
 create mode 100755 tests/xfs/1550
 create mode 100644 tests/xfs/1550.out
 create mode 100755 tests/xfs/1551
 create mode 100644 tests/xfs/1551.out
 create mode 100755 tests/xfs/1552
 create mode 100644 tests/xfs/1552.out
 create mode 100755 tests/xfs/1553
 create mode 100644 tests/xfs/1553.out


diff --git a/common/xfs b/common/xfs
index c6a60119f9..b88491666d 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1913,3 +1913,25 @@ _xfs_calc_hidden_quota_files() {
 		echo 0
 	fi
 }
+
+_require_xfs_mkfs_metadir()
+{
+	_scratch_mkfs_xfs_supported -m metadir=1 >/dev/null 2>&1 || \
+		_notrun "mkfs.xfs doesn't have metadir features"
+}
+
+_require_xfs_scratch_metadir()
+{
+	_require_xfs_mkfs_metadir
+	_require_scratch
+
+	_scratch_mkfs -m metadir=1 &> /dev/null
+	_require_scratch_xfs_features METADIR
+	_try_scratch_mount
+	res=$?
+	if [ $res -ne 0 ]; then
+		_notrun "mounting with metadir not supported by filesystem type: $FSTYP"
+	else
+		_scratch_unmount
+	fi
+}
diff --git a/tests/xfs/1546 b/tests/xfs/1546
new file mode 100755
index 0000000000..a370eb7abf
--- /dev/null
+++ b/tests/xfs/1546
@@ -0,0 +1,37 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1546
+#
+# Populate a XFS filesystem and fuzz every metadir root field.
+# Use xfs_scrub to fix the corruption.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_scrub dangerous_online_repair
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_scratch_metadir
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /')
+
+echo "Fuzz metadir root"
+_scratch_xfs_fuzz_metadata '' 'online' 'path -m /' >> $seqres.full
+echo "Done fuzzing metadir root"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1546.out b/tests/xfs/1546.out
new file mode 100644
index 0000000000..b72891a758
--- /dev/null
+++ b/tests/xfs/1546.out
@@ -0,0 +1,4 @@
+QA output created by 1546
+Format and populate
+Fuzz metadir root
+Done fuzzing metadir root
diff --git a/tests/xfs/1547 b/tests/xfs/1547
new file mode 100755
index 0000000000..0083924b7b
--- /dev/null
+++ b/tests/xfs/1547
@@ -0,0 +1,37 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1547
+#
+# Populate a XFS filesystem and fuzz every metadir root field.
+# Use xfs_repair to fix the corruption.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_scrub dangerous_repair
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_scratch_metadir
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /')
+
+echo "Fuzz metadir root"
+_scratch_xfs_fuzz_metadata '' 'offline' 'path -m /' >> $seqres.full
+echo "Done fuzzing metadir root"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1547.out b/tests/xfs/1547.out
new file mode 100644
index 0000000000..983cc01343
--- /dev/null
+++ b/tests/xfs/1547.out
@@ -0,0 +1,4 @@
+QA output created by 1547
+Format and populate
+Fuzz metadir root
+Done fuzzing metadir root
diff --git a/tests/xfs/1548 b/tests/xfs/1548
new file mode 100755
index 0000000000..d81c524b07
--- /dev/null
+++ b/tests/xfs/1548
@@ -0,0 +1,37 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1548
+#
+# Populate a XFS filesystem and fuzz every metadir root field.
+# Do not fix the filesystem, to test metadata verifiers.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_norepair
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_scratch_metadir
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /')
+
+echo "Fuzz metadir root"
+_scratch_xfs_fuzz_metadata '' 'none' 'path -m /' >> $seqres.full
+echo "Done fuzzing metadir root"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1548.out b/tests/xfs/1548.out
new file mode 100644
index 0000000000..9e395bb059
--- /dev/null
+++ b/tests/xfs/1548.out
@@ -0,0 +1,4 @@
+QA output created by 1548
+Format and populate
+Fuzz metadir root
+Done fuzzing metadir root
diff --git a/tests/xfs/1549 b/tests/xfs/1549
new file mode 100755
index 0000000000..487c73e9e9
--- /dev/null
+++ b/tests/xfs/1549
@@ -0,0 +1,38 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1549
+#
+# Populate a XFS filesystem and fuzz every metadir root field.
+# Try online repair and, if necessary, offline repair,
+# to test the most likely usage pattern.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_bothrepair
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_scratch_metadir
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /')
+
+echo "Fuzz metadir root"
+_scratch_xfs_fuzz_metadata '' 'both' 'path -m /' >> $seqres.full
+echo "Done fuzzing metadir root"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1549.out b/tests/xfs/1549.out
new file mode 100644
index 0000000000..22b3d215e3
--- /dev/null
+++ b/tests/xfs/1549.out
@@ -0,0 +1,4 @@
+QA output created by 1549
+Format and populate
+Fuzz metadir root
+Done fuzzing metadir root
diff --git a/tests/xfs/1550 b/tests/xfs/1550
new file mode 100755
index 0000000000..8b5b42710c
--- /dev/null
+++ b/tests/xfs/1550
@@ -0,0 +1,37 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1550
+#
+# Populate a XFS filesystem and fuzz every metadir subdir field.
+# Use xfs_scrub to fix the corruption.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_scrub dangerous_online_repair
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_scratch_metadir
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime')
+
+echo "Fuzz metadir subdir"
+_scratch_xfs_fuzz_metadata '' 'online' 'path -m /realtime' >> $seqres.full
+echo "Done fuzzing metadir subdir"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1550.out b/tests/xfs/1550.out
new file mode 100644
index 0000000000..7694cd670b
--- /dev/null
+++ b/tests/xfs/1550.out
@@ -0,0 +1,4 @@
+QA output created by 1550
+Format and populate
+Fuzz metadir subdir
+Done fuzzing metadir subdir
diff --git a/tests/xfs/1551 b/tests/xfs/1551
new file mode 100755
index 0000000000..05b0328ef0
--- /dev/null
+++ b/tests/xfs/1551
@@ -0,0 +1,37 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1551
+#
+# Populate a XFS filesystem and fuzz every metadir subdir field.
+# Use xfs_repair to fix the corruption.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_scrub dangerous_repair
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_scratch_metadir
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime')
+
+echo "Fuzz metadir subdir"
+_scratch_xfs_fuzz_metadata '' 'offline' 'path -m /realtime' >> $seqres.full
+echo "Done fuzzing metadir subdir"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1551.out b/tests/xfs/1551.out
new file mode 100644
index 0000000000..4c3360d08b
--- /dev/null
+++ b/tests/xfs/1551.out
@@ -0,0 +1,4 @@
+QA output created by 1551
+Format and populate
+Fuzz metadir subdir
+Done fuzzing metadir subdir
diff --git a/tests/xfs/1552 b/tests/xfs/1552
new file mode 100755
index 0000000000..0870837f9c
--- /dev/null
+++ b/tests/xfs/1552
@@ -0,0 +1,37 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1552
+#
+# Populate a XFS filesystem and fuzz every metadir subdir field.
+# Do not fix the filesystem, to test metadata verifiers.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_norepair
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_scratch_metadir
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime')
+
+echo "Fuzz metadir subdir"
+_scratch_xfs_fuzz_metadata '' 'none' 'path -m /realtime' >> $seqres.full
+echo "Done fuzzing metadir subdir"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1552.out b/tests/xfs/1552.out
new file mode 100644
index 0000000000..6636b1b656
--- /dev/null
+++ b/tests/xfs/1552.out
@@ -0,0 +1,4 @@
+QA output created by 1552
+Format and populate
+Fuzz metadir subdir
+Done fuzzing metadir subdir
diff --git a/tests/xfs/1553 b/tests/xfs/1553
new file mode 100755
index 0000000000..ab15e53e27
--- /dev/null
+++ b/tests/xfs/1553
@@ -0,0 +1,38 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1553
+#
+# Populate a XFS filesystem and fuzz every metadir subdir field.
+# Try online repair and, if necessary, offline repair,
+# to test the most likely usage pattern.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_bothrepair
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_scratch_metadir
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'path -m /realtime')
+
+echo "Fuzz metadir subdir"
+_scratch_xfs_fuzz_metadata '' 'both' 'path -m /realtime' >> $seqres.full
+echo "Done fuzzing metadir subdir"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1553.out b/tests/xfs/1553.out
new file mode 100644
index 0000000000..0298fcfddb
--- /dev/null
+++ b/tests/xfs/1553.out
@@ -0,0 +1,4 @@
+QA output created by 1553
+Format and populate
+Fuzz metadir subdir
+Done fuzzing metadir subdir


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/11] xfs: baseline golden output for metadata directory fuzz tests
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-27 13:52   ` [PATCH 09/11] xfs: create fuzz tests " Darrick J. Wong
@ 2023-12-27 13:52   ` Darrick J. Wong
  2023-12-27 13:52   ` [PATCH 11/11] xfs: test metapath repairs Darrick J. Wong
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:52 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Record all the currently known failures of the xfs_scrub check and
repair code for the metadata directory fuzz tests.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1546.out |  312 ++++++++++++++++++++++++++++++++++++++
 tests/xfs/1547.out |   32 ++++
 tests/xfs/1548.out |  314 ++++++++++++++++++++++++++++++++++++++
 tests/xfs/1549.out |  326 +++++++++++++++++++++++++++++++++++++++
 tests/xfs/1550.out |  430 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1551.out |   31 ++++
 tests/xfs/1552.out |  339 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1553.out |  366 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 2150 insertions(+)


diff --git a/tests/xfs/1546.out b/tests/xfs/1546.out
index b72891a758..cdfffd1609 100644
--- a/tests/xfs/1546.out
+++ b/tests/xfs/1546.out
@@ -1,4 +1,316 @@
 QA output created by 1546
 Format and populate
 Fuzz metadir root
+core.magic = zeroes: mount failed (32).
+core.magic = ones: mount failed (32).
+core.magic = firstbit: mount failed (32).
+core.magic = middlebit: mount failed (32).
+core.magic = lastbit: mount failed (32).
+core.magic = add: mount failed (32).
+core.magic = sub: mount failed (32).
+core.mode = zeroes: mount failed (32).
+core.mode = ones: mount failed (32).
+core.mode = firstbit: mount failed (32).
+core.mode = middlebit: mount failed (32).
+core.mode = lastbit: mount failed (32).
+core.mode = add: mount failed (32).
+core.mode = sub: mount failed (32).
+core.version = zeroes: mount failed (32).
+core.version = ones: mount failed (32).
+core.version = firstbit: mount failed (32).
+core.version = middlebit: mount failed (32).
+core.version = lastbit: mount failed (32).
+core.version = add: mount failed (32).
+core.version = sub: mount failed (32).
+core.format = zeroes: mount failed (32).
+core.format = ones: mount failed (32).
+core.format = firstbit: mount failed (32).
+core.format = middlebit: mount failed (32).
+core.format = lastbit: mount failed (32).
+core.format = add: mount failed (32).
+core.format = sub: mount failed (32).
+core.onlink = ones: mount failed (32).
+core.onlink = firstbit: mount failed (32).
+core.onlink = middlebit: mount failed (32).
+core.onlink = lastbit: mount failed (32).
+core.onlink = add: mount failed (32).
+core.onlink = sub: mount failed (32).
+core.uid = ones: mount failed (32).
+core.uid = firstbit: mount failed (32).
+core.uid = middlebit: mount failed (32).
+core.uid = lastbit: mount failed (32).
+core.uid = add: mount failed (32).
+core.uid = sub: mount failed (32).
+core.gid = ones: mount failed (32).
+core.gid = firstbit: mount failed (32).
+core.gid = middlebit: mount failed (32).
+core.gid = lastbit: mount failed (32).
+core.gid = add: mount failed (32).
+core.gid = sub: mount failed (32).
+core.nlinkv2 = zeroes: mount failed (32).
+core.projid_lo = ones: mount failed (32).
+core.projid_lo = firstbit: mount failed (32).
+core.projid_lo = middlebit: mount failed (32).
+core.projid_lo = lastbit: mount failed (32).
+core.projid_lo = add: mount failed (32).
+core.projid_lo = sub: mount failed (32).
+core.projid_hi = ones: mount failed (32).
+core.projid_hi = firstbit: mount failed (32).
+core.projid_hi = middlebit: mount failed (32).
+core.projid_hi = lastbit: mount failed (32).
+core.projid_hi = add: mount failed (32).
+core.projid_hi = sub: mount failed (32).
+core.nextents = ones: mount failed (32).
+core.nextents = firstbit: mount failed (32).
+core.nextents = middlebit: mount failed (32).
+core.nextents = lastbit: mount failed (32).
+core.nextents = add: mount failed (32).
+core.nextents = sub: mount failed (32).
+core.size = zeroes: mount failed (32).
+core.size = ones: mount failed (32).
+core.size = firstbit: mount failed (32).
+core.size = middlebit: mount failed (32).
+core.size = lastbit: mount failed (32).
+core.size = add: mount failed (32).
+core.size = sub: mount failed (32).
+core.extsize = ones: mount failed (32).
+core.extsize = firstbit: mount failed (32).
+core.extsize = middlebit: mount failed (32).
+core.extsize = lastbit: mount failed (32).
+core.extsize = add: mount failed (32).
+core.extsize = sub: mount failed (32).
+core.naextents = ones: mount failed (32).
+core.naextents = firstbit: mount failed (32).
+core.naextents = middlebit: mount failed (32).
+core.naextents = lastbit: mount failed (32).
+core.naextents = add: mount failed (32).
+core.naextents = sub: mount failed (32).
+core.forkoff = ones: mount failed (32).
+core.forkoff = firstbit: mount failed (32).
+core.forkoff = middlebit: online scrub didn't fail.
+core.forkoff = lastbit: mount failed (32).
+core.forkoff = add: mount failed (32).
+core.forkoff = sub: mount failed (32).
+core.aformat = zeroes: online scrub didn't fail.
+core.aformat = zeroes: offline re-scrub failed (1).
+core.aformat = zeroes: offline post-mod scrub failed (1).
+core.aformat = ones: mount failed (32).
+core.aformat = firstbit: mount failed (32).
+core.aformat = middlebit: mount failed (32).
+core.aformat = lastbit: mount failed (32).
+core.aformat = add: mount failed (32).
+core.aformat = sub: mount failed (32).
+core.realtime = ones: mount failed (32).
+core.realtime = firstbit: mount failed (32).
+core.realtime = middlebit: mount failed (32).
+core.realtime = lastbit: mount failed (32).
+core.realtime = add: mount failed (32).
+core.realtime = sub: mount failed (32).
+core.rtinherit = ones: online scrub didn't fail.
+core.rtinherit = firstbit: online scrub didn't fail.
+core.rtinherit = middlebit: online scrub didn't fail.
+core.rtinherit = lastbit: online scrub didn't fail.
+core.rtinherit = add: online scrub didn't fail.
+core.rtinherit = sub: online scrub didn't fail.
+core.projinherit = ones: online scrub didn't fail.
+core.projinherit = firstbit: online scrub didn't fail.
+core.projinherit = middlebit: online scrub didn't fail.
+core.projinherit = lastbit: online scrub didn't fail.
+core.projinherit = add: online scrub didn't fail.
+core.projinherit = sub: online scrub didn't fail.
+core.nosymlinks = zeroes: mount failed (32).
+core.nosymlinks = firstbit: mount failed (32).
+core.nosymlinks = middlebit: mount failed (32).
+core.nosymlinks = lastbit: mount failed (32).
+core.nosymlinks = add: mount failed (32).
+core.nosymlinks = sub: mount failed (32).
+core.extsz = ones: mount failed (32).
+core.extsz = firstbit: mount failed (32).
+core.extsz = middlebit: mount failed (32).
+core.extsz = lastbit: mount failed (32).
+core.extsz = add: mount failed (32).
+core.extsz = sub: mount failed (32).
+core.extszinherit = ones: mount failed (32).
+core.extszinherit = firstbit: mount failed (32).
+core.extszinherit = middlebit: mount failed (32).
+core.extszinherit = lastbit: mount failed (32).
+core.extszinherit = add: mount failed (32).
+core.extszinherit = sub: mount failed (32).
+next_unlinked = zeroes: mount failed (32).
+next_unlinked = firstbit: mount failed (32).
+next_unlinked = middlebit: mount failed (32).
+next_unlinked = lastbit: mount failed (32).
+next_unlinked = add: online scrub didn't fail.
+next_unlinked = add: offline re-scrub failed (1).
+next_unlinked = add: offline post-mod scrub failed (1).
+next_unlinked = sub: mount failed (32).
+v3.crc = zeroes: mount failed (32).
+v3.crc = ones: mount failed (32).
+v3.crc = firstbit: mount failed (32).
+v3.crc = middlebit: mount failed (32).
+v3.crc = lastbit: mount failed (32).
+v3.crc = add: mount failed (32).
+v3.crc = sub: mount failed (32).
+v3.change_count = zeroes: online scrub didn't fail.
+v3.change_count = ones: online scrub didn't fail.
+v3.change_count = firstbit: online scrub didn't fail.
+v3.change_count = middlebit: online scrub didn't fail.
+v3.change_count = lastbit: online scrub didn't fail.
+v3.change_count = add: online scrub didn't fail.
+v3.change_count = sub: online scrub didn't fail.
+v3.flags2 = zeroes: mount failed (32).
+v3.flags2 = ones: mount failed (32).
+v3.flags2 = firstbit: mount failed (32).
+v3.flags2 = middlebit: online scrub didn't fail.
+v3.flags2 = middlebit: offline re-scrub failed (1).
+v3.flags2 = middlebit: offline post-mod scrub failed (1).
+v3.flags2 = lastbit: mount failed (32).
+v3.flags2 = add: mount failed (32).
+v3.flags2 = sub: mount failed (32).
+v3.cowextsize = ones: mount failed (32).
+v3.cowextsize = firstbit: mount failed (32).
+v3.cowextsize = middlebit: mount failed (32).
+v3.cowextsize = lastbit: mount failed (32).
+v3.cowextsize = add: mount failed (32).
+v3.cowextsize = sub: mount failed (32).
+v3.inumber = zeroes: mount failed (32).
+v3.inumber = ones: mount failed (32).
+v3.inumber = firstbit: mount failed (32).
+v3.inumber = middlebit: mount failed (32).
+v3.inumber = lastbit: mount failed (32).
+v3.inumber = add: mount failed (32).
+v3.inumber = sub: mount failed (32).
+v3.uuid = zeroes: mount failed (32).
+v3.uuid = ones: mount failed (32).
+v3.uuid = firstbit: mount failed (32).
+v3.uuid = middlebit: mount failed (32).
+v3.uuid = lastbit: mount failed (32).
+v3.reflink = ones: mount failed (32).
+v3.reflink = firstbit: mount failed (32).
+v3.reflink = middlebit: mount failed (32).
+v3.reflink = lastbit: mount failed (32).
+v3.reflink = add: mount failed (32).
+v3.reflink = sub: mount failed (32).
+v3.cowextsz = ones: mount failed (32).
+v3.cowextsz = firstbit: mount failed (32).
+v3.cowextsz = middlebit: mount failed (32).
+v3.cowextsz = lastbit: mount failed (32).
+v3.cowextsz = add: mount failed (32).
+v3.cowextsz = sub: mount failed (32).
+v3.nrext64 = zeroes: online scrub didn't fail.
+v3.nrext64 = firstbit: online scrub didn't fail.
+v3.nrext64 = middlebit: online scrub didn't fail.
+v3.nrext64 = lastbit: online scrub didn't fail.
+v3.nrext64 = add: online scrub didn't fail.
+v3.nrext64 = sub: online scrub didn't fail.
+v3.metadir = zeroes: mount failed (32).
+v3.metadir = firstbit: mount failed (32).
+v3.metadir = middlebit: mount failed (32).
+v3.metadir = lastbit: mount failed (32).
+v3.metadir = add: mount failed (32).
+v3.metadir = sub: mount failed (32).
+u3.sfdir3.hdr.count = zeroes: mount failed (32).
+u3.sfdir3.hdr.count = ones: mount failed (32).
+u3.sfdir3.hdr.count = firstbit: mount failed (32).
+u3.sfdir3.hdr.count = middlebit: mount failed (32).
+u3.sfdir3.hdr.count = lastbit: mount failed (32).
+u3.sfdir3.hdr.count = add: mount failed (32).
+u3.sfdir3.hdr.count = sub: mount failed (32).
+u3.sfdir3.hdr.i8count = ones: mount failed (32).
+u3.sfdir3.hdr.i8count = firstbit: mount failed (32).
+u3.sfdir3.hdr.i8count = middlebit: mount failed (32).
+u3.sfdir3.hdr.i8count = lastbit: mount failed (32).
+u3.sfdir3.hdr.i8count = add: mount failed (32).
+u3.sfdir3.hdr.i8count = sub: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = zeroes: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = ones: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = firstbit: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[0].namelen = ones: mount failed (32).
+u3.sfdir3.list[0].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[0].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[0].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[0].namelen = add: mount failed (32).
+u3.sfdir3.list[0].namelen = sub: mount failed (32).
+u3.sfdir3.list[0].offset = zeroes: mount failed (32).
+u3.sfdir3.list[0].offset = ones: mount failed (32).
+u3.sfdir3.list[0].offset = firstbit: mount failed (32).
+u3.sfdir3.list[0].offset = middlebit: mount failed (32).
+u3.sfdir3.list[0].offset = lastbit: mount failed (32).
+u3.sfdir3.list[0].offset = add: mount failed (32).
+u3.sfdir3.list[0].offset = sub: mount failed (32).
+u3.sfdir3.list[0].name = zeroes: mount failed (32).
+u3.sfdir3.list[0].name = ones: mount failed (32).
+u3.sfdir3.list[0].name = firstbit: mount failed (32).
+u3.sfdir3.list[0].name = middlebit: mount failed (32).
+u3.sfdir3.list[0].name = lastbit: mount failed (32).
+u3.sfdir3.list[0].name = add: mount failed (32).
+u3.sfdir3.list[0].name = sub: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[0].filetype = ones: mount failed (32).
+u3.sfdir3.list[0].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[0].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[0].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[0].filetype = add: mount failed (32).
+u3.sfdir3.list[0].filetype = sub: mount failed (32).
+u3.sfdir3.list[1].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[1].namelen = ones: mount failed (32).
+u3.sfdir3.list[1].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[1].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[1].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[1].namelen = add: mount failed (32).
+u3.sfdir3.list[1].namelen = sub: mount failed (32).
+u3.sfdir3.list[1].offset = zeroes: mount failed (32).
+u3.sfdir3.list[1].offset = ones: mount failed (32).
+u3.sfdir3.list[1].offset = firstbit: mount failed (32).
+u3.sfdir3.list[1].offset = middlebit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = add: online scrub didn't fail.
+u3.sfdir3.list[1].offset = sub: mount failed (32).
+u3.sfdir3.list[1].name = zeroes: online repair failed (4).
+u3.sfdir3.list[1].name = zeroes: online re-scrub failed (4).
+u3.sfdir3.list[1].name = zeroes: offline re-scrub failed (1).
+u3.sfdir3.list[1].name = zeroes: pre-mod mount failed (32).
+u3.sfdir3.list[1].name = ones: online repair failed (4).
+u3.sfdir3.list[1].name = ones: online re-scrub failed (4).
+u3.sfdir3.list[1].name = ones: offline re-scrub failed (1).
+u3.sfdir3.list[1].name = ones: pre-mod mount failed (32).
+u3.sfdir3.list[1].name = firstbit: online repair failed (4).
+u3.sfdir3.list[1].name = firstbit: online re-scrub failed (4).
+u3.sfdir3.list[1].name = firstbit: offline re-scrub failed (1).
+u3.sfdir3.list[1].name = firstbit: pre-mod mount failed (32).
+u3.sfdir3.list[1].name = lastbit: online repair failed (4).
+u3.sfdir3.list[1].name = lastbit: online re-scrub failed (4).
+u3.sfdir3.list[1].name = lastbit: offline re-scrub failed (1).
+u3.sfdir3.list[1].name = lastbit: pre-mod mount failed (32).
+u3.sfdir3.list[1].name = add: online repair failed (4).
+u3.sfdir3.list[1].name = add: online re-scrub failed (4).
+u3.sfdir3.list[1].name = add: offline re-scrub failed (1).
+u3.sfdir3.list[1].name = add: pre-mod mount failed (32).
+u3.sfdir3.list[1].name = sub: online repair failed (4).
+u3.sfdir3.list[1].name = sub: online re-scrub failed (4).
+u3.sfdir3.list[1].name = sub: offline re-scrub failed (1).
+u3.sfdir3.list[1].name = sub: pre-mod mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[1].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[1].filetype = ones: mount failed (32).
+u3.sfdir3.list[1].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[1].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[1].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[1].filetype = add: mount failed (32).
+u3.sfdir3.list[1].filetype = sub: mount failed (32).
 Done fuzzing metadir root
diff --git a/tests/xfs/1547.out b/tests/xfs/1547.out
index 983cc01343..662d04321f 100644
--- a/tests/xfs/1547.out
+++ b/tests/xfs/1547.out
@@ -1,4 +1,36 @@
 QA output created by 1547
 Format and populate
 Fuzz metadir root
+core.forkoff = middlebit: offline scrub didn't fail.
+core.rtinherit = ones: offline scrub didn't fail.
+core.rtinherit = firstbit: offline scrub didn't fail.
+core.rtinherit = middlebit: offline scrub didn't fail.
+core.rtinherit = lastbit: offline scrub didn't fail.
+core.rtinherit = add: offline scrub didn't fail.
+core.rtinherit = sub: offline scrub didn't fail.
+core.projinherit = ones: offline scrub didn't fail.
+core.projinherit = firstbit: offline scrub didn't fail.
+core.projinherit = middlebit: offline scrub didn't fail.
+core.projinherit = lastbit: offline scrub didn't fail.
+core.projinherit = add: offline scrub didn't fail.
+core.projinherit = sub: offline scrub didn't fail.
+v3.change_count = zeroes: offline scrub didn't fail.
+v3.change_count = ones: offline scrub didn't fail.
+v3.change_count = firstbit: offline scrub didn't fail.
+v3.change_count = middlebit: offline scrub didn't fail.
+v3.change_count = lastbit: offline scrub didn't fail.
+v3.change_count = add: offline scrub didn't fail.
+v3.change_count = sub: offline scrub didn't fail.
+v3.nrext64 = zeroes: offline scrub didn't fail.
+v3.nrext64 = firstbit: offline scrub didn't fail.
+v3.nrext64 = middlebit: offline scrub didn't fail.
+v3.nrext64 = lastbit: offline scrub didn't fail.
+v3.nrext64 = add: offline scrub didn't fail.
+v3.nrext64 = sub: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = middlebit: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = lastbit: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = add: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = middlebit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = add: offline scrub didn't fail.
 Done fuzzing metadir root
diff --git a/tests/xfs/1548.out b/tests/xfs/1548.out
index 9e395bb059..48ee6c02fb 100644
--- a/tests/xfs/1548.out
+++ b/tests/xfs/1548.out
@@ -1,4 +1,318 @@
 QA output created by 1548
 Format and populate
 Fuzz metadir root
+core.magic = zeroes: mount failed (32).
+core.magic = ones: mount failed (32).
+core.magic = firstbit: mount failed (32).
+core.magic = middlebit: mount failed (32).
+core.magic = lastbit: mount failed (32).
+core.magic = add: mount failed (32).
+core.magic = sub: mount failed (32).
+core.mode = zeroes: mount failed (32).
+core.mode = ones: mount failed (32).
+core.mode = firstbit: mount failed (32).
+core.mode = middlebit: mount failed (32).
+core.mode = lastbit: mount failed (32).
+core.mode = add: mount failed (32).
+core.mode = sub: mount failed (32).
+core.version = zeroes: mount failed (32).
+core.version = ones: mount failed (32).
+core.version = firstbit: mount failed (32).
+core.version = middlebit: mount failed (32).
+core.version = lastbit: mount failed (32).
+core.version = add: mount failed (32).
+core.version = sub: mount failed (32).
+core.format = zeroes: mount failed (32).
+core.format = ones: mount failed (32).
+core.format = firstbit: mount failed (32).
+core.format = middlebit: mount failed (32).
+core.format = lastbit: mount failed (32).
+core.format = add: mount failed (32).
+core.format = sub: mount failed (32).
+core.onlink = ones: mount failed (32).
+core.onlink = firstbit: mount failed (32).
+core.onlink = middlebit: mount failed (32).
+core.onlink = lastbit: mount failed (32).
+core.onlink = add: mount failed (32).
+core.onlink = sub: mount failed (32).
+core.uid = ones: mount failed (32).
+core.uid = firstbit: mount failed (32).
+core.uid = middlebit: mount failed (32).
+core.uid = lastbit: mount failed (32).
+core.uid = add: mount failed (32).
+core.uid = sub: mount failed (32).
+core.gid = ones: mount failed (32).
+core.gid = firstbit: mount failed (32).
+core.gid = middlebit: mount failed (32).
+core.gid = lastbit: mount failed (32).
+core.gid = add: mount failed (32).
+core.gid = sub: mount failed (32).
+core.nlinkv2 = zeroes: mount failed (32).
+core.projid_lo = ones: mount failed (32).
+core.projid_lo = firstbit: mount failed (32).
+core.projid_lo = middlebit: mount failed (32).
+core.projid_lo = lastbit: mount failed (32).
+core.projid_lo = add: mount failed (32).
+core.projid_lo = sub: mount failed (32).
+core.projid_hi = ones: mount failed (32).
+core.projid_hi = firstbit: mount failed (32).
+core.projid_hi = middlebit: mount failed (32).
+core.projid_hi = lastbit: mount failed (32).
+core.projid_hi = add: mount failed (32).
+core.projid_hi = sub: mount failed (32).
+core.nextents = ones: mount failed (32).
+core.nextents = firstbit: mount failed (32).
+core.nextents = middlebit: mount failed (32).
+core.nextents = lastbit: mount failed (32).
+core.nextents = add: mount failed (32).
+core.nextents = sub: mount failed (32).
+core.size = zeroes: mount failed (32).
+core.size = ones: mount failed (32).
+core.size = firstbit: mount failed (32).
+core.size = middlebit: mount failed (32).
+core.size = lastbit: mount failed (32).
+core.size = add: mount failed (32).
+core.size = sub: mount failed (32).
+core.extsize = ones: mount failed (32).
+core.extsize = firstbit: mount failed (32).
+core.extsize = middlebit: mount failed (32).
+core.extsize = lastbit: mount failed (32).
+core.extsize = add: mount failed (32).
+core.extsize = sub: mount failed (32).
+core.naextents = ones: mount failed (32).
+core.naextents = firstbit: mount failed (32).
+core.naextents = middlebit: mount failed (32).
+core.naextents = lastbit: mount failed (32).
+core.naextents = add: mount failed (32).
+core.naextents = sub: mount failed (32).
+core.forkoff = ones: mount failed (32).
+core.forkoff = firstbit: mount failed (32).
+core.forkoff = middlebit: offline scrub didn't fail.
+core.forkoff = middlebit: online scrub didn't fail.
+core.forkoff = lastbit: mount failed (32).
+core.forkoff = add: mount failed (32).
+core.forkoff = sub: mount failed (32).
+core.aformat = zeroes: online scrub didn't fail.
+core.aformat = ones: mount failed (32).
+core.aformat = firstbit: mount failed (32).
+core.aformat = middlebit: mount failed (32).
+core.aformat = lastbit: mount failed (32).
+core.aformat = add: mount failed (32).
+core.aformat = sub: mount failed (32).
+core.realtime = ones: mount failed (32).
+core.realtime = firstbit: mount failed (32).
+core.realtime = middlebit: mount failed (32).
+core.realtime = lastbit: mount failed (32).
+core.realtime = add: mount failed (32).
+core.realtime = sub: mount failed (32).
+core.rtinherit = ones: offline scrub didn't fail.
+core.rtinherit = ones: online scrub didn't fail.
+core.rtinherit = firstbit: offline scrub didn't fail.
+core.rtinherit = firstbit: online scrub didn't fail.
+core.rtinherit = middlebit: offline scrub didn't fail.
+core.rtinherit = middlebit: online scrub didn't fail.
+core.rtinherit = lastbit: offline scrub didn't fail.
+core.rtinherit = lastbit: online scrub didn't fail.
+core.rtinherit = add: offline scrub didn't fail.
+core.rtinherit = add: online scrub didn't fail.
+core.rtinherit = sub: offline scrub didn't fail.
+core.rtinherit = sub: online scrub didn't fail.
+core.projinherit = ones: offline scrub didn't fail.
+core.projinherit = ones: online scrub didn't fail.
+core.projinherit = firstbit: offline scrub didn't fail.
+core.projinherit = firstbit: online scrub didn't fail.
+core.projinherit = middlebit: offline scrub didn't fail.
+core.projinherit = middlebit: online scrub didn't fail.
+core.projinherit = lastbit: offline scrub didn't fail.
+core.projinherit = lastbit: online scrub didn't fail.
+core.projinherit = add: offline scrub didn't fail.
+core.projinherit = add: online scrub didn't fail.
+core.projinherit = sub: offline scrub didn't fail.
+core.projinherit = sub: online scrub didn't fail.
+core.nosymlinks = zeroes: mount failed (32).
+core.nosymlinks = firstbit: mount failed (32).
+core.nosymlinks = middlebit: mount failed (32).
+core.nosymlinks = lastbit: mount failed (32).
+core.nosymlinks = add: mount failed (32).
+core.nosymlinks = sub: mount failed (32).
+core.extsz = ones: mount failed (32).
+core.extsz = firstbit: mount failed (32).
+core.extsz = middlebit: mount failed (32).
+core.extsz = lastbit: mount failed (32).
+core.extsz = add: mount failed (32).
+core.extsz = sub: mount failed (32).
+core.extszinherit = ones: mount failed (32).
+core.extszinherit = firstbit: mount failed (32).
+core.extszinherit = middlebit: mount failed (32).
+core.extszinherit = lastbit: mount failed (32).
+core.extszinherit = add: mount failed (32).
+core.extszinherit = sub: mount failed (32).
+next_unlinked = zeroes: mount failed (32).
+next_unlinked = firstbit: mount failed (32).
+next_unlinked = middlebit: mount failed (32).
+next_unlinked = lastbit: mount failed (32).
+next_unlinked = add: online scrub didn't fail.
+next_unlinked = sub: mount failed (32).
+v3.crc = zeroes: mount failed (32).
+v3.crc = ones: mount failed (32).
+v3.crc = firstbit: mount failed (32).
+v3.crc = middlebit: mount failed (32).
+v3.crc = lastbit: mount failed (32).
+v3.crc = add: mount failed (32).
+v3.crc = sub: mount failed (32).
+v3.change_count = zeroes: offline scrub didn't fail.
+v3.change_count = zeroes: online scrub didn't fail.
+v3.change_count = ones: offline scrub didn't fail.
+v3.change_count = ones: online scrub didn't fail.
+v3.change_count = firstbit: offline scrub didn't fail.
+v3.change_count = firstbit: online scrub didn't fail.
+v3.change_count = middlebit: offline scrub didn't fail.
+v3.change_count = middlebit: online scrub didn't fail.
+v3.change_count = lastbit: offline scrub didn't fail.
+v3.change_count = lastbit: online scrub didn't fail.
+v3.change_count = add: offline scrub didn't fail.
+v3.change_count = add: online scrub didn't fail.
+v3.change_count = sub: offline scrub didn't fail.
+v3.change_count = sub: online scrub didn't fail.
+v3.flags2 = zeroes: mount failed (32).
+v3.flags2 = ones: mount failed (32).
+v3.flags2 = firstbit: mount failed (32).
+v3.flags2 = middlebit: online scrub didn't fail.
+v3.flags2 = lastbit: mount failed (32).
+v3.flags2 = add: mount failed (32).
+v3.flags2 = sub: mount failed (32).
+v3.cowextsize = ones: mount failed (32).
+v3.cowextsize = firstbit: mount failed (32).
+v3.cowextsize = middlebit: mount failed (32).
+v3.cowextsize = lastbit: mount failed (32).
+v3.cowextsize = add: mount failed (32).
+v3.cowextsize = sub: mount failed (32).
+v3.inumber = zeroes: mount failed (32).
+v3.inumber = ones: mount failed (32).
+v3.inumber = firstbit: mount failed (32).
+v3.inumber = middlebit: mount failed (32).
+v3.inumber = lastbit: mount failed (32).
+v3.inumber = add: mount failed (32).
+v3.inumber = sub: mount failed (32).
+v3.uuid = zeroes: mount failed (32).
+v3.uuid = ones: mount failed (32).
+v3.uuid = firstbit: mount failed (32).
+v3.uuid = middlebit: mount failed (32).
+v3.uuid = lastbit: mount failed (32).
+v3.reflink = ones: mount failed (32).
+v3.reflink = firstbit: mount failed (32).
+v3.reflink = middlebit: mount failed (32).
+v3.reflink = lastbit: mount failed (32).
+v3.reflink = add: mount failed (32).
+v3.reflink = sub: mount failed (32).
+v3.cowextsz = ones: mount failed (32).
+v3.cowextsz = firstbit: mount failed (32).
+v3.cowextsz = middlebit: mount failed (32).
+v3.cowextsz = lastbit: mount failed (32).
+v3.cowextsz = add: mount failed (32).
+v3.cowextsz = sub: mount failed (32).
+v3.nrext64 = zeroes: offline scrub didn't fail.
+v3.nrext64 = zeroes: online scrub didn't fail.
+v3.nrext64 = firstbit: offline scrub didn't fail.
+v3.nrext64 = firstbit: online scrub didn't fail.
+v3.nrext64 = middlebit: offline scrub didn't fail.
+v3.nrext64 = middlebit: online scrub didn't fail.
+v3.nrext64 = lastbit: offline scrub didn't fail.
+v3.nrext64 = lastbit: online scrub didn't fail.
+v3.nrext64 = add: offline scrub didn't fail.
+v3.nrext64 = add: online scrub didn't fail.
+v3.nrext64 = sub: offline scrub didn't fail.
+v3.nrext64 = sub: online scrub didn't fail.
+v3.metadir = zeroes: mount failed (32).
+v3.metadir = firstbit: mount failed (32).
+v3.metadir = middlebit: mount failed (32).
+v3.metadir = lastbit: mount failed (32).
+v3.metadir = add: mount failed (32).
+v3.metadir = sub: mount failed (32).
+u3.sfdir3.hdr.count = zeroes: mount failed (32).
+u3.sfdir3.hdr.count = ones: mount failed (32).
+u3.sfdir3.hdr.count = firstbit: mount failed (32).
+u3.sfdir3.hdr.count = middlebit: mount failed (32).
+u3.sfdir3.hdr.count = lastbit: mount failed (32).
+u3.sfdir3.hdr.count = add: mount failed (32).
+u3.sfdir3.hdr.count = sub: mount failed (32).
+u3.sfdir3.hdr.i8count = ones: mount failed (32).
+u3.sfdir3.hdr.i8count = firstbit: mount failed (32).
+u3.sfdir3.hdr.i8count = middlebit: mount failed (32).
+u3.sfdir3.hdr.i8count = lastbit: mount failed (32).
+u3.sfdir3.hdr.i8count = add: mount failed (32).
+u3.sfdir3.hdr.i8count = sub: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = zeroes: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = ones: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = firstbit: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = middlebit: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = lastbit: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = add: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[0].namelen = ones: mount failed (32).
+u3.sfdir3.list[0].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[0].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[0].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[0].namelen = add: mount failed (32).
+u3.sfdir3.list[0].namelen = sub: mount failed (32).
+u3.sfdir3.list[0].offset = zeroes: mount failed (32).
+u3.sfdir3.list[0].offset = ones: mount failed (32).
+u3.sfdir3.list[0].offset = firstbit: mount failed (32).
+u3.sfdir3.list[0].offset = middlebit: mount failed (32).
+u3.sfdir3.list[0].offset = lastbit: mount failed (32).
+u3.sfdir3.list[0].offset = add: mount failed (32).
+u3.sfdir3.list[0].offset = sub: mount failed (32).
+u3.sfdir3.list[0].name = zeroes: mount failed (32).
+u3.sfdir3.list[0].name = ones: mount failed (32).
+u3.sfdir3.list[0].name = firstbit: mount failed (32).
+u3.sfdir3.list[0].name = middlebit: mount failed (32).
+u3.sfdir3.list[0].name = lastbit: mount failed (32).
+u3.sfdir3.list[0].name = add: mount failed (32).
+u3.sfdir3.list[0].name = sub: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[0].filetype = ones: mount failed (32).
+u3.sfdir3.list[0].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[0].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[0].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[0].filetype = add: mount failed (32).
+u3.sfdir3.list[0].filetype = sub: mount failed (32).
+u3.sfdir3.list[1].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[1].namelen = ones: mount failed (32).
+u3.sfdir3.list[1].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[1].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[1].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[1].namelen = add: mount failed (32).
+u3.sfdir3.list[1].namelen = sub: mount failed (32).
+u3.sfdir3.list[1].offset = zeroes: mount failed (32).
+u3.sfdir3.list[1].offset = ones: mount failed (32).
+u3.sfdir3.list[1].offset = firstbit: mount failed (32).
+u3.sfdir3.list[1].offset = middlebit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = middlebit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = add: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = add: online scrub didn't fail.
+u3.sfdir3.list[1].offset = sub: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[1].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[1].filetype = ones: mount failed (32).
+u3.sfdir3.list[1].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[1].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[1].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[1].filetype = add: mount failed (32).
+u3.sfdir3.list[1].filetype = sub: mount failed (32).
 Done fuzzing metadir root
diff --git a/tests/xfs/1549.out b/tests/xfs/1549.out
index 22b3d215e3..c35f47c31a 100644
--- a/tests/xfs/1549.out
+++ b/tests/xfs/1549.out
@@ -1,4 +1,330 @@
 QA output created by 1549
 Format and populate
 Fuzz metadir root
+core.magic = zeroes: mount failed (32).
+core.magic = ones: mount failed (32).
+core.magic = firstbit: mount failed (32).
+core.magic = middlebit: mount failed (32).
+core.magic = lastbit: mount failed (32).
+core.magic = add: mount failed (32).
+core.magic = sub: mount failed (32).
+core.mode = zeroes: mount failed (32).
+core.mode = ones: mount failed (32).
+core.mode = firstbit: mount failed (32).
+core.mode = middlebit: mount failed (32).
+core.mode = lastbit: mount failed (32).
+core.mode = add: mount failed (32).
+core.mode = sub: mount failed (32).
+core.version = zeroes: mount failed (32).
+core.version = ones: mount failed (32).
+core.version = firstbit: mount failed (32).
+core.version = middlebit: mount failed (32).
+core.version = lastbit: mount failed (32).
+core.version = add: mount failed (32).
+core.version = sub: mount failed (32).
+core.format = zeroes: mount failed (32).
+core.format = ones: mount failed (32).
+core.format = firstbit: mount failed (32).
+core.format = middlebit: mount failed (32).
+core.format = lastbit: mount failed (32).
+core.format = add: mount failed (32).
+core.format = sub: mount failed (32).
+core.onlink = ones: mount failed (32).
+core.onlink = firstbit: mount failed (32).
+core.onlink = middlebit: mount failed (32).
+core.onlink = lastbit: mount failed (32).
+core.onlink = add: mount failed (32).
+core.onlink = sub: mount failed (32).
+core.uid = ones: mount failed (32).
+core.uid = firstbit: mount failed (32).
+core.uid = middlebit: mount failed (32).
+core.uid = lastbit: mount failed (32).
+core.uid = add: mount failed (32).
+core.uid = sub: mount failed (32).
+core.gid = ones: mount failed (32).
+core.gid = firstbit: mount failed (32).
+core.gid = middlebit: mount failed (32).
+core.gid = lastbit: mount failed (32).
+core.gid = add: mount failed (32).
+core.gid = sub: mount failed (32).
+core.nlinkv2 = zeroes: mount failed (32).
+core.projid_lo = ones: mount failed (32).
+core.projid_lo = firstbit: mount failed (32).
+core.projid_lo = middlebit: mount failed (32).
+core.projid_lo = lastbit: mount failed (32).
+core.projid_lo = add: mount failed (32).
+core.projid_lo = sub: mount failed (32).
+core.projid_hi = ones: mount failed (32).
+core.projid_hi = firstbit: mount failed (32).
+core.projid_hi = middlebit: mount failed (32).
+core.projid_hi = lastbit: mount failed (32).
+core.projid_hi = add: mount failed (32).
+core.projid_hi = sub: mount failed (32).
+core.nextents = ones: mount failed (32).
+core.nextents = firstbit: mount failed (32).
+core.nextents = middlebit: mount failed (32).
+core.nextents = lastbit: mount failed (32).
+core.nextents = add: mount failed (32).
+core.nextents = sub: mount failed (32).
+core.size = zeroes: mount failed (32).
+core.size = ones: mount failed (32).
+core.size = firstbit: mount failed (32).
+core.size = middlebit: mount failed (32).
+core.size = lastbit: mount failed (32).
+core.size = add: mount failed (32).
+core.size = sub: mount failed (32).
+core.extsize = ones: mount failed (32).
+core.extsize = firstbit: mount failed (32).
+core.extsize = middlebit: mount failed (32).
+core.extsize = lastbit: mount failed (32).
+core.extsize = add: mount failed (32).
+core.extsize = sub: mount failed (32).
+core.naextents = ones: mount failed (32).
+core.naextents = firstbit: mount failed (32).
+core.naextents = middlebit: mount failed (32).
+core.naextents = lastbit: mount failed (32).
+core.naextents = add: mount failed (32).
+core.naextents = sub: mount failed (32).
+core.forkoff = ones: mount failed (32).
+core.forkoff = firstbit: mount failed (32).
+core.forkoff = middlebit: offline scrub didn't fail.
+core.forkoff = middlebit: online scrub didn't fail.
+core.forkoff = lastbit: mount failed (32).
+core.forkoff = add: mount failed (32).
+core.forkoff = sub: mount failed (32).
+core.aformat = zeroes: online scrub didn't fail.
+core.aformat = zeroes: offline re-scrub failed (1).
+core.aformat = zeroes: offline post-mod scrub failed (1).
+core.aformat = ones: mount failed (32).
+core.aformat = firstbit: mount failed (32).
+core.aformat = middlebit: mount failed (32).
+core.aformat = lastbit: mount failed (32).
+core.aformat = add: mount failed (32).
+core.aformat = sub: mount failed (32).
+core.realtime = ones: mount failed (32).
+core.realtime = firstbit: mount failed (32).
+core.realtime = middlebit: mount failed (32).
+core.realtime = lastbit: mount failed (32).
+core.realtime = add: mount failed (32).
+core.realtime = sub: mount failed (32).
+core.rtinherit = ones: offline scrub didn't fail.
+core.rtinherit = ones: online scrub didn't fail.
+core.rtinherit = firstbit: offline scrub didn't fail.
+core.rtinherit = firstbit: online scrub didn't fail.
+core.rtinherit = middlebit: offline scrub didn't fail.
+core.rtinherit = middlebit: online scrub didn't fail.
+core.rtinherit = lastbit: offline scrub didn't fail.
+core.rtinherit = lastbit: online scrub didn't fail.
+core.rtinherit = add: offline scrub didn't fail.
+core.rtinherit = add: online scrub didn't fail.
+core.rtinherit = sub: offline scrub didn't fail.
+core.rtinherit = sub: online scrub didn't fail.
+core.projinherit = ones: offline scrub didn't fail.
+core.projinherit = ones: online scrub didn't fail.
+core.projinherit = firstbit: offline scrub didn't fail.
+core.projinherit = firstbit: online scrub didn't fail.
+core.projinherit = middlebit: offline scrub didn't fail.
+core.projinherit = middlebit: online scrub didn't fail.
+core.projinherit = lastbit: offline scrub didn't fail.
+core.projinherit = lastbit: online scrub didn't fail.
+core.projinherit = add: offline scrub didn't fail.
+core.projinherit = add: online scrub didn't fail.
+core.projinherit = sub: offline scrub didn't fail.
+core.projinherit = sub: online scrub didn't fail.
+core.nosymlinks = zeroes: mount failed (32).
+core.nosymlinks = firstbit: mount failed (32).
+core.nosymlinks = middlebit: mount failed (32).
+core.nosymlinks = lastbit: mount failed (32).
+core.nosymlinks = add: mount failed (32).
+core.nosymlinks = sub: mount failed (32).
+core.extsz = ones: mount failed (32).
+core.extsz = firstbit: mount failed (32).
+core.extsz = middlebit: mount failed (32).
+core.extsz = lastbit: mount failed (32).
+core.extsz = add: mount failed (32).
+core.extsz = sub: mount failed (32).
+core.extszinherit = ones: mount failed (32).
+core.extszinherit = firstbit: mount failed (32).
+core.extszinherit = middlebit: mount failed (32).
+core.extszinherit = lastbit: mount failed (32).
+core.extszinherit = add: mount failed (32).
+core.extszinherit = sub: mount failed (32).
+next_unlinked = zeroes: mount failed (32).
+next_unlinked = firstbit: mount failed (32).
+next_unlinked = middlebit: mount failed (32).
+next_unlinked = lastbit: mount failed (32).
+next_unlinked = add: online scrub didn't fail.
+next_unlinked = add: offline re-scrub failed (1).
+next_unlinked = add: offline post-mod scrub failed (1).
+next_unlinked = sub: mount failed (32).
+v3.crc = zeroes: mount failed (32).
+v3.crc = ones: mount failed (32).
+v3.crc = firstbit: mount failed (32).
+v3.crc = middlebit: mount failed (32).
+v3.crc = lastbit: mount failed (32).
+v3.crc = add: mount failed (32).
+v3.crc = sub: mount failed (32).
+v3.change_count = zeroes: offline scrub didn't fail.
+v3.change_count = zeroes: online scrub didn't fail.
+v3.change_count = ones: offline scrub didn't fail.
+v3.change_count = ones: online scrub didn't fail.
+v3.change_count = firstbit: offline scrub didn't fail.
+v3.change_count = firstbit: online scrub didn't fail.
+v3.change_count = middlebit: offline scrub didn't fail.
+v3.change_count = middlebit: online scrub didn't fail.
+v3.change_count = lastbit: offline scrub didn't fail.
+v3.change_count = lastbit: online scrub didn't fail.
+v3.change_count = add: offline scrub didn't fail.
+v3.change_count = add: online scrub didn't fail.
+v3.change_count = sub: offline scrub didn't fail.
+v3.change_count = sub: online scrub didn't fail.
+v3.flags2 = zeroes: mount failed (32).
+v3.flags2 = ones: mount failed (32).
+v3.flags2 = firstbit: mount failed (32).
+v3.flags2 = middlebit: online scrub didn't fail.
+v3.flags2 = middlebit: offline re-scrub failed (1).
+v3.flags2 = middlebit: offline post-mod scrub failed (1).
+v3.flags2 = lastbit: mount failed (32).
+v3.flags2 = add: mount failed (32).
+v3.flags2 = sub: mount failed (32).
+v3.cowextsize = ones: mount failed (32).
+v3.cowextsize = firstbit: mount failed (32).
+v3.cowextsize = middlebit: mount failed (32).
+v3.cowextsize = lastbit: mount failed (32).
+v3.cowextsize = add: mount failed (32).
+v3.cowextsize = sub: mount failed (32).
+v3.inumber = zeroes: mount failed (32).
+v3.inumber = ones: mount failed (32).
+v3.inumber = firstbit: mount failed (32).
+v3.inumber = middlebit: mount failed (32).
+v3.inumber = lastbit: mount failed (32).
+v3.inumber = add: mount failed (32).
+v3.inumber = sub: mount failed (32).
+v3.uuid = zeroes: mount failed (32).
+v3.uuid = ones: mount failed (32).
+v3.uuid = firstbit: mount failed (32).
+v3.uuid = middlebit: mount failed (32).
+v3.uuid = lastbit: mount failed (32).
+v3.reflink = ones: mount failed (32).
+v3.reflink = firstbit: mount failed (32).
+v3.reflink = middlebit: mount failed (32).
+v3.reflink = lastbit: mount failed (32).
+v3.reflink = add: mount failed (32).
+v3.reflink = sub: mount failed (32).
+v3.cowextsz = ones: mount failed (32).
+v3.cowextsz = firstbit: mount failed (32).
+v3.cowextsz = middlebit: mount failed (32).
+v3.cowextsz = lastbit: mount failed (32).
+v3.cowextsz = add: mount failed (32).
+v3.cowextsz = sub: mount failed (32).
+v3.nrext64 = zeroes: offline scrub didn't fail.
+v3.nrext64 = zeroes: online scrub didn't fail.
+v3.nrext64 = firstbit: offline scrub didn't fail.
+v3.nrext64 = firstbit: online scrub didn't fail.
+v3.nrext64 = middlebit: offline scrub didn't fail.
+v3.nrext64 = middlebit: online scrub didn't fail.
+v3.nrext64 = lastbit: offline scrub didn't fail.
+v3.nrext64 = lastbit: online scrub didn't fail.
+v3.nrext64 = add: offline scrub didn't fail.
+v3.nrext64 = add: online scrub didn't fail.
+v3.nrext64 = sub: offline scrub didn't fail.
+v3.nrext64 = sub: online scrub didn't fail.
+v3.metadir = zeroes: mount failed (32).
+v3.metadir = firstbit: mount failed (32).
+v3.metadir = middlebit: mount failed (32).
+v3.metadir = lastbit: mount failed (32).
+v3.metadir = add: mount failed (32).
+v3.metadir = sub: mount failed (32).
+u3.sfdir3.hdr.count = zeroes: mount failed (32).
+u3.sfdir3.hdr.count = ones: mount failed (32).
+u3.sfdir3.hdr.count = firstbit: mount failed (32).
+u3.sfdir3.hdr.count = middlebit: mount failed (32).
+u3.sfdir3.hdr.count = lastbit: mount failed (32).
+u3.sfdir3.hdr.count = add: mount failed (32).
+u3.sfdir3.hdr.count = sub: mount failed (32).
+u3.sfdir3.hdr.i8count = ones: mount failed (32).
+u3.sfdir3.hdr.i8count = firstbit: mount failed (32).
+u3.sfdir3.hdr.i8count = middlebit: mount failed (32).
+u3.sfdir3.hdr.i8count = lastbit: mount failed (32).
+u3.sfdir3.hdr.i8count = add: mount failed (32).
+u3.sfdir3.hdr.i8count = sub: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = zeroes: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = ones: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = firstbit: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = middlebit: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = lastbit: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = add: offline scrub didn't fail.
+u3.sfdir3.hdr.parent.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[0].namelen = ones: mount failed (32).
+u3.sfdir3.list[0].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[0].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[0].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[0].namelen = add: mount failed (32).
+u3.sfdir3.list[0].namelen = sub: mount failed (32).
+u3.sfdir3.list[0].offset = zeroes: mount failed (32).
+u3.sfdir3.list[0].offset = ones: mount failed (32).
+u3.sfdir3.list[0].offset = firstbit: mount failed (32).
+u3.sfdir3.list[0].offset = middlebit: mount failed (32).
+u3.sfdir3.list[0].offset = lastbit: mount failed (32).
+u3.sfdir3.list[0].offset = add: mount failed (32).
+u3.sfdir3.list[0].offset = sub: mount failed (32).
+u3.sfdir3.list[0].name = zeroes: mount failed (32).
+u3.sfdir3.list[0].name = ones: mount failed (32).
+u3.sfdir3.list[0].name = firstbit: mount failed (32).
+u3.sfdir3.list[0].name = middlebit: mount failed (32).
+u3.sfdir3.list[0].name = lastbit: mount failed (32).
+u3.sfdir3.list[0].name = add: mount failed (32).
+u3.sfdir3.list[0].name = sub: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[0].filetype = ones: mount failed (32).
+u3.sfdir3.list[0].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[0].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[0].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[0].filetype = add: mount failed (32).
+u3.sfdir3.list[0].filetype = sub: mount failed (32).
+u3.sfdir3.list[1].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[1].namelen = ones: mount failed (32).
+u3.sfdir3.list[1].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[1].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[1].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[1].namelen = add: mount failed (32).
+u3.sfdir3.list[1].namelen = sub: mount failed (32).
+u3.sfdir3.list[1].offset = zeroes: mount failed (32).
+u3.sfdir3.list[1].offset = ones: mount failed (32).
+u3.sfdir3.list[1].offset = firstbit: mount failed (32).
+u3.sfdir3.list[1].offset = middlebit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = middlebit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = add: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = add: online scrub didn't fail.
+u3.sfdir3.list[1].offset = sub: mount failed (32).
+u3.sfdir3.list[1].name = zeroes: online repair failed (4).
+u3.sfdir3.list[1].name = firstbit: online repair failed (4).
+u3.sfdir3.list[1].name = middlebit: online repair failed (4).
+u3.sfdir3.list[1].name = lastbit: online repair failed (4).
+u3.sfdir3.list[1].name = add: online repair failed (4).
+u3.sfdir3.list[1].name = sub: online repair failed (4).
+u3.sfdir3.list[1].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[1].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[1].filetype = ones: mount failed (32).
+u3.sfdir3.list[1].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[1].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[1].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[1].filetype = add: mount failed (32).
+u3.sfdir3.list[1].filetype = sub: mount failed (32).
 Done fuzzing metadir root
diff --git a/tests/xfs/1550.out b/tests/xfs/1550.out
index 7694cd670b..166e165538 100644
--- a/tests/xfs/1550.out
+++ b/tests/xfs/1550.out
@@ -1,4 +1,434 @@
 QA output created by 1550
 Format and populate
 Fuzz metadir subdir
+core.magic = zeroes: mount failed (32).
+core.magic = ones: mount failed (32).
+core.magic = firstbit: mount failed (32).
+core.magic = middlebit: mount failed (32).
+core.magic = lastbit: mount failed (32).
+core.magic = add: mount failed (32).
+core.magic = sub: mount failed (32).
+core.mode = zeroes: mount failed (32).
+core.mode = ones: mount failed (32).
+core.mode = firstbit: mount failed (32).
+core.mode = middlebit: mount failed (32).
+core.mode = lastbit: mount failed (32).
+core.mode = add: mount failed (32).
+core.mode = sub: mount failed (32).
+core.version = zeroes: mount failed (32).
+core.version = ones: mount failed (32).
+core.version = firstbit: mount failed (32).
+core.version = middlebit: mount failed (32).
+core.version = lastbit: mount failed (32).
+core.version = add: mount failed (32).
+core.version = sub: mount failed (32).
+core.format = zeroes: mount failed (32).
+core.format = ones: mount failed (32).
+core.format = firstbit: mount failed (32).
+core.format = middlebit: mount failed (32).
+core.format = lastbit: mount failed (32).
+core.format = add: mount failed (32).
+core.format = sub: mount failed (32).
+core.onlink = ones: mount failed (32).
+core.onlink = firstbit: mount failed (32).
+core.onlink = middlebit: mount failed (32).
+core.onlink = lastbit: mount failed (32).
+core.onlink = add: mount failed (32).
+core.onlink = sub: mount failed (32).
+core.uid = ones: mount failed (32).
+core.uid = firstbit: mount failed (32).
+core.uid = middlebit: mount failed (32).
+core.uid = lastbit: mount failed (32).
+core.uid = add: mount failed (32).
+core.uid = sub: mount failed (32).
+core.gid = ones: mount failed (32).
+core.gid = firstbit: mount failed (32).
+core.gid = middlebit: mount failed (32).
+core.gid = lastbit: mount failed (32).
+core.gid = add: mount failed (32).
+core.gid = sub: mount failed (32).
+core.nlinkv2 = zeroes: mount failed (32).
+core.projid_lo = ones: mount failed (32).
+core.projid_lo = firstbit: mount failed (32).
+core.projid_lo = middlebit: mount failed (32).
+core.projid_lo = lastbit: mount failed (32).
+core.projid_lo = add: mount failed (32).
+core.projid_lo = sub: mount failed (32).
+core.projid_hi = ones: mount failed (32).
+core.projid_hi = firstbit: mount failed (32).
+core.projid_hi = middlebit: mount failed (32).
+core.projid_hi = lastbit: mount failed (32).
+core.projid_hi = add: mount failed (32).
+core.projid_hi = sub: mount failed (32).
+core.nextents = ones: mount failed (32).
+core.nextents = firstbit: mount failed (32).
+core.nextents = middlebit: mount failed (32).
+core.nextents = lastbit: mount failed (32).
+core.nextents = add: mount failed (32).
+core.nextents = sub: mount failed (32).
+core.size = zeroes: mount failed (32).
+core.size = ones: mount failed (32).
+core.size = firstbit: mount failed (32).
+core.size = middlebit: mount failed (32).
+core.size = lastbit: mount failed (32).
+core.size = add: mount failed (32).
+core.size = sub: mount failed (32).
+core.extsize = ones: mount failed (32).
+core.extsize = firstbit: mount failed (32).
+core.extsize = middlebit: mount failed (32).
+core.extsize = lastbit: mount failed (32).
+core.extsize = add: mount failed (32).
+core.extsize = sub: mount failed (32).
+core.naextents = ones: mount failed (32).
+core.naextents = firstbit: mount failed (32).
+core.naextents = middlebit: mount failed (32).
+core.naextents = lastbit: mount failed (32).
+core.naextents = add: mount failed (32).
+core.naextents = sub: mount failed (32).
+core.forkoff = zeroes: mount failed (32).
+core.forkoff = ones: mount failed (32).
+core.forkoff = firstbit: mount failed (32).
+core.forkoff = middlebit: mount failed (32).
+core.forkoff = lastbit: mount failed (32).
+core.forkoff = add: mount failed (32).
+core.forkoff = sub: mount failed (32).
+core.aformat = zeroes: mount failed (32).
+core.aformat = ones: mount failed (32).
+core.aformat = firstbit: mount failed (32).
+core.aformat = middlebit: mount failed (32).
+core.aformat = lastbit: mount failed (32).
+core.aformat = add: mount failed (32).
+core.aformat = sub: mount failed (32).
+core.realtime = ones: mount failed (32).
+core.realtime = firstbit: mount failed (32).
+core.realtime = middlebit: mount failed (32).
+core.realtime = lastbit: mount failed (32).
+core.realtime = add: mount failed (32).
+core.realtime = sub: mount failed (32).
+core.rtinherit = ones: online scrub didn't fail.
+core.rtinherit = firstbit: online scrub didn't fail.
+core.rtinherit = middlebit: online scrub didn't fail.
+core.rtinherit = lastbit: online scrub didn't fail.
+core.rtinherit = add: online scrub didn't fail.
+core.rtinherit = sub: online scrub didn't fail.
+core.projinherit = ones: online scrub didn't fail.
+core.projinherit = firstbit: online scrub didn't fail.
+core.projinherit = middlebit: online scrub didn't fail.
+core.projinherit = lastbit: online scrub didn't fail.
+core.projinherit = add: online scrub didn't fail.
+core.projinherit = sub: online scrub didn't fail.
+core.nosymlinks = zeroes: mount failed (32).
+core.nosymlinks = firstbit: mount failed (32).
+core.nosymlinks = middlebit: mount failed (32).
+core.nosymlinks = lastbit: mount failed (32).
+core.nosymlinks = add: mount failed (32).
+core.nosymlinks = sub: mount failed (32).
+core.extsz = ones: mount failed (32).
+core.extsz = firstbit: mount failed (32).
+core.extsz = middlebit: mount failed (32).
+core.extsz = lastbit: mount failed (32).
+core.extsz = add: mount failed (32).
+core.extsz = sub: mount failed (32).
+core.extszinherit = ones: mount failed (32).
+core.extszinherit = firstbit: mount failed (32).
+core.extszinherit = middlebit: mount failed (32).
+core.extszinherit = lastbit: mount failed (32).
+core.extszinherit = add: mount failed (32).
+core.extszinherit = sub: mount failed (32).
+next_unlinked = zeroes: mount failed (32).
+next_unlinked = firstbit: mount failed (32).
+next_unlinked = middlebit: mount failed (32).
+next_unlinked = lastbit: mount failed (32).
+next_unlinked = add: online scrub didn't fail.
+next_unlinked = add: offline re-scrub failed (1).
+next_unlinked = add: offline post-mod scrub failed (1).
+next_unlinked = sub: mount failed (32).
+v3.crc = zeroes: mount failed (32).
+v3.crc = ones: mount failed (32).
+v3.crc = firstbit: mount failed (32).
+v3.crc = middlebit: mount failed (32).
+v3.crc = lastbit: mount failed (32).
+v3.crc = add: mount failed (32).
+v3.crc = sub: mount failed (32).
+v3.change_count = zeroes: online scrub didn't fail.
+v3.change_count = ones: online scrub didn't fail.
+v3.change_count = firstbit: online scrub didn't fail.
+v3.change_count = middlebit: online scrub didn't fail.
+v3.change_count = lastbit: online scrub didn't fail.
+v3.change_count = add: online scrub didn't fail.
+v3.change_count = sub: online scrub didn't fail.
+v3.flags2 = zeroes: mount failed (32).
+v3.flags2 = ones: mount failed (32).
+v3.flags2 = firstbit: mount failed (32).
+v3.flags2 = middlebit: online scrub didn't fail.
+v3.flags2 = middlebit: offline re-scrub failed (1).
+v3.flags2 = middlebit: offline post-mod scrub failed (1).
+v3.flags2 = lastbit: mount failed (32).
+v3.flags2 = add: mount failed (32).
+v3.flags2 = sub: mount failed (32).
+v3.cowextsize = ones: mount failed (32).
+v3.cowextsize = firstbit: mount failed (32).
+v3.cowextsize = middlebit: mount failed (32).
+v3.cowextsize = lastbit: mount failed (32).
+v3.cowextsize = add: mount failed (32).
+v3.cowextsize = sub: mount failed (32).
+v3.inumber = zeroes: mount failed (32).
+v3.inumber = ones: mount failed (32).
+v3.inumber = firstbit: mount failed (32).
+v3.inumber = middlebit: mount failed (32).
+v3.inumber = lastbit: mount failed (32).
+v3.inumber = add: mount failed (32).
+v3.inumber = sub: mount failed (32).
+v3.uuid = zeroes: mount failed (32).
+v3.uuid = ones: mount failed (32).
+v3.uuid = firstbit: mount failed (32).
+v3.uuid = middlebit: mount failed (32).
+v3.uuid = lastbit: mount failed (32).
+v3.reflink = ones: mount failed (32).
+v3.reflink = firstbit: mount failed (32).
+v3.reflink = middlebit: mount failed (32).
+v3.reflink = lastbit: mount failed (32).
+v3.reflink = add: mount failed (32).
+v3.reflink = sub: mount failed (32).
+v3.cowextsz = ones: mount failed (32).
+v3.cowextsz = firstbit: mount failed (32).
+v3.cowextsz = middlebit: mount failed (32).
+v3.cowextsz = lastbit: mount failed (32).
+v3.cowextsz = add: mount failed (32).
+v3.cowextsz = sub: mount failed (32).
+v3.nrext64 = zeroes: online scrub didn't fail.
+v3.nrext64 = firstbit: online scrub didn't fail.
+v3.nrext64 = middlebit: online scrub didn't fail.
+v3.nrext64 = lastbit: online scrub didn't fail.
+v3.nrext64 = add: online scrub didn't fail.
+v3.nrext64 = sub: online scrub didn't fail.
+v3.metadir = zeroes: mount failed (32).
+v3.metadir = firstbit: mount failed (32).
+v3.metadir = middlebit: mount failed (32).
+v3.metadir = lastbit: mount failed (32).
+v3.metadir = add: mount failed (32).
+v3.metadir = sub: mount failed (32).
+u3.sfdir3.hdr.count = zeroes: mount failed (32).
+u3.sfdir3.hdr.count = ones: mount failed (32).
+u3.sfdir3.hdr.count = firstbit: mount failed (32).
+u3.sfdir3.hdr.count = middlebit: mount failed (32).
+u3.sfdir3.hdr.count = lastbit: mount failed (32).
+u3.sfdir3.hdr.count = add: mount failed (32).
+u3.sfdir3.hdr.count = sub: mount failed (32).
+u3.sfdir3.hdr.i8count = ones: mount failed (32).
+u3.sfdir3.hdr.i8count = firstbit: mount failed (32).
+u3.sfdir3.hdr.i8count = middlebit: mount failed (32).
+u3.sfdir3.hdr.i8count = lastbit: mount failed (32).
+u3.sfdir3.hdr.i8count = add: mount failed (32).
+u3.sfdir3.hdr.i8count = sub: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = zeroes: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = ones: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = firstbit: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[0].namelen = ones: mount failed (32).
+u3.sfdir3.list[0].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[0].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[0].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[0].namelen = add: mount failed (32).
+u3.sfdir3.list[0].namelen = sub: mount failed (32).
+u3.sfdir3.list[0].offset = zeroes: mount failed (32).
+u3.sfdir3.list[0].offset = ones: mount failed (32).
+u3.sfdir3.list[0].offset = firstbit: mount failed (32).
+u3.sfdir3.list[0].offset = middlebit: mount failed (32).
+u3.sfdir3.list[0].offset = lastbit: mount failed (32).
+u3.sfdir3.list[0].offset = add: mount failed (32).
+u3.sfdir3.list[0].offset = sub: mount failed (32).
+u3.sfdir3.list[0].name = zeroes: mount failed (32).
+u3.sfdir3.list[0].name = ones: mount failed (32).
+u3.sfdir3.list[0].name = firstbit: mount failed (32).
+u3.sfdir3.list[0].name = middlebit: mount failed (32).
+u3.sfdir3.list[0].name = lastbit: mount failed (32).
+u3.sfdir3.list[0].name = add: mount failed (32).
+u3.sfdir3.list[0].name = sub: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[0].filetype = ones: mount failed (32).
+u3.sfdir3.list[0].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[0].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[0].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[0].filetype = add: mount failed (32).
+u3.sfdir3.list[0].filetype = sub: mount failed (32).
+u3.sfdir3.list[1].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[1].namelen = ones: mount failed (32).
+u3.sfdir3.list[1].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[1].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[1].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[1].namelen = add: mount failed (32).
+u3.sfdir3.list[1].namelen = sub: mount failed (32).
+u3.sfdir3.list[1].offset = zeroes: mount failed (32).
+u3.sfdir3.list[1].offset = ones: mount failed (32).
+u3.sfdir3.list[1].offset = firstbit: mount failed (32).
+u3.sfdir3.list[1].offset = middlebit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = add: online scrub didn't fail.
+u3.sfdir3.list[1].offset = sub: mount failed (32).
+u3.sfdir3.list[1].name = zeroes: mount failed (32).
+u3.sfdir3.list[1].name = ones: mount failed (32).
+u3.sfdir3.list[1].name = firstbit: mount failed (32).
+u3.sfdir3.list[1].name = middlebit: mount failed (32).
+u3.sfdir3.list[1].name = lastbit: mount failed (32).
+u3.sfdir3.list[1].name = add: mount failed (32).
+u3.sfdir3.list[1].name = sub: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[1].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[1].filetype = ones: mount failed (32).
+u3.sfdir3.list[1].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[1].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[1].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[1].filetype = add: mount failed (32).
+u3.sfdir3.list[1].filetype = sub: mount failed (32).
+a.sfattr.hdr.totsize = zeroes: mount failed (32).
+a.sfattr.hdr.totsize = ones: mount failed (32).
+a.sfattr.hdr.totsize = firstbit: mount failed (32).
+a.sfattr.hdr.totsize = middlebit: mount failed (32).
+a.sfattr.hdr.totsize = lastbit: mount failed (32).
+a.sfattr.hdr.totsize = add: mount failed (32).
+a.sfattr.hdr.totsize = sub: mount failed (32).
+a.sfattr.hdr.count = zeroes: mount failed (32).
+a.sfattr.hdr.count = ones: mount failed (32).
+a.sfattr.hdr.count = firstbit: mount failed (32).
+a.sfattr.hdr.count = middlebit: mount failed (32).
+a.sfattr.hdr.count = lastbit: mount failed (32).
+a.sfattr.hdr.count = add: mount failed (32).
+a.sfattr.hdr.count = sub: mount failed (32).
+a.sfattr.list[0].namelen = zeroes: mount failed (32).
+a.sfattr.list[0].namelen = ones: mount failed (32).
+a.sfattr.list[0].namelen = firstbit: mount failed (32).
+a.sfattr.list[0].namelen = middlebit: mount failed (32).
+a.sfattr.list[0].namelen = lastbit: mount failed (32).
+a.sfattr.list[0].namelen = add: mount failed (32).
+a.sfattr.list[0].namelen = sub: mount failed (32).
+a.sfattr.list[0].parent = zeroes: online repair failed (1).
+a.sfattr.list[0].parent = zeroes: online re-scrub failed (5).
+a.sfattr.list[0].parent = zeroes: offline re-scrub failed (1).
+a.sfattr.list[0].parent = zeroes: online post-mod scrub failed (1).
+a.sfattr.list[0].parent = zeroes: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent = lastbit: online repair failed (1).
+a.sfattr.list[0].parent = lastbit: online re-scrub failed (5).
+a.sfattr.list[0].parent = lastbit: offline re-scrub failed (1).
+a.sfattr.list[0].parent = lastbit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent = lastbit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent = sub: online repair failed (1).
+a.sfattr.list[0].parent = sub: online re-scrub failed (5).
+a.sfattr.list[0].parent = sub: offline re-scrub failed (1).
+a.sfattr.list[0].parent = sub: online post-mod scrub failed (1).
+a.sfattr.list[0].parent = sub: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_ino = lastbit: online repair failed (1).
+a.sfattr.list[0].parent_ino = lastbit: online re-scrub failed (5).
+a.sfattr.list[0].parent_ino = lastbit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_ino = lastbit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_ino = lastbit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_ino = add: online repair failed (1).
+a.sfattr.list[0].parent_ino = add: online re-scrub failed (5).
+a.sfattr.list[0].parent_ino = add: offline re-scrub failed (1).
+a.sfattr.list[0].parent_ino = add: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_ino = add: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_ino = sub: online repair failed (1).
+a.sfattr.list[0].parent_ino = sub: online re-scrub failed (5).
+a.sfattr.list[0].parent_ino = sub: offline re-scrub failed (1).
+a.sfattr.list[0].parent_ino = sub: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_ino = sub: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = ones: online repair failed (1).
+a.sfattr.list[0].parent_gen = ones: online re-scrub failed (5).
+a.sfattr.list[0].parent_gen = ones: offline re-scrub failed (1).
+a.sfattr.list[0].parent_gen = ones: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = ones: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = firstbit: online repair failed (1).
+a.sfattr.list[0].parent_gen = firstbit: online re-scrub failed (5).
+a.sfattr.list[0].parent_gen = firstbit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_gen = firstbit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = firstbit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = middlebit: online repair failed (1).
+a.sfattr.list[0].parent_gen = middlebit: online re-scrub failed (5).
+a.sfattr.list[0].parent_gen = middlebit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_gen = middlebit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = middlebit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = lastbit: online repair failed (1).
+a.sfattr.list[0].parent_gen = lastbit: online re-scrub failed (5).
+a.sfattr.list[0].parent_gen = lastbit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_gen = lastbit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = lastbit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = add: online repair failed (1).
+a.sfattr.list[0].parent_gen = add: online re-scrub failed (5).
+a.sfattr.list[0].parent_gen = add: offline re-scrub failed (1).
+a.sfattr.list[0].parent_gen = add: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = add: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = sub: online repair failed (1).
+a.sfattr.list[0].parent_gen = sub: online re-scrub failed (5).
+a.sfattr.list[0].parent_gen = sub: offline re-scrub failed (1).
+a.sfattr.list[0].parent_gen = sub: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_gen = sub: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = zeroes: online repair failed (1).
+a.sfattr.list[0].parent_namehash = zeroes: online re-scrub failed (5).
+a.sfattr.list[0].parent_namehash = zeroes: offline re-scrub failed (1).
+a.sfattr.list[0].parent_namehash = zeroes: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = zeroes: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = firstbit: online repair failed (1).
+a.sfattr.list[0].parent_namehash = firstbit: online re-scrub failed (5).
+a.sfattr.list[0].parent_namehash = firstbit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_namehash = firstbit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = firstbit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = lastbit: online repair failed (1).
+a.sfattr.list[0].parent_namehash = lastbit: online re-scrub failed (5).
+a.sfattr.list[0].parent_namehash = lastbit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_namehash = lastbit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = lastbit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = add: online repair failed (1).
+a.sfattr.list[0].parent_namehash = add: online re-scrub failed (5).
+a.sfattr.list[0].parent_namehash = add: offline re-scrub failed (1).
+a.sfattr.list[0].parent_namehash = add: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = add: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = sub: online repair failed (1).
+a.sfattr.list[0].parent_namehash = sub: online re-scrub failed (5).
+a.sfattr.list[0].parent_namehash = sub: offline re-scrub failed (1).
+a.sfattr.list[0].parent_namehash = sub: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_namehash = sub: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = ones: online repair failed (1).
+a.sfattr.list[0].parent_name = ones: online re-scrub failed (5).
+a.sfattr.list[0].parent_name = ones: offline re-scrub failed (1).
+a.sfattr.list[0].parent_name = ones: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = ones: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = firstbit: online repair failed (1).
+a.sfattr.list[0].parent_name = firstbit: online re-scrub failed (5).
+a.sfattr.list[0].parent_name = firstbit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_name = firstbit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = firstbit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = middlebit: online repair failed (1).
+a.sfattr.list[0].parent_name = middlebit: online re-scrub failed (5).
+a.sfattr.list[0].parent_name = middlebit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_name = middlebit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = middlebit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = lastbit: online repair failed (1).
+a.sfattr.list[0].parent_name = lastbit: online re-scrub failed (5).
+a.sfattr.list[0].parent_name = lastbit: offline re-scrub failed (1).
+a.sfattr.list[0].parent_name = lastbit: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = lastbit: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = add: online repair failed (1).
+a.sfattr.list[0].parent_name = add: online re-scrub failed (5).
+a.sfattr.list[0].parent_name = add: offline re-scrub failed (1).
+a.sfattr.list[0].parent_name = add: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = add: offline post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = sub: online repair failed (1).
+a.sfattr.list[0].parent_name = sub: online re-scrub failed (5).
+a.sfattr.list[0].parent_name = sub: offline re-scrub failed (1).
+a.sfattr.list[0].parent_name = sub: online post-mod scrub failed (1).
+a.sfattr.list[0].parent_name = sub: offline post-mod scrub failed (1).
 Done fuzzing metadir subdir
diff --git a/tests/xfs/1551.out b/tests/xfs/1551.out
index 4c3360d08b..fedbe7ad69 100644
--- a/tests/xfs/1551.out
+++ b/tests/xfs/1551.out
@@ -1,4 +1,35 @@
 QA output created by 1551
 Format and populate
 Fuzz metadir subdir
+core.rtinherit = ones: offline scrub didn't fail.
+core.rtinherit = firstbit: offline scrub didn't fail.
+core.rtinherit = middlebit: offline scrub didn't fail.
+core.rtinherit = lastbit: offline scrub didn't fail.
+core.rtinherit = add: offline scrub didn't fail.
+core.rtinherit = sub: offline scrub didn't fail.
+core.projinherit = ones: offline scrub didn't fail.
+core.projinherit = firstbit: offline scrub didn't fail.
+core.projinherit = middlebit: offline scrub didn't fail.
+core.projinherit = lastbit: offline scrub didn't fail.
+core.projinherit = add: offline scrub didn't fail.
+core.projinherit = sub: offline scrub didn't fail.
+v3.change_count = zeroes: offline scrub didn't fail.
+v3.change_count = ones: offline scrub didn't fail.
+v3.change_count = firstbit: offline scrub didn't fail.
+v3.change_count = middlebit: offline scrub didn't fail.
+v3.change_count = lastbit: offline scrub didn't fail.
+v3.change_count = add: offline scrub didn't fail.
+v3.change_count = sub: offline scrub didn't fail.
+v3.nrext64 = zeroes: offline scrub didn't fail.
+v3.nrext64 = firstbit: offline scrub didn't fail.
+v3.nrext64 = middlebit: offline scrub didn't fail.
+v3.nrext64 = lastbit: offline scrub didn't fail.
+v3.nrext64 = add: offline scrub didn't fail.
+v3.nrext64 = sub: offline scrub didn't fail.
+u3.sfdir3.list[0].inumber.i4 = lastbit: offline repair failed (1).
+u3.sfdir3.list[0].inumber.i4 = lastbit: offline re-scrub failed (1).
+u3.sfdir3.list[0].inumber.i4 = lastbit: pre-mod mount failed (32).
+u3.sfdir3.list[1].offset = middlebit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = add: offline scrub didn't fail.
 Done fuzzing metadir subdir
diff --git a/tests/xfs/1552.out b/tests/xfs/1552.out
index 6636b1b656..35cc60d261 100644
--- a/tests/xfs/1552.out
+++ b/tests/xfs/1552.out
@@ -1,4 +1,343 @@
 QA output created by 1552
 Format and populate
 Fuzz metadir subdir
+core.magic = zeroes: mount failed (32).
+core.magic = ones: mount failed (32).
+core.magic = firstbit: mount failed (32).
+core.magic = middlebit: mount failed (32).
+core.magic = lastbit: mount failed (32).
+core.magic = add: mount failed (32).
+core.magic = sub: mount failed (32).
+core.mode = zeroes: mount failed (32).
+core.mode = ones: mount failed (32).
+core.mode = firstbit: mount failed (32).
+core.mode = middlebit: mount failed (32).
+core.mode = lastbit: mount failed (32).
+core.mode = add: mount failed (32).
+core.mode = sub: mount failed (32).
+core.version = zeroes: mount failed (32).
+core.version = ones: mount failed (32).
+core.version = firstbit: mount failed (32).
+core.version = middlebit: mount failed (32).
+core.version = lastbit: mount failed (32).
+core.version = add: mount failed (32).
+core.version = sub: mount failed (32).
+core.format = zeroes: mount failed (32).
+core.format = ones: mount failed (32).
+core.format = firstbit: mount failed (32).
+core.format = middlebit: mount failed (32).
+core.format = lastbit: mount failed (32).
+core.format = add: mount failed (32).
+core.format = sub: mount failed (32).
+core.onlink = ones: mount failed (32).
+core.onlink = firstbit: mount failed (32).
+core.onlink = middlebit: mount failed (32).
+core.onlink = lastbit: mount failed (32).
+core.onlink = add: mount failed (32).
+core.onlink = sub: mount failed (32).
+core.uid = ones: mount failed (32).
+core.uid = firstbit: mount failed (32).
+core.uid = middlebit: mount failed (32).
+core.uid = lastbit: mount failed (32).
+core.uid = add: mount failed (32).
+core.uid = sub: mount failed (32).
+core.gid = ones: mount failed (32).
+core.gid = firstbit: mount failed (32).
+core.gid = middlebit: mount failed (32).
+core.gid = lastbit: mount failed (32).
+core.gid = add: mount failed (32).
+core.gid = sub: mount failed (32).
+core.nlinkv2 = zeroes: mount failed (32).
+core.projid_lo = ones: mount failed (32).
+core.projid_lo = firstbit: mount failed (32).
+core.projid_lo = middlebit: mount failed (32).
+core.projid_lo = lastbit: mount failed (32).
+core.projid_lo = add: mount failed (32).
+core.projid_lo = sub: mount failed (32).
+core.projid_hi = ones: mount failed (32).
+core.projid_hi = firstbit: mount failed (32).
+core.projid_hi = middlebit: mount failed (32).
+core.projid_hi = lastbit: mount failed (32).
+core.projid_hi = add: mount failed (32).
+core.projid_hi = sub: mount failed (32).
+core.nextents = ones: mount failed (32).
+core.nextents = firstbit: mount failed (32).
+core.nextents = middlebit: mount failed (32).
+core.nextents = lastbit: mount failed (32).
+core.nextents = add: mount failed (32).
+core.nextents = sub: mount failed (32).
+core.size = zeroes: mount failed (32).
+core.size = ones: mount failed (32).
+core.size = firstbit: mount failed (32).
+core.size = middlebit: mount failed (32).
+core.size = lastbit: mount failed (32).
+core.size = add: mount failed (32).
+core.size = sub: mount failed (32).
+core.extsize = ones: mount failed (32).
+core.extsize = firstbit: mount failed (32).
+core.extsize = middlebit: mount failed (32).
+core.extsize = lastbit: mount failed (32).
+core.extsize = add: mount failed (32).
+core.extsize = sub: mount failed (32).
+core.naextents = ones: mount failed (32).
+core.naextents = firstbit: mount failed (32).
+core.naextents = middlebit: mount failed (32).
+core.naextents = lastbit: mount failed (32).
+core.naextents = add: mount failed (32).
+core.naextents = sub: mount failed (32).
+core.forkoff = zeroes: mount failed (32).
+core.forkoff = ones: mount failed (32).
+core.forkoff = firstbit: mount failed (32).
+core.forkoff = middlebit: mount failed (32).
+core.forkoff = lastbit: mount failed (32).
+core.forkoff = add: mount failed (32).
+core.forkoff = sub: mount failed (32).
+core.aformat = zeroes: mount failed (32).
+core.aformat = ones: mount failed (32).
+core.aformat = firstbit: mount failed (32).
+core.aformat = middlebit: mount failed (32).
+core.aformat = lastbit: mount failed (32).
+core.aformat = add: mount failed (32).
+core.aformat = sub: mount failed (32).
+core.realtime = ones: mount failed (32).
+core.realtime = firstbit: mount failed (32).
+core.realtime = middlebit: mount failed (32).
+core.realtime = lastbit: mount failed (32).
+core.realtime = add: mount failed (32).
+core.realtime = sub: mount failed (32).
+core.rtinherit = ones: offline scrub didn't fail.
+core.rtinherit = ones: online scrub didn't fail.
+core.rtinherit = firstbit: offline scrub didn't fail.
+core.rtinherit = firstbit: online scrub didn't fail.
+core.rtinherit = middlebit: offline scrub didn't fail.
+core.rtinherit = middlebit: online scrub didn't fail.
+core.rtinherit = lastbit: offline scrub didn't fail.
+core.rtinherit = lastbit: online scrub didn't fail.
+core.rtinherit = add: offline scrub didn't fail.
+core.rtinherit = add: online scrub didn't fail.
+core.rtinherit = sub: offline scrub didn't fail.
+core.rtinherit = sub: online scrub didn't fail.
+core.projinherit = ones: offline scrub didn't fail.
+core.projinherit = ones: online scrub didn't fail.
+core.projinherit = firstbit: offline scrub didn't fail.
+core.projinherit = firstbit: online scrub didn't fail.
+core.projinherit = middlebit: offline scrub didn't fail.
+core.projinherit = middlebit: online scrub didn't fail.
+core.projinherit = lastbit: offline scrub didn't fail.
+core.projinherit = lastbit: online scrub didn't fail.
+core.projinherit = add: offline scrub didn't fail.
+core.projinherit = add: online scrub didn't fail.
+core.projinherit = sub: offline scrub didn't fail.
+core.projinherit = sub: online scrub didn't fail.
+core.nosymlinks = zeroes: mount failed (32).
+core.nosymlinks = firstbit: mount failed (32).
+core.nosymlinks = middlebit: mount failed (32).
+core.nosymlinks = lastbit: mount failed (32).
+core.nosymlinks = add: mount failed (32).
+core.nosymlinks = sub: mount failed (32).
+core.extsz = ones: mount failed (32).
+core.extsz = firstbit: mount failed (32).
+core.extsz = middlebit: mount failed (32).
+core.extsz = lastbit: mount failed (32).
+core.extsz = add: mount failed (32).
+core.extsz = sub: mount failed (32).
+core.extszinherit = ones: mount failed (32).
+core.extszinherit = firstbit: mount failed (32).
+core.extszinherit = middlebit: mount failed (32).
+core.extszinherit = lastbit: mount failed (32).
+core.extszinherit = add: mount failed (32).
+core.extszinherit = sub: mount failed (32).
+next_unlinked = zeroes: mount failed (32).
+next_unlinked = firstbit: mount failed (32).
+next_unlinked = middlebit: mount failed (32).
+next_unlinked = lastbit: mount failed (32).
+next_unlinked = add: online scrub didn't fail.
+next_unlinked = sub: mount failed (32).
+v3.crc = zeroes: mount failed (32).
+v3.crc = ones: mount failed (32).
+v3.crc = firstbit: mount failed (32).
+v3.crc = middlebit: mount failed (32).
+v3.crc = lastbit: mount failed (32).
+v3.crc = add: mount failed (32).
+v3.crc = sub: mount failed (32).
+v3.change_count = zeroes: offline scrub didn't fail.
+v3.change_count = zeroes: online scrub didn't fail.
+v3.change_count = ones: offline scrub didn't fail.
+v3.change_count = ones: online scrub didn't fail.
+v3.change_count = firstbit: offline scrub didn't fail.
+v3.change_count = firstbit: online scrub didn't fail.
+v3.change_count = middlebit: offline scrub didn't fail.
+v3.change_count = middlebit: online scrub didn't fail.
+v3.change_count = lastbit: offline scrub didn't fail.
+v3.change_count = lastbit: online scrub didn't fail.
+v3.change_count = add: offline scrub didn't fail.
+v3.change_count = add: online scrub didn't fail.
+v3.change_count = sub: offline scrub didn't fail.
+v3.change_count = sub: online scrub didn't fail.
+v3.flags2 = zeroes: mount failed (32).
+v3.flags2 = ones: mount failed (32).
+v3.flags2 = firstbit: mount failed (32).
+v3.flags2 = middlebit: online scrub didn't fail.
+v3.flags2 = lastbit: mount failed (32).
+v3.flags2 = add: mount failed (32).
+v3.flags2 = sub: mount failed (32).
+v3.cowextsize = ones: mount failed (32).
+v3.cowextsize = firstbit: mount failed (32).
+v3.cowextsize = middlebit: mount failed (32).
+v3.cowextsize = lastbit: mount failed (32).
+v3.cowextsize = add: mount failed (32).
+v3.cowextsize = sub: mount failed (32).
+v3.inumber = zeroes: mount failed (32).
+v3.inumber = ones: mount failed (32).
+v3.inumber = firstbit: mount failed (32).
+v3.inumber = middlebit: mount failed (32).
+v3.inumber = lastbit: mount failed (32).
+v3.inumber = add: mount failed (32).
+v3.inumber = sub: mount failed (32).
+v3.uuid = zeroes: mount failed (32).
+v3.uuid = ones: mount failed (32).
+v3.uuid = firstbit: mount failed (32).
+v3.uuid = middlebit: mount failed (32).
+v3.uuid = lastbit: mount failed (32).
+v3.reflink = ones: mount failed (32).
+v3.reflink = firstbit: mount failed (32).
+v3.reflink = middlebit: mount failed (32).
+v3.reflink = lastbit: mount failed (32).
+v3.reflink = add: mount failed (32).
+v3.reflink = sub: mount failed (32).
+v3.cowextsz = ones: mount failed (32).
+v3.cowextsz = firstbit: mount failed (32).
+v3.cowextsz = middlebit: mount failed (32).
+v3.cowextsz = lastbit: mount failed (32).
+v3.cowextsz = add: mount failed (32).
+v3.cowextsz = sub: mount failed (32).
+v3.nrext64 = zeroes: offline scrub didn't fail.
+v3.nrext64 = zeroes: online scrub didn't fail.
+v3.nrext64 = firstbit: offline scrub didn't fail.
+v3.nrext64 = firstbit: online scrub didn't fail.
+v3.nrext64 = middlebit: offline scrub didn't fail.
+v3.nrext64 = middlebit: online scrub didn't fail.
+v3.nrext64 = lastbit: offline scrub didn't fail.
+v3.nrext64 = lastbit: online scrub didn't fail.
+v3.nrext64 = add: offline scrub didn't fail.
+v3.nrext64 = add: online scrub didn't fail.
+v3.nrext64 = sub: offline scrub didn't fail.
+v3.nrext64 = sub: online scrub didn't fail.
+v3.metadir = zeroes: mount failed (32).
+v3.metadir = firstbit: mount failed (32).
+v3.metadir = middlebit: mount failed (32).
+v3.metadir = lastbit: mount failed (32).
+v3.metadir = add: mount failed (32).
+v3.metadir = sub: mount failed (32).
+u3.sfdir3.hdr.count = zeroes: mount failed (32).
+u3.sfdir3.hdr.count = ones: mount failed (32).
+u3.sfdir3.hdr.count = firstbit: mount failed (32).
+u3.sfdir3.hdr.count = middlebit: mount failed (32).
+u3.sfdir3.hdr.count = lastbit: mount failed (32).
+u3.sfdir3.hdr.count = add: mount failed (32).
+u3.sfdir3.hdr.count = sub: mount failed (32).
+u3.sfdir3.hdr.i8count = ones: mount failed (32).
+u3.sfdir3.hdr.i8count = firstbit: mount failed (32).
+u3.sfdir3.hdr.i8count = middlebit: mount failed (32).
+u3.sfdir3.hdr.i8count = lastbit: mount failed (32).
+u3.sfdir3.hdr.i8count = add: mount failed (32).
+u3.sfdir3.hdr.i8count = sub: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = zeroes: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = ones: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = firstbit: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[0].namelen = ones: mount failed (32).
+u3.sfdir3.list[0].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[0].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[0].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[0].namelen = add: mount failed (32).
+u3.sfdir3.list[0].namelen = sub: mount failed (32).
+u3.sfdir3.list[0].offset = zeroes: mount failed (32).
+u3.sfdir3.list[0].offset = ones: mount failed (32).
+u3.sfdir3.list[0].offset = firstbit: mount failed (32).
+u3.sfdir3.list[0].offset = middlebit: mount failed (32).
+u3.sfdir3.list[0].offset = lastbit: mount failed (32).
+u3.sfdir3.list[0].offset = add: mount failed (32).
+u3.sfdir3.list[0].offset = sub: mount failed (32).
+u3.sfdir3.list[0].name = zeroes: mount failed (32).
+u3.sfdir3.list[0].name = ones: mount failed (32).
+u3.sfdir3.list[0].name = firstbit: mount failed (32).
+u3.sfdir3.list[0].name = middlebit: mount failed (32).
+u3.sfdir3.list[0].name = lastbit: mount failed (32).
+u3.sfdir3.list[0].name = add: mount failed (32).
+u3.sfdir3.list[0].name = sub: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[0].filetype = ones: mount failed (32).
+u3.sfdir3.list[0].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[0].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[0].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[0].filetype = add: mount failed (32).
+u3.sfdir3.list[0].filetype = sub: mount failed (32).
+u3.sfdir3.list[1].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[1].namelen = ones: mount failed (32).
+u3.sfdir3.list[1].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[1].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[1].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[1].namelen = add: mount failed (32).
+u3.sfdir3.list[1].namelen = sub: mount failed (32).
+u3.sfdir3.list[1].offset = zeroes: mount failed (32).
+u3.sfdir3.list[1].offset = ones: mount failed (32).
+u3.sfdir3.list[1].offset = firstbit: mount failed (32).
+u3.sfdir3.list[1].offset = middlebit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = middlebit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = add: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = add: online scrub didn't fail.
+u3.sfdir3.list[1].offset = sub: mount failed (32).
+u3.sfdir3.list[1].name = zeroes: mount failed (32).
+u3.sfdir3.list[1].name = ones: mount failed (32).
+u3.sfdir3.list[1].name = firstbit: mount failed (32).
+u3.sfdir3.list[1].name = middlebit: mount failed (32).
+u3.sfdir3.list[1].name = lastbit: mount failed (32).
+u3.sfdir3.list[1].name = add: mount failed (32).
+u3.sfdir3.list[1].name = sub: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[1].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[1].filetype = ones: mount failed (32).
+u3.sfdir3.list[1].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[1].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[1].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[1].filetype = add: mount failed (32).
+u3.sfdir3.list[1].filetype = sub: mount failed (32).
+a.sfattr.hdr.totsize = zeroes: mount failed (32).
+a.sfattr.hdr.totsize = ones: mount failed (32).
+a.sfattr.hdr.totsize = firstbit: mount failed (32).
+a.sfattr.hdr.totsize = middlebit: mount failed (32).
+a.sfattr.hdr.totsize = lastbit: mount failed (32).
+a.sfattr.hdr.totsize = add: mount failed (32).
+a.sfattr.hdr.totsize = sub: mount failed (32).
+a.sfattr.hdr.count = zeroes: mount failed (32).
+a.sfattr.hdr.count = ones: mount failed (32).
+a.sfattr.hdr.count = firstbit: mount failed (32).
+a.sfattr.hdr.count = middlebit: mount failed (32).
+a.sfattr.hdr.count = lastbit: mount failed (32).
+a.sfattr.hdr.count = add: mount failed (32).
+a.sfattr.hdr.count = sub: mount failed (32).
+a.sfattr.list[0].namelen = zeroes: mount failed (32).
+a.sfattr.list[0].namelen = ones: mount failed (32).
+a.sfattr.list[0].namelen = firstbit: mount failed (32).
+a.sfattr.list[0].namelen = middlebit: mount failed (32).
+a.sfattr.list[0].namelen = lastbit: mount failed (32).
+a.sfattr.list[0].namelen = add: mount failed (32).
+a.sfattr.list[0].namelen = sub: mount failed (32).
 Done fuzzing metadir subdir
diff --git a/tests/xfs/1553.out b/tests/xfs/1553.out
index 0298fcfddb..e406b598b9 100644
--- a/tests/xfs/1553.out
+++ b/tests/xfs/1553.out
@@ -1,4 +1,370 @@
 QA output created by 1553
 Format and populate
 Fuzz metadir subdir
+core.magic = zeroes: mount failed (32).
+core.magic = ones: mount failed (32).
+core.magic = firstbit: mount failed (32).
+core.magic = middlebit: mount failed (32).
+core.magic = lastbit: mount failed (32).
+core.magic = add: mount failed (32).
+core.magic = sub: mount failed (32).
+core.mode = zeroes: mount failed (32).
+core.mode = ones: mount failed (32).
+core.mode = firstbit: mount failed (32).
+core.mode = middlebit: mount failed (32).
+core.mode = lastbit: mount failed (32).
+core.mode = add: mount failed (32).
+core.mode = sub: mount failed (32).
+core.version = zeroes: mount failed (32).
+core.version = ones: mount failed (32).
+core.version = firstbit: mount failed (32).
+core.version = middlebit: mount failed (32).
+core.version = lastbit: mount failed (32).
+core.version = add: mount failed (32).
+core.version = sub: mount failed (32).
+core.format = zeroes: mount failed (32).
+core.format = ones: mount failed (32).
+core.format = firstbit: mount failed (32).
+core.format = middlebit: mount failed (32).
+core.format = lastbit: mount failed (32).
+core.format = add: mount failed (32).
+core.format = sub: mount failed (32).
+core.onlink = ones: mount failed (32).
+core.onlink = firstbit: mount failed (32).
+core.onlink = middlebit: mount failed (32).
+core.onlink = lastbit: mount failed (32).
+core.onlink = add: mount failed (32).
+core.onlink = sub: mount failed (32).
+core.uid = ones: mount failed (32).
+core.uid = firstbit: mount failed (32).
+core.uid = middlebit: mount failed (32).
+core.uid = lastbit: mount failed (32).
+core.uid = add: mount failed (32).
+core.uid = sub: mount failed (32).
+core.gid = ones: mount failed (32).
+core.gid = firstbit: mount failed (32).
+core.gid = middlebit: mount failed (32).
+core.gid = lastbit: mount failed (32).
+core.gid = add: mount failed (32).
+core.gid = sub: mount failed (32).
+core.nlinkv2 = zeroes: mount failed (32).
+core.projid_lo = ones: mount failed (32).
+core.projid_lo = firstbit: mount failed (32).
+core.projid_lo = middlebit: mount failed (32).
+core.projid_lo = lastbit: mount failed (32).
+core.projid_lo = add: mount failed (32).
+core.projid_lo = sub: mount failed (32).
+core.projid_hi = ones: mount failed (32).
+core.projid_hi = firstbit: mount failed (32).
+core.projid_hi = middlebit: mount failed (32).
+core.projid_hi = lastbit: mount failed (32).
+core.projid_hi = add: mount failed (32).
+core.projid_hi = sub: mount failed (32).
+core.nextents = ones: mount failed (32).
+core.nextents = firstbit: mount failed (32).
+core.nextents = middlebit: mount failed (32).
+core.nextents = lastbit: mount failed (32).
+core.nextents = add: mount failed (32).
+core.nextents = sub: mount failed (32).
+core.size = zeroes: mount failed (32).
+core.size = ones: mount failed (32).
+core.size = firstbit: mount failed (32).
+core.size = middlebit: mount failed (32).
+core.size = lastbit: mount failed (32).
+core.size = add: mount failed (32).
+core.size = sub: mount failed (32).
+core.extsize = ones: mount failed (32).
+core.extsize = firstbit: mount failed (32).
+core.extsize = middlebit: mount failed (32).
+core.extsize = lastbit: mount failed (32).
+core.extsize = add: mount failed (32).
+core.extsize = sub: mount failed (32).
+core.naextents = ones: mount failed (32).
+core.naextents = firstbit: mount failed (32).
+core.naextents = middlebit: mount failed (32).
+core.naextents = lastbit: mount failed (32).
+core.naextents = add: mount failed (32).
+core.naextents = sub: mount failed (32).
+core.forkoff = zeroes: mount failed (32).
+core.forkoff = ones: mount failed (32).
+core.forkoff = firstbit: mount failed (32).
+core.forkoff = middlebit: mount failed (32).
+core.forkoff = lastbit: mount failed (32).
+core.forkoff = add: mount failed (32).
+core.forkoff = sub: mount failed (32).
+core.aformat = zeroes: mount failed (32).
+core.aformat = ones: mount failed (32).
+core.aformat = firstbit: mount failed (32).
+core.aformat = middlebit: mount failed (32).
+core.aformat = lastbit: mount failed (32).
+core.aformat = add: mount failed (32).
+core.aformat = sub: mount failed (32).
+core.realtime = ones: mount failed (32).
+core.realtime = firstbit: mount failed (32).
+core.realtime = middlebit: mount failed (32).
+core.realtime = lastbit: mount failed (32).
+core.realtime = add: mount failed (32).
+core.realtime = sub: mount failed (32).
+core.rtinherit = ones: offline scrub didn't fail.
+core.rtinherit = ones: online scrub didn't fail.
+core.rtinherit = firstbit: offline scrub didn't fail.
+core.rtinherit = firstbit: online scrub didn't fail.
+core.rtinherit = middlebit: offline scrub didn't fail.
+core.rtinherit = middlebit: online scrub didn't fail.
+core.rtinherit = lastbit: offline scrub didn't fail.
+core.rtinherit = lastbit: online scrub didn't fail.
+core.rtinherit = add: offline scrub didn't fail.
+core.rtinherit = add: online scrub didn't fail.
+core.rtinherit = sub: offline scrub didn't fail.
+core.rtinherit = sub: online scrub didn't fail.
+core.projinherit = ones: offline scrub didn't fail.
+core.projinherit = ones: online scrub didn't fail.
+core.projinherit = firstbit: offline scrub didn't fail.
+core.projinherit = firstbit: online scrub didn't fail.
+core.projinherit = middlebit: offline scrub didn't fail.
+core.projinherit = middlebit: online scrub didn't fail.
+core.projinherit = lastbit: offline scrub didn't fail.
+core.projinherit = lastbit: online scrub didn't fail.
+core.projinherit = add: offline scrub didn't fail.
+core.projinherit = add: online scrub didn't fail.
+core.projinherit = sub: offline scrub didn't fail.
+core.projinherit = sub: online scrub didn't fail.
+core.nosymlinks = zeroes: mount failed (32).
+core.nosymlinks = firstbit: mount failed (32).
+core.nosymlinks = middlebit: mount failed (32).
+core.nosymlinks = lastbit: mount failed (32).
+core.nosymlinks = add: mount failed (32).
+core.nosymlinks = sub: mount failed (32).
+core.extsz = ones: mount failed (32).
+core.extsz = firstbit: mount failed (32).
+core.extsz = middlebit: mount failed (32).
+core.extsz = lastbit: mount failed (32).
+core.extsz = add: mount failed (32).
+core.extsz = sub: mount failed (32).
+core.extszinherit = ones: mount failed (32).
+core.extszinherit = firstbit: mount failed (32).
+core.extszinherit = middlebit: mount failed (32).
+core.extszinherit = lastbit: mount failed (32).
+core.extszinherit = add: mount failed (32).
+core.extszinherit = sub: mount failed (32).
+next_unlinked = zeroes: mount failed (32).
+next_unlinked = firstbit: mount failed (32).
+next_unlinked = middlebit: mount failed (32).
+next_unlinked = lastbit: mount failed (32).
+next_unlinked = add: online scrub didn't fail.
+next_unlinked = add: offline re-scrub failed (1).
+next_unlinked = add: offline post-mod scrub failed (1).
+next_unlinked = sub: mount failed (32).
+v3.crc = zeroes: mount failed (32).
+v3.crc = ones: mount failed (32).
+v3.crc = firstbit: mount failed (32).
+v3.crc = middlebit: mount failed (32).
+v3.crc = lastbit: mount failed (32).
+v3.crc = add: mount failed (32).
+v3.crc = sub: mount failed (32).
+v3.change_count = zeroes: offline scrub didn't fail.
+v3.change_count = zeroes: online scrub didn't fail.
+v3.change_count = ones: offline scrub didn't fail.
+v3.change_count = ones: online scrub didn't fail.
+v3.change_count = firstbit: offline scrub didn't fail.
+v3.change_count = firstbit: online scrub didn't fail.
+v3.change_count = middlebit: offline scrub didn't fail.
+v3.change_count = middlebit: online scrub didn't fail.
+v3.change_count = lastbit: offline scrub didn't fail.
+v3.change_count = lastbit: online scrub didn't fail.
+v3.change_count = add: offline scrub didn't fail.
+v3.change_count = add: online scrub didn't fail.
+v3.change_count = sub: offline scrub didn't fail.
+v3.change_count = sub: online scrub didn't fail.
+v3.flags2 = zeroes: mount failed (32).
+v3.flags2 = ones: mount failed (32).
+v3.flags2 = firstbit: mount failed (32).
+v3.flags2 = middlebit: online scrub didn't fail.
+v3.flags2 = middlebit: offline re-scrub failed (1).
+v3.flags2 = middlebit: offline post-mod scrub failed (1).
+v3.flags2 = lastbit: mount failed (32).
+v3.flags2 = add: mount failed (32).
+v3.flags2 = sub: mount failed (32).
+v3.cowextsize = ones: mount failed (32).
+v3.cowextsize = firstbit: mount failed (32).
+v3.cowextsize = middlebit: mount failed (32).
+v3.cowextsize = lastbit: mount failed (32).
+v3.cowextsize = add: mount failed (32).
+v3.cowextsize = sub: mount failed (32).
+v3.inumber = zeroes: mount failed (32).
+v3.inumber = ones: mount failed (32).
+v3.inumber = firstbit: mount failed (32).
+v3.inumber = middlebit: mount failed (32).
+v3.inumber = lastbit: mount failed (32).
+v3.inumber = add: mount failed (32).
+v3.inumber = sub: mount failed (32).
+v3.uuid = zeroes: mount failed (32).
+v3.uuid = ones: mount failed (32).
+v3.uuid = firstbit: mount failed (32).
+v3.uuid = middlebit: mount failed (32).
+v3.uuid = lastbit: mount failed (32).
+v3.reflink = ones: mount failed (32).
+v3.reflink = firstbit: mount failed (32).
+v3.reflink = middlebit: mount failed (32).
+v3.reflink = lastbit: mount failed (32).
+v3.reflink = add: mount failed (32).
+v3.reflink = sub: mount failed (32).
+v3.cowextsz = ones: mount failed (32).
+v3.cowextsz = firstbit: mount failed (32).
+v3.cowextsz = middlebit: mount failed (32).
+v3.cowextsz = lastbit: mount failed (32).
+v3.cowextsz = add: mount failed (32).
+v3.cowextsz = sub: mount failed (32).
+v3.nrext64 = zeroes: offline scrub didn't fail.
+v3.nrext64 = zeroes: online scrub didn't fail.
+v3.nrext64 = firstbit: offline scrub didn't fail.
+v3.nrext64 = firstbit: online scrub didn't fail.
+v3.nrext64 = middlebit: offline scrub didn't fail.
+v3.nrext64 = middlebit: online scrub didn't fail.
+v3.nrext64 = lastbit: offline scrub didn't fail.
+v3.nrext64 = lastbit: online scrub didn't fail.
+v3.nrext64 = add: offline scrub didn't fail.
+v3.nrext64 = add: online scrub didn't fail.
+v3.nrext64 = sub: offline scrub didn't fail.
+v3.nrext64 = sub: online scrub didn't fail.
+v3.metadir = zeroes: mount failed (32).
+v3.metadir = firstbit: mount failed (32).
+v3.metadir = middlebit: mount failed (32).
+v3.metadir = lastbit: mount failed (32).
+v3.metadir = add: mount failed (32).
+v3.metadir = sub: mount failed (32).
+u3.sfdir3.hdr.count = zeroes: mount failed (32).
+u3.sfdir3.hdr.count = ones: mount failed (32).
+u3.sfdir3.hdr.count = firstbit: mount failed (32).
+u3.sfdir3.hdr.count = middlebit: mount failed (32).
+u3.sfdir3.hdr.count = lastbit: mount failed (32).
+u3.sfdir3.hdr.count = add: mount failed (32).
+u3.sfdir3.hdr.count = sub: mount failed (32).
+u3.sfdir3.hdr.i8count = ones: mount failed (32).
+u3.sfdir3.hdr.i8count = firstbit: mount failed (32).
+u3.sfdir3.hdr.i8count = middlebit: mount failed (32).
+u3.sfdir3.hdr.i8count = lastbit: mount failed (32).
+u3.sfdir3.hdr.i8count = add: mount failed (32).
+u3.sfdir3.hdr.i8count = sub: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = zeroes: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = ones: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = firstbit: mount failed (32).
+u3.sfdir3.hdr.parent.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[0].namelen = ones: mount failed (32).
+u3.sfdir3.list[0].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[0].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[0].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[0].namelen = add: mount failed (32).
+u3.sfdir3.list[0].namelen = sub: mount failed (32).
+u3.sfdir3.list[0].offset = zeroes: mount failed (32).
+u3.sfdir3.list[0].offset = ones: mount failed (32).
+u3.sfdir3.list[0].offset = firstbit: mount failed (32).
+u3.sfdir3.list[0].offset = middlebit: mount failed (32).
+u3.sfdir3.list[0].offset = lastbit: mount failed (32).
+u3.sfdir3.list[0].offset = add: mount failed (32).
+u3.sfdir3.list[0].offset = sub: mount failed (32).
+u3.sfdir3.list[0].name = zeroes: mount failed (32).
+u3.sfdir3.list[0].name = ones: mount failed (32).
+u3.sfdir3.list[0].name = firstbit: mount failed (32).
+u3.sfdir3.list[0].name = middlebit: mount failed (32).
+u3.sfdir3.list[0].name = lastbit: mount failed (32).
+u3.sfdir3.list[0].name = add: mount failed (32).
+u3.sfdir3.list[0].name = sub: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = lastbit: offline repair failed (1).
+u3.sfdir3.list[0].inumber.i4 = lastbit: offline re-scrub failed (1).
+u3.sfdir3.list[0].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[0].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[0].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[0].filetype = ones: mount failed (32).
+u3.sfdir3.list[0].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[0].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[0].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[0].filetype = add: mount failed (32).
+u3.sfdir3.list[0].filetype = sub: mount failed (32).
+u3.sfdir3.list[1].namelen = zeroes: mount failed (32).
+u3.sfdir3.list[1].namelen = ones: mount failed (32).
+u3.sfdir3.list[1].namelen = firstbit: mount failed (32).
+u3.sfdir3.list[1].namelen = middlebit: mount failed (32).
+u3.sfdir3.list[1].namelen = lastbit: mount failed (32).
+u3.sfdir3.list[1].namelen = add: mount failed (32).
+u3.sfdir3.list[1].namelen = sub: mount failed (32).
+u3.sfdir3.list[1].offset = zeroes: mount failed (32).
+u3.sfdir3.list[1].offset = ones: mount failed (32).
+u3.sfdir3.list[1].offset = firstbit: mount failed (32).
+u3.sfdir3.list[1].offset = middlebit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = middlebit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = lastbit: online scrub didn't fail.
+u3.sfdir3.list[1].offset = add: offline scrub didn't fail.
+u3.sfdir3.list[1].offset = add: online scrub didn't fail.
+u3.sfdir3.list[1].offset = sub: mount failed (32).
+u3.sfdir3.list[1].name = zeroes: mount failed (32).
+u3.sfdir3.list[1].name = ones: mount failed (32).
+u3.sfdir3.list[1].name = firstbit: mount failed (32).
+u3.sfdir3.list[1].name = middlebit: mount failed (32).
+u3.sfdir3.list[1].name = lastbit: mount failed (32).
+u3.sfdir3.list[1].name = add: mount failed (32).
+u3.sfdir3.list[1].name = sub: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = zeroes: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = ones: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = firstbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = middlebit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = lastbit: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = add: mount failed (32).
+u3.sfdir3.list[1].inumber.i4 = sub: mount failed (32).
+u3.sfdir3.list[1].filetype = zeroes: mount failed (32).
+u3.sfdir3.list[1].filetype = ones: mount failed (32).
+u3.sfdir3.list[1].filetype = firstbit: mount failed (32).
+u3.sfdir3.list[1].filetype = middlebit: mount failed (32).
+u3.sfdir3.list[1].filetype = lastbit: mount failed (32).
+u3.sfdir3.list[1].filetype = add: mount failed (32).
+u3.sfdir3.list[1].filetype = sub: mount failed (32).
+a.sfattr.hdr.totsize = zeroes: mount failed (32).
+a.sfattr.hdr.totsize = ones: mount failed (32).
+a.sfattr.hdr.totsize = firstbit: mount failed (32).
+a.sfattr.hdr.totsize = middlebit: mount failed (32).
+a.sfattr.hdr.totsize = lastbit: mount failed (32).
+a.sfattr.hdr.totsize = add: mount failed (32).
+a.sfattr.hdr.totsize = sub: mount failed (32).
+a.sfattr.hdr.count = zeroes: mount failed (32).
+a.sfattr.hdr.count = ones: mount failed (32).
+a.sfattr.hdr.count = firstbit: mount failed (32).
+a.sfattr.hdr.count = middlebit: mount failed (32).
+a.sfattr.hdr.count = lastbit: mount failed (32).
+a.sfattr.hdr.count = add: mount failed (32).
+a.sfattr.hdr.count = sub: mount failed (32).
+a.sfattr.list[0].namelen = zeroes: mount failed (32).
+a.sfattr.list[0].namelen = ones: mount failed (32).
+a.sfattr.list[0].namelen = firstbit: mount failed (32).
+a.sfattr.list[0].namelen = middlebit: mount failed (32).
+a.sfattr.list[0].namelen = lastbit: mount failed (32).
+a.sfattr.list[0].namelen = add: mount failed (32).
+a.sfattr.list[0].namelen = sub: mount failed (32).
+a.sfattr.list[0].parent = zeroes: online repair failed (1).
+a.sfattr.list[0].parent = add: online repair failed (1).
+a.sfattr.list[0].parent_ino = zeroes: online repair failed (1).
+a.sfattr.list[0].parent_ino = firstbit: online repair failed (1).
+a.sfattr.list[0].parent_ino = middlebit: online repair failed (1).
+a.sfattr.list[0].parent_ino = lastbit: online repair failed (1).
+a.sfattr.list[0].parent_ino = add: online repair failed (1).
+a.sfattr.list[0].parent_gen = ones: online repair failed (1).
+a.sfattr.list[0].parent_gen = firstbit: online repair failed (1).
+a.sfattr.list[0].parent_gen = middlebit: online repair failed (1).
+a.sfattr.list[0].parent_gen = lastbit: online repair failed (1).
+a.sfattr.list[0].parent_gen = add: online repair failed (1).
+a.sfattr.list[0].parent_namehash = zeroes: online repair failed (1).
+a.sfattr.list[0].parent_namehash = firstbit: online repair failed (1).
+a.sfattr.list[0].parent_namehash = middlebit: online repair failed (1).
+a.sfattr.list[0].parent_namehash = lastbit: online repair failed (1).
+a.sfattr.list[0].parent_name = firstbit: online repair failed (1).
+a.sfattr.list[0].parent_name = middlebit: online repair failed (1).
+a.sfattr.list[0].parent_name = add: online repair failed (1).
+a.sfattr.list[0].parent_name = sub: online repair failed (1).
 Done fuzzing metadir subdir


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/11] xfs: test metapath repairs
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-27 13:52   ` [PATCH 10/11] xfs: baseline golden output for metadata directory fuzz tests Darrick J. Wong
@ 2023-12-27 13:52   ` Darrick J. Wong
  10 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:52 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Functional testing for metadir path checking and repairs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1874     |  132 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1874.out |   16 ++++++
 2 files changed, 148 insertions(+)
 create mode 100755 tests/xfs/1874
 create mode 100644 tests/xfs/1874.out


diff --git a/tests/xfs/1874 b/tests/xfs/1874
new file mode 100755
index 0000000000..a00b58773f
--- /dev/null
+++ b/tests/xfs/1874
@@ -0,0 +1,132 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test 1874
+#
+# Functional test of using online repair to fix metadir paths.
+#
+. ./common/preamble
+_begin_fstest auto online_repair
+
+# Override the default cleanup function.
+# _cleanup()
+# {
+# 	cd /
+# 	rm -r -f $tmp.*
+# }
+
+# Import common functions.
+source ./common/filter
+source ./common/inject
+source ./common/fuzzy
+source ./common/quota
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs xfs
+_require_xfs_db_command "link"
+_require_xfs_db_command "unlink"
+_require_scratch
+_require_xfs_stress_online_repair
+
+prepare_fs() {
+	# Format filesystem
+	_scratch_mkfs | _filter_mkfs 2> $tmp.mkfs >> $seqres.full
+	_scratch_mount
+
+	$XFS_INFO_PROG $SCRATCH_MNT | grep -q 'metadir=1' || \
+		_notrun "metadata directories must be enabled"
+
+	$XFS_INFO_PROG $SCRATCH_MNT | grep -q 'parent=1' || \
+		_notrun "parent pointers must be enabled"
+
+	root_inum="$(stat -c '%i' $SCRATCH_MNT)"
+	__stress_scrub_check_commands "%dir%" '' '' 'scrub metapath'
+	_scratch_unmount
+
+	# Stash the /realtime inode number and gen
+	rt_metadir_inum=$(_scratch_xfs_get_metadata_field v3.inumber 'path -m /realtime')
+	rt_metadir_gen=$(_scratch_xfs_get_metadata_field core.gen 'path -m /realtime')
+
+	# Stash the /realtime/bitmap inode number and gen
+	rbm_inum=$(_scratch_xfs_get_metadata_field v3.inumber 'path -m /realtime/bitmap')
+	rbm_gen=$(_scratch_xfs_get_metadata_field core.gen 'path -m /realtime/bitmap')
+
+	# Fuzz parent pointer in rtbitmap file
+	_scratch_xfs_db -x \
+		-c 'path -m /realtime/bitmap' \
+		-c "write -d a.sfattr.list[0].parent_ino $root_inum" >> $seqres.full.
+}
+
+simple_online_repair() {
+	echo "check /realtime dir" | _tee_kernlog
+	$XFS_IO_PROG -c "scrub directory $rt_metadir_inum $rt_metadir_gen" $SCRATCH_MNT
+
+	echo "check /realtime/bitmap pptr" | _tee_kernlog
+	$XFS_IO_PROG -c "scrub parent $rbm_inum $rbm_gen" $SCRATCH_MNT
+
+	echo "check /realtime/bitmap metapath" | _tee_kernlog
+	$XFS_IO_PROG -c "scrub metapath rtbitmap" $SCRATCH_MNT
+
+	echo "check nlinks" | _tee_kernlog
+	$XFS_IO_PROG -c "scrub nlinks" $SCRATCH_MNT
+
+	# Destroying a metadir path (e.g. /realtime/bitmap) cannot be done
+	# offline because then the mount will fail.  Hence we must use a
+	# specific sequence of online repairs to remove the metadir path link.
+	# Only then can we use the metapath scrubber to restore the link.
+
+	# Force repair the parent directory.  Since /realtime/bitmap has a bad
+	# parent pointer, the "bitmap" entry in /realtime will not be created.
+	echo "fix /realtime dir" | _tee_kernlog
+	$XFS_IO_PROG -x -c "repair -R directory $rt_metadir_inum $rt_metadir_gen" $SCRATCH_MNT
+
+	# Force repair the parent pointer.  Since the "bitmap" entry in
+	# /realtime no longer exists and no other directories count the
+	# rtbitmap as a parent, this will fail cross-referencing after the
+	# repair.
+	echo "fix /realtime/bitmap pptr" | _tee_kernlog
+	$XFS_IO_PROG -x -c "repair -R parent $rbm_inum $rbm_gen" $SCRATCH_MNT
+
+	# Now that we've completely erased the /realtime/bitmap path, check
+	# that the link is indeed lost, and restore the link.
+	echo "fix /realtime/bitmap metapath" | _tee_kernlog
+	$XFS_IO_PROG -x -c "repair metapath rtbitmap" $SCRATCH_MNT
+
+	# Make sure we're not missing any link count
+	echo "fix nlinks" | _tee_kernlog
+	$XFS_IO_PROG -x -c "repair nlinks" $SCRATCH_MNT
+}
+
+# Part 1: Use raw ioctls to detect the error and fix it.
+prepare_fs
+_scratch_mount
+simple_online_repair
+_check_scratch_fs
+_scratch_unmount
+
+# Part 2: Use xfs_scrub to detect the error and fix it.
+prepare_fs
+_scratch_mount
+echo "fix with xfs_scrub" | _tee_kernlog
+_scratch_scrub &>> $seqres.full
+echo "xfs_scrub returned $?" >> $seqres.full
+_check_scratch_fs
+_scratch_unmount
+
+# Part 3: Use xfs_repair to detect the error and fix it.
+prepare_fs
+echo "fix with xfs_repair" | _tee_kernlog
+echo repair?? >> $seqres.full
+_scratch_xfs_repair &>> $seqres.full
+echo "xfs_repair returned $?" >> $seqres.full
+_scratch_mount
+_check_scratch_fs
+_scratch_unmount
+
+echo "done with test" | _tee_kernlog
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1874.out b/tests/xfs/1874.out
new file mode 100644
index 0000000000..1c29698283
--- /dev/null
+++ b/tests/xfs/1874.out
@@ -0,0 +1,16 @@
+QA output created by 1874
+check /realtime dir
+Corruption detected during cross-referencing.
+check /realtime/bitmap pptr
+Corruption detected during cross-referencing.
+check /realtime/bitmap metapath
+check nlinks
+fix /realtime dir
+fix /realtime/bitmap pptr
+Corruption remains.
+Corruption still detected during cross-referencing.
+fix /realtime/bitmap metapath
+fix nlinks
+fix with xfs_scrub
+fix with xfs_repair
+done with test


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/17] common/populate: refactor caching of metadumps to a helper
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
@ 2023-12-27 13:53   ` Darrick J. Wong
  2023-12-27 13:53   ` [PATCH 02/17] common/xfs: wipe external logs during mdrestore operations Darrick J. Wong
                     ` (15 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:53 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Hoist out of _scratch_populate_cached all the code that we use to save a
metadump of the populated filesystem.  We're going to make this more
involved for XFS in the next few patches so that we can take advantage
of the new support for external devices in metadump/mdrestore.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/populate |   37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)


diff --git a/common/populate b/common/populate
index 83cd0eb5db..72c88e0651 100644
--- a/common/populate
+++ b/common/populate
@@ -1047,6 +1047,31 @@ _scratch_populate_restore_cached() {
 	return 1
 }
 
+# Take a metadump of the scratch filesystem and cache it for later.
+_scratch_populate_save_metadump()
+{
+	local metadump_file="$1"
+
+	case "${FSTYP}" in
+	"xfs")
+		local logdev=none
+		[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
+			logdev=$SCRATCH_LOGDEV
+
+		_xfs_metadump "$metadump_file" "$SCRATCH_DEV" "$logdev" \
+				compress
+		res=$?
+		;;
+	"ext2"|"ext3"|"ext4")
+		_ext4_metadump "${SCRATCH_DEV}" "${metadump_file}" compress
+		res=$?
+		;;
+	*)
+		_fail "Don't know how to save a ${FSTYP} filesystem."
+	esac
+	return $res
+}
+
 # Populate a scratch FS from scratch or from a cached image.
 _scratch_populate_cached() {
 	local meta_descr="$(_scratch_populate_cache_tag "$@")"
@@ -1070,26 +1095,20 @@ _scratch_populate_cached() {
 
 	# Oh well, just create one from scratch
 	_scratch_mkfs
-	echo "${meta_descr}" > "${populate_metadump_descr}"
 	case "${FSTYP}" in
 	"xfs")
 		_scratch_xfs_populate $@
 		_scratch_xfs_populate_check
-
-		local logdev=none
-		[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
-			logdev=$SCRATCH_LOGDEV
-
-		_xfs_metadump "$POPULATE_METADUMP" "$SCRATCH_DEV" "$logdev" \
-			compress
 		;;
 	"ext2"|"ext3"|"ext4")
 		_scratch_ext4_populate $@
 		_scratch_ext4_populate_check
-		_ext4_metadump "${SCRATCH_DEV}" "${POPULATE_METADUMP}" compress
 		;;
 	*)
 		_fail "Don't know how to populate a ${FSTYP} filesystem."
 		;;
 	esac
+
+	_scratch_populate_save_metadump "${POPULATE_METADUMP}" && \
+			echo "${meta_descr}" > "${populate_metadump_descr}"
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/17] common/xfs: wipe external logs during mdrestore operations
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
  2023-12-27 13:53   ` [PATCH 01/17] common/populate: refactor caching of metadumps to a helper Darrick J. Wong
@ 2023-12-27 13:53   ` Darrick J. Wong
  2023-12-27 13:53   ` [PATCH 03/17] common/ext4: reformat " Darrick J. Wong
                     ` (14 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:53 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The XFS metadump file format doesn't support the capture of external log
devices, which means that mdrestore ought to wipe the external log and
run xfs_repair to rewrite the log device as needed to get the restored
filesystem to work again.  The common/populate code could already do
this, so push it to the common xfs helper.

While we're at it, fix the uncareful usage of SCRATCH_LOGDEV in the
populate code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/fuzzy    |    7 ++++++-
 common/populate |   19 ++++++-------------
 common/xfs      |   21 +++++++++++++++++++--
 3 files changed, 31 insertions(+), 16 deletions(-)


diff --git a/common/fuzzy b/common/fuzzy
index b72b4a9fe7..b72ee3f67f 100644
--- a/common/fuzzy
+++ b/common/fuzzy
@@ -304,7 +304,12 @@ __scratch_xfs_fuzz_unmount()
 __scratch_xfs_fuzz_mdrestore()
 {
 	__scratch_xfs_fuzz_unmount
-	_xfs_mdrestore "${POPULATE_METADUMP}" "${SCRATCH_DEV}" || \
+
+	local logdev=none
+	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
+		logdev=$SCRATCH_LOGDEV
+
+	_xfs_mdrestore "${POPULATE_METADUMP}" "${SCRATCH_DEV}" "${logdev}" || \
 		_fail "${POPULATE_METADUMP}: Could not find metadump to restore?"
 }
 
diff --git a/common/populate b/common/populate
index 72c88e0651..92a3c5e354 100644
--- a/common/populate
+++ b/common/populate
@@ -1011,21 +1011,14 @@ _scratch_populate_cache_tag() {
 _scratch_populate_restore_cached() {
 	local metadump="$1"
 
+	local logdev=none
+	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
+		logdev=$SCRATCH_LOGDEV
+
 	case "${FSTYP}" in
 	"xfs")
-		_xfs_mdrestore "${metadump}" "${SCRATCH_DEV}"
-		res=$?
-		test $res -ne 0 && return $res
-
-		# Cached images should have been unmounted cleanly, so if
-		# there's an external log we need to wipe it and run repair to
-		# format it to match this filesystem.
-		if [ -n "${SCRATCH_LOGDEV}" ]; then
-			$WIPEFS_PROG -a "${SCRATCH_LOGDEV}"
-			_scratch_xfs_repair
-			res=$?
-		fi
-		return $res
+		_xfs_mdrestore "${metadump}" "${SCRATCH_DEV}" "${logdev}"
+		return $?
 		;;
 	"ext2"|"ext3"|"ext4")
 		_ext4_mdrestore "${metadump}" "${SCRATCH_DEV}"
diff --git a/common/xfs b/common/xfs
index b88491666d..77ba786ece 100644
--- a/common/xfs
+++ b/common/xfs
@@ -683,7 +683,8 @@ _xfs_metadump() {
 _xfs_mdrestore() {
 	local metadump="$1"
 	local device="$2"
-	shift; shift
+	local logdev="$3"
+	shift; shift; shift
 	local options="$@"
 
 	# If we're configured for compressed dumps and there isn't already an
@@ -697,6 +698,18 @@ _xfs_mdrestore() {
 	test -r "$metadump" || return 1
 
 	$XFS_MDRESTORE_PROG $options "${metadump}" "${device}"
+	res=$?
+	test $res -ne 0 && return $res
+
+	# mdrestore does not know how to restore an external log.  If there is
+	# one, we need to erase the log header and run xfs_repair to format a
+	# new log header onto the log device.
+	if [ "$logdev" != "none" ]; then
+		$XFS_IO_PROG -d -c 'pwrite -S 0 -q 0 1m' "$logdev"
+		_scratch_xfs_repair >> $seqres.full 2>&1
+		res=$?
+	fi
+	return $res
 }
 
 # Snapshot the metadata on the scratch device
@@ -718,7 +731,11 @@ _scratch_xfs_mdrestore()
 	local metadump=$1
 	shift
 
-	_xfs_mdrestore "$metadump" "$SCRATCH_DEV" "$@"
+	local logdev=none
+	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
+		logdev=$SCRATCH_LOGDEV
+
+	_xfs_mdrestore "$metadump" "$SCRATCH_DEV" "$logdev" "$@"
 }
 
 # Do not use xfs_repair (offline fsck) to rebuild the filesystem


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/17] common/ext4: reformat external logs during mdrestore operations
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
  2023-12-27 13:53   ` [PATCH 01/17] common/populate: refactor caching of metadumps to a helper Darrick J. Wong
  2023-12-27 13:53   ` [PATCH 02/17] common/xfs: wipe external logs during mdrestore operations Darrick J. Wong
@ 2023-12-27 13:53   ` Darrick J. Wong
  2023-12-27 13:53   ` [PATCH 04/17] xfs: use metadump v2 format by default Darrick J. Wong
                     ` (13 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:53 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The e2image file format doesn't support the capture of external log
devices, which means that mdrestore ought to reformat the external log
to get the restored filesystem to work again.  The common/populate code
could already do this, so push it to the common ext4 helper.

While we're at it, fix the uncareful usage of SCRATCH_LOGDEV in the
populate code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/ext4     |   17 ++++++++++++++++-
 common/populate |   16 ++--------------
 2 files changed, 18 insertions(+), 15 deletions(-)


diff --git a/common/ext4 b/common/ext4
index 3dcbfe17c9..5171b8df68 100644
--- a/common/ext4
+++ b/common/ext4
@@ -134,7 +134,8 @@ _ext4_mdrestore()
 {
 	local metadump="$1"
 	local device="$2"
-	shift; shift
+	local logdev="$3"
+	shift; shift; shift
 	local options="$@"
 
 	# If we're configured for compressed dumps and there isn't already an
@@ -148,6 +149,20 @@ _ext4_mdrestore()
 	test -r "$metadump" || return 1
 
 	$E2IMAGE_PROG $options -r "${metadump}" "${SCRATCH_DEV}"
+	res=$?
+	test $res -ne 0 && return $res
+
+	# ext4 cannot e2image external logs, so we have to reformat the log
+	# device to match the restored fs
+	if [ "${logdev}" != "none" ]; then
+		local fsuuid="$($DUMPE2FS_PROG -h "${SCRATCH_DEV}" 2>/dev/null | \
+				grep 'Journal UUID:' | \
+				sed -e 's/Journal UUID:[[:space:]]*//g')"
+		$MKFS_EXT4_PROG -O journal_dev "${logdev}" \
+				-F -U "${fsuuid}"
+		res=$?
+	fi
+	return $res
 }
 
 # this test requires the ext4 kernel support crc feature on scratch device
diff --git a/common/populate b/common/populate
index 92a3c5e354..450b024bfc 100644
--- a/common/populate
+++ b/common/populate
@@ -1021,20 +1021,8 @@ _scratch_populate_restore_cached() {
 		return $?
 		;;
 	"ext2"|"ext3"|"ext4")
-		_ext4_mdrestore "${metadump}" "${SCRATCH_DEV}"
-		ret=$?
-		test $ret -ne 0 && return $ret
-
-		# ext4 cannot e2image external logs, so we have to reformat
-		# the scratch device to match the restored fs
-		if [ -n "${SCRATCH_LOGDEV}" ]; then
-			local fsuuid="$($DUMPE2FS_PROG -h "${SCRATCH_DEV}" 2>/dev/null | \
-					grep 'Journal UUID:' | \
-					sed -e 's/Journal UUID:[[:space:]]*//g')"
-			$MKFS_EXT4_PROG -O journal_dev "${SCRATCH_LOGDEV}" \
-					-F -U "${fsuuid}"
-		fi
-		return 0
+		_ext4_mdrestore "${metadump}" "${SCRATCH_DEV}" "${logdev}"
+		return $?
 		;;
 	esac
 	return 1


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/17] xfs: use metadump v2 format by default
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:53   ` [PATCH 03/17] common/ext4: reformat " Darrick J. Wong
@ 2023-12-27 13:53   ` Darrick J. Wong
  2023-12-27 13:54   ` [PATCH 05/17] common/xfs: capture external logs during metadump/mdrestore Darrick J. Wong
                     ` (12 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:53 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Create metadump v2 files by default.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs |    8 ++++++++
 1 file changed, 8 insertions(+)


diff --git a/common/xfs b/common/xfs
index 77ba786ece..4e8630b3ab 100644
--- a/common/xfs
+++ b/common/xfs
@@ -669,6 +669,14 @@ _xfs_metadump() {
 	local options="$@"
 	test -z "$options" && options="-a -o"
 
+	# Use metadump v2 format unless the user gave us a specific version
+	$XFS_METADUMP_PROG --help 2>&1 | grep -q -- '-v version' && \
+			metadump_has_v2=1
+
+	if ! echo "$options" | grep -q -- '-v' && [ -n "$metadump_has_v2" ]; then
+		options="$options -v 2"
+	fi
+
 	if [ "$logdev" != "none" ]; then
 		options="$options -l $logdev"
 	fi


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/17] common/xfs: capture external logs during metadump/mdrestore
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:53   ` [PATCH 04/17] xfs: use metadump v2 format by default Darrick J. Wong
@ 2023-12-27 13:54   ` Darrick J. Wong
  2023-12-27 13:54   ` [PATCH 06/17] xfs/122: update for rtgroups Darrick J. Wong
                     ` (11 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:54 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

If xfs_mdrestore supports the -l switch and there's an external scratch
log, pass the option so that we can restore log contents.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs |   40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)


diff --git a/common/xfs b/common/xfs
index 4e8630b3ab..175de69c63 100644
--- a/common/xfs
+++ b/common/xfs
@@ -668,6 +668,7 @@ _xfs_metadump() {
 	shift; shift; shift; shift
 	local options="$@"
 	test -z "$options" && options="-a -o"
+	local metadump_has_dash_x
 
 	# Use metadump v2 format unless the user gave us a specific version
 	$XFS_METADUMP_PROG --help 2>&1 | grep -q -- '-v version' && \
@@ -694,6 +695,12 @@ _xfs_mdrestore() {
 	local logdev="$3"
 	shift; shift; shift
 	local options="$@"
+	local need_repair
+	local mdrestore_has_dash_l
+
+	# Does mdrestore support restoring to external devices?
+	$XFS_MDRESTORE_PROG --help 2>&1 | grep -q -- '-l logdev' &&
+			mdrestore_has_dash_l=1
 
 	# If we're configured for compressed dumps and there isn't already an
 	# uncompressed dump, see if we can use DUMP_COMPRESSOR to decompress
@@ -705,19 +712,38 @@ _xfs_mdrestore() {
 	fi
 	test -r "$metadump" || return 1
 
+	if [ "$logdev" != "none" ]; then
+		# We have an external log device.  If mdrestore supports
+		# restoring to it, configure ourselves to do that.
+		if [ -n "$mdrestore_has_dash_l" ]; then
+			options="$options -l $logdev"
+		fi
+
+		# Wipe the log device.  If mdrestore doesn't support restoring
+		# to external log devices or the metadump file doesn't capture
+		# the log contents, this is our only chance to signal that the
+		# log header needs to be rewritten.
+		$XFS_IO_PROG -d -c 'pwrite -S 0 -q 0 1m' "$logdev"
+	fi
+
 	$XFS_MDRESTORE_PROG $options "${metadump}" "${device}"
 	res=$?
 	test $res -ne 0 && return $res
 
-	# mdrestore does not know how to restore an external log.  If there is
-	# one, we need to erase the log header and run xfs_repair to format a
-	# new log header onto the log device.
+	# If there's an external log, check to see if the restore rewrote the
+	# log header.  If not, we need to run xfs_repair to format a new log
+	# header onto the log device.
 	if [ "$logdev" != "none" ]; then
-		$XFS_IO_PROG -d -c 'pwrite -S 0 -q 0 1m' "$logdev"
-		_scratch_xfs_repair >> $seqres.full 2>&1
-		res=$?
+		magic="$($XFS_IO_PROG -c 'pread -q -v 0 4' "$logdev")"
+		if [ "$magic" = "00000000:  00 00 00 00  ...." ]; then
+			need_repair=1
+		fi
 	fi
-	return $res
+
+	test -z "$need_repair" && return 0
+
+	echo "repairing fs to fix uncaptured parts of fs." >> $seqres.full
+	_scratch_xfs_repair >> $seqres.full 2>&1
 }
 
 # Snapshot the metadata on the scratch device


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/17] xfs/122: update for rtgroups
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:54   ` [PATCH 05/17] common/xfs: capture external logs during metadump/mdrestore Darrick J. Wong
@ 2023-12-27 13:54   ` Darrick J. Wong
  2023-12-27 13:54   ` [PATCH 07/17] punch-alternating: detect xfs realtime files with large allocation units Darrick J. Wong
                     ` (10 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:54 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Add our new metadata for realtime allocation groups to the ondisk checking.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/122.out |    5 +++++
 1 file changed, 5 insertions(+)


diff --git a/tests/xfs/122.out b/tests/xfs/122.out
index 430b805792..21b139d838 100644
--- a/tests/xfs/122.out
+++ b/tests/xfs/122.out
@@ -44,6 +44,9 @@ offsetof(xfs_sb_t, sb_rbmino) = 64
 offsetof(xfs_sb_t, sb_rextents) = 24
 offsetof(xfs_sb_t, sb_rextsize) = 80
 offsetof(xfs_sb_t, sb_rextslog) = 125
+offsetof(xfs_sb_t, sb_rgblklog) = 280
+offsetof(xfs_sb_t, sb_rgblocks) = 272
+offsetof(xfs_sb_t, sb_rgcount) = 276
 offsetof(xfs_sb_t, sb_rootino) = 56
 offsetof(xfs_sb_t, sb_rrmapino) = 264
 offsetof(xfs_sb_t, sb_rsumino) = 72
@@ -116,9 +119,11 @@ sizeof(struct xfs_refcount_key) = 4
 sizeof(struct xfs_refcount_rec) = 12
 sizeof(struct xfs_rmap_key) = 20
 sizeof(struct xfs_rmap_rec) = 24
+sizeof(struct xfs_rtgroup_geometry) = 128
 sizeof(struct xfs_rtrmap_key) = 24
 sizeof(struct xfs_rtrmap_rec) = 32
 sizeof(struct xfs_rtrmap_root) = 4
+sizeof(struct xfs_rtsb) = 104
 sizeof(struct xfs_rud_log_format) = 16
 sizeof(struct xfs_rui_log_format) = 16
 sizeof(struct xfs_scrub_metadata) = 64


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/17] punch-alternating: detect xfs realtime files with large allocation units
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:54   ` [PATCH 06/17] xfs/122: update for rtgroups Darrick J. Wong
@ 2023-12-27 13:54   ` Darrick J. Wong
  2023-12-27 13:54   ` [PATCH 08/17] xfs/206: update mkfs filtering for rt groups feature Darrick J. Wong
                     ` (9 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:54 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

For files on the XFS realtime volume, it's possible that the file
allocation unit (aka the minimum size we have to punch to deallocate
file blocks) could be greater than a single fs block.  This utility
assumed that it's always possible to punch a single fs block, but for
these types of files, all that does is zeroes the page cache.  While
that's what most *user applications* want, fstests uses punching to
fragment file mapping metadata and/or fragment free space, so adapt this
test for that purpose by detecting realtime files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 src/punch-alternating.c |   28 +++++++++++++++++++++++++++-
 tests/xfs/114           |    4 ++++
 tests/xfs/146           |    2 +-
 tests/xfs/187           |    3 ++-
 tests/xfs/341           |    4 ++--
 5 files changed, 36 insertions(+), 5 deletions(-)


diff --git a/src/punch-alternating.c b/src/punch-alternating.c
index 18dd215197..d2bb4b6a22 100644
--- a/src/punch-alternating.c
+++ b/src/punch-alternating.c
@@ -20,6 +20,28 @@ void usage(char *cmd)
 	exit(1);
 }
 
+/* Compute the file allocation unit size for an XFS file. */
+static int detect_xfs_alloc_unit(int fd)
+{
+	struct fsxattr fsx;
+	struct xfs_fsop_geom fsgeom;
+	int ret;
+
+	ret = ioctl(fd, XFS_IOC_FSGEOMETRY, &fsgeom);
+	if (ret)
+		return -1;
+
+	ret = ioctl(fd, XFS_IOC_FSGETXATTR, &fsx);
+	if (ret)
+		return -1;
+
+	ret = fsgeom.blocksize;
+	if (fsx.fsx_xflags & XFS_XFLAG_REALTIME)
+		ret *= fsgeom.rtextsize;
+
+	return ret;
+}
+
 int main(int argc, char *argv[])
 {
 	struct stat	s;
@@ -82,7 +104,11 @@ int main(int argc, char *argv[])
 		goto err;
 
 	sz = s.st_size;
-	blksz = sf.f_bsize;
+	c = detect_xfs_alloc_unit(fd);
+	if (c > 0)
+		blksz = c;
+	else
+		blksz = sf.f_bsize;
 
 	mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
 	for (offset = start_offset * blksz;
diff --git a/tests/xfs/114 b/tests/xfs/114
index 0e8a0529ab..7ecb4d217c 100755
--- a/tests/xfs/114
+++ b/tests/xfs/114
@@ -49,6 +49,10 @@ $XFS_IO_PROG -f \
 	-c "pwrite -S 0x68 -b 1048576 0 $len2" \
 	$SCRATCH_MNT/f2 >> $seqres.full
 
+# The arguments to punch-alternating must be specified in units of file
+# allocation units, so we divide the argument by $file_blksz.  We already
+# verified that $blksz is congruent with $file_blksz, so the fpunch parameters
+# will always align with the file allocation unit.
 $here/src/punch-alternating -o $((16 * blksz / file_blksz)) \
 	-s $((blksz / file_blksz)) \
 	-i $((blksz * 2 / file_blksz)) \
diff --git a/tests/xfs/146 b/tests/xfs/146
index 5090435976..5d30cd8123 100755
--- a/tests/xfs/146
+++ b/tests/xfs/146
@@ -69,7 +69,7 @@ _xfs_force_bdev realtime $SCRATCH_MNT
 # Allocate some stuff at the start, to force the first falloc of the ouch file
 # to happen somewhere in the middle of the rt volume
 $XFS_IO_PROG -f -c 'falloc 0 64m' "$SCRATCH_MNT/b"
-$here/src/punch-alternating -i $((rextblks * 2)) -s $((rextblks)) "$SCRATCH_MNT/b"
+$here/src/punch-alternating "$SCRATCH_MNT/b"
 
 avail="$(df -P "$SCRATCH_MNT" | awk 'END {print $4}')"1
 toobig="$((avail * 2))"
diff --git a/tests/xfs/187 b/tests/xfs/187
index 7c34d8e630..14c3b37670 100755
--- a/tests/xfs/187
+++ b/tests/xfs/187
@@ -132,7 +132,8 @@ $XFS_IO_PROG -f -c "truncate $required_sz" -c "falloc 0 $remap_sz" $SCRATCH_MNT/
 # Punch out every other extent of the last two sections, to fragment free space.
 frag_sz=$((remap_sz * 3))
 punch_off=$((bigfile_sz - frag_sz))
-$here/src/punch-alternating $SCRATCH_MNT/bigfile -o $((punch_off / fsbsize)) -i $((rtextsize_blks * 2)) -s $rtextsize_blks
+rtextsize_bytes=$((fsbsize * rtextsize_blks))
+$here/src/punch-alternating $SCRATCH_MNT/bigfile -o $((punch_off / rtextsize_bytes))
 
 # Make sure we have some free rtextents.
 free_rtx=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | grep statfs.f_bavail | awk '{print $3}')
diff --git a/tests/xfs/341 b/tests/xfs/341
index 1f734c9015..7d2842b579 100755
--- a/tests/xfs/341
+++ b/tests/xfs/341
@@ -43,8 +43,8 @@ len=$((blocks * rtextsz))
 echo "Create some files"
 $XFS_IO_PROG -f -R -c "falloc 0 $len" -c "pwrite -S 0x68 -b 1048576 0 $len" $SCRATCH_MNT/f1 >> $seqres.full
 $XFS_IO_PROG -f -R -c "falloc 0 $len" -c "pwrite -S 0x68 -b 1048576 0 $len" $SCRATCH_MNT/f2 >> $seqres.full
-$here/src/punch-alternating -i $((2 * rtextsz_blks)) -s $rtextsz_blks $SCRATCH_MNT/f1 >> "$seqres.full"
-$here/src/punch-alternating -i $((2 * rtextsz_blks)) -s $rtextsz_blks $SCRATCH_MNT/f2 >> "$seqres.full"
+$here/src/punch-alternating $SCRATCH_MNT/f1 >> "$seqres.full"
+$here/src/punch-alternating $SCRATCH_MNT/f2 >> "$seqres.full"
 echo garbage > $SCRATCH_MNT/f3
 ino=$(stat -c '%i' $SCRATCH_MNT/f3)
 _scratch_unmount


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/17] xfs/206: update mkfs filtering for rt groups feature
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:54   ` [PATCH 07/17] punch-alternating: detect xfs realtime files with large allocation units Darrick J. Wong
@ 2023-12-27 13:54   ` Darrick J. Wong
  2023-12-27 13:55   ` [PATCH 09/17] common: pass the realtime device to xfs_db when possible Darrick J. Wong
                     ` (8 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:54 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Filter out the new mkfs lines that show the rtgroup information, since
this test is heavily dependent on old mkfs output.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/206 |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/tests/xfs/206 b/tests/xfs/206
index 9fd9400daf..4aceb12715 100755
--- a/tests/xfs/206
+++ b/tests/xfs/206
@@ -66,7 +66,8 @@ mkfs_filter()
 	    -e "/.*crc=/d" \
 	    -e "/^Default configuration/d" \
 	    -e 's/, parent=[01]//' \
-	    -e '/metadir=.*/d'
+	    -e '/metadir=.*/d' \
+	    -e '/rgcount=/d'
 }
 
 # mkfs slightly smaller than that, small log for speed.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/17] common: pass the realtime device to xfs_db when possible
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 13:54   ` [PATCH 08/17] xfs/206: update mkfs filtering for rt groups feature Darrick J. Wong
@ 2023-12-27 13:55   ` Darrick J. Wong
  2023-12-27 13:55   ` [PATCH 10/17] common: filter rtgroups when we're disabling metadir Darrick J. Wong
                     ` (7 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:55 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Teach xfstests to pass the realtime device to xfs_db when it supports
that option.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs |   14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)


diff --git a/common/xfs b/common/xfs
index 175de69c63..aa1c23dd54 100644
--- a/common/xfs
+++ b/common/xfs
@@ -282,10 +282,10 @@ _xfs_check()
 {
 	OPTS=" "
 	DBOPTS=" "
-	USAGE="Usage: xfs_check [-fsvV] [-l logdev] [-i ino]... [-b bno]... special"
+	USAGE="Usage: xfs_check [-fsvV] [-l logdev] [-r rtdev] [-i ino]... [-b bno]... special"
 
 	OPTIND=1
-	while getopts "b:fi:l:stvV" c; do
+	while getopts "b:fi:l:stvVR:" c; do
 		case $c in
 			s) OPTS=$OPTS"-s ";;
 			t) OPTS=$OPTS"-t ";;
@@ -297,6 +297,7 @@ _xfs_check()
 			V) $XFS_DB_PROG -p xfs_check -V
 			   return $?
 			   ;;
+			r) DBOPTS="$DBOPTS -R $OPTARG";;
 		esac
 	done
 	set -- extra $@
@@ -340,6 +341,10 @@ _scratch_xfs_db_options()
 	SCRATCH_OPTIONS=""
 	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
 		SCRATCH_OPTIONS="-l$SCRATCH_LOGDEV"
+	if [ "$USE_EXTERNAL" = yes ] && [ ! -z "$SCRATCH_RTDEV" ]; then
+		$XFS_DB_PROG --help 2>&1 | grep -q -- '-R rtdev' && \
+			SCRATCH_OPTIONS="$SCRATCH_OPTIONS -R$SCRATCH_RTDEV"
+	fi
 	echo $SCRATCH_OPTIONS $* $SCRATCH_DEV
 }
 
@@ -404,6 +409,11 @@ _scratch_xfs_check()
 		SCRATCH_OPTIONS="-l $SCRATCH_LOGDEV"
 	[ "$LARGE_SCRATCH_DEV" = yes ] && \
 		SCRATCH_OPTIONS=$SCRATCH_OPTIONS" -t"
+	if [ "$USE_EXTERNAL" = yes ] && [ ! -z "$SCRATCH_RTDEV" ]; then
+		$XFS_DB_PROG --help 2>&1 | grep -q -- '-R rtdev' || \
+			_notrun 'xfs_db does not support rt devices'
+		SCRATCH_OPTIONS="$SCRATCH_OPTIONS -R$SCRATCH_RTDEV"
+	fi
 	_xfs_check $SCRATCH_OPTIONS $* $SCRATCH_DEV
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/17] common: filter rtgroups when we're disabling metadir
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-27 13:55   ` [PATCH 09/17] common: pass the realtime device to xfs_db when possible Darrick J. Wong
@ 2023-12-27 13:55   ` Darrick J. Wong
  2023-12-27 13:55   ` [PATCH 11/17] xfs/185: update for rtgroups Darrick J. Wong
                     ` (6 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:55 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

If we're forcing a filesystem to be created without the metadir feature,
we should forcibly disable rtgroups as well.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs |    4 ++++
 1 file changed, 4 insertions(+)


diff --git a/common/xfs b/common/xfs
index aa1c23dd54..e66e11f15d 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1936,6 +1936,10 @@ _scratch_xfs_find_metafile()
 # Force metadata directories off.
 _scratch_xfs_force_no_metadir()
 {
+	if echo "$MKFS_OPTIONS" | grep -q 'rtgroups='; then
+		MKFS_OPTIONS="$(echo "$MKFS_OPTIONS" | sed -e 's/rtgroups=\([01]\)/rtgroups=0/g')"
+	fi
+
 	if echo "$MKFS_OPTIONS" | grep -q 'metadir='; then
 		MKFS_OPTIONS="$(echo "$MKFS_OPTIONS" | sed -e 's/metadir=\([01]\)/metadir=0/g')"
 		return


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/17] xfs/185: update for rtgroups
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-27 13:55   ` [PATCH 10/17] common: filter rtgroups when we're disabling metadir Darrick J. Wong
@ 2023-12-27 13:55   ` Darrick J. Wong
  2023-12-27 13:55   ` [PATCH 12/17] xfs/449: update test to know about xfs_db -R Darrick J. Wong
                     ` (5 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:55 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Send the fallocate results to seqres.full, since it doesn't matter if
the call fails as long as we get the layout that we wanted.  This test
already has code to check the layout, so there's no point in failing on
random ENOSPC errors.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/185 |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


diff --git a/tests/xfs/185 b/tests/xfs/185
index abeb052580..04770fd6c9 100755
--- a/tests/xfs/185
+++ b/tests/xfs/185
@@ -100,7 +100,7 @@ test "$ddbytes" -lt "$((rtbytes + (10 * rtextsize) ))" || \
 # easy because fallocate for the first rt file always starts allocating at
 # physical offset zero.
 alloc_rtx="$((rtbytes / rtextsize))"
-$XFS_IO_PROG -c "falloc 0 $((alloc_rtx * rtextsize))" $rtfile
+$XFS_IO_PROG -c "falloc 0 $((alloc_rtx * rtextsize))" $rtfile &>> $seqres.full
 
 expected_end="$(( (alloc_rtx * rtextsize - 1) / 512 ))"
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/17] xfs/449: update test to know about xfs_db -R
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-27 13:55   ` [PATCH 11/17] xfs/185: update for rtgroups Darrick J. Wong
@ 2023-12-27 13:55   ` Darrick J. Wong
  2023-12-27 13:56   ` [PATCH 13/17] xfs/122: update for rtbitmap headers Darrick J. Wong
                     ` (4 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:55 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The realtime groups feature added a -R flag to xfs_db so that users can
pass in the realtime device.  Since we've now modified the
_scratch_xfs_db to use this facility, we can update the test to do exact
comparisons of the xfs_db info command against the mkfs output.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/449 |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)


diff --git a/tests/xfs/449 b/tests/xfs/449
index 5374bf2f85..66c443e994 100755
--- a/tests/xfs/449
+++ b/tests/xfs/449
@@ -32,7 +32,11 @@ echo DB >> $seqres.full
 cat $tmp.dbinfo >> $seqres.full
 # xfs_db doesn't take a rtdev argument, so it reports "realtime=external".
 # mkfs does, so make a quick substitution
-diff -u <(cat $tmp.mkfs | sed -e 's/realtime =\/.*extsz=/realtime =external               extsz=/g') $tmp.dbinfo
+if $XFS_DB_PROG --help 2>&1 | grep -q -- '-R rtdev'; then
+	diff -u $tmp.mkfs $tmp.dbinfo
+else
+	diff -u <(cat $tmp.mkfs | sed -e 's/realtime =\/.*extsz=/realtime =external               extsz=/g') $tmp.dbinfo
+fi
 
 _scratch_mount
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/17] xfs/122: update for rtbitmap headers
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-27 13:55   ` [PATCH 12/17] xfs/449: update test to know about xfs_db -R Darrick J. Wong
@ 2023-12-27 13:56   ` Darrick J. Wong
  2023-12-27 13:56   ` [PATCH 14/17] xfs/122: udpate test to pick up rtword/suminfo ondisk unions Darrick J. Wong
                     ` (3 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:56 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/122.out |    1 +
 1 file changed, 1 insertion(+)


diff --git a/tests/xfs/122.out b/tests/xfs/122.out
index 21b139d838..8bb79ba959 100644
--- a/tests/xfs/122.out
+++ b/tests/xfs/122.out
@@ -119,6 +119,7 @@ sizeof(struct xfs_refcount_key) = 4
 sizeof(struct xfs_refcount_rec) = 12
 sizeof(struct xfs_rmap_key) = 20
 sizeof(struct xfs_rmap_rec) = 24
+sizeof(struct xfs_rtbuf_blkinfo) = 48
 sizeof(struct xfs_rtgroup_geometry) = 128
 sizeof(struct xfs_rtrmap_key) = 24
 sizeof(struct xfs_rtrmap_rec) = 32


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/17] xfs/122: udpate test to pick up rtword/suminfo ondisk unions
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-27 13:56   ` [PATCH 13/17] xfs/122: update for rtbitmap headers Darrick J. Wong
@ 2023-12-27 13:56   ` Darrick J. Wong
  2023-12-27 13:56   ` [PATCH 15/17] xfs/27[46],xfs/556: fix tests to deal with rtgroups output in bmap/fsmap commands Darrick J. Wong
                     ` (2 subsequent siblings)
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:56 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Update this test to check that the ondisk unions for rt bitmap word and
rt summary counts are always the correct size.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/122     |    2 +-
 tests/xfs/122.out |    2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)


diff --git a/tests/xfs/122 b/tests/xfs/122
index ba927c77c4..4e5ba1dfee 100755
--- a/tests/xfs/122
+++ b/tests/xfs/122
@@ -195,7 +195,7 @@ echo 'int main(int argc, char *argv[]) {' >>$cprog
 #
 cat /usr/include/xfs/xfs*.h | indent |\
 _attribute_filter |\
-grep -E '(} *xfs_.*_t|^struct xfs_[a-z0-9_]*$)' |\
+grep -E '(} *xfs_.*_t|^(union|struct) xfs_[a-z0-9_]*$)' |\
 grep -E -v -f $tmp.ignore |\
 sed -e 's/^.*}[[:space:]]*//g' -e 's/;.*$//g' -e 's/_t, /_t\n/g' |\
 sort | uniq |\
diff --git a/tests/xfs/122.out b/tests/xfs/122.out
index 8bb79ba959..6b03b90c2d 100644
--- a/tests/xfs/122.out
+++ b/tests/xfs/122.out
@@ -134,6 +134,8 @@ sizeof(struct xfs_swap_extent) = 64
 sizeof(struct xfs_sxd_log_format) = 16
 sizeof(struct xfs_sxi_log_format) = 80
 sizeof(struct xfs_unmount_log_format) = 8
+sizeof(union xfs_rtword_raw) = 4
+sizeof(union xfs_suminfo_raw) = 4
 sizeof(xfs_agf_t) = 224
 sizeof(xfs_agfl_t) = 36
 sizeof(xfs_agi_t) = 344


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/17] xfs/27[46],xfs/556: fix tests to deal with rtgroups output in bmap/fsmap commands
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-27 13:56   ` [PATCH 14/17] xfs/122: udpate test to pick up rtword/suminfo ondisk unions Darrick J. Wong
@ 2023-12-27 13:56   ` Darrick J. Wong
  2023-12-27 13:56   ` [PATCH 16/17] common/xfs: capture realtime devices during metadump/mdrestore Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 17/17] common/fuzzy: adapt the scrub stress tests to support rtgroups Darrick J. Wong
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:56 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Fix these tests to deal with the xfs_io bmap and fsmap commands printing
out realtime group numbers if the feature is enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs    |    7 +++++++
 tests/xfs/271 |    3 ++-
 tests/xfs/556 |   16 ++++++++++------
 3 files changed, 19 insertions(+), 7 deletions(-)


diff --git a/common/xfs b/common/xfs
index e66e11f15d..1136d685e7 100644
--- a/common/xfs
+++ b/common/xfs
@@ -485,6 +485,13 @@ _xfs_has_feature()
 		feat="rtextents"
 		feat_regex="[1-9][0-9]*"
 		;;
+	"rtgroups")
+		# any fs with rtgroups enabled will have a nonzero rt group
+		# size, even if there is no rt device (and hence zero actual
+		# groups)
+		feat="rgsize"
+		feat_regex="[1-9][0-9]*"
+		;;
 	esac
 
 	local answer="$($XFS_INFO_PROG "$fs" 2>&1 | grep -E -w -c "$feat=$feat_regex")"
diff --git a/tests/xfs/271 b/tests/xfs/271
index d67ac4d6c1..74e2c822c1 100755
--- a/tests/xfs/271
+++ b/tests/xfs/271
@@ -31,6 +31,7 @@ _scratch_mkfs > "$seqres.full" 2>&1
 _scratch_mount
 
 agcount=$(_xfs_mount_agcount $SCRATCH_MNT)
+rgcount=$(_xfs_mount_rgcount $SCRATCH_MNT)
 
 # mkfs lays out btree root blocks in the order bnobt, cntbt, inobt, finobt,
 # rmapbt, refcountbt, and then allocates AGFL blocks.  Since GETFSMAP has the
@@ -48,7 +49,7 @@ cat $TEST_DIR/fsmap >> $seqres.full
 
 echo "Check AG header" | tee -a $seqres.full
 grep 'static fs metadata[[:space:]]*[0-9]*[[:space:]]*(0\.\.' $TEST_DIR/fsmap | tee -a $seqres.full > $TEST_DIR/testout
-_within_tolerance "AG header count" $(wc -l < $TEST_DIR/testout) $agcount 0 -v
+_within_tolerance "AG header count" $(wc -l < $TEST_DIR/testout) $((agcount + rgcount)) 0 -v
 
 echo "Check freesp/rmap btrees" | tee -a $seqres.full
 grep 'per-AG metadata[[:space:]]*[0-9]*[[:space:]]*([0-9]*\.\.' $TEST_DIR/fsmap | tee -a $seqres.full > $TEST_DIR/testout
diff --git a/tests/xfs/556 b/tests/xfs/556
index 061d8d5723..5940226223 100755
--- a/tests/xfs/556
+++ b/tests/xfs/556
@@ -47,16 +47,20 @@ victim=$SCRATCH_MNT/a
 file_blksz=$(_get_file_block_size $SCRATCH_MNT)
 $XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
 unset errordev
-_xfs_is_realtime_file $victim && errordev="RT"
+
+awk_len_prog='{print $6}'
+if _xfs_is_realtime_file $victim; then
+	if ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
+		awk_len_prog='{print $4}'
+	fi
+	errordev="RT"
+fi
 bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
 echo "$errordev:$bmap_str" >> $seqres.full
 
 phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
-if [ "$errordev" = "RT" ]; then
-	len="$(echo "$bmap_str" | $AWK_PROG '{print $4}')"
-else
-	len="$(echo "$bmap_str" | $AWK_PROG '{print $6}')"
-fi
+len="$(echo "$bmap_str" | $AWK_PROG "$awk_len_prog")"
+
 fs_blksz=$(_get_block_size $SCRATCH_MNT)
 echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
 kernel_sectors_per_fs_block=$((fs_blksz / 512))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/17] common/xfs: capture realtime devices during metadump/mdrestore
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-27 13:56   ` [PATCH 15/17] xfs/27[46],xfs/556: fix tests to deal with rtgroups output in bmap/fsmap commands Darrick J. Wong
@ 2023-12-27 13:56   ` Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 17/17] common/fuzzy: adapt the scrub stress tests to support rtgroups Darrick J. Wong
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:56 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

If xfs_metadump supports the -r switch to capture the contents of
realtime devices and there is a realtime device, add the option to the
command line to enable preservation.

Similarly, if xfs_mdrestore supports the -r switch and there's an
external scratch rtdev, pass the option so that we can restore rtdev
contents.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/fuzzy    |    6 ++++-
 common/populate |   12 ++++++++--
 common/xfs      |   65 +++++++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 71 insertions(+), 12 deletions(-)


diff --git a/common/fuzzy b/common/fuzzy
index b72ee3f67f..d504f0854e 100644
--- a/common/fuzzy
+++ b/common/fuzzy
@@ -309,7 +309,11 @@ __scratch_xfs_fuzz_mdrestore()
 	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
 		logdev=$SCRATCH_LOGDEV
 
-	_xfs_mdrestore "${POPULATE_METADUMP}" "${SCRATCH_DEV}" "${logdev}" || \
+	local rtdev=none
+	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_RTDEV" ] && \
+		rtdev=$SCRATCH_RTDEV
+
+	_xfs_mdrestore "${POPULATE_METADUMP}" "${SCRATCH_DEV}" "${logdev}" "${rtdev}" || \
 		_fail "${POPULATE_METADUMP}: Could not find metadump to restore?"
 }
 
diff --git a/common/populate b/common/populate
index 450b024bfc..dc89eee70e 100644
--- a/common/populate
+++ b/common/populate
@@ -1017,7 +1017,11 @@ _scratch_populate_restore_cached() {
 
 	case "${FSTYP}" in
 	"xfs")
-		_xfs_mdrestore "${metadump}" "${SCRATCH_DEV}" "${logdev}"
+		local rtdev=none
+		[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_RTDEV" ] && \
+			rtdev=$SCRATCH_RTDEV
+
+		_xfs_mdrestore "${metadump}" "${SCRATCH_DEV}" "${logdev}" "${rtdev}"
 		return $?
 		;;
 	"ext2"|"ext3"|"ext4")
@@ -1039,8 +1043,12 @@ _scratch_populate_save_metadump()
 		[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
 			logdev=$SCRATCH_LOGDEV
 
+		local rtdev=none
+		[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_RTDEV" ] && \
+			rtdev=$SCRATCH_RTDEV
+
 		_xfs_metadump "$metadump_file" "$SCRATCH_DEV" "$logdev" \
-				compress
+				"$rtdev" compress
 		res=$?
 		;;
 	"ext2"|"ext3"|"ext4")
diff --git a/common/xfs b/common/xfs
index 1136d685e7..1ff81f4cc2 100644
--- a/common/xfs
+++ b/common/xfs
@@ -681,11 +681,17 @@ _xfs_metadump() {
 	local metadump="$1"
 	local device="$2"
 	local logdev="$3"
-	local compressopt="$4"
-	shift; shift; shift; shift
+	local rtdev="$4"
+	local compressopt="$5"
+	shift; shift; shift; shift; shift
 	local options="$@"
 	test -z "$options" && options="-a -o"
 	local metadump_has_dash_x
+	local metadump_has_dash_r
+
+	# Does metadump support capturing realtime devices?
+	$XFS_METADUMP_PROG --help 2>&1 | grep -q -- '-r rtdev' && \
+			metadump_has_dash_r=1
 
 	# Use metadump v2 format unless the user gave us a specific version
 	$XFS_METADUMP_PROG --help 2>&1 | grep -q -- '-v version' && \
@@ -699,6 +705,10 @@ _xfs_metadump() {
 		options="$options -l $logdev"
 	fi
 
+	if [ "$rtdev" != "none" ] && [ -n "$metadump_has_dash_r" ]; then
+		options="$options -r $rtdev"
+	fi
+
 	$XFS_METADUMP_PROG $options "$device" "$metadump"
 	res=$?
 	[ "$compressopt" = "compress" ] && [ -n "$DUMP_COMPRESSOR" ] &&
@@ -710,14 +720,19 @@ _xfs_mdrestore() {
 	local metadump="$1"
 	local device="$2"
 	local logdev="$3"
-	shift; shift; shift
+	local rtdev="$4"
+	shift; shift; shift; shift
 	local options="$@"
 	local need_repair
 	local mdrestore_has_dash_l
+	local mdrestore_has_dash_r
 
 	# Does mdrestore support restoring to external devices?
 	$XFS_MDRESTORE_PROG --help 2>&1 | grep -q -- '-l logdev' &&
 			mdrestore_has_dash_l=1
+	# Does mdrestore support restoring to realtime devices?
+	$XFS_MDRESTORE_PROG --help 2>&1 | grep -q -- '-r rtdev' &&
+			mdrestore_has_dash_r=1
 
 	# If we're configured for compressed dumps and there isn't already an
 	# uncompressed dump, see if we can use DUMP_COMPRESSOR to decompress
@@ -743,6 +758,20 @@ _xfs_mdrestore() {
 		$XFS_IO_PROG -d -c 'pwrite -S 0 -q 0 1m' "$logdev"
 	fi
 
+	if [ "$rtdev" != "none" ]; then
+		# We have a realtime device.  If mdrestore supports restoring
+		# to it, configure ourselves to do that.
+		if [ -n "$mdrestore_has_dash_r" ]; then
+			options="$options -r $rtdev"
+		fi
+
+		# Wipe the realtime device.  If mdrestore doesn't support
+		# restoring to realtime devices or the metadump file doesn't
+		# capture the realtime group headers, this is our only chance
+		# to signal that the log header needs to be rewritten.
+		$XFS_IO_PROG -d -c 'pwrite -S 0 -q 0 1m' "$rtdev"
+	fi
+
 	$XFS_MDRESTORE_PROG $options "${metadump}" "${device}"
 	res=$?
 	test $res -ne 0 && return $res
@@ -757,6 +786,16 @@ _xfs_mdrestore() {
 		fi
 	fi
 
+	# If there's a realtime device, check to see if the restore rewrote the
+	# rt group headers.  If not, we need to run xfs_repair to format new
+	# group headers onto the realtime device.
+	if [ "$rtdev" != "none" ] && [ -z "$need_repair" ]; then
+		magic="$($XFS_IO_PROG -c 'pread -q -v 0 4' "$rtdev")"
+		if [ "$magic" = "00000000:  00 00 00 00  ...." ]; then
+			need_repair=1
+		fi
+	fi
+
 	test -z "$need_repair" && return 0
 
 	echo "repairing fs to fix uncaptured parts of fs." >> $seqres.full
@@ -768,12 +807,16 @@ _scratch_xfs_metadump()
 {
 	local metadump=$1
 	shift
+
 	local logdev=none
-
 	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
 		logdev=$SCRATCH_LOGDEV
 
-	_xfs_metadump "$metadump" "$SCRATCH_DEV" "$logdev" nocompress "$@"
+	local rtdev=none
+	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_RTDEV" ] && \
+		rtdev=$SCRATCH_RTDEV
+
+	_xfs_metadump "$metadump" "$SCRATCH_DEV" "$logdev" "$rtdev" nocompress "$@"
 }
 
 # Restore snapshotted metadata on the scratch device
@@ -786,7 +829,11 @@ _scratch_xfs_mdrestore()
 	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
 		logdev=$SCRATCH_LOGDEV
 
-	_xfs_mdrestore "$metadump" "$SCRATCH_DEV" "$logdev" "$@"
+	local rtdev=none
+	[ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_RTDEV" ] && \
+		rtdev=$SCRATCH_RTDEV
+
+	_xfs_mdrestore "$metadump" "$SCRATCH_DEV" "$logdev" "$rtdev" "$@"
 }
 
 # Do not use xfs_repair (offline fsck) to rebuild the filesystem
@@ -923,7 +970,7 @@ _check_xfs_filesystem()
 	if [ "$ok" -ne 1 ] && [ "$DUMP_CORRUPT_FS" = "1" ]; then
 		local flatdev="$(basename "$device")"
 		_xfs_metadump "$seqres.$flatdev.check.md" "$device" "$logdev" \
-			compress >> $seqres.full
+			"$rtdev" compress >> $seqres.full
 	fi
 
 	# Optionally test the index rebuilding behavior.
@@ -956,7 +1003,7 @@ _check_xfs_filesystem()
 		if [ "$rebuild_ok" -ne 1 ] && [ "$DUMP_CORRUPT_FS" = "1" ]; then
 			local flatdev="$(basename "$device")"
 			_xfs_metadump "$seqres.$flatdev.rebuild.md" "$device" \
-				"$logdev" compress >> $seqres.full
+				"$logdev" "$rtdev" compress >> $seqres.full
 		fi
 	fi
 
@@ -1040,7 +1087,7 @@ _check_xfs_filesystem()
 		if [ "$orebuild_ok" -ne 1 ] && [ "$DUMP_CORRUPT_FS" = "1" ]; then
 			local flatdev="$(basename "$device")"
 			_xfs_metadump "$seqres.$flatdev.orebuild.md" "$device" \
-				"$logdev" compress >> $seqres.full
+				"$logdev" "$rtdev" compress >> $seqres.full
 		fi
 	fi
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/17] common/fuzzy: adapt the scrub stress tests to support rtgroups
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-27 13:56   ` [PATCH 16/17] common/xfs: capture realtime devices during metadump/mdrestore Darrick J. Wong
@ 2023-12-27 13:57   ` Darrick J. Wong
  16 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:57 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Adapt the scrub stress testing framework to support checking realtime
group metadata.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/fuzzy  |   27 ++++++++++++++++++++++-----
 common/xfs    |    9 +++++++++
 tests/xfs/581 |    2 +-
 tests/xfs/720 |    2 +-
 tests/xfs/795 |    2 +-
 5 files changed, 34 insertions(+), 8 deletions(-)


diff --git a/common/fuzzy b/common/fuzzy
index d504f0854e..2de06622e5 100644
--- a/common/fuzzy
+++ b/common/fuzzy
@@ -847,8 +847,10 @@ __stress_one_scrub_loop() {
 	local scrub_tgt="$3"
 	local scrub_startat="$4"
 	local start_agno="$5"
-	shift; shift; shift; shift; shift
+	local start_rgno="$6"
+	shift; shift; shift; shift; shift; shift
 	local agcount="$(_xfs_mount_agcount $SCRATCH_MNT)"
+	local rgcount="$(_xfs_mount_rgcount $SCRATCH_MNT)"
 
 	local xfs_io_args=()
 	for arg in "$@"; do
@@ -861,6 +863,12 @@ __stress_one_scrub_loop() {
 				local ag_arg="$(echo "$arg" | sed -e "s|%agno%|$agno|g")"
 				xfs_io_args+=('-c' "$ag_arg")
 			done
+		elif echo "$arg" | grep -q -w '%rgno%'; then
+			# Substitute the rtgroup number
+			for ((rgno = start_rgno; rgno < rgcount; rgno++)); do
+				local rg_arg="$(echo "$arg" | sed -e "s|%rgno%|$rgno|g")"
+				xfs_io_args+=('-c' "$rg_arg")
+			done
 		else
 			xfs_io_args+=('-c' "$arg")
 		fi
@@ -1245,7 +1253,9 @@ _scratch_xfs_stress_scrub_cleanup() {
 __stress_scrub_check_commands() {
 	local scrub_tgt="$1"
 	local start_agno="$2"
-	shift; shift
+	local start_rgno="$3"
+	shift; shift; shift
+	local rgcount="$(_xfs_mount_rgcount $SCRATCH_MNT)"
 
 	local cooked_tgt="$scrub_tgt"
 	case "$scrub_tgt" in
@@ -1275,6 +1285,10 @@ __stress_scrub_check_commands() {
 			cooked_arg="$(echo "$cooked_arg" | sed -e 's/^repair/repair -R/g')"
 		fi
 		cooked_arg="$(echo "$cooked_arg" | sed -e "s/%agno%/$start_agno/g")"
+		if echo "$cooked_arg" | grep -q -w '%rgno%'; then
+			test "$rgcount" -eq 0 && continue
+			cooked_arg="$(echo "$cooked_arg" | sed -e "s/%rgno%/$start_rgno/g")"
+		fi
 		testio=`$XFS_IO_PROG -x -c "$cooked_arg" "$cooked_tgt" 2>&1`
 		echo $testio | grep -q "Unknown type" && \
 			_notrun "xfs_io scrub subcommand support is missing"
@@ -1300,6 +1314,7 @@ __stress_scrub_check_commands() {
 #	in a separate loop.  If zero -i options are specified, do not run.
 #	Callers must check each of these commands (via _require_xfs_io_command)
 #	before calling here.
+# -R	For %rgno% substitution, start with this rtgroup instead of rtgroup 0.
 # -r	Run fsstress for this amount of time, then remount the fs ro or rw.
 #	The default is to run fsstress continuously with no remount, unless
 #	XFS_SCRUB_STRESS_REMOUNT_PERIOD is set.
@@ -1346,6 +1361,7 @@ _scratch_xfs_stress_scrub() {
 	local remount_period="${XFS_SCRUB_STRESS_REMOUNT_PERIOD}"
 	local stress_tgt="${XFS_SCRUB_STRESS_TARGET:-default}"
 	local start_agno=0
+	local start_rgno=0
 
 	__SCRUB_STRESS_FREEZE_PID=""
 	__SCRUB_STRESS_REMOUNT_LOOP=""
@@ -1353,12 +1369,13 @@ _scratch_xfs_stress_scrub() {
 	touch "$runningfile"
 
 	OPTIND=1
-	while getopts "a:fi:r:s:S:t:w:x:X:" c; do
+	while getopts "a:fi:r:R:s:S:t:w:x:X:" c; do
 		case "$c" in
 			a) start_agno="$OPTARG";;
 			f) freeze=yes;;
 			i) io_args+=("$OPTARG");;
 			r) remount_period="$OPTARG";;
+			R) start_rgno="$OPTARG";;
 			s) one_scrub_args+=("$OPTARG");;
 			S) xfs_scrub_args+=("$OPTARG");;
 			t) scrub_tgt="$OPTARG";;
@@ -1369,7 +1386,7 @@ _scratch_xfs_stress_scrub() {
 		esac
 	done
 
-	__stress_scrub_check_commands "$scrub_tgt" "$start_agno" \
+	__stress_scrub_check_commands "$scrub_tgt" "$start_agno" "$start_rgno" \
 			"${one_scrub_args[@]}"
 
 	if ! command -v "__stress_scrub_${exerciser}_loop" &>/dev/null; then
@@ -1425,7 +1442,7 @@ _scratch_xfs_stress_scrub() {
 
 	if [ "${#one_scrub_args[@]}" -gt 0 ]; then
 		__stress_one_scrub_loop "$end" "$runningfile" "$scrub_tgt" \
-				"$scrub_startat" "$start_agno" \
+				"$scrub_startat" "$start_agno" "$start_rgno" \
 				"${one_scrub_args[@]}" &
 	fi
 
diff --git a/common/xfs b/common/xfs
index 1ff81f4cc2..313b7045bd 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1584,6 +1584,15 @@ _xfs_mount_agcount()
 	$XFS_INFO_PROG "$1" | sed -n "s/^.*agcount=\([[:digit:]]*\).*/\1/p"
 }
 
+# Find rtgroup count of mounted filesystem
+_xfs_mount_rgcount()
+{
+	local rtgroups="$($XFS_INFO_PROG "$1" | grep rgcount= | sed -e 's/^.*rgcount=\([0-9]*\).*$/\1/g')"
+
+	test -z "$rtgroups" && rtgroups=0
+	echo "$rtgroups"
+}
+
 # Wipe the superblock of each XFS AGs
 _try_wipe_scratch_xfs()
 {
diff --git a/tests/xfs/581 b/tests/xfs/581
index 1d08bc7df3..353421130e 100755
--- a/tests/xfs/581
+++ b/tests/xfs/581
@@ -32,7 +32,7 @@ _require_xfs_stress_scrub
 _scratch_mkfs > "$seqres.full" 2>&1
 _scratch_mount
 _require_xfs_has_feature "$SCRATCH_MNT" realtime
-_scratch_xfs_stress_scrub -s "scrub rtbitmap"
+_scratch_xfs_stress_scrub -s "scrub rtbitmap"  -s "scrub rgbitmap %rgno%"
 
 # success, all done
 echo Silence is golden
diff --git a/tests/xfs/720 b/tests/xfs/720
index 2b6406da6e..3242a19b02 100755
--- a/tests/xfs/720
+++ b/tests/xfs/720
@@ -39,7 +39,7 @@ alloc_unit=$(_get_file_block_size $SCRATCH_MNT)
 scratchfile=$SCRATCH_MNT/file
 touch $scratchfile
 $XFS_IO_PROG -x -c 'inject force_repair' $SCRATCH_MNT
-__stress_scrub_check_commands "$scratchfile" "" 'repair bmapbtd'
+__stress_scrub_check_commands "$scratchfile" "" "" 'repair bmapbtd'
 
 # Compute the number of extent records needed to guarantee btree format,
 # assuming 16 bytes for each ondisk extent record
diff --git a/tests/xfs/795 b/tests/xfs/795
index a381db320f..2ce2ec5365 100755
--- a/tests/xfs/795
+++ b/tests/xfs/795
@@ -39,7 +39,7 @@ scratchfile=$SCRATCH_MNT/file
 mkdir $scratchdir
 touch $scratchfile
 $XFS_IO_PROG -x -c 'inject force_repair' $SCRATCH_MNT
-__stress_scrub_check_commands "$scratchdir" "" 'repair directory'
+__stress_scrub_check_commands "$scratchdir" "" "" 'repair directory'
 
 # Create a 2-dirblock directory
 total_size=$((alloc_unit * 2))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/2] xfs: refactor statfs field extraction
  2023-12-31 20:00 ` [PATCHSET v2.0 3/9] fstests: enable FITRIM for the realtime section Darrick J. Wong
@ 2023-12-27 13:57   ` Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 2/2] common/xfs: FITRIM now supports realtime volumes Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:57 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Prepare for the next patch by refactoring the open-coded bits that call
statfs on a mounted xfs filesystem to extract a status field.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs    |    6 ++++++
 tests/xfs/176 |    4 ++--
 tests/xfs/187 |    6 +++---
 tests/xfs/541 |    6 ++----
 4 files changed, 13 insertions(+), 9 deletions(-)


diff --git a/common/xfs b/common/xfs
index 313b7045bd..d9aa242ec7 100644
--- a/common/xfs
+++ b/common/xfs
@@ -2063,3 +2063,9 @@ _require_xfs_scratch_metadir()
 		_scratch_unmount
 	fi
 }
+
+# Extract a statfs attribute of the given mounted XFS filesystem.
+_xfs_statfs_field()
+{
+	$XFS_IO_PROG -c 'statfs' "$1" | grep -E "$2" | cut -d ' ' -f 3
+}
diff --git a/tests/xfs/176 b/tests/xfs/176
index 5231b888ba..0af81fcce3 100755
--- a/tests/xfs/176
+++ b/tests/xfs/176
@@ -49,7 +49,7 @@ fi
 
 _scratch_mount
 _xfs_force_bdev data $SCRATCH_MNT
-old_dblocks=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | grep geom.datablocks)
+old_dblocks=$(_xfs_statfs_field "$SCRATCH_MNT" geom.datablocks)
 
 mkdir $SCRATCH_MNT/save/
 sino=$(stat -c '%i' $SCRATCH_MNT/save)
@@ -172,7 +172,7 @@ for ((ino = target_ino; ino >= icluster_ino; ino--)); do
 	res=$?
 
 	# Make sure shrink did not work
-	new_dblocks=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | grep geom.datablocks)
+	new_dblocks=$(_xfs_statfs_field "$SCRATCH_MNT" geom.datablocks)
 	if [ "$new_dblocks" != "$old_dblocks" ]; then
 		echo "should not have shrank $old_dblocks -> $new_dblocks"
 		break
diff --git a/tests/xfs/187 b/tests/xfs/187
index 14c3b37670..03d92d0890 100755
--- a/tests/xfs/187
+++ b/tests/xfs/187
@@ -79,8 +79,8 @@ _xfs_force_bdev realtime $SCRATCH_MNT
 
 # Set the extent size hint larger than the realtime extent size.  This is
 # necessary to exercise the minlen constraints on the realtime allocator.
-fsbsize=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | grep geom.bsize | awk '{print $3}')
-rtextsize_blks=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | grep geom.rtextsize | awk '{print $3}')
+fsbsize=$(_xfs_statfs_field "$SCRATCH_MNT" geom.bsize)
+rtextsize_blks=$(_xfs_statfs_field "$SCRATCH_MNT" geom.rtextsize)
 extsize=$((2 * rtextsize_blks * fsbsize))
 
 echo "rtextsize_blks=$rtextsize_blks extsize=$extsize" >> $seqres.full
@@ -136,7 +136,7 @@ rtextsize_bytes=$((fsbsize * rtextsize_blks))
 $here/src/punch-alternating $SCRATCH_MNT/bigfile -o $((punch_off / rtextsize_bytes))
 
 # Make sure we have some free rtextents.
-free_rtx=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | grep statfs.f_bavail | awk '{print $3}')
+free_rtx=$(_xfs_statfs_field "$SCRATCH_MNT" statfs.f_bavail)
 if [ $free_rtx -eq 0 ]; then
 	echo "Expected fragmented free rt space, found none."
 fi
diff --git a/tests/xfs/541 b/tests/xfs/541
index ae2fd819d5..3a0a9d5390 100755
--- a/tests/xfs/541
+++ b/tests/xfs/541
@@ -83,13 +83,11 @@ test $grow_extszhint -eq 0 || \
 	echo "expected post-grow extszhint 0, got $grow_extszhint"
 
 # Check that we now have rt extents.
-rtextents=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | \
-	grep 'geom.rtextents' | cut -d ' ' -f 3)
+rtextents=$(_xfs_statfs_field "$SCRATCH_MNT" geom.rtextents)
 test $rtextents -gt 0 || echo "expected rtextents > 0"
 
 # Check the new rt extent size.
-after_rtextsz_blocks=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | \
-	grep 'geom.rtextsize' | cut -d ' ' -f 3)
+after_rtextsz_blocks=$(_xfs_statfs_field "$SCRATCH_MNT" geom.rtextsize)
 test $after_rtextsz_blocks -eq $new_rtextsz_blocks || \
 	echo "expected rtextsize $new_rtextsz_blocks, got $after_rtextsz_blocks"
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/2] common/xfs: FITRIM now supports realtime volumes
  2023-12-31 20:00 ` [PATCHSET v2.0 3/9] fstests: enable FITRIM for the realtime section Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 1/2] xfs: refactor statfs field extraction Darrick J. Wong
@ 2023-12-27 13:57   ` Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:57 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

XFS now supports FITRIM to the realtime volume.  Detect this support and
enable it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs |   40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)


diff --git a/common/xfs b/common/xfs
index d9aa242ec7..75f1bbcc3d 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1937,8 +1937,44 @@ _require_xfs_scratch_atomicswap()
 # of 1024 byte blocks.
 _xfs_discard_max_offset_kb()
 {
-	$XFS_IO_PROG -c 'statfs' "$1" | \
-		awk '{g[$1] = $3} END {print (g["geom.bsize"] * g["geom.datablocks"] / 1024)}'
+	local statfs
+
+	# Use awk to read the statfs output for the XFS filesystem, compute
+	# the two possible FITRIM offset maximums, and then use some horrid
+	# bash magic to import the five numbers as an indexed array.  There's
+	# no better way to do this in bash since you can't readarray to build
+	# an associative array.  Elements are as follows:
+	#
+	# 0: fsblock size in bytes
+	# 1: Data volume size in fsblocks.
+	# 2: Realtime volume size in fsblocks.
+	# 3: Max FITRIM offset if we can only trim the data volume
+	# 4: Max FITRIM offset if we can trim the data and rt volumes
+	readarray -t statfs < <($XFS_IO_PROG -c 'statfs' "$1" | \
+		awk '{g[$1] = $3} END {printf("%d\n%d\n%d\n%d\n%d\n",
+			g["geom.bsize"],
+			g["geom.datablocks"],
+			g["geom.rtblocks"],
+			g["geom.bsize"] * g["geom.datablocks"] / 1024,
+			g["geom.bsize"] * (g["geom.datablocks"] + g["geom.rtblocks"]) / 1024);}')
+
+	# If the kernel supports discarding the realtime volume, then it will
+	# not reject a FITRIM for fsblock dblks+1, even if the len/minlen
+	# arguments are absurd.
+	if [ "${statfs[2]}" -gt 0 ]; then
+		if $FSTRIM_PROG -o "$((statfs[0] * statfs[1]))" \
+				-l "${statfs[0]}" \
+				-m "$((statfs[0] * 2))" "$1" &>/dev/null; then
+			# The kernel supports discarding the rt volume, so
+			# print out the second answer from above.
+			echo "${statfs[4]}"
+			return
+		fi
+	fi
+
+	# The kernel does not support discarding the rt volume or there is no
+	# rt volume.  Print out the first answer from above.
+	echo "${statfs[3]}"
 }
 
 # Adjust MKFS_OPTIONS as necessary to avoid having parent pointers formatted


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/13] xfs: fix tests that try to access the realtime rmap inode
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
@ 2023-12-27 13:57   ` Darrick J. Wong
  2023-12-27 13:58   ` [PATCH 02/13] fuzz: for fuzzing the rtrmapbt, find the path to the rt rmap btree file Darrick J. Wong
                     ` (11 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:57 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The realtime rmap tests were added to fstests a long time ago.  Since
they were added, we decided to create a metadata file directory
structure instead of adding more fields to the superblock.  Therefore,
fix all the tests that try to access these paths.

While we're at it, fix xfs/409 to run the *online* scrub program like
it's supposed to.  xfs/408 is the fuzzer for xfs_repair testing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs        |   41 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/122.out |    1 -
 tests/xfs/333     |   45 ---------------------------------------------
 tests/xfs/333.out |    6 ------
 tests/xfs/337     |    2 +-
 tests/xfs/338     |   30 +++++++++++++++++++++++++-----
 tests/xfs/339     |    5 +++--
 tests/xfs/340     |   25 ++++++++++++++++++++-----
 tests/xfs/341     |    2 +-
 tests/xfs/342     |    4 ++--
 10 files changed, 93 insertions(+), 68 deletions(-)
 delete mode 100755 tests/xfs/333
 delete mode 100644 tests/xfs/333.out


diff --git a/common/xfs b/common/xfs
index 75f1bbcc3d..d3baa7a7c3 100644
--- a/common/xfs
+++ b/common/xfs
@@ -2105,3 +2105,44 @@ _xfs_statfs_field()
 {
 	$XFS_IO_PROG -c 'statfs' "$1" | grep -E "$2" | cut -d ' ' -f 3
 }
+
+# Resolve a metadata directory tree path and return the inode number.
+_scratch_metadir_lookup() {
+	local res="$(_scratch_xfs_db -c "ls -i -m $1")"
+	test "${PIPESTATUS[0]}" -eq 0 && echo "$res"
+}
+
+# Figure out which directory entry we have to change to update the rtrmap
+# inode pointer.  The last line of output is inumber field within a metadata
+# object; any previous lines are the accessor commands that must be fed to
+# xfs_db to get to the correct directory block.
+_scratch_find_rt_metadir_entry() {
+	local sfkey="$(_scratch_xfs_db -c 'path -m /realtime' -c print | \
+		grep "\"$1\"" | \
+		sed -e 's/.name.*$//g' -e 's/\[/\\[/g' -e 's/\]/\\]/g' )"
+	if [ -n "$sfkey" ]; then
+		echo 'path -m /realtime'
+		_scratch_xfs_db -c 'path -m /realtime' -c print | \
+			grep "${sfkey}.inumber" | awk '{print $1}'
+		return 0
+	fi
+
+	local size=$(_scratch_xfs_db -c 'path -m /realtime' -c 'print core.size' | awk '{print $3}')
+	local blksz=$(_scratch_xfs_db -c 'sb 0' -c 'print blocksize' | awk '{print $3}')
+	local dirblklog=$(_scratch_xfs_db -c 'sb 0' -c 'print dirblklog' | awk '{print $3}')
+	local dirblksz=$((blksz << dirblklog ))
+	for ((fileoff = 0; fileoff < (size / dirblksz); fileoff++)); do
+		local dbkey="$(_scratch_xfs_db -c 'path -m /realtime' -c "dblock $fileoff" -c 'print' | \
+			grep "\"$1\"" | \
+			sed -e 's/.name.*$//g' -e 's/\[/\\[/g' -e 's/\]/\\]/g' )"
+		if [ -n "$dbkey" ]; then
+			echo 'path -m /realtime'
+			echo "dblock $fileoff"
+			_scratch_xfs_db -c 'path -m /realtime' -c "dblock $fileoff" -c print | \
+				grep "${dbkey}.inumber" | awk '{print $1}'
+			return 0
+		fi
+	done
+
+	return 1
+}
diff --git a/tests/xfs/122.out b/tests/xfs/122.out
index 6b03b90c2d..837286a9d3 100644
--- a/tests/xfs/122.out
+++ b/tests/xfs/122.out
@@ -48,7 +48,6 @@ offsetof(xfs_sb_t, sb_rgblklog) = 280
 offsetof(xfs_sb_t, sb_rgblocks) = 272
 offsetof(xfs_sb_t, sb_rgcount) = 276
 offsetof(xfs_sb_t, sb_rootino) = 56
-offsetof(xfs_sb_t, sb_rrmapino) = 264
 offsetof(xfs_sb_t, sb_rsumino) = 72
 offsetof(xfs_sb_t, sb_sectlog) = 121
 offsetof(xfs_sb_t, sb_sectsize) = 102
diff --git a/tests/xfs/333 b/tests/xfs/333
deleted file mode 100755
index 728c518402..0000000000
--- a/tests/xfs/333
+++ /dev/null
@@ -1,45 +0,0 @@
-#! /bin/bash
-# SPDX-License-Identifier: GPL-2.0
-# Copyright (c) 2016, Oracle and/or its affiliates.  All Rights Reserved.
-#
-# FS QA Test No. 333
-#
-# Set rrmapino to another inode on an non-rt rmap fs and see if repair fixes it.
-#
-. ./common/preamble
-_begin_fstest auto quick rmap realtime
-
-# Import common functions.
-. ./common/filter
-
-# real QA test starts here
-_supported_fs xfs
-_require_xfs_scratch_rmapbt
-_disable_dmesg_check
-
-rm -f "$seqres.full"
-
-unset SCRATCH_RTDEV
-
-echo "Format and mount"
-_scratch_mkfs > "$seqres.full" 2>&1
-rrmapino="$(_scratch_xfs_db -c 'sb 0' -c 'p rrmapino' 2>&1)"
-test "${rrmapino}" = "field rrmapino not found" && _notrun "realtime rmapbt not supported"
-_scratch_mount
-
-echo "Create some files"
-$XFS_IO_PROG -f -c "pwrite -S 0x68 0 9999" $SCRATCH_MNT/f1 >> $seqres.full
-$XFS_IO_PROG -f -c "pwrite -S 0x68 0 9999" $SCRATCH_MNT/f2 >> $seqres.full
-echo garbage > $SCRATCH_MNT/f3
-ino=$(stat -c '%i' $SCRATCH_MNT/f3)
-_scratch_unmount
-
-echo "Corrupt fs"
-_scratch_xfs_db -x -c 'sb 0' -c "write rrmapino $ino" >> $seqres.full
-_try_scratch_mount 2>&1 | _filter_error_mount
-
-echo "Test done, mount should have failed"
-
-# success, all done
-status=0
-exit
diff --git a/tests/xfs/333.out b/tests/xfs/333.out
deleted file mode 100644
index b3c698750f..0000000000
--- a/tests/xfs/333.out
+++ /dev/null
@@ -1,6 +0,0 @@
-QA output created by 333
-Format and mount
-Create some files
-Corrupt fs
-mount: Structure needs cleaning
-Test done, mount should have failed
diff --git a/tests/xfs/337 b/tests/xfs/337
index f74baae9b0..9ea8587b27 100755
--- a/tests/xfs/337
+++ b/tests/xfs/337
@@ -53,7 +53,7 @@ echo "+ check fs"
 _scratch_xfs_repair -n >> $seqres.full 2>&1 || echo "xfs_repair should not fail"
 
 echo "+ corrupt image"
-_scratch_xfs_db -x -c "sb" -c "addr rrmapino" -c "addr u3.rtrmapbt.ptrs[1]" \
+_scratch_xfs_db -x -c "path -m /realtime/0.rmap" -c "addr u3.rtrmapbt.ptrs[1]" \
 	-c "stack" -c "blocktrash -x 4096 -y 4096 -n 8 -3 -z" \
 	>> $seqres.full 2>&1
 
diff --git a/tests/xfs/338 b/tests/xfs/338
index 9f36150c7e..babadc4e1b 100755
--- a/tests/xfs/338
+++ b/tests/xfs/338
@@ -29,13 +29,33 @@ $XFS_IO_PROG -f -R -c "pwrite -S 0x68 0 9999" $SCRATCH_MNT/f2 >> $seqres.full
 _scratch_unmount
 
 echo "Corrupt fs"
-_scratch_xfs_db -x -c 'sb 0' -c 'addr rrmapino' \
-	-c 'write core.nlinkv2 0' -c 'write core.mode 0' -c 'sb 0' \
-	-c 'write rrmapino 0' >> $seqres.full
-_try_scratch_mount >> $seqres.full 2>&1 && echo "mount should have failed"
+readarray -t rtrmap_path < <(_scratch_find_rt_metadir_entry 0.rmap)
+
+rtrmap_accessors=()
+rtrmap_path_len="${#rtrmap_path[@]}"
+for ((i = 0; i < rtrmap_path_len - 1; i++)); do
+	rtrmap_accessors+=(-c "${rtrmap_path[i]}")
+done
+rtrmap_entry="${rtrmap_path[rtrmap_path_len - 1]}"
+test -n "$rtrmap_entry" || _fail "Could not find rtrmap metadir entry?"
+
+_scratch_xfs_db -x -c 'path -m /realtime/0.rmap' \
+	-c 'write core.nlinkv2 0' -c 'write core.mode 0' \
+	"${rtrmap_accessors[@]}" \
+	-c "print $rtrmap_entry" \
+	-c "write -d $rtrmap_entry 0" >> $seqres.full
+if _try_scratch_mount >> $seqres.full 2>&1; then
+       echo "mount should have failed"
+       _scratch_unmount
+else
+	# If the verifiers are working properly, the mount will fail because
+	# we fuzzed the metadata root directory.  This causes loud complaints
+	# to dmesg, so we want to ignore those.
+	_disable_dmesg_check
+fi
 
 echo "Repair fs"
-_scratch_unmount 2>&1 | _filter_scratch
+_scratch_unmount 2>&1 | _filter_scratch | _filter_ending_dot
 _repair_scratch_fs >> $seqres.full 2>&1
 
 echo "Try to create more files (again)"
diff --git a/tests/xfs/339 b/tests/xfs/339
index 3e0b4d97ab..24a90d0ba3 100755
--- a/tests/xfs/339
+++ b/tests/xfs/339
@@ -31,7 +31,8 @@ ln $SCRATCH_MNT/f3 $SCRATCH_MNT/f4
 _scratch_unmount
 
 echo "Corrupt fs"
-rrmapino=`_scratch_xfs_get_sb_field rrmapino`
+rrmapino=$(_scratch_metadir_lookup /realtime/0.rmap)
+test -n "$rrmapino" || _fail "Could not find rtrmap inode?"
 _scratch_xfs_set_metadata_field "u3.sfdir3.list[3].inumber.i4" $rrmapino \
 	'sb 0' 'addr rootino' >> $seqres.full
 _scratch_mount
@@ -43,7 +44,7 @@ echo "Try to create more files"
 $XFS_IO_PROG -f -R -c "pwrite -S 0x68 0 9999" $SCRATCH_MNT/f5 >> $seqres.full 2>&1
 
 echo "Repair fs"
-_scratch_unmount 2>&1 | _filter_scratch
+_scratch_unmount 2>&1 | _filter_scratch | _filter_ending_dot
 _repair_scratch_fs >> $seqres.full 2>&1
 
 echo "Try to create more files (again)"
diff --git a/tests/xfs/340 b/tests/xfs/340
index 2c0014513e..6ee818becb 100755
--- a/tests/xfs/340
+++ b/tests/xfs/340
@@ -31,16 +31,31 @@ ino=$(stat -c '%i' $SCRATCH_MNT/f3)
 _scratch_unmount
 
 echo "Corrupt fs"
-rrmapino=$(_scratch_xfs_get_sb_field rrmapino)
-_scratch_xfs_db -x -c "inode $rrmapino" \
+readarray -t rtrmap_path < <(_scratch_find_rt_metadir_entry 0.rmap)
+
+rtrmap_accessors=()
+rtrmap_path_len="${#rtrmap_path[@]}"
+for ((i = 0; i < rtrmap_path_len - 1; i++)); do
+	rtrmap_accessors+=(-c "${rtrmap_path[i]}")
+done
+rtrmap_entry="${rtrmap_path[rtrmap_path_len - 1]}"
+test -n "$rtrmap_entry" || _fail "Could not find rtrmap metadir entry?"
+
+rrmapino=$(_scratch_metadir_lookup /realtime/0.rmap)
+test -n "$rrmapino" || _fail "Could not find rtrmap inode?"
+_scratch_xfs_db -x -c "path -m /realtime/0.rmap" \
 	-c 'write core.format 2' -c 'write core.size 0' \
-	-c 'write core.nblocks 0' -c 'sb 0' -c 'addr rootino' \
+	-c 'write core.nblocks 0' \
+	-c 'sb 0' -c 'addr rootino' \
+	-c "print u3.sfdir3.list[2].inumber" \
 	-c "write u3.sfdir3.list[2].inumber.i4 $rrmapino" \
-	-c 'sb 0' -c "write rrmapino $ino" >> $seqres.full
+	"${rtrmap_accessors[@]}" \
+	-c "print $rtrmap_entry" \
+	-c "write $rtrmap_entry $ino" >> $seqres.full
 _try_scratch_mount >> $seqres.full 2>&1 && echo "mount should have failed"
 
 echo "Repair fs"
-_scratch_unmount 2>&1 | _filter_scratch
+_scratch_unmount 2>&1 | _filter_scratch | _filter_ending_dot
 _repair_scratch_fs >> $seqres.full 2>&1
 
 echo "Try to create more files (again)"
diff --git a/tests/xfs/341 b/tests/xfs/341
index 7d2842b579..8861e751a9 100755
--- a/tests/xfs/341
+++ b/tests/xfs/341
@@ -53,7 +53,7 @@ echo "Corrupt fs"
 fsbno=$(_scratch_xfs_db -c "inode $ino" -c 'bmap' | grep 'flag 0' | head -n 1 | \
 	sed -e 's/^.*startblock \([0-9]*\) .*$/\1/g')
 
-_scratch_xfs_db -x -c 'sb 0' -c 'addr rrmapino' \
+_scratch_xfs_db -x -c 'path -m /realtime/0.rmap' \
 	-c "write u3.rtrmapbt.ptrs[1] $fsbno" -c 'p' >> $seqres.full
 _scratch_mount
 
diff --git a/tests/xfs/342 b/tests/xfs/342
index 538c8987ef..f29bd874e9 100755
--- a/tests/xfs/342
+++ b/tests/xfs/342
@@ -47,9 +47,9 @@ ino=$(stat -c '%i' $SCRATCH_MNT/f3)
 _scratch_unmount
 
 echo "Corrupt fs"
-_scratch_xfs_db -c 'sb 0' -c 'addr rrmapino' -c 'p u3.rtrmapbt.ptrs[1]' >> $seqres.full
+_scratch_xfs_db -c 'path -m /realtime/0.rmap' -c 'p u3.rtrmapbt.ptrs[1]' >> $seqres.full
 
-fsbno=$(_scratch_xfs_db -c 'sb 0' -c 'addr rrmapino' \
+fsbno=$(_scratch_xfs_db -c 'path -m /realtime/0.rmap' \
 	-c 'p u3.rtrmapbt.ptrs[1]' | sed -e 's/^.*://g')
 _scratch_xfs_db -x -c "inode $ino" -c "write u3.bmx[0].startblock $fsbno" >> $seqres.full
 _scratch_mount


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/13] fuzz: for fuzzing the rtrmapbt, find the path to the rt rmap btree file
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 01/13] xfs: fix tests that try to access the realtime rmap inode Darrick J. Wong
@ 2023-12-27 13:58   ` Darrick J. Wong
  2023-12-27 13:58   ` [PATCH 03/13] xfs: race fsstress with realtime rmap btree scrub and repair Darrick J. Wong
                     ` (10 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:58 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The fs population code creates a realtime rmap btree in /some/ realtime
group with at least two levels.  This rmapbt file isn't necessarily the
one for group 0, so we need to find it programmatically.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs    |   33 +++++++++++++++++++++++++++++++++
 tests/xfs/406 |    6 ++++--
 tests/xfs/407 |    6 ++++--
 tests/xfs/408 |    7 +++++--
 tests/xfs/409 |    7 +++++--
 tests/xfs/481 |    6 ++++--
 tests/xfs/482 |    7 +++++--
 7 files changed, 60 insertions(+), 12 deletions(-)


diff --git a/common/xfs b/common/xfs
index d3baa7a7c3..b8dd8d4a40 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1914,6 +1914,39 @@ _scratch_xfs_find_agbtree_height() {
 	return 1
 }
 
+# Find us the path to the inode containing a realtime btree with a specific
+# height.
+_scratch_xfs_find_rgbtree_height() {
+	local bt_type="$1"
+	local bt_height="$2"
+	local rgcount=$(_xfs_mount_rgcount $SCRATCH_DEV)
+	local path
+	local path_format
+	local bt_prefix
+
+	case "${bt_type}" in
+	"rmap")
+		path_format="/realtime/%u.rmap"
+		bt_prefix="u3.rtrmapbt"
+		;;
+	*)
+		_fail "Don't know about rt btree ${bt_type}"
+		;;
+	esac
+
+	for ((rgno = 0; rgno < rgcount; rgno++)); do
+		path="$(printf "${path_format}" "${rgno}")"
+		bt_level=$(_scratch_xfs_db -c "path -m ${path}" -c "p ${bt_prefix}.level" | awk '{print $3}')
+		# "level" is the actual level within the btree
+		if [ "${bt_level}" -eq "$((bt_height - 1))" ]; then
+			echo "${path}"
+			return 0
+		fi
+	done
+
+	return 1
+}
+
 _require_xfs_mkfs_atomicswap()
 {
 	# atomicswap can be activated on rmap or reflink filesystems.
diff --git a/tests/xfs/406 b/tests/xfs/406
index 78db18077c..8c5570886b 100755
--- a/tests/xfs/406
+++ b/tests/xfs/406
@@ -26,10 +26,12 @@ _require_scratch_xfs_fuzz_fields
 echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
-inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'sb 0' 'addr rrmapino')
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 2)" || \
+	_fail "could not find two-level rtrmapbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
 
 echo "Fuzz rtrmapbt recs"
-_scratch_xfs_fuzz_metadata '' 'offline' 'sb 0' 'addr rrmapino' "addr u${inode_ver}.rtrmapbt.ptrs[1]" >> $seqres.full
+_scratch_xfs_fuzz_metadata '' 'offline' "path -m $path" "addr u${inode_ver}.rtrmapbt.ptrs[1]" >> $seqres.full
 echo "Done fuzzing rtrmapbt recs"
 
 # success, all done
diff --git a/tests/xfs/407 b/tests/xfs/407
index 5a43775b55..2460ea336c 100755
--- a/tests/xfs/407
+++ b/tests/xfs/407
@@ -26,10 +26,12 @@ _require_scratch_xfs_fuzz_fields
 echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
-inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'sb 0' 'addr rrmapino')
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 1)" || \
+	_fail "could not find two-level rtrmapbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
 
 echo "Fuzz rtrmapbt recs"
-_scratch_xfs_fuzz_metadata '' 'online' 'sb 0' 'addr rrmapino' "addr u${inode_ver}.rtrmapbt.ptrs[1]" >> $seqres.full
+_scratch_xfs_fuzz_metadata '' 'online' "path -m $path" "addr u${inode_ver}.rtrmapbt.ptrs[1]" >> $seqres.full
 echo "Done fuzzing rtrmapbt recs"
 
 # success, all done
diff --git a/tests/xfs/408 b/tests/xfs/408
index 8049d6bead..3bed3824e8 100755
--- a/tests/xfs/408
+++ b/tests/xfs/408
@@ -4,7 +4,7 @@
 #
 # FS QA Test No. 408
 #
-# Populate a XFS filesystem and fuzz every rtrmapbt keyptr field.
+# Populate a XFS filesystem and fuzz every rtrmapbt key/pointer field.
 # Use xfs_repair to fix the corruption.
 #
 . ./common/preamble
@@ -26,8 +26,11 @@ _require_scratch_xfs_fuzz_fields
 echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 2)" || \
+	_fail "could not find two-level rtrmapbt"
+
 echo "Fuzz rtrmapbt keyptrs"
-_scratch_xfs_fuzz_metadata '(rtrmapbt)' 'offline' 'sb 0' 'addr rrmapino' >> $seqres.full
+_scratch_xfs_fuzz_metadata '(rtrmapbt)' 'offline' "path -m $path" >> $seqres.full
 echo "Done fuzzing rtrmapbt keyptrs"
 
 # success, all done
diff --git a/tests/xfs/409 b/tests/xfs/409
index adac95fea8..ce66175c6e 100755
--- a/tests/xfs/409
+++ b/tests/xfs/409
@@ -4,7 +4,7 @@
 #
 # FS QA Test No. 409
 #
-# Populate a XFS filesystem and fuzz every rtrmapbt keyptr field.
+# Populate a XFS filesystem and fuzz every rtrmapbt key/pointer field.
 # Use xfs_scrub to fix the corruption.
 #
 . ./common/preamble
@@ -26,8 +26,11 @@ _require_scratch_xfs_fuzz_fields
 echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 2)" || \
+	_fail "could not find two-level rtrmapbt"
+
 echo "Fuzz rtrmapbt keyptrs"
-_scratch_xfs_fuzz_metadata '(rtrmapbt)' 'offline' 'sb 0' 'addr rrmapino' >> $seqres.full
+_scratch_xfs_fuzz_metadata '(rtrmapbt)' 'online' "path -m $path" >> $seqres.full
 echo "Done fuzzing rtrmapbt keyptrs"
 
 # success, all done
diff --git a/tests/xfs/481 b/tests/xfs/481
index 48c7580ccb..d303f2c27d 100755
--- a/tests/xfs/481
+++ b/tests/xfs/481
@@ -27,10 +27,12 @@ _disable_dmesg_check
 echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
-inode_ver=$(_scratch_xfs_get_metadata_field "core.version" 'sb 0' 'addr rrmapino')
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 2)" || \
+	_fail "could not find two-level rtrmapbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
 
 echo "Fuzz rtrmapbt recs"
-_scratch_xfs_fuzz_metadata '' 'none' 'sb 0' 'addr rrmapino' "addr u${inode_ver}.rtrmapbt.ptrs[1]" >> $seqres.full
+_scratch_xfs_fuzz_metadata '' 'none' "path -m $path" "addr u${inode_ver}.rtrmapbt.ptrs[1]" >> $seqres.full
 echo "Done fuzzing rtrmapbt recs"
 
 # success, all done
diff --git a/tests/xfs/482 b/tests/xfs/482
index 0192b94cc0..32a3012154 100755
--- a/tests/xfs/482
+++ b/tests/xfs/482
@@ -4,7 +4,7 @@
 #
 # FS QA Test No. 482
 #
-# Populate a XFS filesystem and fuzz every rtrmapbt keyptr field.
+# Populate a XFS filesystem and fuzz every rtrmapbt key/pointer field.
 # Do not fix the filesystem, to test metadata verifiers.
 
 . ./common/preamble
@@ -27,8 +27,11 @@ _disable_dmesg_check
 echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 2)" || \
+	_fail "could not find two-level rtrmapbt"
+
 echo "Fuzz rtrmapbt keyptrs"
-_scratch_xfs_fuzz_metadata '(rtrmapbt)' 'offline' 'sb 0' 'addr rrmapino' >> $seqres.full
+_scratch_xfs_fuzz_metadata '(rtrmapbt)' 'offline' "path -m $path" >> $seqres.full
 echo "Done fuzzing rtrmapbt keyptrs"
 
 # success, all done


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/13] xfs: race fsstress with realtime rmap btree scrub and repair
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 01/13] xfs: fix tests that try to access the realtime rmap inode Darrick J. Wong
  2023-12-27 13:58   ` [PATCH 02/13] fuzz: for fuzzing the rtrmapbt, find the path to the rt rmap btree file Darrick J. Wong
@ 2023-12-27 13:58   ` Darrick J. Wong
  2023-12-27 13:58   ` [PATCH 04/13] xfs/856: add rtrmapbt upgrade to test matrix Darrick J. Wong
                     ` (9 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:58 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Race checking and rebuilding realtime rmap btrees with fsstress.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1817     |   42 ++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1817.out |    2 ++
 tests/xfs/1821     |   42 ++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1821.out |    2 ++
 tests/xfs/1857     |   42 ++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1857.out |    2 ++
 6 files changed, 132 insertions(+)
 create mode 100755 tests/xfs/1817
 create mode 100644 tests/xfs/1817.out
 create mode 100755 tests/xfs/1821
 create mode 100644 tests/xfs/1821.out
 create mode 100755 tests/xfs/1857
 create mode 100644 tests/xfs/1857.out


diff --git a/tests/xfs/1817 b/tests/xfs/1817
new file mode 100755
index 0000000000..56ce63e005
--- /dev/null
+++ b/tests/xfs/1817
@@ -0,0 +1,42 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1817
+#
+# Race fsstress and rtrmapbt scrub for a while to see if we crash or livelock.
+#
+. ./common/preamble
+_begin_fstest scrub dangerous_fsstress_scrub
+
+_cleanup() {
+	_scratch_xfs_stress_scrub_cleanup &> /dev/null
+	cd /
+	rm -r -f $tmp.*
+}
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/fuzzy
+. ./common/inject
+. ./common/xfs
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch
+_require_xfs_stress_scrub
+
+_scratch_mkfs > "$seqres.full" 2>&1
+_scratch_mount
+_require_xfs_has_feature "$SCRATCH_MNT" realtime
+_require_xfs_has_feature "$SCRATCH_MNT" rmapbt
+_xfs_force_bdev realtime $SCRATCH_MNT
+
+_scratch_xfs_stress_scrub -s "scrub rtrmapbt %rgno%"
+
+# success, all done
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1817.out b/tests/xfs/1817.out
new file mode 100644
index 0000000000..525ec14ed0
--- /dev/null
+++ b/tests/xfs/1817.out
@@ -0,0 +1,2 @@
+QA output created by 1817
+Silence is golden
diff --git a/tests/xfs/1821 b/tests/xfs/1821
new file mode 100755
index 0000000000..7dbac04f52
--- /dev/null
+++ b/tests/xfs/1821
@@ -0,0 +1,42 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1821
+#
+# Race fsstress and realtime bitmap repair for a while to see if we crash or
+# livelock.
+#
+. ./common/preamble
+_begin_fstest online_repair dangerous_fsstress_repair
+
+_cleanup() {
+	_scratch_xfs_stress_scrub_cleanup &> /dev/null
+	cd /
+	rm -r -f $tmp.*
+}
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/fuzzy
+. ./common/inject
+. ./common/xfs
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch
+_require_xfs_stress_online_repair
+
+_scratch_mkfs > "$seqres.full" 2>&1
+_scratch_mount
+_require_xfs_has_feature "$SCRATCH_MNT" realtime
+_xfs_force_bdev realtime $SCRATCH_MNT
+
+_scratch_xfs_stress_online_repair -s "repair rtbitmap" -s "repair rgbitmap %rgno%"
+
+# success, all done
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1821.out b/tests/xfs/1821.out
new file mode 100644
index 0000000000..0f18ad5d88
--- /dev/null
+++ b/tests/xfs/1821.out
@@ -0,0 +1,2 @@
+QA output created by 1821
+Silence is golden
diff --git a/tests/xfs/1857 b/tests/xfs/1857
new file mode 100755
index 0000000000..de10bbb5c4
--- /dev/null
+++ b/tests/xfs/1857
@@ -0,0 +1,42 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1857
+#
+# Race fsstress and rtrmapbt repair for a while to see if we crash or livelock.
+#
+. ./common/preamble
+_begin_fstest online_repair dangerous_fsstress_repair
+
+_cleanup() {
+	_scratch_xfs_stress_scrub_cleanup &> /dev/null
+	cd /
+	rm -r -f $tmp.*
+}
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/fuzzy
+. ./common/inject
+. ./common/xfs
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch
+_require_xfs_stress_online_repair
+
+_scratch_mkfs > "$seqres.full" 2>&1
+_scratch_mount
+_require_xfs_has_feature "$SCRATCH_MNT" realtime
+_require_xfs_has_feature "$SCRATCH_MNT" rmapbt
+_xfs_force_bdev realtime $SCRATCH_MNT
+
+_scratch_xfs_stress_online_repair -s "repair rtrmapbt %rgno%"
+
+# success, all done
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1857.out b/tests/xfs/1857.out
new file mode 100644
index 0000000000..b51ffd3f6c
--- /dev/null
+++ b/tests/xfs/1857.out
@@ -0,0 +1,2 @@
+QA output created by 1857
+Silence is golden


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/13] xfs/856: add rtrmapbt upgrade to test matrix
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 13:58   ` [PATCH 03/13] xfs: race fsstress with realtime rmap btree scrub and repair Darrick J. Wong
@ 2023-12-27 13:58   ` Darrick J. Wong
  2023-12-27 13:59   ` [PATCH 05/13] xfs/122: update for rtgroups-based realtime rmap btrees Darrick J. Wong
                     ` (8 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:58 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Add realtime reverse mapping btrees to the features that this test will
try to upgrade.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1856 |   39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)


diff --git a/tests/xfs/1856 b/tests/xfs/1856
index 93bdbaa531..8453d9bff2 100755
--- a/tests/xfs/1856
+++ b/tests/xfs/1856
@@ -34,11 +34,46 @@ rt_configured()
 	test "$USE_EXTERNAL" = "yes" && test -n "$SCRATCH_RTDEV"
 }
 
+# Does mkfs support rtgroups?
+supports_rtgroups()
+{
+	$MKFS_XFS_PROG 2>&1 | grep -q 'rtgroups='
+}
+
+# Do we need to enable rtgroups at mkfs time to support a feature upgrade test?
+need_rtgroups()
+{
+	local feat="$1"
+
+	# if realtime isn't configured, we don't need rt groups
+	rt_configured || return 1
+
+	# If we don't even know what realtime rmap is, we don't need rt groups
+	test -z "${FEATURE_STATE["rmapbt"]}" && return 1
+
+	# rt rmap btrees require rt groups but rt groups cannot be added to
+	# an existing filesystem, so we must force it on at mkfs time
+	test "${FEATURE_STATE["rmapbt"]}" -eq 1 && return 0
+	test "$feat" = "rmapbt" && return 0
+
+	return 1
+}
+
 # Compute the MKFS_OPTIONS string for a particular feature upgrade test
 compute_mkfs_options()
 {
+	local feat="$1"
 	local m_opts=""
 	local caller_options="$MKFS_OPTIONS"
+	local rtgroups
+
+	need_rtgroups "$feat" && rtgroups=1
+	if echo "$caller_options" | grep -q 'rtgroups='; then
+		test -z "$rtgroups" && rtgroups=0
+		caller_options="$(echo "$caller_options" | sed -e 's/rtgroups=*[0-9]*/rtgroups='$rtgroups'/g')"
+	elif [ -n "$rtgroups" ]; then
+		caller_options="$caller_options -r rtgroups=$rtgroups"
+	fi
 
 	for feat in "${FEATURES[@]}"; do
 		local feat_state="${FEATURE_STATE["${feat}"]}"
@@ -170,10 +205,12 @@ post_exercise()
 # upgrade don't spread failure to the rest of the tests.
 FEATURES=()
 if rt_configured; then
+	# rmap wasn't added to rt devices until after metadir and rtgroups
 	check_repair_upgrade finobt && FEATURES+=("finobt")
 	check_repair_upgrade inobtcount && FEATURES+=("inobtcount")
 	check_repair_upgrade bigtime && FEATURES+=("bigtime")
 	check_repair_upgrade metadir && FEATURES+=("metadir")
+	supports_rtgroups && check_repair_upgrade rmapbt && FEATURES+=("rmapbt")
 else
 	check_repair_upgrade finobt && FEATURES+=("finobt")
 	check_repair_upgrade rmapbt && FEATURES+=("rmapbt")
@@ -196,7 +233,7 @@ for feat in "${FEATURES[@]}"; do
 
 	upgrade_start_message "$feat" | _tee_kernlog $seqres.full > /dev/null
 
-	opts="$(compute_mkfs_options)"
+	opts="$(compute_mkfs_options "$feat")"
 	echo "mkfs.xfs $opts" >> $seqres.full
 
 	# Format filesystem


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/13] xfs/122: update for rtgroups-based realtime rmap btrees
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 13:58   ` [PATCH 04/13] xfs/856: add rtrmapbt upgrade to test matrix Darrick J. Wong
@ 2023-12-27 13:59   ` Darrick J. Wong
  2023-12-27 13:59   ` [PATCH 06/13] xfs: fix various problems with fsmap detecting the data device Darrick J. Wong
                     ` (7 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:59 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Now that we've redesigned realtime rmap to require that the rt section
be sharded into allocation groups of no more than 2^31 blocks, we've
reduced the size of the ondisk structures and therefore need to update
this test.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/122.out |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)


diff --git a/tests/xfs/122.out b/tests/xfs/122.out
index 837286a9d3..f5621cac12 100644
--- a/tests/xfs/122.out
+++ b/tests/xfs/122.out
@@ -120,8 +120,8 @@ sizeof(struct xfs_rmap_key) = 20
 sizeof(struct xfs_rmap_rec) = 24
 sizeof(struct xfs_rtbuf_blkinfo) = 48
 sizeof(struct xfs_rtgroup_geometry) = 128
-sizeof(struct xfs_rtrmap_key) = 24
-sizeof(struct xfs_rtrmap_rec) = 32
+sizeof(struct xfs_rtrmap_key) = 20
+sizeof(struct xfs_rtrmap_rec) = 24
 sizeof(struct xfs_rtrmap_root) = 4
 sizeof(struct xfs_rtsb) = 104
 sizeof(struct xfs_rud_log_format) = 16


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/13] xfs: fix various problems with fsmap detecting the data device
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:59   ` [PATCH 05/13] xfs/122: update for rtgroups-based realtime rmap btrees Darrick J. Wong
@ 2023-12-27 13:59   ` Darrick J. Wong
  2023-12-27 13:59   ` [PATCH 07/13] xfs/341: update test for rtgroup-based rmap Darrick J. Wong
                     ` (6 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:59 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Various tests of realtime rmap functionality assumed that the data
device could be picked out from the GETFSMAP output by looking for
static fs metadata.  This is no longer true, since rtgroups have a
static superblock header at the start, so update these tests.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/272 |    2 +-
 tests/xfs/276 |    2 +-
 tests/xfs/277 |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)


diff --git a/tests/xfs/272 b/tests/xfs/272
index c68fa9d614..d5f3a74177 100755
--- a/tests/xfs/272
+++ b/tests/xfs/272
@@ -57,7 +57,7 @@ cat $TEST_DIR/bmap | while read ext offrange colon blockrange ag agrange total c
 done
 
 echo "Check device field of FS metadata and regular file"
-data_dev=$(grep 'static fs metadata' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
+data_dev=$(grep 'inode btree' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
 rt_dev=$(grep "${ino}[[:space:]]*[0-9]*\.\.[0-9]*" $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
 test "${data_dev}" = "${rt_dev}" || echo "data ${data_dev} realtime ${rt_dev}?"
 
diff --git a/tests/xfs/276 b/tests/xfs/276
index 8cc486752a..a05ca1961d 100755
--- a/tests/xfs/276
+++ b/tests/xfs/276
@@ -61,7 +61,7 @@ cat $TEST_DIR/bmap | while read ext offrange colon blockrange ag agrange total c
 done
 
 echo "Check device field of FS metadata and realtime file"
-data_dev=$(grep 'static fs metadata' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
+data_dev=$(grep 'inode btree' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
 rt_dev=$(grep "${ino}[[:space:]]*[0-9]*\.\.[0-9]*[[:space:]]*[0-9]*$" $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
 test "${data_dev}" != "${rt_dev}" || echo "data ${data_dev} realtime ${rt_dev}?"
 
diff --git a/tests/xfs/277 b/tests/xfs/277
index 03208ef233..eff54a2a50 100755
--- a/tests/xfs/277
+++ b/tests/xfs/277
@@ -38,7 +38,7 @@ $XFS_IO_PROG -c 'fsmap -v' $SCRATCH_MNT >> $seqres.full
 $XFS_IO_PROG -c 'fsmap -v' $SCRATCH_MNT | tr '[]()' '    ' > $TEST_DIR/fsmap
 
 echo "Check device field of FS metadata and journalling log"
-data_dev=$(grep 'static fs metadata' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
+data_dev=$(grep 'inode btree' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
 journal_dev=$(grep 'journalling log' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
 test "${data_dev}" = "${journal_dev}" || echo "data ${data_dev} journal ${journal_dev}?"
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/13] xfs/341: update test for rtgroup-based rmap
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 13:59   ` [PATCH 06/13] xfs: fix various problems with fsmap detecting the data device Darrick J. Wong
@ 2023-12-27 13:59   ` Darrick J. Wong
  2023-12-27 13:59   ` [PATCH 08/13] xfs/3{43,32}: adapt tests for rt extent size greater than 1 Darrick J. Wong
                     ` (5 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:59 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Now that we're sharding the realtime volume into multiple allocation
groups, update this test to reflect the new reality.  The realtime rmap
btree record and key sizes have shrunk, and we can't guarantee that a
quick file write actually hits the same rt group as the one we fuzzed,
so eliminate the file write test since we're really only curious if
xfs_repair will fix the problem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/341     |   10 +++-------
 tests/xfs/341.out |    1 -
 2 files changed, 3 insertions(+), 8 deletions(-)


diff --git a/tests/xfs/341 b/tests/xfs/341
index 8861e751a9..561054f0bd 100755
--- a/tests/xfs/341
+++ b/tests/xfs/341
@@ -32,10 +32,10 @@ blksz="$(_get_block_size $SCRATCH_MNT)"
 rtextsz_blks=$((rtextsz / blksz))
 
 # inode core size is at least 176 bytes; btree header is 56 bytes;
-# rtrmap record is 32 bytes; and rtrmap key/pointer are 56 bytes.
+# rtrmap record is 24 bytes; and rtrmap key/pointer are 48 bytes.
 i_core_size="$(_xfs_get_inode_core_bytes $SCRATCH_MNT)"
-i_ptrs=$(( (isize - i_core_size) / 56 ))
-bt_recs=$(( (blksz - 56) / 32 ))
+i_ptrs=$(( (isize - i_core_size) / 48 ))
+bt_recs=$(( (blksz - 56) / 24 ))
 
 blocks=$((i_ptrs * bt_recs + 1))
 len=$((blocks * rtextsz))
@@ -57,10 +57,6 @@ _scratch_xfs_db -x -c 'path -m /realtime/0.rmap' \
 	-c "write u3.rtrmapbt.ptrs[1] $fsbno" -c 'p' >> $seqres.full
 _scratch_mount
 
-echo "Try to create more files"
-$XFS_IO_PROG -f -R -c "pwrite -S 0x68 0 9999" $SCRATCH_MNT/f5 >> $seqres.full 2>&1
-test -e $SCRATCH_MNT/f5 && echo "should not have been able to write f5"
-
 echo "Repair fs"
 _scratch_unmount 2>&1 | _filter_scratch
 _repair_scratch_fs >> $seqres.full 2>&1
diff --git a/tests/xfs/341.out b/tests/xfs/341.out
index 75a5bc6c61..580d788954 100644
--- a/tests/xfs/341.out
+++ b/tests/xfs/341.out
@@ -2,6 +2,5 @@ QA output created by 341
 Format and mount
 Create some files
 Corrupt fs
-Try to create more files
 Repair fs
 Try to create more files (again)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/13] xfs/3{43,32}: adapt tests for rt extent size greater than 1
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 13:59   ` [PATCH 07/13] xfs/341: update test for rtgroup-based rmap Darrick J. Wong
@ 2023-12-27 13:59   ` Darrick J. Wong
  2023-12-27 14:00   ` [PATCH 09/13] xfs: skip tests if formatting small filesystem fails Darrick J. Wong
                     ` (4 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 13:59 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Both of these tests for the realtime volume can fail when the rt extent
size is larger than a single block.

332 is a read-write functionality test that encodes md5sum in the
output, so we need to skip it if $blksz isn't congruent with the extent
size, because the fcollapse call will fail.

343 is a test of the rmap btree, so the fix here is simpler -- make
$blksz the file allocation unit, and get rid of the md5sum in the
golden output.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/332     |    6 +-----
 tests/xfs/332.out |    2 --
 tests/xfs/343     |    2 ++
 3 files changed, 3 insertions(+), 7 deletions(-)


diff --git a/tests/xfs/332 b/tests/xfs/332
index a2d37ee905..c1ac87adcb 100755
--- a/tests/xfs/332
+++ b/tests/xfs/332
@@ -28,7 +28,7 @@ rm -f "$seqres.full"
 echo "Format and mount"
 _scratch_mkfs > "$seqres.full" 2>&1
 _scratch_mount
-blksz=65536
+blksz=$(_get_file_block_size $SCRATCH_MNT) # 65536
 blocks=16
 len=$((blocks * blksz))
 
@@ -45,10 +45,6 @@ $XFS_IO_PROG -c "fpunch $blksz $blksz" \
 	-c "fcollapse $((9 * blksz)) $blksz" \
 	-c "finsert $((10 * blksz)) $blksz" $SCRATCH_MNT/f1 >> $seqres.full
 
-echo "Check file"
-md5sum $SCRATCH_MNT/f1 | _filter_scratch
-od -tx1 -Ad -c $SCRATCH_MNT/f1 >> $seqres.full
-
 echo "Unmount"
 _scratch_unmount
 
diff --git a/tests/xfs/332.out b/tests/xfs/332.out
index 9beff7cc37..3a7ca95b40 100644
--- a/tests/xfs/332.out
+++ b/tests/xfs/332.out
@@ -2,8 +2,6 @@ QA output created by 332
 Format and mount
 Create some files
 Manipulate file
-Check file
-e45c5707fcf6817e914ffb6ce37a0ac7  SCRATCH_MNT/f1
 Unmount
 Try a regular fsmap
 Try a bad fsmap
diff --git a/tests/xfs/343 b/tests/xfs/343
index bffcc7d9ac..fe461847ed 100755
--- a/tests/xfs/343
+++ b/tests/xfs/343
@@ -31,6 +31,8 @@ blksz=65536
 blocks=16
 len=$((blocks * blksz))
 
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
+
 echo "Create some files"
 $XFS_IO_PROG -f -R -c "falloc 0 $len" -c "pwrite -S 0x68 -b 1048576 0 $len" $SCRATCH_MNT/f1 >> $seqres.full
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/13] xfs: skip tests if formatting small filesystem fails
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 13:59   ` [PATCH 08/13] xfs/3{43,32}: adapt tests for rt extent size greater than 1 Darrick J. Wong
@ 2023-12-27 14:00   ` Darrick J. Wong
  2023-12-27 14:00   ` [PATCH 10/13] xfs/443: use file allocation unit, not dbsize Darrick J. Wong
                     ` (3 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:00 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

There are a few tests that try to exercise XFS functionality with an
unusually small (< 500MB) filesystem.  Formatting can fail if the test
configuration also specifies a very large realtime device because mkfs
hits ENOSPC when allocating the realtime metadata.  The test proceeds
anyway (which causes an immediate mount failure) so we might as well
skip these.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/104 |    1 +
 tests/xfs/291 |    3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)


diff --git a/tests/xfs/104 b/tests/xfs/104
index d16f46d8e4..c3d1d18a58 100755
--- a/tests/xfs/104
+++ b/tests/xfs/104
@@ -16,6 +16,7 @@ _create_scratch()
 {
 	echo "*** mkfs"
 	_scratch_mkfs_xfs $@ | tee -a $seqres.full | _filter_mkfs 2>$tmp.mkfs
+	test "${PIPESTATUS[0]}" -eq 0 || _notrun "formatting small scratch fs failed"
 	. $tmp.mkfs
 
 	echo "*** mount"
diff --git a/tests/xfs/291 b/tests/xfs/291
index 600dcb2eba..70e5f51cee 100755
--- a/tests/xfs/291
+++ b/tests/xfs/291
@@ -18,7 +18,8 @@ _require_command "$XFS_MDRESTORE_PROG" "xfs_mdrestore"
 # real QA test starts here
 _require_scratch
 logblks=$(_scratch_find_xfs_min_logblocks -n size=16k -d size=133m)
-_scratch_mkfs_xfs -n size=16k -l size=${logblks}b -d size=133m >> $seqres.full 2>&1
+_scratch_mkfs_xfs -n size=16k -l size=${logblks}b -d size=133m >> $seqres.full 2>&1 || \
+	_notrun "formatting small scratch fs failed"
 _scratch_mount
 
 # First we cause very badly fragmented freespace, then


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/13] xfs/443: use file allocation unit, not dbsize
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-27 14:00   ` [PATCH 09/13] xfs: skip tests if formatting small filesystem fails Darrick J. Wong
@ 2023-12-27 14:00   ` Darrick J. Wong
  2023-12-27 14:00   ` [PATCH 11/13] populate: adjust rtrmap calculations for rtgroups Darrick J. Wong
                     ` (2 subsequent siblings)
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:00 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

We can only punch in units of file allocation boundaries, so update this
test to use that instead of the fs blocksize.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/443 |    9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)


diff --git a/tests/xfs/443 b/tests/xfs/443
index 56828decae..ab3cda59f3 100755
--- a/tests/xfs/443
+++ b/tests/xfs/443
@@ -40,14 +40,15 @@ _scratch_mount
 
 file1=$SCRATCH_MNT/file1
 file2=$SCRATCH_MNT/file2
+file_blksz=$(_get_file_block_size $SCRATCH_MNT)
 
 # The goal is run an extent swap where one of the associated files has the
 # minimum number of extents to remain in btree format. First, create a couple
 # files with large enough extent counts (200 or so should be plenty) to ensure
 # btree format on the largest possible inode size filesystems.
-$XFS_IO_PROG -fc "falloc 0 $((400 * dbsize))" $file1
+$XFS_IO_PROG -fc "falloc 0 $((400 * file_blksz))" $file1
 $here/src/punch-alternating $file1
-$XFS_IO_PROG -fc "falloc 0 $((400 * dbsize))" $file2
+$XFS_IO_PROG -fc "falloc 0 $((400 * file_blksz))" $file2
 $here/src/punch-alternating $file2
 
 # Now run an extent swap at every possible extent count down to 0. Depending on
@@ -55,12 +56,12 @@ $here/src/punch-alternating $file2
 # btree format.
 for i in $(seq 1 2 399); do
 	# punch one extent from the tmpfile and swap
-	$XFS_IO_PROG -c "fpunch $((i * dbsize)) $dbsize" $file2
+	$XFS_IO_PROG -c "fpunch $((i * file_blksz)) $file_blksz" $file2
 	$XFS_IO_PROG -c "swapext $file2" $file1
 
 	# punch the same extent from the old fork (now in file2) to resync the
 	# extent counts and repeat
-	$XFS_IO_PROG -c "fpunch $((i * dbsize)) $dbsize" $file2
+	$XFS_IO_PROG -c "fpunch $((i * file_blksz)) $file_blksz" $file2
 done
 
 # sanity check that no extents are left over


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/13] populate: adjust rtrmap calculations for rtgroups
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-27 14:00   ` [PATCH 10/13] xfs/443: use file allocation unit, not dbsize Darrick J. Wong
@ 2023-12-27 14:00   ` Darrick J. Wong
  2023-12-27 14:00   ` [PATCH 12/13] populate: check that we created a realtime rmap btree of the given height Darrick J. Wong
  2023-12-27 14:01   ` [PATCH 13/13] fuzzy: create missing fuzz tests for rt rmap btrees Darrick J. Wong
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:00 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Now that we've sharded the realtime volume and created per-group rmap
btrees, we need to adjust downward the size of rtrmapbt records since
the block counts are now 32-bit instead of 64-bit.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/populate |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


diff --git a/common/populate b/common/populate
index dc89eee70e..d8d19bf782 100644
--- a/common/populate
+++ b/common/populate
@@ -450,7 +450,7 @@ _scratch_xfs_populate() {
 	is_rt="$(_xfs_get_rtextents "$SCRATCH_MNT")"
 	if [ $is_rmapbt -gt 0 ] && [ $is_rt -gt 0 ]; then
 		echo "+ rtrmapbt btree"
-		nr="$((blksz * 2 / 32))"
+		nr="$((blksz * 2 / 24))"
 		$XFS_IO_PROG -R -f -c 'truncate 0' "${SCRATCH_MNT}/RTRMAPBT"
 		__populate_create_file $((blksz * nr)) "${SCRATCH_MNT}/RTRMAPBT"
 	fi


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/13] populate: check that we created a realtime rmap btree of the given height
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-27 14:00   ` [PATCH 11/13] populate: adjust rtrmap calculations for rtgroups Darrick J. Wong
@ 2023-12-27 14:00   ` Darrick J. Wong
  2023-12-27 14:01   ` [PATCH 13/13] fuzzy: create missing fuzz tests for rt rmap btrees Darrick J. Wong
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:00 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Make sure that we actually create an rt rmap btree of the desired height
somewhere in the filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/populate |   34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)


diff --git a/common/populate b/common/populate
index d8d19bf782..84ff9304e3 100644
--- a/common/populate
+++ b/common/populate
@@ -740,6 +740,37 @@ __populate_check_xfs_agbtree_height() {
 	return 1
 }
 
+# Check that there's at least one rt btree with multiple levels
+__populate_check_xfs_rgbtree_height() {
+	local bt_type="$1"
+	local rgcount=$(_scratch_xfs_db -c 'sb 0' -c 'p rgcount' | awk '{print $3}')
+	local path
+	local path_format
+	local bt_prefix
+
+	case "${bt_type}" in
+	"rmap")
+		path_format="/realtime/%u.rmap"
+		bt_prefix="u3.rtrmapbt"
+		;;
+	*)
+		_fail "Don't know about rt btree ${bt_type}"
+		;;
+	esac
+
+	for ((rgno = 0; rgno < rgcount; rgno++)); do
+		path="$(printf "${path_format}" "${rgno}")"
+		bt_level=$(_scratch_xfs_db -c "path -m ${path}" -c "p ${bt_prefix}.level" | awk '{print $3}')
+		# "level" is the actual level within the btree
+		if [ "${bt_level}" -gt 0 ]; then
+			return 0
+		fi
+	done
+
+	__populate_fail "Failed to create rt ${bt_type} of sufficient height!"
+	return 1
+}
+
 # Check that populate created all the types of files we wanted
 _scratch_xfs_populate_check() {
 	_scratch_mount
@@ -763,6 +794,7 @@ _scratch_xfs_populate_check() {
 	is_finobt=$(_xfs_has_feature "$SCRATCH_MNT" finobt -v)
 	is_rmapbt=$(_xfs_has_feature "$SCRATCH_MNT" rmapbt -v)
 	is_reflink=$(_xfs_has_feature "$SCRATCH_MNT" reflink -v)
+	is_rt="$(_xfs_get_rtextents "$SCRATCH_MNT")"
 
 	blksz="$(stat -f -c '%s' "${SCRATCH_MNT}")"
 	dblksz="$(_xfs_get_dir_blocksize "$SCRATCH_MNT")"
@@ -793,6 +825,8 @@ _scratch_xfs_populate_check() {
 	test $is_finobt -ne 0 && __populate_check_xfs_agbtree_height "fino"
 	test $is_rmapbt -ne 0 && __populate_check_xfs_agbtree_height "rmap"
 	test $is_reflink -ne 0 && __populate_check_xfs_agbtree_height "refcnt"
+	test $is_rmapbt -ne 0 && test $is_rt -gt 0 && \
+		__populate_check_xfs_rgbtree_height "rmap"
 }
 
 # Check data fork format of ext4 file


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/13] fuzzy: create missing fuzz tests for rt rmap btrees
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-27 14:00   ` [PATCH 12/13] populate: check that we created a realtime rmap btree of the given height Darrick J. Wong
@ 2023-12-27 14:01   ` Darrick J. Wong
  12 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:01 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Back when I first created the fuzz tests for the realtime rmap btree, I
forgot a couple of things.  Add tests to fuzz rtrmap btree leaf records,
and node keys.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1528     |   41 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1528.out |    4 ++++
 tests/xfs/1529     |   40 ++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1529.out |    4 ++++
 tests/xfs/407      |    2 +-
 5 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100755 tests/xfs/1528
 create mode 100644 tests/xfs/1528.out
 create mode 100755 tests/xfs/1529
 create mode 100644 tests/xfs/1529.out


diff --git a/tests/xfs/1528 b/tests/xfs/1528
new file mode 100755
index 0000000000..24cbb55c23
--- /dev/null
+++ b/tests/xfs/1528
@@ -0,0 +1,41 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1528
+#
+# Populate a XFS filesystem and fuzz every rtrmapbt record field.
+# Try online repair and, if necessary, offline repair,
+# to test the most likely usage pattern.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_bothrepair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_xfs_scratch_rmapbt
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 2)" || \
+	_fail "could not find two-level rtrmapbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
+
+echo "Fuzz rtrmapbt recs"
+_scratch_xfs_fuzz_metadata '' 'both' "path -m $path" "addr u${inode_ver}.rtrmapbt.ptrs[1]" >> $seqres.full
+echo "Done fuzzing rtrmapbt recs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1528.out b/tests/xfs/1528.out
new file mode 100644
index 0000000000..b51b640c40
--- /dev/null
+++ b/tests/xfs/1528.out
@@ -0,0 +1,4 @@
+QA output created by 1528
+Format and populate
+Fuzz rtrmapbt recs
+Done fuzzing rtrmapbt recs
diff --git a/tests/xfs/1529 b/tests/xfs/1529
new file mode 100755
index 0000000000..3930edad47
--- /dev/null
+++ b/tests/xfs/1529
@@ -0,0 +1,40 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1529
+#
+# Populate a XFS filesystem and fuzz every rtrmapbt keyptr field.
+# Try online repair and, if necessary, offline repair,
+# to test the most likely usage pattern.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_bothrepair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_xfs_scratch_rmapbt
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 2)" || \
+	_fail "could not find two-level rtrmapbt"
+
+echo "Fuzz rtrmapbt keyptrs"
+_scratch_xfs_fuzz_metadata '(rtrmapbt)' 'offline' "path -m $path" >> $seqres.full
+echo "Done fuzzing rtrmapbt keyptrs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1529.out b/tests/xfs/1529.out
new file mode 100644
index 0000000000..808fcc957f
--- /dev/null
+++ b/tests/xfs/1529.out
@@ -0,0 +1,4 @@
+QA output created by 1529
+Format and populate
+Fuzz rtrmapbt keyptrs
+Done fuzzing rtrmapbt keyptrs
diff --git a/tests/xfs/407 b/tests/xfs/407
index 2460ea336c..bd439105e2 100755
--- a/tests/xfs/407
+++ b/tests/xfs/407
@@ -26,7 +26,7 @@ _require_scratch_xfs_fuzz_fields
 echo "Format and populate"
 _scratch_populate_cached nofill > $seqres.full 2>&1
 
-path="$(_scratch_xfs_find_rgbtree_height 'rmap' 1)" || \
+path="$(_scratch_xfs_find_rgbtree_height 'rmap' 2)" || \
 	_fail "could not find two-level rtrmapbt"
 inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/1] fuzzy: create known output for rt rmap btree fuzz tests
  2023-12-31 20:01 ` [PATCHSET v2.0 5/9] fstests: establish baseline for realtime rmap fuzz tests Darrick J. Wong
@ 2023-12-27 14:01   ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:01 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1528.out |   22 ++++++++
 tests/xfs/1529.out |  138 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/406.out  |   22 ++++++++
 tests/xfs/408.out  |  138 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/409.out  |   15 ++++++
 tests/xfs/481.out  |   22 ++++++++
 tests/xfs/482.out  |  138 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 495 insertions(+)


diff --git a/tests/xfs/1528.out b/tests/xfs/1528.out
index b51b640c40..efa31b604e 100644
--- a/tests/xfs/1528.out
+++ b/tests/xfs/1528.out
@@ -1,4 +1,26 @@
 QA output created by 1528
 Format and populate
 Fuzz rtrmapbt recs
+leftsib = add: offline scrub didn't fail.
+rightsib = ones: offline scrub didn't fail.
+rightsib = firstbit: offline scrub didn't fail.
+rightsib = lastbit: offline scrub didn't fail.
+rightsib = add: offline scrub didn't fail.
+recs[2].blockcount = middlebit: offline scrub didn't fail.
+recs[2].blockcount = lastbit: offline scrub didn't fail.
+recs[2].blockcount = add: offline scrub didn't fail.
+recs[3].blockcount = middlebit: offline scrub didn't fail.
+recs[3].blockcount = add: offline scrub didn't fail.
+recs[4].blockcount = middlebit: offline scrub didn't fail.
+recs[4].blockcount = add: offline scrub didn't fail.
+recs[5].blockcount = middlebit: offline scrub didn't fail.
+recs[5].blockcount = add: offline scrub didn't fail.
+recs[6].blockcount = middlebit: offline scrub didn't fail.
+recs[6].blockcount = add: offline scrub didn't fail.
+recs[7].blockcount = middlebit: offline scrub didn't fail.
+recs[7].blockcount = add: offline scrub didn't fail.
+recs[8].blockcount = middlebit: offline scrub didn't fail.
+recs[8].blockcount = add: offline scrub didn't fail.
+recs[9].blockcount = middlebit: offline scrub didn't fail.
+recs[9].blockcount = add: offline scrub didn't fail.
 Done fuzzing rtrmapbt recs
diff --git a/tests/xfs/1529.out b/tests/xfs/1529.out
index 808fcc957f..0810094a43 100644
--- a/tests/xfs/1529.out
+++ b/tests/xfs/1529.out
@@ -1,4 +1,142 @@
 QA output created by 1529
 Format and populate
 Fuzz rtrmapbt keyptrs
+u3.rtrmapbt.keys[1].startblock = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[1] = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[2] = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[3] = firstbit: offline scrub didn't fail.
 Done fuzzing rtrmapbt keyptrs
diff --git a/tests/xfs/406.out b/tests/xfs/406.out
index 6615533e6c..e95ac52fed 100644
--- a/tests/xfs/406.out
+++ b/tests/xfs/406.out
@@ -1,4 +1,26 @@
 QA output created by 406
 Format and populate
 Fuzz rtrmapbt recs
+leftsib = add: offline scrub didn't fail.
+rightsib = ones: offline scrub didn't fail.
+rightsib = firstbit: offline scrub didn't fail.
+rightsib = lastbit: offline scrub didn't fail.
+rightsib = add: offline scrub didn't fail.
+recs[2].blockcount = middlebit: offline scrub didn't fail.
+recs[2].blockcount = lastbit: offline scrub didn't fail.
+recs[2].blockcount = add: offline scrub didn't fail.
+recs[3].blockcount = middlebit: offline scrub didn't fail.
+recs[3].blockcount = add: offline scrub didn't fail.
+recs[4].blockcount = middlebit: offline scrub didn't fail.
+recs[4].blockcount = add: offline scrub didn't fail.
+recs[5].blockcount = middlebit: offline scrub didn't fail.
+recs[5].blockcount = add: offline scrub didn't fail.
+recs[6].blockcount = middlebit: offline scrub didn't fail.
+recs[6].blockcount = add: offline scrub didn't fail.
+recs[7].blockcount = middlebit: offline scrub didn't fail.
+recs[7].blockcount = add: offline scrub didn't fail.
+recs[8].blockcount = middlebit: offline scrub didn't fail.
+recs[8].blockcount = add: offline scrub didn't fail.
+recs[9].blockcount = middlebit: offline scrub didn't fail.
+recs[9].blockcount = add: offline scrub didn't fail.
 Done fuzzing rtrmapbt recs
diff --git a/tests/xfs/408.out b/tests/xfs/408.out
index c01b2e26ae..5626c59560 100644
--- a/tests/xfs/408.out
+++ b/tests/xfs/408.out
@@ -1,4 +1,142 @@
 QA output created by 408
 Format and populate
 Fuzz rtrmapbt keyptrs
+u3.rtrmapbt.keys[1].startblock = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[1] = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[2] = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[3] = firstbit: offline scrub didn't fail.
 Done fuzzing rtrmapbt keyptrs
diff --git a/tests/xfs/409.out b/tests/xfs/409.out
index 690ac99b10..3d0a67afa8 100644
--- a/tests/xfs/409.out
+++ b/tests/xfs/409.out
@@ -1,4 +1,19 @@
 QA output created by 409
 Format and populate
 Fuzz rtrmapbt keyptrs
+u3.rtrmapbt.level = ones: mount failed (32).
+u3.rtrmapbt.level = firstbit: mount failed (32).
+u3.rtrmapbt.level = middlebit: mount failed (32).
+u3.rtrmapbt.level = add: mount failed (32).
+u3.rtrmapbt.level = sub: mount failed (32).
+u3.rtrmapbt.numrecs = zeroes: online repair failed (1).
+u3.rtrmapbt.numrecs = zeroes: online re-scrub failed (5).
+u3.rtrmapbt.numrecs = zeroes: offline re-scrub failed (1).
+u3.rtrmapbt.numrecs = zeroes: online post-mod scrub failed (4).
+u3.rtrmapbt.numrecs = zeroes: offline post-mod scrub failed (1).
+u3.rtrmapbt.numrecs = ones: mount failed (32).
+u3.rtrmapbt.numrecs = firstbit: mount failed (32).
+u3.rtrmapbt.numrecs = middlebit: mount failed (32).
+u3.rtrmapbt.numrecs = add: mount failed (32).
+u3.rtrmapbt.numrecs = sub: mount failed (32).
 Done fuzzing rtrmapbt keyptrs
diff --git a/tests/xfs/481.out b/tests/xfs/481.out
index d570dc8f21..1e080de148 100644
--- a/tests/xfs/481.out
+++ b/tests/xfs/481.out
@@ -1,4 +1,26 @@
 QA output created by 481
 Format and populate
 Fuzz rtrmapbt recs
+leftsib = add: offline scrub didn't fail.
+rightsib = ones: offline scrub didn't fail.
+rightsib = firstbit: offline scrub didn't fail.
+rightsib = lastbit: offline scrub didn't fail.
+rightsib = add: offline scrub didn't fail.
+recs[2].blockcount = middlebit: offline scrub didn't fail.
+recs[2].blockcount = lastbit: offline scrub didn't fail.
+recs[2].blockcount = add: offline scrub didn't fail.
+recs[3].blockcount = middlebit: offline scrub didn't fail.
+recs[3].blockcount = add: offline scrub didn't fail.
+recs[4].blockcount = middlebit: offline scrub didn't fail.
+recs[4].blockcount = add: offline scrub didn't fail.
+recs[5].blockcount = middlebit: offline scrub didn't fail.
+recs[5].blockcount = add: offline scrub didn't fail.
+recs[6].blockcount = middlebit: offline scrub didn't fail.
+recs[6].blockcount = add: offline scrub didn't fail.
+recs[7].blockcount = middlebit: offline scrub didn't fail.
+recs[7].blockcount = add: offline scrub didn't fail.
+recs[8].blockcount = middlebit: offline scrub didn't fail.
+recs[8].blockcount = add: offline scrub didn't fail.
+recs[9].blockcount = middlebit: offline scrub didn't fail.
+recs[9].blockcount = add: offline scrub didn't fail.
 Done fuzzing rtrmapbt recs
diff --git a/tests/xfs/482.out b/tests/xfs/482.out
index 6bdf7a9fc1..c945b7aaf8 100644
--- a/tests/xfs/482.out
+++ b/tests/xfs/482.out
@@ -1,4 +1,142 @@
 QA output created by 482
 Format and populate
 Fuzz rtrmapbt keyptrs
+u3.rtrmapbt.keys[1].startblock = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[1].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[2].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].startblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].owner_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = zeroes: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].offset_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].attrfork_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = ones: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = middlebit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = lastbit: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = add: offline scrub didn't fail.
+u3.rtrmapbt.keys[3].bmbtblock_hi = sub: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[1] = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[2] = firstbit: offline scrub didn't fail.
+u3.rtrmapbt.ptrs[3] = firstbit: offline scrub didn't fail.
 Done fuzzing rtrmapbt keyptrs


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/9] xfs/122: update fields for realtime reflink
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
@ 2023-12-27 14:01   ` Darrick J. Wong
  2023-12-27 14:01   ` [PATCH 2/9] common/populate: create realtime refcount btree Darrick J. Wong
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:01 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Add a few more ondisk structures for realtime reflink.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/122.out |    3 +++
 1 file changed, 3 insertions(+)


diff --git a/tests/xfs/122.out b/tests/xfs/122.out
index f5621cac12..9c2d2fffb4 100644
--- a/tests/xfs/122.out
+++ b/tests/xfs/122.out
@@ -120,6 +120,9 @@ sizeof(struct xfs_rmap_key) = 20
 sizeof(struct xfs_rmap_rec) = 24
 sizeof(struct xfs_rtbuf_blkinfo) = 48
 sizeof(struct xfs_rtgroup_geometry) = 128
+sizeof(struct xfs_rtrefcount_key) = 4
+sizeof(struct xfs_rtrefcount_rec) = 12
+sizeof(struct xfs_rtrefcount_root) = 4
 sizeof(struct xfs_rtrmap_key) = 20
 sizeof(struct xfs_rtrmap_rec) = 24
 sizeof(struct xfs_rtrmap_root) = 4


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/9] common/populate: create realtime refcount btree
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
  2023-12-27 14:01   ` [PATCH 1/9] xfs/122: update fields for realtime reflink Darrick J. Wong
@ 2023-12-27 14:01   ` Darrick J. Wong
  2023-12-27 14:02   ` [PATCH 3/9] xfs: create fuzz tests for the " Darrick J. Wong
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:01 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Populate a realtime refcount btree when we're creating a sample fs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/populate |   26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)


diff --git a/common/populate b/common/populate
index 84ff9304e3..1e51eedddc 100644
--- a/common/populate
+++ b/common/populate
@@ -438,16 +438,30 @@ _scratch_xfs_populate() {
 	local dir="${SCRATCH_MNT}/INOBT"
 	__populate_create_dir "${dir}" "${nr}" true --file-pct 100
 
-	# Reverse-mapping btree
+	is_rt="$(_xfs_get_rtextents "$SCRATCH_MNT")"
 	is_rmapbt="$(_xfs_has_feature "$SCRATCH_MNT" rmapbt -v)"
+	is_reflink="$(_xfs_has_feature "$SCRATCH_MNT" reflink -v)"
+
+	# Reverse-mapping btree
 	if [ $is_rmapbt -gt 0 ]; then
 		echo "+ rmapbt btree"
 		nr="$((blksz * 2 / 24))"
 		__populate_create_file $((blksz * nr)) "${SCRATCH_MNT}/RMAPBT"
 	fi
 
+	# Realtime Reference-count btree comes before the rtrmapbt so that
+	# the refcount entries are created in rtgroup 0.
+	if [ $is_reflink -gt 0 ] && [ $is_rt -gt 0 ]; then
+		echo "+ rtreflink btree"
+		rt_blksz=$(_xfs_get_rtextsize "$SCRATCH_MNT")
+		nr="$((rt_blksz * 2 / 12))"
+		$XFS_IO_PROG -R -f -c 'truncate 0' "${SCRATCH_MNT}/RTREFCOUNTBT"
+		__populate_create_file $((blksz * nr)) "${SCRATCH_MNT}/RTREFCOUNTBT"
+		$XFS_IO_PROG -R -f -c 'truncate 0' "${SCRATCH_MNT}/RTREFCOUNTBT2"
+		cp --reflink=always "${SCRATCH_MNT}/RTREFCOUNTBT" "${SCRATCH_MNT}/RTREFCOUNTBT2"
+	fi
+
 	# Realtime Reverse-mapping btree
-	is_rt="$(_xfs_get_rtextents "$SCRATCH_MNT")"
 	if [ $is_rmapbt -gt 0 ] && [ $is_rt -gt 0 ]; then
 		echo "+ rtrmapbt btree"
 		nr="$((blksz * 2 / 24))"
@@ -456,7 +470,6 @@ _scratch_xfs_populate() {
 	fi
 
 	# Reference-count btree
-	is_reflink="$(_xfs_has_feature "$SCRATCH_MNT" reflink -v)"
 	if [ $is_reflink -gt 0 ]; then
 		echo "+ reflink btree"
 		nr="$((blksz * 2 / 12))"
@@ -512,6 +525,7 @@ _scratch_xfs_populate() {
 	__populate_fragment_file "${SCRATCH_MNT}/RMAPBT"
 	__populate_fragment_file "${SCRATCH_MNT}/RTRMAPBT"
 	__populate_fragment_file "${SCRATCH_MNT}/REFCOUNTBT"
+	__populate_fragment_file "${SCRATCH_MNT}/RTREFCOUNTBT"
 
 	umount "${SCRATCH_MNT}"
 }
@@ -753,6 +767,10 @@ __populate_check_xfs_rgbtree_height() {
 		path_format="/realtime/%u.rmap"
 		bt_prefix="u3.rtrmapbt"
 		;;
+	"refcnt")
+		path_format="/realtime/%u.refcount"
+		bt_prefix="u3.rtrefcbt"
+		;;
 	*)
 		_fail "Don't know about rt btree ${bt_type}"
 		;;
@@ -827,6 +845,8 @@ _scratch_xfs_populate_check() {
 	test $is_reflink -ne 0 && __populate_check_xfs_agbtree_height "refcnt"
 	test $is_rmapbt -ne 0 && test $is_rt -gt 0 && \
 		__populate_check_xfs_rgbtree_height "rmap"
+	test $is_reflink -ne 0 && test $is_rt -gt 0 && \
+		__populate_check_xfs_rgbtree_height "refcnt"
 }
 
 # Check data fork format of ext4 file


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/9] xfs: create fuzz tests for the realtime refcount btree
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
  2023-12-27 14:01   ` [PATCH 1/9] xfs/122: update fields for realtime reflink Darrick J. Wong
  2023-12-27 14:01   ` [PATCH 2/9] common/populate: create realtime refcount btree Darrick J. Wong
@ 2023-12-27 14:02   ` Darrick J. Wong
  2023-12-27 14:02   ` [PATCH 4/9] xfs/27[24]: adapt for checking files on the realtime volume Darrick J. Wong
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:02 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Create fuzz tests for the realtime refcount btree record and key/ptr
blocks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs         |    4 ++++
 tests/xfs/1538     |   41 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1538.out |    4 ++++
 tests/xfs/1539     |   41 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1539.out |    4 ++++
 tests/xfs/1540     |   41 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1540.out |    4 ++++
 tests/xfs/1541     |   42 ++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1541.out |    4 ++++
 tests/xfs/1542     |   41 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1542.out |    4 ++++
 tests/xfs/1543     |   40 ++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1543.out |    4 ++++
 tests/xfs/1544     |   40 ++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1544.out |    4 ++++
 tests/xfs/1545     |   41 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1545.out |    4 ++++
 17 files changed, 363 insertions(+)
 create mode 100755 tests/xfs/1538
 create mode 100644 tests/xfs/1538.out
 create mode 100755 tests/xfs/1539
 create mode 100644 tests/xfs/1539.out
 create mode 100755 tests/xfs/1540
 create mode 100644 tests/xfs/1540.out
 create mode 100755 tests/xfs/1541
 create mode 100644 tests/xfs/1541.out
 create mode 100755 tests/xfs/1542
 create mode 100644 tests/xfs/1542.out
 create mode 100755 tests/xfs/1543
 create mode 100644 tests/xfs/1543.out
 create mode 100755 tests/xfs/1544
 create mode 100644 tests/xfs/1544.out
 create mode 100755 tests/xfs/1545
 create mode 100644 tests/xfs/1545.out


diff --git a/common/xfs b/common/xfs
index b8dd8d4a40..aab04bfb18 100644
--- a/common/xfs
+++ b/common/xfs
@@ -1929,6 +1929,10 @@ _scratch_xfs_find_rgbtree_height() {
 		path_format="/realtime/%u.rmap"
 		bt_prefix="u3.rtrmapbt"
 		;;
+	"refcnt")
+		path_format="/realtime/%u.refcount"
+		bt_prefix="u3.rtrefcbt"
+		;;
 	*)
 		_fail "Don't know about rt btree ${bt_type}"
 		;;
diff --git a/tests/xfs/1538 b/tests/xfs/1538
new file mode 100755
index 0000000000..2bd630fdf1
--- /dev/null
+++ b/tests/xfs/1538
@@ -0,0 +1,41 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1538
+#
+# Populate a XFS filesystem and fuzz every rtrefcountbt record field.
+# Use xfs_scrub to fix the corruption.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_scrub dangerous_online_repair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch_reflink
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'refcnt' 2)" || \
+	_fail "could not find two-level rtrefcountbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
+
+echo "Fuzz rtrefcountbt recs"
+_scratch_xfs_fuzz_metadata '' 'online' "path -m $path" "addr u${inode_ver}.rtrefcbt.ptrs[1]" >> $seqres.full
+echo "Done fuzzing rtrefcountbt recs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1538.out b/tests/xfs/1538.out
new file mode 100644
index 0000000000..968cfd6ef9
--- /dev/null
+++ b/tests/xfs/1538.out
@@ -0,0 +1,4 @@
+QA output created by 1538
+Format and populate
+Fuzz rtrefcountbt recs
+Done fuzzing rtrefcountbt recs
diff --git a/tests/xfs/1539 b/tests/xfs/1539
new file mode 100755
index 0000000000..8086943211
--- /dev/null
+++ b/tests/xfs/1539
@@ -0,0 +1,41 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1539
+#
+# Populate a XFS filesystem and fuzz every rtrefcountbt record field.
+# Use xfs_repair to fix the corruption.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_scrub dangerous_repair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch_reflink
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'refcnt' 2)" || \
+	_fail "could not find two-level rtrefcountbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
+
+echo "Fuzz rtrefcountbt recs"
+_scratch_xfs_fuzz_metadata '' 'offline' "path -m $path" "addr u${inode_ver}.rtrefcbt.ptrs[1]" >> $seqres.full
+echo "Done fuzzing rtrefcountbt recs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1539.out b/tests/xfs/1539.out
new file mode 100644
index 0000000000..aa3a963dc2
--- /dev/null
+++ b/tests/xfs/1539.out
@@ -0,0 +1,4 @@
+QA output created by 1539
+Format and populate
+Fuzz rtrefcountbt recs
+Done fuzzing rtrefcountbt recs
diff --git a/tests/xfs/1540 b/tests/xfs/1540
new file mode 100755
index 0000000000..ab8dee2a02
--- /dev/null
+++ b/tests/xfs/1540
@@ -0,0 +1,41 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1540
+#
+# Populate a XFS filesystem and fuzz every rtrefcountbt record field.
+# Do not fix the filesystem, to test metadata verifiers.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_norepair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch_reflink
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'refcnt' 2)" || \
+	_fail "could not find two-level rtrefcountbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
+
+echo "Fuzz rtrefcountbt recs"
+_scratch_xfs_fuzz_metadata '' 'none' "path -m $path" "addr u${inode_ver}.rtrefcbt.ptrs[1]" >> $seqres.full
+echo "Done fuzzing rtrefcountbt recs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1540.out b/tests/xfs/1540.out
new file mode 100644
index 0000000000..37f3311837
--- /dev/null
+++ b/tests/xfs/1540.out
@@ -0,0 +1,4 @@
+QA output created by 1540
+Format and populate
+Fuzz rtrefcountbt recs
+Done fuzzing rtrefcountbt recs
diff --git a/tests/xfs/1541 b/tests/xfs/1541
new file mode 100755
index 0000000000..18312456f4
--- /dev/null
+++ b/tests/xfs/1541
@@ -0,0 +1,42 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1541
+#
+# Populate a XFS filesystem and fuzz every rtrefcountbt record field.
+# Try online repair and, if necessary, offline repair,
+# to test the most likely usage pattern.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_bothrepair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch_reflink
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'refcnt' 2)" || \
+	_fail "could not find two-level rtrefcountbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
+
+echo "Fuzz rtrefcountbt recs"
+_scratch_xfs_fuzz_metadata '' 'both' "path -m $path" "addr u${inode_ver}.rtrefcbt.ptrs[1]" >> $seqres.full
+echo "Done fuzzing rtrefcountbt recs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1541.out b/tests/xfs/1541.out
new file mode 100644
index 0000000000..35a9b73471
--- /dev/null
+++ b/tests/xfs/1541.out
@@ -0,0 +1,4 @@
+QA output created by 1541
+Format and populate
+Fuzz rtrefcountbt recs
+Done fuzzing rtrefcountbt recs
diff --git a/tests/xfs/1542 b/tests/xfs/1542
new file mode 100755
index 0000000000..9246a31a31
--- /dev/null
+++ b/tests/xfs/1542
@@ -0,0 +1,41 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1542
+#
+# Populate a XFS filesystem and fuzz every rtrefcountbt key/ptr field.
+# Use xfs_scrub to fix the corruption.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_scrub dangerous_online_repair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch_reflink
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'refcnt' 2)" || \
+	_fail "could not find two-level rtrefcountbt"
+inode_ver=$(_scratch_xfs_get_metadata_field "core.version" "path -m $path")
+
+echo "Fuzz rtrefcountbt keyptrs"
+_scratch_xfs_fuzz_metadata '(rtrefcbt)' 'online' "path -m $path" >> $seqres.full
+echo "Done fuzzing rtrefcountbt keyptrs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1542.out b/tests/xfs/1542.out
new file mode 100644
index 0000000000..55d820b4b1
--- /dev/null
+++ b/tests/xfs/1542.out
@@ -0,0 +1,4 @@
+QA output created by 1542
+Format and populate
+Fuzz rtrefcountbt keyptrs
+Done fuzzing rtrefcountbt keyptrs
diff --git a/tests/xfs/1543 b/tests/xfs/1543
new file mode 100755
index 0000000000..38afae106a
--- /dev/null
+++ b/tests/xfs/1543
@@ -0,0 +1,40 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1543
+#
+# Populate a XFS filesystem and fuzz every rtrefcountbt key/ptr field.
+# Use xfs_repair to fix the corruption.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_scrub dangerous_repair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch_reflink
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'refcnt' 2)" || \
+	_fail "could not find two-level rtrefcountbt"
+
+echo "Fuzz rtrefcountbt keyptrs"
+_scratch_xfs_fuzz_metadata '(rtrefcbt)' 'offline' "path -m $path" >> $seqres.full
+echo "Done fuzzing rtrefcountbt keyptrs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1543.out b/tests/xfs/1543.out
new file mode 100644
index 0000000000..e7afa10744
--- /dev/null
+++ b/tests/xfs/1543.out
@@ -0,0 +1,4 @@
+QA output created by 1543
+Format and populate
+Fuzz rtrefcountbt keyptrs
+Done fuzzing rtrefcountbt keyptrs
diff --git a/tests/xfs/1544 b/tests/xfs/1544
new file mode 100755
index 0000000000..86b11b1955
--- /dev/null
+++ b/tests/xfs/1544
@@ -0,0 +1,40 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1544
+#
+# Populate a XFS filesystem and fuzz every rtrefcountbt key/ptr field.
+# Do not fix the filesystem, to test metadata verifiers.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_norepair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch_reflink
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'refcnt' 2)" || \
+	_fail "could not find two-level rtrefcountbt"
+
+echo "Fuzz rtrefcountbt keyptrs"
+_scratch_xfs_fuzz_metadata '(rtrefcbt)' 'none' "path -m $path" >> $seqres.full
+echo "Done fuzzing rtrefcountbt keyptrs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1544.out b/tests/xfs/1544.out
new file mode 100644
index 0000000000..b39532c160
--- /dev/null
+++ b/tests/xfs/1544.out
@@ -0,0 +1,4 @@
+QA output created by 1544
+Format and populate
+Fuzz rtrefcountbt keyptrs
+Done fuzzing rtrefcountbt keyptrs
diff --git a/tests/xfs/1545 b/tests/xfs/1545
new file mode 100755
index 0000000000..1dbe03506b
--- /dev/null
+++ b/tests/xfs/1545
@@ -0,0 +1,41 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1545
+#
+# Populate a XFS filesystem and fuzz every rtrefcountbt key/ptr field.
+# Try online repair and, if necessary, offline repair,
+# to test the most likely usage pattern.
+
+. ./common/preamble
+_begin_fstest dangerous_fuzzers dangerous_bothrepair realtime
+
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/populate
+. ./common/fuzzy
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch_reflink
+_require_scratch_xfs_fuzz_fields
+_disable_dmesg_check
+
+echo "Format and populate"
+_scratch_populate_cached nofill > $seqres.full 2>&1
+
+path="$(_scratch_xfs_find_rgbtree_height 'refcnt' 2)" || \
+	_fail "could not find two-level rtrefcountbt"
+
+echo "Fuzz rtrefcountbt keyptrs"
+_scratch_xfs_fuzz_metadata '(rtrefcbt)' 'both' "path -m $path" >> $seqres.full
+echo "Done fuzzing rtrefcountbt keyptrs"
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1545.out b/tests/xfs/1545.out
new file mode 100644
index 0000000000..982a0d64df
--- /dev/null
+++ b/tests/xfs/1545.out
@@ -0,0 +1,4 @@
+QA output created by 1545
+Format and populate
+Fuzz rtrefcountbt keyptrs
+Done fuzzing rtrefcountbt keyptrs


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/9] xfs/27[24]: adapt for checking files on the realtime volume
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 14:02   ` [PATCH 3/9] xfs: create fuzz tests for the " Darrick J. Wong
@ 2023-12-27 14:02   ` Darrick J. Wong
  2023-12-27 14:02   ` [PATCH 5/9] xfs: race fsstress with realtime refcount btree scrub and repair Darrick J. Wong
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:02 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Adapt both tests to behave properly if the two files being tested are on
the realtime volume.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/272 |   40 +++++++++++++++++++++++++------------
 tests/xfs/274 |   62 ++++++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 70 insertions(+), 32 deletions(-)


diff --git a/tests/xfs/272 b/tests/xfs/272
index d5f3a74177..edc5b16967 100755
--- a/tests/xfs/272
+++ b/tests/xfs/272
@@ -40,26 +40,40 @@ $here/src/punch-alternating $SCRATCH_MNT/urk >> $seqres.full
 ino=$(stat -c '%i' $SCRATCH_MNT/urk)
 
 echo "Get fsmap" | tee -a $seqres.full
-$XFS_IO_PROG -c 'fsmap -v' $SCRATCH_MNT >> $seqres.full
 $XFS_IO_PROG -c 'fsmap -v' $SCRATCH_MNT | tr '[]()' '    ' > $TEST_DIR/fsmap
+cat $TEST_DIR/fsmap >> $seqres.full
 
 echo "Get bmap" | tee -a $seqres.full
-$XFS_IO_PROG -c 'bmap -v' $SCRATCH_MNT/urk >> $seqres.full
 $XFS_IO_PROG -c 'bmap -v' $SCRATCH_MNT/urk | grep '^[[:space:]]*[0-9]*:' | grep -v 'hole' | tr '[]()' '    ' > $TEST_DIR/bmap
+cat $TEST_DIR/bmap >> $seqres.full
 
 echo "Check bmap and fsmap" | tee -a $seqres.full
-cat $TEST_DIR/bmap | while read ext offrange colon blockrange ag agrange total crap; do
-	qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${blockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${ag}[[:space:]]*${agrange}[[:space:]]*${total}(| [01]*)$"
-	echo "${qstr}" >> $seqres.full
-	grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
-	found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
-	test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
-done
+if $XFS_IO_PROG -c 'stat -v' $SCRATCH_MNT/urk | grep -q realtime; then
+	# file on rt volume
+	cat $TEST_DIR/bmap | while read ext offrange colon rtblockrange total crap; do
+		qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${rtblockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${total}(| [01]*)$"
+		echo "${qstr}" >> $seqres.full
+		grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
+		found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
+		test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
+	done
 
-echo "Check device field of FS metadata and regular file"
-data_dev=$(grep 'inode btree' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
-rt_dev=$(grep "${ino}[[:space:]]*[0-9]*\.\.[0-9]*" $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
-test "${data_dev}" = "${rt_dev}" || echo "data ${data_dev} realtime ${rt_dev}?"
+	echo "Check device field of FS metadata and regular file"
+else
+	# file on data volume
+	cat $TEST_DIR/bmap | while read ext offrange colon blockrange ag agrange total crap; do
+		qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${blockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${ag}[[:space:]]*${agrange}[[:space:]]*${total}(| [01]*)$"
+		echo "${qstr}" >> $seqres.full
+		grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
+		found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
+		test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
+	done
+
+	echo "Check device field of FS metadata and regular file"
+	data_dev=$(grep 'inode btree' $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
+	rt_dev=$(grep "${ino}[[:space:]]*[0-9]*\.\.[0-9]*" $TEST_DIR/fsmap | head -n 1 | awk '{print $2}')
+	test "${data_dev}" = "${rt_dev}" || echo "data ${data_dev} realtime ${rt_dev}?"
+fi
 
 # success, all done
 status=0
diff --git a/tests/xfs/274 b/tests/xfs/274
index cd483d77bc..f16b49fc1c 100755
--- a/tests/xfs/274
+++ b/tests/xfs/274
@@ -40,34 +40,58 @@ _cp_reflink $SCRATCH_MNT/f1 $SCRATCH_MNT/f2
 ino=$(stat -c '%i' $SCRATCH_MNT/f1)
 
 echo "Get fsmap" | tee -a $seqres.full
-$XFS_IO_PROG -c 'fsmap -v' $SCRATCH_MNT >> $seqres.full
 $XFS_IO_PROG -c 'fsmap -v' $SCRATCH_MNT | tr '[]()' '    ' > $TEST_DIR/fsmap
+cat $TEST_DIR/fsmap >> $seqres.full
 
 echo "Get f1 bmap" | tee -a $seqres.full
-$XFS_IO_PROG -c 'bmap -v' $SCRATCH_MNT/f1 >> $seqres.full
 $XFS_IO_PROG -c 'bmap -v' $SCRATCH_MNT/f1 | grep '^[[:space:]]*[0-9]*:' | grep -v 'hole' | tr '[]()' '    ' > $TEST_DIR/bmap
+cat $TEST_DIR/bmap >> $seqres.full
 
-echo "Check f1 bmap and fsmap" | tee -a $seqres.full
-cat $TEST_DIR/bmap | while read ext offrange colon blockrange ag agrange total crap; do
-	qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${blockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${ag}[[:space:]]*${agrange}[[:space:]]*${total} 010[01]{4}$"
-	echo "${qstr}" >> $seqres.full
-	grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
-	found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
-	test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
-done
+if _xfs_is_realtime_file $SCRATCH_MNT/f1 && ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
+	# file on rt volume
+	echo "Check f1 bmap and fsmap" | tee -a $seqres.full
+	cat $TEST_DIR/bmap | while read ext offrange colon rtblockrange total crap; do
+		qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${rtblockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${total} 010[01]{4}$"
+		echo "${qstr}" >> $seqres.full
+		grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
+		found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
+		test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
+	done
+else
+	# file on data volume
+	echo "Check f1 bmap and fsmap" | tee -a $seqres.full
+	cat $TEST_DIR/bmap | while read ext offrange colon blockrange ag agrange total crap; do
+		qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${blockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${ag}[[:space:]]*${agrange}[[:space:]]*${total} 010[01]{4}$"
+		echo "${qstr}" >> $seqres.full
+		grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
+		found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
+		test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
+	done
+fi
 
 echo "Get f2 bmap" | tee -a $seqres.full
-$XFS_IO_PROG -c 'bmap -v' $SCRATCH_MNT/f2 >> $seqres.full
 $XFS_IO_PROG -c 'bmap -v' $SCRATCH_MNT/f2 | grep '^[[:space:]]*[0-9]*:' | grep -v 'hole' | tr '[]()' '    ' > $TEST_DIR/bmap
+cat $TEST_DIR/bmap >> $seqres.full
 
-echo "Check f2 bmap and fsmap" | tee -a $seqres.full
-cat $TEST_DIR/bmap | while read ext offrange colon blockrange ag agrange total crap; do
-	qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${blockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${ag}[[:space:]]*${agrange}[[:space:]]*${total} 010[01]{4}$"
-	echo "${qstr}" >> $seqres.full
-	grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
-	found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
-	test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
-done
+if _xfs_is_realtime_file $SCRATCH_MNT/f2 && ! _xfs_has_feature $SCRATCH_MNT rtgroups; then
+	echo "Check f2 bmap and fsmap" | tee -a $seqres.full
+	cat $TEST_DIR/bmap | while read ext offrange colon rtblockrange total crap; do
+		qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${rtblockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${total} 010[01]{4}$"
+		echo "${qstr}" >> $seqres.full
+		grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
+		found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
+		test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
+	done
+else
+	echo "Check f2 bmap and fsmap" | tee -a $seqres.full
+	cat $TEST_DIR/bmap | while read ext offrange colon blockrange ag agrange total crap; do
+		qstr="^[[:space:]]*[0-9]*:[[:space:]]*[0-9]*:[0-9]*[[:space:]]*${blockrange} :[[:space:]]*${ino}[[:space:]]*${offrange}[[:space:]]*${ag}[[:space:]]*${agrange}[[:space:]]*${total} 010[01]{4}$"
+		echo "${qstr}" >> $seqres.full
+		grep "${qstr}" $TEST_DIR/fsmap >> $seqres.full
+		found=$(grep -E -c "${qstr}" $TEST_DIR/fsmap)
+		test $found -eq 1 || echo "Unexpected output for offset ${offrange}."
+	done
+fi
 
 # success, all done
 status=0


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/9] xfs: race fsstress with realtime refcount btree scrub and repair
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 14:02   ` [PATCH 4/9] xfs/27[24]: adapt for checking files on the realtime volume Darrick J. Wong
@ 2023-12-27 14:02   ` Darrick J. Wong
  2023-12-27 14:02   ` [PATCH 6/9] xfs: remove xfs/131 now that we allow reflink on realtime volumes Darrick J. Wong
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:02 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Race checking and rebuilding realtime refcount btrees with fsstress.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1818     |   43 +++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1818.out |    2 ++
 tests/xfs/1819     |   43 +++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1819.out |    2 ++
 4 files changed, 90 insertions(+)
 create mode 100755 tests/xfs/1818
 create mode 100644 tests/xfs/1818.out
 create mode 100755 tests/xfs/1819
 create mode 100644 tests/xfs/1819.out


diff --git a/tests/xfs/1818 b/tests/xfs/1818
new file mode 100755
index 0000000000..674fff25fa
--- /dev/null
+++ b/tests/xfs/1818
@@ -0,0 +1,43 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1818
+#
+# Race fsstress and rt refcount btree scrub for a while to see if we crash or
+# livelock.
+#
+. ./common/preamble
+_begin_fstest scrub dangerous_fsstress_scrub
+
+_cleanup() {
+	_scratch_xfs_stress_scrub_cleanup &> /dev/null
+	cd /
+	rm -r -f $tmp.*
+}
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/fuzzy
+. ./common/inject
+. ./common/xfs
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch
+_require_xfs_stress_scrub
+
+_scratch_mkfs > "$seqres.full" 2>&1
+_scratch_mount
+_require_xfs_has_feature "$SCRATCH_MNT" realtime
+_require_xfs_has_feature "$SCRATCH_MNT" reflink
+_xfs_force_bdev realtime $SCRATCH_MNT
+
+_scratch_xfs_stress_scrub -s "scrub rtrefcountbt %rgno%"
+
+# success, all done
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1818.out b/tests/xfs/1818.out
new file mode 100644
index 0000000000..700d301da1
--- /dev/null
+++ b/tests/xfs/1818.out
@@ -0,0 +1,2 @@
+QA output created by 1818
+Silence is golden
diff --git a/tests/xfs/1819 b/tests/xfs/1819
new file mode 100755
index 0000000000..e27a29271f
--- /dev/null
+++ b/tests/xfs/1819
@@ -0,0 +1,43 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1819
+#
+# Race fsstress and rt refcount btree scrub for a while to see if we crash or
+# livelock.
+#
+. ./common/preamble
+_begin_fstest online_repair dangerous_fsstress_repair
+
+_cleanup() {
+	_scratch_xfs_stress_scrub_cleanup &> /dev/null
+	cd /
+	rm -r -f $tmp.*
+}
+_register_cleanup "_cleanup" BUS
+
+# Import common functions.
+. ./common/filter
+. ./common/fuzzy
+. ./common/inject
+. ./common/xfs
+
+# real QA test starts here
+_supported_fs xfs
+_require_realtime
+_require_scratch
+_require_xfs_stress_online_repair
+
+_scratch_mkfs > "$seqres.full" 2>&1
+_scratch_mount
+_require_xfs_has_feature "$SCRATCH_MNT" realtime
+_require_xfs_has_feature "$SCRATCH_MNT" reflink
+_xfs_force_bdev realtime $SCRATCH_MNT
+
+_scratch_xfs_stress_online_repair -s "repair rtrefcountbt %rgno%"
+
+# success, all done
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1819.out b/tests/xfs/1819.out
new file mode 100644
index 0000000000..041e17ab61
--- /dev/null
+++ b/tests/xfs/1819.out
@@ -0,0 +1,2 @@
+QA output created by 1819
+Silence is golden


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/9] xfs: remove xfs/131 now that we allow reflink on realtime volumes
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 14:02   ` [PATCH 5/9] xfs: race fsstress with realtime refcount btree scrub and repair Darrick J. Wong
@ 2023-12-27 14:02   ` Darrick J. Wong
  2023-12-27 14:03   ` [PATCH 7/9] xfs/856: add rtreflink upgrade to test matrix Darrick J. Wong
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:02 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Remove this test, since we now support reflink on the rt volume.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/131     |   48 ------------------------------------------------
 tests/xfs/131.out |    5 -----
 2 files changed, 53 deletions(-)
 delete mode 100755 tests/xfs/131
 delete mode 100644 tests/xfs/131.out


diff --git a/tests/xfs/131 b/tests/xfs/131
deleted file mode 100755
index 879e2dc6e8..0000000000
--- a/tests/xfs/131
+++ /dev/null
@@ -1,48 +0,0 @@
-#! /bin/bash
-# SPDX-License-Identifier: GPL-2.0
-# Copyright (c) 2015, Oracle and/or its affiliates.  All Rights Reserved.
-#
-# FS QA Test No. 131
-#
-# Ensure that we can't reflink realtime files.
-#
-. ./common/preamble
-_begin_fstest auto quick clone realtime
-
-# Override the default cleanup function.
-_cleanup()
-{
-    cd /
-    umount $SCRATCH_MNT > /dev/null 2>&1
-    rm -rf $tmp.* $testdir $metadump_file
-}
-
-# Import common functions.
-. ./common/filter
-. ./common/reflink
-
-# real QA test starts here
-_supported_fs xfs
-_require_realtime
-_require_scratch_reflink
-_require_cp_reflink
-
-echo "Format and mount scratch device"
-_scratch_mkfs >> $seqres.full
-_scratch_mount
-
-testdir=$SCRATCH_MNT/test-$seq
-mkdir $testdir
-
-echo "Create the original file blocks"
-blksz=65536
-$XFS_IO_PROG -R -f -c "truncate $blksz" $testdir/file1
-
-echo "Reflink every block"
-_cp_reflink $testdir/file1 $testdir/file2 2>&1 | _filter_scratch
-
-test -s $testdir/file2 && _fail "Should not be able to reflink a realtime file."
-
-# success, all done
-status=0
-exit
diff --git a/tests/xfs/131.out b/tests/xfs/131.out
deleted file mode 100644
index 3c0186f0c7..0000000000
--- a/tests/xfs/131.out
+++ /dev/null
@@ -1,5 +0,0 @@
-QA output created by 131
-Format and mount scratch device
-Create the original file blocks
-Reflink every block
-cp: failed to clone 'SCRATCH_MNT/test-131/file2' from 'SCRATCH_MNT/test-131/file1': Invalid argument


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 7/9] xfs/856: add rtreflink upgrade to test matrix
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-27 14:02   ` [PATCH 6/9] xfs: remove xfs/131 now that we allow reflink on realtime volumes Darrick J. Wong
@ 2023-12-27 14:03   ` Darrick J. Wong
  2023-12-27 14:03   ` [PATCH 8/9] generic/331,xfs/240: support files that skip delayed allocation Darrick J. Wong
  2023-12-27 14:03   ` [PATCH 9/9] common/xfs: fix _xfs_get_file_block_size when rtinherit is set and no rt section Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:03 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Add realtime reflink to the features that this test will try to
upgrade.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1856 |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/tests/xfs/1856 b/tests/xfs/1856
index 8453d9bff2..4126c28e5d 100755
--- a/tests/xfs/1856
+++ b/tests/xfs/1856
@@ -205,12 +205,13 @@ post_exercise()
 # upgrade don't spread failure to the rest of the tests.
 FEATURES=()
 if rt_configured; then
-	# rmap wasn't added to rt devices until after metadir and rtgroups
+	# reflink & rmap weren't added to rt devices until after metadir and rtgroups
 	check_repair_upgrade finobt && FEATURES+=("finobt")
 	check_repair_upgrade inobtcount && FEATURES+=("inobtcount")
 	check_repair_upgrade bigtime && FEATURES+=("bigtime")
 	check_repair_upgrade metadir && FEATURES+=("metadir")
 	supports_rtgroups && check_repair_upgrade rmapbt && FEATURES+=("rmapbt")
+	supports_rtgroups && check_repair_upgrade reflink && FEATURES+=("reflink")
 else
 	check_repair_upgrade finobt && FEATURES+=("finobt")
 	check_repair_upgrade rmapbt && FEATURES+=("rmapbt")


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 8/9] generic/331,xfs/240: support files that skip delayed allocation
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-27 14:03   ` [PATCH 7/9] xfs/856: add rtreflink upgrade to test matrix Darrick J. Wong
@ 2023-12-27 14:03   ` Darrick J. Wong
  2023-12-27 14:03   ` [PATCH 9/9] common/xfs: fix _xfs_get_file_block_size when rtinherit is set and no rt section Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:03 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The goal of this test is to ensure that log recovery finishes a copy on
write operation in the event of temporary media errors.  It's important
that the test observe some sort of IO error once we switch the scratch
device to fail all IOs, but regrettably the test encoded the specific
behavior of XFS and btrfs when the test was written -- the aio write
to the page cache doesn't have to touch the disk and succeeds, and the
fdatasync flushes things to disk and hits the IO error.

However, this is not how things work on the XFS realtime device.  There
is no delalloc on realtime, so the aio write allocates an unwritten
extent to stage the write.  The allocation fails due to EIO, so it's the
write call that fails.  Therefore, all we need to do is to detect an IO
error at any point between the write and the fdatasync call to be
satisfied that the test does what we want to do.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/generic/331     |   12 ++++++++++--
 tests/generic/331.out |    2 +-
 tests/xfs/240         |   13 +++++++++++--
 tests/xfs/240.out     |    2 +-
 4 files changed, 23 insertions(+), 6 deletions(-)


diff --git a/tests/generic/331 b/tests/generic/331
index 492abedf76..8c665ce4fc 100755
--- a/tests/generic/331
+++ b/tests/generic/331
@@ -59,9 +59,17 @@ echo "CoW and unmount"
 $XFS_IO_PROG -f -c "pwrite -S 0x63 $bufsize 1" $testdir/file2 >> $seqres.full
 $XFS_IO_PROG -f -c "pwrite -S 0x63 -b $bufsize 0 $filesize" $TEST_DIR/moo >> $seqres.full
 sync
+
+# If the filesystem supports delalloc, then the fdatasync will report an IO
+# error.  If the write goes directly to disk, then aiocp will return nonzero.
+unset write_failed
 _dmerror_load_error_table
-$AIO_TEST -b $bufsize $TEST_DIR/moo $testdir/file2 >> $seqres.full
-$XFS_IO_PROG -c "fdatasync" $testdir/file2
+$AIO_TEST -b $bufsize $TEST_DIR/moo $testdir/file2 &>> $seqres.full || \
+	write_failed=1
+$XFS_IO_PROG -c "fdatasync" $testdir/file2 2>&1 | grep -q 'Input.output error' && \
+	write_failed=1
+test -n $write_failed && echo "write failed"
+
 _dmerror_load_working_table
 _dmerror_unmount
 _dmerror_mount
diff --git a/tests/generic/331.out b/tests/generic/331.out
index adbf841d00..d8ccea704b 100644
--- a/tests/generic/331.out
+++ b/tests/generic/331.out
@@ -5,7 +5,7 @@ Compare files
 1886e67cf8783e89ce6ddc5bb09a3944  SCRATCH_MNT/test-331/file1
 1886e67cf8783e89ce6ddc5bb09a3944  SCRATCH_MNT/test-331/file2
 CoW and unmount
-fdatasync: Input/output error
+write failed
 Compare files
 1886e67cf8783e89ce6ddc5bb09a3944  SCRATCH_MNT/test-331/file1
 d94b0ab13385aba594411c174b1cc13c  SCRATCH_MNT/test-331/file2
diff --git a/tests/xfs/240 b/tests/xfs/240
index a65c270d23..cabe309201 100755
--- a/tests/xfs/240
+++ b/tests/xfs/240
@@ -66,8 +66,17 @@ $XFS_IO_PROG -f -c "pwrite -S 0x63 $bufsize 1" $testdir/file2 >> $seqres.full
 $XFS_IO_PROG -f -c "pwrite -S 0x63 -b $bufsize 0 $filesize" $TEST_DIR/moo >> $seqres.full
 sync
 _dmerror_load_error_table
-$AIO_TEST -b $bufsize $TEST_DIR/moo $testdir/file2 >> $seqres.full
-$XFS_IO_PROG -c "fdatasync" $testdir/file2
+
+# If the filesystem supports delalloc, then the fdatasync will report an IO
+# error.  If the write goes directly to disk, then aiocp will return nonzero.
+unset write_failed
+_dmerror_load_error_table
+$AIO_TEST -b $bufsize $TEST_DIR/moo $testdir/file2 &>> $seqres.full || \
+	write_failed=1
+$XFS_IO_PROG -c "fdatasync" $testdir/file2 2>&1 | grep -q 'Input.output error' && \
+	write_failed=1
+test -n $write_failed && echo "write failed"
+
 _dmerror_load_working_table
 _dmerror_unmount
 _dmerror_mount
diff --git a/tests/xfs/240.out b/tests/xfs/240.out
index 1a22e8a389..00bb116e5c 100644
--- a/tests/xfs/240.out
+++ b/tests/xfs/240.out
@@ -5,7 +5,7 @@ Compare files
 1886e67cf8783e89ce6ddc5bb09a3944  SCRATCH_MNT/test-240/file1
 1886e67cf8783e89ce6ddc5bb09a3944  SCRATCH_MNT/test-240/file2
 CoW and unmount
-fdatasync: Input/output error
+write failed
 Compare files
 1886e67cf8783e89ce6ddc5bb09a3944  SCRATCH_MNT/test-240/file1
 d94b0ab13385aba594411c174b1cc13c  SCRATCH_MNT/test-240/file2


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 9/9] common/xfs: fix _xfs_get_file_block_size when rtinherit is set and no rt section
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-27 14:03   ` [PATCH 8/9] generic/331,xfs/240: support files that skip delayed allocation Darrick J. Wong
@ 2023-12-27 14:03   ` Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:03 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

It's possible for the sysadmin to set rtinherit on the directory tree
even if there isn't a realtime section attached to the filesystem.  When
this is the case, the realtime flag is /not/ passed to new files, and
file data is written to the data device.  The file allocation unit for
the file is the fs blocksize, and it is not correct to use the rt
extent.

fstests can be fooled into doing the incorrect thing if test runner puts
'-d rtinherit=1 -r extsize=28k' into MKFS_OPTIONS without configuring a
realtime device.  This causes many tests to do the wrong thing because
they think they must operate on units of 28k (and not 4k).  Fix this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/xfs |   11 +++++++++++
 1 file changed, 11 insertions(+)


diff --git a/common/xfs b/common/xfs
index aab04bfb18..69a0eb620c 100644
--- a/common/xfs
+++ b/common/xfs
@@ -208,6 +208,8 @@ _xfs_get_file_block_size()
 {
 	local path="$1"
 
+	# If rtinherit or realtime are not set on the path, then all files
+	# will be created on the data device.
 	if ! ($XFS_IO_PROG -c "stat -v" "$path" 2>&1 | grep -E -q '(rt-inherit|realtime)'); then
 		_get_block_size "$path"
 		return
@@ -218,6 +220,15 @@ _xfs_get_file_block_size()
 	while ! $XFS_INFO_PROG "$path" &>/dev/null && [ "$path" != "/" ]; do
 		path="$(dirname "$path")"
 	done
+
+	# If there's no realtime section, the rtinherit and rextsize settings
+	# are irrelevant -- all files are created on the data device.
+	if $XFS_INFO_PROG "$path" | grep -q 'realtime =none'; then
+		_get_block_size "$path"
+		return
+	fi
+
+	# Otherwise, report the rt extent size.
 	_xfs_get_rtextsize "$path"
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/1] xfs: baseline golden output for rt refcount btree fuzz tests
  2023-12-31 20:01 ` [PATCHSET v2.0 7/9] fstests: establish baseline for realtime reflink fuzz tests Darrick J. Wong
@ 2023-12-27 14:03   ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:03 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Record all the currently known failures of the xfs_scrub check and
repair code for the rt refcount btree fuzz tests.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1539.out |    6 ++++++
 tests/xfs/1540.out |    6 ++++++
 tests/xfs/1541.out |    6 ++++++
 tests/xfs/1542.out |   10 ++++++++++
 tests/xfs/1543.out |    2 ++
 tests/xfs/1544.out |   12 ++++++++++++
 tests/xfs/1545.out |   12 ++++++++++++
 7 files changed, 54 insertions(+)


diff --git a/tests/xfs/1539.out b/tests/xfs/1539.out
index aa3a963dc2..9f90912f2c 100644
--- a/tests/xfs/1539.out
+++ b/tests/xfs/1539.out
@@ -1,4 +1,10 @@
 QA output created by 1539
 Format and populate
 Fuzz rtrefcountbt recs
+numrecs = lastbit: offline scrub didn't fail.
+leftsib = add: offline scrub didn't fail.
+rightsib = ones: offline scrub didn't fail.
+rightsib = firstbit: offline scrub didn't fail.
+rightsib = lastbit: offline scrub didn't fail.
+rightsib = add: offline scrub didn't fail.
 Done fuzzing rtrefcountbt recs
diff --git a/tests/xfs/1540.out b/tests/xfs/1540.out
index 37f3311837..368dbace0f 100644
--- a/tests/xfs/1540.out
+++ b/tests/xfs/1540.out
@@ -1,4 +1,10 @@
 QA output created by 1540
 Format and populate
 Fuzz rtrefcountbt recs
+numrecs = lastbit: offline scrub didn't fail.
+leftsib = add: offline scrub didn't fail.
+rightsib = ones: offline scrub didn't fail.
+rightsib = firstbit: offline scrub didn't fail.
+rightsib = lastbit: offline scrub didn't fail.
+rightsib = add: offline scrub didn't fail.
 Done fuzzing rtrefcountbt recs
diff --git a/tests/xfs/1541.out b/tests/xfs/1541.out
index 35a9b73471..4af739233a 100644
--- a/tests/xfs/1541.out
+++ b/tests/xfs/1541.out
@@ -1,4 +1,10 @@
 QA output created by 1541
 Format and populate
 Fuzz rtrefcountbt recs
+numrecs = lastbit: offline scrub didn't fail.
+leftsib = add: offline scrub didn't fail.
+rightsib = ones: offline scrub didn't fail.
+rightsib = firstbit: offline scrub didn't fail.
+rightsib = lastbit: offline scrub didn't fail.
+rightsib = add: offline scrub didn't fail.
 Done fuzzing rtrefcountbt recs
diff --git a/tests/xfs/1542.out b/tests/xfs/1542.out
index 55d820b4b1..d6acd5257c 100644
--- a/tests/xfs/1542.out
+++ b/tests/xfs/1542.out
@@ -1,4 +1,14 @@
 QA output created by 1542
 Format and populate
 Fuzz rtrefcountbt keyptrs
+u3.rtrefcbt.level = ones: mount failed (32).
+u3.rtrefcbt.level = firstbit: mount failed (32).
+u3.rtrefcbt.level = middlebit: mount failed (32).
+u3.rtrefcbt.level = add: mount failed (32).
+u3.rtrefcbt.level = sub: mount failed (32).
+u3.rtrefcbt.numrecs = ones: mount failed (32).
+u3.rtrefcbt.numrecs = firstbit: mount failed (32).
+u3.rtrefcbt.numrecs = middlebit: mount failed (32).
+u3.rtrefcbt.numrecs = add: mount failed (32).
+u3.rtrefcbt.numrecs = sub: mount failed (32).
 Done fuzzing rtrefcountbt keyptrs
diff --git a/tests/xfs/1543.out b/tests/xfs/1543.out
index e7afa10744..59eb5ad149 100644
--- a/tests/xfs/1543.out
+++ b/tests/xfs/1543.out
@@ -1,4 +1,6 @@
 QA output created by 1543
 Format and populate
 Fuzz rtrefcountbt keyptrs
+u3.rtrefcbt.keys[1].startblock = zeroes: offline scrub didn't fail.
+u3.rtrefcbt.keys[1].startblock = lastbit: offline scrub didn't fail.
 Done fuzzing rtrefcountbt keyptrs
diff --git a/tests/xfs/1544.out b/tests/xfs/1544.out
index b39532c160..717f901bb4 100644
--- a/tests/xfs/1544.out
+++ b/tests/xfs/1544.out
@@ -1,4 +1,16 @@
 QA output created by 1544
 Format and populate
 Fuzz rtrefcountbt keyptrs
+u3.rtrefcbt.level = ones: mount failed (32).
+u3.rtrefcbt.level = firstbit: mount failed (32).
+u3.rtrefcbt.level = middlebit: mount failed (32).
+u3.rtrefcbt.level = add: mount failed (32).
+u3.rtrefcbt.level = sub: mount failed (32).
+u3.rtrefcbt.numrecs = ones: mount failed (32).
+u3.rtrefcbt.numrecs = firstbit: mount failed (32).
+u3.rtrefcbt.numrecs = middlebit: mount failed (32).
+u3.rtrefcbt.numrecs = add: mount failed (32).
+u3.rtrefcbt.numrecs = sub: mount failed (32).
+u3.rtrefcbt.keys[1].startblock = zeroes: offline scrub didn't fail.
+u3.rtrefcbt.keys[1].startblock = lastbit: offline scrub didn't fail.
 Done fuzzing rtrefcountbt keyptrs
diff --git a/tests/xfs/1545.out b/tests/xfs/1545.out
index 982a0d64df..d279dc0db8 100644
--- a/tests/xfs/1545.out
+++ b/tests/xfs/1545.out
@@ -1,4 +1,16 @@
 QA output created by 1545
 Format and populate
 Fuzz rtrefcountbt keyptrs
+u3.rtrefcbt.level = ones: mount failed (32).
+u3.rtrefcbt.level = firstbit: mount failed (32).
+u3.rtrefcbt.level = middlebit: mount failed (32).
+u3.rtrefcbt.level = add: mount failed (32).
+u3.rtrefcbt.level = sub: mount failed (32).
+u3.rtrefcbt.numrecs = ones: mount failed (32).
+u3.rtrefcbt.numrecs = firstbit: mount failed (32).
+u3.rtrefcbt.numrecs = middlebit: mount failed (32).
+u3.rtrefcbt.numrecs = add: mount failed (32).
+u3.rtrefcbt.numrecs = sub: mount failed (32).
+u3.rtrefcbt.keys[1].startblock = zeroes: offline scrub didn't fail.
+u3.rtrefcbt.keys[1].startblock = lastbit: offline scrub didn't fail.
 Done fuzzing rtrefcountbt keyptrs


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/5] xfs: make sure that CoW will write around when rextsize > 1
  2023-12-31 20:01 ` [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents Darrick J. Wong
@ 2023-12-27 14:04   ` Darrick J. Wong
  2023-12-27 14:04   ` [PATCH 2/5] xfs: skip cowextsize hint fragmentation tests on realtime volumes Darrick J. Wong
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:04 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Make sure that CoW triggers the intended copy-around behavior when we
write a tiny amount to the middle of a large rt extent.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1919     |  163 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1919.out |   84 +++++++++++++++++++++++++++
 2 files changed, 247 insertions(+)
 create mode 100755 tests/xfs/1919
 create mode 100644 tests/xfs/1919.out


diff --git a/tests/xfs/1919 b/tests/xfs/1919
new file mode 100755
index 0000000000..00b16737e7
--- /dev/null
+++ b/tests/xfs/1919
@@ -0,0 +1,163 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1919
+#
+# Make sure that copy on write actually does the intended write-around when we
+# stage a tiny modification to a large shared realtime extent.  We should never
+# end up with multiple rt extents mapped to the same region.
+#
+. ./common/preamble
+_begin_fstest auto quick clone realtime
+
+# Import common functions.
+. ./common/filter
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs xfs
+_require_xfs_io_command "fpunch"
+_require_xfs_io_command "fzero"
+_require_xfs_io_command "fcollapse"
+_require_xfs_io_command "finsert"
+_require_xfs_io_command "funshare"
+_require_realtime
+_require_scratch_reflink
+
+rtextsz=262144
+filesz=$((rtextsz * 3))
+
+echo "Format filesystem and populate"
+_scratch_mkfs -m reflink=1 -r extsize=$rtextsz > $seqres.full
+_scratch_mount >> $seqres.full
+
+# Force all our files to be on the realtime device
+_xfs_force_bdev realtime $SCRATCH_MNT
+
+check_file() {
+	$XFS_IO_PROG -c fsync -c 'bmap -elpv' $1 >> $seqres.full
+	md5sum $SCRATCH_MNT/a | _filter_scratch
+	md5sum $1 | _filter_scratch
+}
+
+rtextsz_got=$(_xfs_get_rtextsize "$SCRATCH_MNT")
+test $rtextsz_got -eq $rtextsz || \
+	_notrun "got rtextsize $rtextsz_got, wanted $rtextsz"
+
+_pwrite_byte 0x59 0 $filesz $SCRATCH_MNT/a >> $seqres.full
+sync
+md5sum $SCRATCH_MNT/a | _filter_scratch
+$XFS_IO_PROG -c 'bmap -elpv' $SCRATCH_MNT/a >> $seqres.full
+
+echo "pwrite 1 byte in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/b
+_pwrite_byte 0x00 345678 1 $SCRATCH_MNT/b >> $seqres.full
+check_file $SCRATCH_MNT/b
+
+echo "mwrite 1 byte in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/c
+$XFS_IO_PROG -c "mmap -rw 0 $filesz" -c "mwrite -S 0x00 345678 1" -c msync $SCRATCH_MNT/c
+check_file $SCRATCH_MNT/c
+
+echo "fzero 1 byte in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/d
+$XFS_IO_PROG -c "fzero 345678 1" $SCRATCH_MNT/d
+check_file $SCRATCH_MNT/d
+
+echo "fpunch 1 byte in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/e
+$XFS_IO_PROG -c "fpunch 345678 1" $SCRATCH_MNT/e
+check_file $SCRATCH_MNT/e
+
+echo "funshare 1 byte in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/f
+$XFS_IO_PROG -c "funshare 345678 1" $SCRATCH_MNT/f
+check_file $SCRATCH_MNT/f
+
+echo "pwrite 1 block in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/g
+_pwrite_byte 0x00 327680 65536 $SCRATCH_MNT/g >> $seqres.full
+check_file $SCRATCH_MNT/g
+
+echo "mwrite 1 block in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/h
+$XFS_IO_PROG -c "mmap -rw 0 $filesz" -c "mwrite -S 0x00 327680 65536" -c msync $SCRATCH_MNT/h
+check_file $SCRATCH_MNT/h
+
+echo "fzero 1 block in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/i
+$XFS_IO_PROG -c "fzero 327680 65536" $SCRATCH_MNT/i
+check_file $SCRATCH_MNT/i
+
+echo "fpunch 1 block in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/j
+$XFS_IO_PROG -c "fpunch 327680 65536" $SCRATCH_MNT/j
+check_file $SCRATCH_MNT/j
+
+echo "funshare 1 block in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/k
+$XFS_IO_PROG -c "funshare 327680 65536" $SCRATCH_MNT/k
+check_file $SCRATCH_MNT/k
+
+echo "pwrite 1 extent in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/l
+_pwrite_byte 0x00 262144 262144 $SCRATCH_MNT/l >> $seqres.full
+check_file $SCRATCH_MNT/l
+
+echo "mwrite 1 extent in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/m
+$XFS_IO_PROG -c "mmap -rw 0 $filesz" -c "mwrite -S 0x00 262144 262144" -c msync $SCRATCH_MNT/m
+check_file $SCRATCH_MNT/m
+
+echo "fzero 1 extent in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/n
+$XFS_IO_PROG -c "fzero 262144 262144" $SCRATCH_MNT/n
+check_file $SCRATCH_MNT/n
+
+echo "fpunch 1 extent in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/o
+$XFS_IO_PROG -c "fpunch 262144 262144" $SCRATCH_MNT/o
+check_file $SCRATCH_MNT/o
+
+echo "funshare 1 extent in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/p
+$XFS_IO_PROG -c "funshare 262144 262144" $SCRATCH_MNT/p
+check_file $SCRATCH_MNT/p
+
+echo "fcollapse 1 extent in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/q
+$XFS_IO_PROG -c "fcollapse 262144 262144" $SCRATCH_MNT/q
+check_file $SCRATCH_MNT/q
+
+echo "finsert 1 extent in the middle" | tee -a $seqres.full
+_cp_reflink $SCRATCH_MNT/a $SCRATCH_MNT/r
+$XFS_IO_PROG -c "finsert 262144 262144" $SCRATCH_MNT/r
+check_file $SCRATCH_MNT/r
+
+echo "copy unwritten blocks in large rtext" | tee -a $seqres.full
+$XFS_IO_PROG -f -c "falloc 0 $filesz" -c 'pwrite -S 0x59 345678 1' $SCRATCH_MNT/s >> $seqres.full
+$XFS_IO_PROG -c 'bmap -elpv' $SCRATCH_MNT/s >> $seqres.full
+_cp_reflink $SCRATCH_MNT/s $SCRATCH_MNT/t
+$XFS_IO_PROG -c 'bmap -elpv' $SCRATCH_MNT/s >> $seqres.full
+$XFS_IO_PROG -f -c 'pwrite -S 0x59 1048576 1' $SCRATCH_MNT/s >> $seqres.full
+$XFS_IO_PROG -f -c 'pwrite -S 0x59 1048576 1' $SCRATCH_MNT/t >> $seqres.full
+check_file $SCRATCH_MNT/s
+check_file $SCRATCH_MNT/t
+
+echo "test writing to shared unwritten extent" | tee -a $seqres.full
+$XFS_IO_PROG -c 'bmap -elpv' $SCRATCH_MNT/s >> $seqres.full
+_cp_reflink $SCRATCH_MNT/s $SCRATCH_MNT/u
+$XFS_IO_PROG -c 'bmap -elpv' $SCRATCH_MNT/s >> $seqres.full
+$XFS_IO_PROG -f -c 'pwrite -S 0x59 345678 1' $SCRATCH_MNT/u >> $seqres.full
+check_file $SCRATCH_MNT/u
+
+echo "Remount and recheck" | tee -a $seqres.full
+md5sum $SCRATCH_MNT/a | _filter_scratch
+for i in b c d e f g h i j k l m n o p q r s t u; do
+	check_file $SCRATCH_MNT/$i | grep -v SCRATCH_MNT.a
+done
+
+# success, all done
+status=0
+exit
diff --git a/tests/xfs/1919.out b/tests/xfs/1919.out
new file mode 100644
index 0000000000..40cf5092a5
--- /dev/null
+++ b/tests/xfs/1919.out
@@ -0,0 +1,84 @@
+QA output created by 1919
+Format filesystem and populate
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+pwrite 1 byte in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+ea0b05f13c8cce703accaffe56d59bd3  SCRATCH_MNT/b
+mwrite 1 byte in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+ea0b05f13c8cce703accaffe56d59bd3  SCRATCH_MNT/c
+fzero 1 byte in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+ea0b05f13c8cce703accaffe56d59bd3  SCRATCH_MNT/d
+fpunch 1 byte in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+ea0b05f13c8cce703accaffe56d59bd3  SCRATCH_MNT/e
+funshare 1 byte in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/f
+pwrite 1 block in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+2a508e23efc80e468efa7004fd8a1839  SCRATCH_MNT/g
+mwrite 1 block in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+2a508e23efc80e468efa7004fd8a1839  SCRATCH_MNT/h
+fzero 1 block in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+2a508e23efc80e468efa7004fd8a1839  SCRATCH_MNT/i
+fpunch 1 block in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+2a508e23efc80e468efa7004fd8a1839  SCRATCH_MNT/j
+funshare 1 block in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/k
+pwrite 1 extent in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+352abe71b0d40f194b9d701750b0d7f3  SCRATCH_MNT/l
+mwrite 1 extent in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+352abe71b0d40f194b9d701750b0d7f3  SCRATCH_MNT/m
+fzero 1 extent in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+352abe71b0d40f194b9d701750b0d7f3  SCRATCH_MNT/n
+fpunch 1 extent in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+352abe71b0d40f194b9d701750b0d7f3  SCRATCH_MNT/o
+funshare 1 extent in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/p
+fcollapse 1 extent in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+b0581637c15320958874ef3f082111da  SCRATCH_MNT/q
+finsert 1 extent in the middle
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+a7359d0c100367c2cd430be334dffbd3  SCRATCH_MNT/r
+copy unwritten blocks in large rtext
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+bff4e0a70430429c92d6139065e6949b  SCRATCH_MNT/s
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+bff4e0a70430429c92d6139065e6949b  SCRATCH_MNT/t
+test writing to shared unwritten extent
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+bff4e0a70430429c92d6139065e6949b  SCRATCH_MNT/u
+Remount and recheck
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/a
+ea0b05f13c8cce703accaffe56d59bd3  SCRATCH_MNT/b
+ea0b05f13c8cce703accaffe56d59bd3  SCRATCH_MNT/c
+ea0b05f13c8cce703accaffe56d59bd3  SCRATCH_MNT/d
+ea0b05f13c8cce703accaffe56d59bd3  SCRATCH_MNT/e
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/f
+2a508e23efc80e468efa7004fd8a1839  SCRATCH_MNT/g
+2a508e23efc80e468efa7004fd8a1839  SCRATCH_MNT/h
+2a508e23efc80e468efa7004fd8a1839  SCRATCH_MNT/i
+2a508e23efc80e468efa7004fd8a1839  SCRATCH_MNT/j
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/k
+352abe71b0d40f194b9d701750b0d7f3  SCRATCH_MNT/l
+352abe71b0d40f194b9d701750b0d7f3  SCRATCH_MNT/m
+352abe71b0d40f194b9d701750b0d7f3  SCRATCH_MNT/n
+352abe71b0d40f194b9d701750b0d7f3  SCRATCH_MNT/o
+924a97fdaa2ab30e2768081469e728a7  SCRATCH_MNT/p
+b0581637c15320958874ef3f082111da  SCRATCH_MNT/q
+a7359d0c100367c2cd430be334dffbd3  SCRATCH_MNT/r
+bff4e0a70430429c92d6139065e6949b  SCRATCH_MNT/s
+bff4e0a70430429c92d6139065e6949b  SCRATCH_MNT/t
+bff4e0a70430429c92d6139065e6949b  SCRATCH_MNT/u


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/5] xfs: skip cowextsize hint fragmentation tests on realtime volumes
  2023-12-31 20:01 ` [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents Darrick J. Wong
  2023-12-27 14:04   ` [PATCH 1/5] xfs: make sure that CoW will write around when rextsize > 1 Darrick J. Wong
@ 2023-12-27 14:04   ` Darrick J. Wong
  2023-12-27 14:04   ` [PATCH 3/5] misc: add more congruent oplen testing Darrick J. Wong
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:04 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

The XFS CoW extent size hint is ignored on realtime filesystems when the
rt extent size set to a unit larger than a single filesystem block
because it is assumed that the larger allocation unit is the
administrator's sole and mandatory anti-fragmentation strategy.  As
such, we can skip these tests.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/reflink |   27 +++++++++++++++++++++++++++
 tests/xfs/180  |    1 +
 tests/xfs/182  |    1 +
 tests/xfs/184  |    1 +
 tests/xfs/192  |    1 +
 tests/xfs/200  |    1 +
 tests/xfs/204  |    1 +
 tests/xfs/208  |    1 +
 tests/xfs/315  |    1 +
 tests/xfs/326  |    6 ++++++
 10 files changed, 41 insertions(+)


diff --git a/common/reflink b/common/reflink
index 22adc4449b..1082642e4e 100644
--- a/common/reflink
+++ b/common/reflink
@@ -521,3 +521,30 @@ _sweave_reflink_rainbow_delalloc() {
 		_pwrite_byte 0x62 $((blksz * i)) $blksz $dfile.chk
 	done
 }
+
+# Require that the COW extent size hint can actually be used to combat
+# fragmentation on the scratch filesystem.  This is (so far) true for any
+# filesystem except for the ones where the realtime extent size is larger
+# than one fs block, for it is assumed that setting a rt extent size is the
+# preferred fragmentation avoidance strategy.
+_require_scratch_cowextsize_useful() {
+	local testfile=$SCRATCH_MNT/hascowextsize
+	local param="${1:-1m}"
+
+	rm -f $testfile
+	touch $testfile
+	local before="$($XFS_IO_PROG -c 'cowextsize' $testfile)"
+
+	$XFS_IO_PROG -c "cowextsize $param" $testfile
+	local after="$($XFS_IO_PROG -c 'cowextsize' $testfile)"
+	rm -f $testfile
+
+	test "$before" != "$after" || \
+		_notrun "setting cowextsize to $param had no effect"
+
+	local fileblocksize=$(_get_file_block_size $SCRATCH_MNT)
+	local fsblocksize=$(_get_block_size $SCRATCH_MNT)
+
+	test $fsblocksize -eq $fileblocksize || \
+		_notrun "XFS does not support cowextsize when rt extsize ($fileblocksize) > 1FSB ($fsblocksize)"
+}
diff --git a/tests/xfs/180 b/tests/xfs/180
index d2fac03a9e..e8e04062a8 100755
--- a/tests/xfs/180
+++ b/tests/xfs/180
@@ -38,6 +38,7 @@ nr=128
 filesize=$((blksz * nr))
 bufnr=16
 bufsize=$((blksz * bufnr))
+_require_scratch_cowextsize_useful $bufsize
 
 _require_fs_space $SCRATCH_MNT $((filesize / 1024 * 3 * 5 / 4))
 real_blksz=$(_get_block_size $testdir)
diff --git a/tests/xfs/182 b/tests/xfs/182
index 511aca6f2d..7c0713b248 100755
--- a/tests/xfs/182
+++ b/tests/xfs/182
@@ -39,6 +39,7 @@ nr=128
 filesize=$((blksz * nr))
 bufnr=16
 bufsize=$((blksz * bufnr))
+_require_scratch_cowextsize_useful $bufsize
 
 _require_fs_space $SCRATCH_MNT $((filesize / 1024 * 3 * 5 / 4))
 real_blksz=$(_get_block_size $testdir)
diff --git a/tests/xfs/184 b/tests/xfs/184
index 97f529f666..b2b4da6024 100755
--- a/tests/xfs/184
+++ b/tests/xfs/184
@@ -40,6 +40,7 @@ nr=128
 filesize=$((blksz * nr))
 bufnr=16
 bufsize=$((blksz * bufnr))
+_require_scratch_cowextsize_useful $bufsize
 
 _require_fs_space $SCRATCH_MNT $((filesize / 1024 * 3 * 5 / 4))
 real_blksz=$(_get_block_size $testdir)
diff --git a/tests/xfs/192 b/tests/xfs/192
index ef7da55be5..0017597280 100755
--- a/tests/xfs/192
+++ b/tests/xfs/192
@@ -42,6 +42,7 @@ nr=128
 filesize=$((blksz * nr))
 bufnr=16
 bufsize=$((blksz * bufnr))
+_require_scratch_cowextsize_useful $bufsize
 
 _require_fs_space $SCRATCH_MNT $((filesize / 1024 * 3 * 5 / 4))
 real_blksz=$(_get_block_size $testdir)
diff --git a/tests/xfs/200 b/tests/xfs/200
index a9d6ce1bb6..500564d9a8 100755
--- a/tests/xfs/200
+++ b/tests/xfs/200
@@ -42,6 +42,7 @@ nr=128
 filesize=$((blksz * nr))
 bufnr=16
 bufsize=$((blksz * bufnr))
+_require_scratch_cowextsize_useful $bufsize
 
 _require_fs_space $SCRATCH_MNT $((filesize / 1024 * 3 * 5 / 4))
 real_blksz=$(_get_block_size $testdir)
diff --git a/tests/xfs/204 b/tests/xfs/204
index 7dacfa2de7..8cbdd6513f 100755
--- a/tests/xfs/204
+++ b/tests/xfs/204
@@ -44,6 +44,7 @@ nr=128
 filesize=$((blksz * nr))
 bufnr=16
 bufsize=$((blksz * bufnr))
+_require_scratch_cowextsize_useful $bufsize
 
 _require_fs_space $SCRATCH_MNT $((filesize / 1024 * 3 * 5 / 4))
 real_blksz=$(_get_block_size $testdir)
diff --git a/tests/xfs/208 b/tests/xfs/208
index 1e7734b822..0d2401b908 100755
--- a/tests/xfs/208
+++ b/tests/xfs/208
@@ -41,6 +41,7 @@ nr=128
 filesize=$((blksz * nr))
 bufnr=16
 bufsize=$((blksz * bufnr))
+_require_scratch_cowextsize_useful $bufsize
 
 _require_fs_space $SCRATCH_MNT $((filesize / 1024 * 3 * 5 / 4))
 real_blksz=$(_get_file_block_size $testdir)
diff --git a/tests/xfs/315 b/tests/xfs/315
index 9f6b39c8cc..3a618a3680 100755
--- a/tests/xfs/315
+++ b/tests/xfs/315
@@ -38,6 +38,7 @@ echo "Format filesystem"
 _scratch_mkfs >/dev/null 2>&1
 _scratch_mount >> $seqres.full
 _require_congruent_file_oplen $SCRATCH_MNT $blksz
+_require_scratch_cowextsize_useful $sz
 
 $XFS_IO_PROG -c "cowextsize $sz" $SCRATCH_MNT
 
diff --git a/tests/xfs/326 b/tests/xfs/326
index ac620fc433..a3fed8b6ac 100755
--- a/tests/xfs/326
+++ b/tests/xfs/326
@@ -55,6 +55,12 @@ $XFS_IO_PROG -c "cowextsize $sz" $SCRATCH_MNT
 # staging extent for an unshared extent and trips over the injected error.
 _require_no_xfs_always_cow
 
+# This test uses a very large cowextszhint to manipulate the COW fork to
+# contain a large unwritten extent before injecting the error.  XFS ignores
+# cowextsize when the realtime extent size is greater than 1FSB, so this test
+# cannot set up the preconditions for the test.
+_require_scratch_cowextsize_useful $sz
+
 echo "Create files"
 _pwrite_byte 0x66 0 $sz $SCRATCH_MNT/file1 >> $seqres.full
 _cp_reflink $SCRATCH_MNT/file1 $SCRATCH_MNT/file2


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/5] misc: add more congruent oplen testing
  2023-12-31 20:01 ` [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents Darrick J. Wong
  2023-12-27 14:04   ` [PATCH 1/5] xfs: make sure that CoW will write around when rextsize > 1 Darrick J. Wong
  2023-12-27 14:04   ` [PATCH 2/5] xfs: skip cowextsize hint fragmentation tests on realtime volumes Darrick J. Wong
@ 2023-12-27 14:04   ` Darrick J. Wong
  2023-12-27 14:05   ` [PATCH 4/5] xfs: test COWing entire rt extents Darrick J. Wong
  2023-12-27 14:05   ` [PATCH 5/5] generic/303: avoid test failures on weird rt extent sizes Darrick J. Wong
  4 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:04 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Do more checking for file allocation operation op length congruency.
This prevents tests from failing with EINVAL when the realtime extent
size is something weird like 28k or 1GB.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/generic/145 |    1 +
 tests/generic/147 |    1 +
 tests/generic/261 |    1 +
 tests/generic/262 |    1 +
 tests/generic/331 |    1 +
 tests/generic/353 |    2 +-
 tests/generic/517 |    1 +
 tests/generic/657 |    1 +
 tests/generic/658 |    1 +
 tests/generic/659 |    1 +
 tests/generic/660 |    1 +
 tests/generic/663 |    1 +
 tests/generic/664 |    1 +
 tests/generic/665 |    1 +
 tests/generic/670 |    1 +
 tests/generic/672 |    1 +
 tests/xfs/420     |    3 +++
 tests/xfs/421     |    3 +++
 tests/xfs/792     |    1 +
 19 files changed, 23 insertions(+), 1 deletion(-)


diff --git a/tests/generic/145 b/tests/generic/145
index f213f53be8..81fc5f6c2f 100755
--- a/tests/generic/145
+++ b/tests/generic/145
@@ -36,6 +36,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $TEST_DIR $blksz
 _pwrite_byte 0x61 0 $blksz $testdir/file1 >> $seqres.full
 _pwrite_byte 0x62 $blksz $blksz $testdir/file1 >> $seqres.full
 _pwrite_byte 0x63 $((blksz * 2)) $blksz $testdir/file1 >> $seqres.full
diff --git a/tests/generic/147 b/tests/generic/147
index 113800944b..bb17bb1c0b 100755
--- a/tests/generic/147
+++ b/tests/generic/147
@@ -35,6 +35,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $TEST_DIR $blksz
 _pwrite_byte 0x61 0 $blksz $testdir/file1 >> $seqres.full
 _pwrite_byte 0x62 $blksz $blksz $testdir/file1 >> $seqres.full
 _pwrite_byte 0x63 $((blksz * 2)) $blksz $testdir/file1 >> $seqres.full
diff --git a/tests/generic/261 b/tests/generic/261
index 93c1c349b1..deb360288e 100755
--- a/tests/generic/261
+++ b/tests/generic/261
@@ -29,6 +29,7 @@ testdir=$SCRATCH_MNT/test-$seq
 mkdir $testdir
 
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=5
 filesize=$((blksz * nr))
 
diff --git a/tests/generic/262 b/tests/generic/262
index 46e88f8731..f296e37e02 100755
--- a/tests/generic/262
+++ b/tests/generic/262
@@ -29,6 +29,7 @@ testdir=$SCRATCH_MNT/test-$seq
 mkdir $testdir
 
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=4
 filesize=$((blksz * nr))
 
diff --git a/tests/generic/331 b/tests/generic/331
index 8c665ce4fc..9b6801e16f 100755
--- a/tests/generic/331
+++ b/tests/generic/331
@@ -38,6 +38,7 @@ testdir=$SCRATCH_MNT/test-$seq
 mkdir $testdir
 
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=640
 bufnr=128
 filesize=$((blksz * nr))
diff --git a/tests/generic/353 b/tests/generic/353
index c563972510..6dbb0d4c24 100755
--- a/tests/generic/353
+++ b/tests/generic/353
@@ -30,7 +30,7 @@ _scratch_mkfs > /dev/null 2>&1
 _scratch_mount
 
 blocksize=$(_get_file_block_size $SCRATCH_MNT)
-
+_require_congruent_file_oplen $SCRATCH_MNT $blocksize
 file1="$SCRATCH_MNT/file1"
 file2="$SCRATCH_MNT/file2"
 extmap1="$SCRATCH_MNT/extmap1"
diff --git a/tests/generic/517 b/tests/generic/517
index cf3031ed2d..229358d06b 100755
--- a/tests/generic/517
+++ b/tests/generic/517
@@ -21,6 +21,7 @@ _require_scratch_dedupe
 
 _scratch_mkfs >>$seqres.full 2>&1
 _scratch_mount
+_require_congruent_file_oplen $SCRATCH_MNT 65536
 
 # The first byte with a value of 0xae starts at an offset (512Kb + 100) which is
 # not a multiple of the block size.
diff --git a/tests/generic/657 b/tests/generic/657
index e0fecd544c..9f4673dda3 100755
--- a/tests/generic/657
+++ b/tests/generic/657
@@ -30,6 +30,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=64
 filesize=$((blksz * nr))
 _pwrite_byte 0x61 0 $filesize $testdir/file1 >> $seqres.full
diff --git a/tests/generic/658 b/tests/generic/658
index a5cbadaaa5..e9519c25e2 100755
--- a/tests/generic/658
+++ b/tests/generic/658
@@ -31,6 +31,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=64
 filesize=$((blksz * nr))
 _weave_reflink_regular $blksz $nr $testdir/file1 $testdir/file3 >> $seqres.full
diff --git a/tests/generic/659 b/tests/generic/659
index ccc2d7950d..05436edfab 100755
--- a/tests/generic/659
+++ b/tests/generic/659
@@ -31,6 +31,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=64
 filesize=$((blksz * nr))
 _weave_reflink_unwritten $blksz $nr $testdir/file1 $testdir/file3 >> $seqres.full
diff --git a/tests/generic/660 b/tests/generic/660
index bc17dc5e59..52b0d1ea9e 100755
--- a/tests/generic/660
+++ b/tests/generic/660
@@ -31,6 +31,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=64
 filesize=$((blksz * nr))
 _weave_reflink_holes $blksz $nr $testdir/file1 $testdir/file3 >> $seqres.full
diff --git a/tests/generic/663 b/tests/generic/663
index 658a5b7004..692c77b745 100755
--- a/tests/generic/663
+++ b/tests/generic/663
@@ -32,6 +32,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=64
 filesize=$((blksz * nr))
 _sweave_reflink_regular $blksz $nr $testdir/file1 $testdir/file3 >> $seqres.full
diff --git a/tests/generic/664 b/tests/generic/664
index 3009101fdc..40fb8c6d92 100755
--- a/tests/generic/664
+++ b/tests/generic/664
@@ -34,6 +34,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=64
 filesize=$((blksz * nr))
 _sweave_reflink_unwritten $blksz $nr $testdir/file1 $testdir/file3 >> $seqres.full
diff --git a/tests/generic/665 b/tests/generic/665
index 86ba578720..ee511755e6 100755
--- a/tests/generic/665
+++ b/tests/generic/665
@@ -34,6 +34,7 @@ mkdir $testdir
 
 echo "Create the original files"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=64
 filesize=$((blksz * nr))
 _sweave_reflink_holes $blksz $nr $testdir/file1 $testdir/file3 >> $seqres.full
diff --git a/tests/generic/670 b/tests/generic/670
index 67de167405..80f9fe6d4f 100755
--- a/tests/generic/670
+++ b/tests/generic/670
@@ -31,6 +31,7 @@ mkdir $testdir
 loops=512
 nr_loops=$((loops - 1))
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 
 echo "Initialize files"
 echo >> $seqres.full
diff --git a/tests/generic/672 b/tests/generic/672
index 9e3a97ec5e..0710a04294 100755
--- a/tests/generic/672
+++ b/tests/generic/672
@@ -30,6 +30,7 @@ mkdir $testdir
 loops=1024
 nr_loops=$((loops - 1))
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 
 echo "Initialize files"
 echo >> $seqres.full
diff --git a/tests/xfs/420 b/tests/xfs/420
index d38772c9d9..51f87bc304 100755
--- a/tests/xfs/420
+++ b/tests/xfs/420
@@ -69,6 +69,9 @@ exercise_lseek() {
 }
 
 blksz=65536
+# Golden output encodes SEEK_HOLE/DATA output, which depends on COW only
+# happening on $blksz granularity
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=8
 filesize=$((blksz * nr))
 
diff --git a/tests/xfs/421 b/tests/xfs/421
index 027ae47c21..429333e349 100755
--- a/tests/xfs/421
+++ b/tests/xfs/421
@@ -51,6 +51,9 @@ testdir=$SCRATCH_MNT/test-$seq
 mkdir $testdir
 
 blksz=65536
+# Golden output encodes SEEK_HOLE/DATA output, which depends on COW only
+# happening on $blksz granularity
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 nr=8
 filesize=$((blksz * nr))
 
diff --git a/tests/xfs/792 b/tests/xfs/792
index bfbfbce4aa..bfbfd8bfbc 100755
--- a/tests/xfs/792
+++ b/tests/xfs/792
@@ -32,6 +32,7 @@ _require_xfs_io_error_injection "bmap_finish_one"
 
 _scratch_mkfs >> $seqres.full
 _scratch_mount
+_require_congruent_file_oplen $SCRATCH_MNT 65536
 
 # Create original file
 _pwrite_byte 0x58 0 1m $SCRATCH_MNT/a >> $seqres.full


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/5] xfs: test COWing entire rt extents
  2023-12-31 20:01 ` [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-27 14:04   ` [PATCH 3/5] misc: add more congruent oplen testing Darrick J. Wong
@ 2023-12-27 14:05   ` Darrick J. Wong
  2023-12-27 14:05   ` [PATCH 5/5] generic/303: avoid test failures on weird rt extent sizes Darrick J. Wong
  4 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:05 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

For XFS filesystems with a realtime volume, an extent size larger than
1FSB, and reflink enabled, we use a "COW around" strategy that detects
file writes that are not aligned to the rt extent size and enlarges
those writes to dirty enough page cache so that the entire rt extent can
be COWed all at once.  This is a functional test to make sure that all
works properly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/1926     |  177 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1926.out |    2 +
 2 files changed, 179 insertions(+)
 create mode 100755 tests/xfs/1926
 create mode 100644 tests/xfs/1926.out


diff --git a/tests/xfs/1926 b/tests/xfs/1926
new file mode 100755
index 0000000000..91b611f59d
--- /dev/null
+++ b/tests/xfs/1926
@@ -0,0 +1,177 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1926
+#
+# Functional testing for COWing around realtime extents.
+#
+. ./common/preamble
+_begin_fstest auto clone realtime rw
+
+_cleanup()
+{
+	cd /
+	rm -r -f $tmp.* $testdir
+}
+
+. ./common/reflink
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs generic
+_require_test
+_require_test_reflink
+_require_cp_reflink
+
+alloc_unit=$(_get_file_block_size $TEST_DIR)
+blksz=$(_get_block_size $TEST_DIR)
+
+testdir="$TEST_DIR/test-$seq"
+mkdir "$testdir"
+
+filesize="$(( (alloc_unit * 5 / 2) + 512))"
+origfile="$testdir/a"
+testfile0="$testdir/a0"
+testfile1="$testdir/a1"
+testfile2="$testdir/a2"
+testfile3="$testdir/a3"
+testfile4="$testdir/a4"
+testfile5="$testdir/a5"
+testfile6="$testdir/a6"
+testfile7="$testdir/a7"
+testfile8="$testdir/a8"
+testfile9="$testdir/a9"
+testfileA="$testdir/aA"
+testfileB="$testdir/aB"
+
+goldfile0="$testdir/g0"
+goldfile1="$testdir/g1"
+goldfile2="$testdir/g2"
+goldfile3="$testdir/g3"
+goldfile4="$testdir/g4"
+goldfile5="$testdir/g5"
+
+orig_cmd="pwrite -S 0x58 -b 1m 0 $filesize"
+$XFS_IO_PROG -f -c "$orig_cmd" -c fsync "$origfile" >> $seqres.full
+
+_cp_reflink "$origfile" "$testfile0"
+_cp_reflink "$origfile" "$testfile1"
+_cp_reflink "$origfile" "$testfile2"
+_cp_reflink "$origfile" "$testfile3"
+_cp_reflink "$origfile" "$testfile4"
+_cp_reflink "$origfile" "$testfile5"
+_cp_reflink "$origfile" "$testfile6"
+_cp_reflink "$origfile" "$testfile7"
+_cp_reflink "$origfile" "$testfile8"
+_cp_reflink "$origfile" "$testfile9"
+_cp_reflink "$origfile" "$testfileA"
+_cp_reflink "$origfile" "$testfileB"
+
+# First we try a partial ovewrite below EOF
+cmd="pwrite -S 0x59 -b 1m -W 512 512"
+_kernlog "********** buffered write below eof goldfile0"
+$XFS_IO_PROG -c "$cmd" "$testfile0" >> $seqres.full
+_kernlog "********** direct write below eof goldfile0"
+$XFS_IO_PROG -d -c "$cmd" "$testfile1" >> $seqres.full
+_kernlog "********** goldfile0"
+$XFS_IO_PROG -f -c "$orig_cmd" -c "$cmd" -c fsync "$goldfile0" >> $seqres.full
+
+# Next we try an appending write at EOF
+_kernlog "********** appending buffered write goldfile1"
+cmd="pwrite -S 0x5a -b 1m -W $filesize 512"
+$XFS_IO_PROG -c "$cmd" "$testfile2" >> $seqres.full
+_kernlog "********** appending direct write goldfile1"
+$XFS_IO_PROG -d -c "$cmd" "$testfile3" >> $seqres.full
+_kernlog "********** goldfile1"
+$XFS_IO_PROG -f -c "$orig_cmd" -c "$cmd" -c fsync "$goldfile1" >> $seqres.full
+
+# Third we try a pure overwrite of the second block
+_kernlog "********** buffered overwrite second block goldfile2"
+cmd="pwrite -S 0x5b -b 1m -W $alloc_unit $alloc_unit"
+$XFS_IO_PROG -c "$cmd" "$testfile4" >> $seqres.full
+_kernlog "********** direct overwrite second block goldfile2"
+$XFS_IO_PROG -d -c "$cmd" "$testfile5" >> $seqres.full
+_kernlog "********** goldfile2"
+$XFS_IO_PROG -f -c "$orig_cmd" -c "$cmd" -c fsync "$goldfile2" >> $seqres.full
+
+# Fourth we try a small write well beyond EOF
+_kernlog "********** buffered write beyond eof block goldfile3"
+cmd="pwrite -S 0x5c -b 1m -W $(( (filesize * 2) + 512)) 512"
+$XFS_IO_PROG -c "$cmd" "$testfile6" >> $seqres.full
+_kernlog "********** direct write beyond eof block goldfile3"
+$XFS_IO_PROG -d -c "$cmd" "$testfile7" >> $seqres.full
+_kernlog "********** goldfile3"
+$XFS_IO_PROG -f -c "$orig_cmd"  -c "$cmd" -c fsync "$goldfile3" >> $seqres.full
+
+# Fifth we try a large write well beyond EOF
+_kernlog "********** buffered write beyond eof block goldfile4"
+cmd="pwrite -S 0x5d -b 1m -W $(( ((filesize * 2) + 512) & ~(alloc_unit - 1) )) $alloc_unit"
+$XFS_IO_PROG -c "$cmd" "$testfile8" >> $seqres.full
+_kernlog "********** direct write beyond eof block goldfile4"
+$XFS_IO_PROG -d -c "$cmd" "$testfile9" >> $seqres.full
+_kernlog "********** goldfile3"
+$XFS_IO_PROG -f -c "$orig_cmd" -c "$cmd" -c fsync "$goldfile4" >> $seqres.full
+
+# Sixth we try a pure overwrite of the second fsblock
+_kernlog "********** buffered overwrite second fsblock goldfile5"
+cmd="pwrite -S 0x5b -b 1m -W $blksz $alloc_unit"
+$XFS_IO_PROG -c "$cmd" "$testfileA" >> $seqres.full
+_kernlog "********** direct overwrite second fsblock goldfile5"
+$XFS_IO_PROG -d -c "$cmd" "$testfileB" >> $seqres.full
+_kernlog "********** goldfile5"
+$XFS_IO_PROG -f -c "$orig_cmd" -c "$cmd" -c fsync "$goldfile5" >> $seqres.full
+
+_kernlog "********** done"
+
+corruption_noted=
+
+note_corruption() {
+	local fname="$1"
+
+	echo "$fname is bad"
+
+	if [ -z "$corruption_noted" ]; then
+		corruption_noted=1
+		echo "origfile" >> $seqres.full
+		od -tx1 -Ad -c "$origfile" >> $seqres.full
+	fi
+
+	echo "$fname" >> $seqres.full
+	od -tx1 -Ad -c "$testfile1" >> $seqres.full
+}
+
+echo "before remount" >> $seqres.full
+cmp -s "$goldfile0" "$testfile0" || note_corruption "$testfile0"
+cmp -s "$goldfile0" "$testfile1" || note_corruption "$testfile1"
+cmp -s "$goldfile1" "$testfile2" || note_corruption "$testfile2"
+cmp -s "$goldfile1" "$testfile3" || note_corruption "$testfile3"
+cmp -s "$goldfile2" "$testfile4" || note_corruption "$testfile4"
+cmp -s "$goldfile2" "$testfile5" || note_corruption "$testfile5"
+cmp -s "$goldfile3" "$testfile6" || note_corruption "$testfile6"
+cmp -s "$goldfile3" "$testfile7" || note_corruption "$testfile7"
+cmp -s "$goldfile4" "$testfile8" || note_corruption "$testfile8"
+cmp -s "$goldfile4" "$testfile9" || note_corruption "$testfile9"
+cmp -s "$goldfile5" "$testfileA" || note_corruption "$testfileA"
+cmp -s "$goldfile5" "$testfileB" || note_corruption "$testfileB"
+
+_test_cycle_mount
+
+echo "after remount" >> $seqres.full
+cmp -s "$goldfile0" "$testfile0" || note_corruption "$testfile0"
+cmp -s "$goldfile0" "$testfile1" || note_corruption "$testfile1"
+cmp -s "$goldfile1" "$testfile2" || note_corruption "$testfile2"
+cmp -s "$goldfile1" "$testfile3" || note_corruption "$testfile3"
+cmp -s "$goldfile2" "$testfile4" || note_corruption "$testfile4"
+cmp -s "$goldfile2" "$testfile5" || note_corruption "$testfile5"
+cmp -s "$goldfile3" "$testfile6" || note_corruption "$testfile6"
+cmp -s "$goldfile3" "$testfile7" || note_corruption "$testfile7"
+cmp -s "$goldfile4" "$testfile8" || note_corruption "$testfile8"
+cmp -s "$goldfile4" "$testfile9" || note_corruption "$testfile9"
+cmp -s "$goldfile5" "$testfileA" || note_corruption "$testfileA"
+cmp -s "$goldfile5" "$testfileB" || note_corruption "$testfileB"
+
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1926.out b/tests/xfs/1926.out
new file mode 100644
index 0000000000..a56601b969
--- /dev/null
+++ b/tests/xfs/1926.out
@@ -0,0 +1,2 @@
+QA output created by 1926
+Silence is golden


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/5] generic/303: avoid test failures on weird rt extent sizes
  2023-12-31 20:01 ` [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-27 14:05   ` [PATCH 4/5] xfs: test COWing entire rt extents Darrick J. Wong
@ 2023-12-27 14:05   ` Darrick J. Wong
  4 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:05 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Fix this test to skip the high offset reflink test if (on XFS) the rt
extent size isn't congruent with the chosen target offset.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/rc         |   23 +++++++++++++++++++++++
 tests/generic/303 |    8 +++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)


diff --git a/common/rc b/common/rc
index f760eedc26..2d67f7dff1 100644
--- a/common/rc
+++ b/common/rc
@@ -4690,6 +4690,29 @@ _get_file_block_size()
 	esac
 }
 
+_test_congruent_file_oplen()
+{
+	local file="$1"
+	local alloc_unit=$(_get_file_block_size "$file")
+	local oplen="$2"
+
+	case $FSTYP in
+	nfs*|cifs|9p|virtiofs|ceph|glusterfs|overlay|pvfs2)
+		# Network filesystems don't know about (or tell the client
+		# about) the underlying file allocation unit and they generally
+		# pass the file IO request to the underlying filesystem, so we
+		# don't have anything to check here.
+		return
+		;;
+	esac
+
+	if [ $alloc_unit -gt $oplen ]; then
+		return 1
+	fi
+	test $((oplen % alloc_unit)) -eq 0 || return 1
+	return 0
+}
+
 # Given a file path and a byte length of a file operation under test, ensure
 # that the length is an integer multiple of the file's allocation unit size.
 # In other words, skip the test unless (oplen ≡ alloc_unit mod 0).  This is
diff --git a/tests/generic/303 b/tests/generic/303
index 95679569e4..ef88d2357b 100755
--- a/tests/generic/303
+++ b/tests/generic/303
@@ -48,7 +48,13 @@ echo "Reflink past maximum file size in dest file (should fail)"
 _reflink_range $testdir/file1 0 $testdir/file5 4611686018427322368 $len >> $seqres.full
 
 echo "Reflink high offset to low offset"
-_reflink_range $testdir/file1 $bigoff_64k $testdir/file6 1048576 65535 >> $seqres.full
+oplen=1048576
+if _test_congruent_file_oplen $testdir $oplen; then
+	_reflink_range $testdir/file1 $bigoff_64k $testdir/file6 $oplen 65535 >> $seqres.full
+else
+	# If we can't do the ficlonerange test, fake it in the output file
+	$XFS_IO_PROG -f -c 'pwrite -S 0x61 1114110 1' $testdir/file6 >> $seqres.full
+fi
 
 echo "Reflink past source file EOF (should fail)"
 _reflink_range $testdir/file2 524288 $testdir/file7 0 1048576 >> $seqres.full


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/3] common: enable testing of realtime quota when supported
  2023-12-31 20:02 ` [PATCHSET v2.0 9/9] fstests: functional tests for rt quota Darrick J. Wong
@ 2023-12-27 14:05   ` Darrick J. Wong
  2023-12-27 14:05   ` [PATCH 2/3] xfs: fix quota tests to adapt to realtime quota Darrick J. Wong
  2023-12-27 14:06   ` [PATCH 3/3] xfs: regression testing of quota on the realtime device Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:05 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

If the kernel advertises realtime quota support, test it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/populate |    2 +-
 common/quota    |   12 ++++++------
 common/xfs      |   12 ++++++++++++
 3 files changed, 19 insertions(+), 7 deletions(-)


diff --git a/common/populate b/common/populate
index 1e51eedddc..538cbc86fc 100644
--- a/common/populate
+++ b/common/populate
@@ -240,7 +240,7 @@ _populate_xfs_qmount_option()
 	if [ ! -f /proc/fs/xfs/xqmstat ]; then
 		# No quota support
 		return
-	elif [ "${USE_EXTERNAL}" = "yes" ] && [ ! -z "${SCRATCH_RTDEV}" ]; then
+	elif [ "${USE_EXTERNAL}" = "yes" ] && [ ! -z "${SCRATCH_RTDEV}" ] && ! _xfs_supports_rtquota; then
 		# Quotas not supported on rt filesystems
 		return
 	elif [ -z "${XFS_QUOTA_PROG}" ]; then
diff --git a/common/quota b/common/quota
index 6b529bf4b4..565057d932 100644
--- a/common/quota
+++ b/common/quota
@@ -23,10 +23,10 @@ _require_quota()
 	if [ ! -f /proc/fs/xfs/xqmstat ]; then
 	    _notrun "Installed kernel does not support XFS quotas"
         fi
-	if [ "$USE_EXTERNAL" = yes -a ! -z "$TEST_RTDEV" ]; then
+	if [ "$USE_EXTERNAL" = yes ] && [ -n "$TEST_RTDEV" ] && ! _xfs_supports_rtquota; then
 	    _notrun "Quotas not supported on realtime test device"
 	fi
-	if [ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_RTDEV" ]; then
+	if [ "$USE_EXTERNAL" = yes ] && [ -n "$SCRATCH_RTDEV" ] && ! _xfs_supports_rtquota; then
 	    _notrun "Quotas not supported on realtime scratch device"
 	fi
 	;;
@@ -44,10 +44,10 @@ _require_xfs_quota()
 {
     $here/src/feature -q $TEST_DEV
     [ $? -ne 0 ] && _notrun "Installed kernel does not support XFS quota"
-    if [ "$USE_EXTERNAL" = yes -a ! -z "$TEST_RTDEV" ]; then
+    if [ "$USE_EXTERNAL" = yes ] && [ -n "$TEST_RTDEV" ] && ! _xfs_supports_rtquota; then
 	_notrun "Quotas not supported on realtime test device"
     fi
-    if [ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_RTDEV" ]; then
+    if [ "$USE_EXTERNAL" = yes ] && [ -n "$SCRATCH_RTDEV" ] && ! _xfs_supports_rtquota; then
 	_notrun "Quotas not supported on realtime scratch device"
     fi
     [ -n "$XFS_QUOTA_PROG" ] || _notrun "XFS quota user tools not installed"
@@ -153,8 +153,8 @@ _require_prjquota()
     fi
     $here/src/feature -P $_dev
     [ $? -ne 0 ] && _notrun "Installed kernel does not support project quotas"
-    if [ "$USE_EXTERNAL" = yes ]; then
-	if [ -n "$TEST_RTDEV" -o -n "$SCRATCH_RTDEV" ]; then
+    if [ "$FSTYP" = "xfs" ] && [ "$USE_EXTERNAL" = yes ]; then
+	if [ -n "$TEST_RTDEV" -o -n "$SCRATCH_RTDEV" ] && ! _xfs_supports_rtquota; then
 	    _notrun "Project quotas not supported on realtime filesystem"
 	fi
     fi
diff --git a/common/xfs b/common/xfs
index 69a0eb620c..56c4b20889 100644
--- a/common/xfs
+++ b/common/xfs
@@ -2194,3 +2194,15 @@ _scratch_find_rt_metadir_entry() {
 
 	return 1
 }
+
+_xfs_supports_rtquota() {
+	test "$FSTYP" = "xfs" || return 1
+
+	local xqmfile="/proc/fs/xfs/xqm"
+
+	test -e "$xqmfile" || modprobe xfs
+	test -e "$xqmfile" || return 1
+
+	local rtquota="$(cat "$xqmfile" | awk '{print $5}')"
+	test "$rtquota" = "1"
+}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/3] xfs: fix quota tests to adapt to realtime quota
  2023-12-31 20:02 ` [PATCHSET v2.0 9/9] fstests: functional tests for rt quota Darrick J. Wong
  2023-12-27 14:05   ` [PATCH 1/3] common: enable testing of realtime quota when supported Darrick J. Wong
@ 2023-12-27 14:05   ` Darrick J. Wong
  2023-12-27 14:06   ` [PATCH 3/3] xfs: regression testing of quota on the realtime device Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:05 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Fix or limit the scope of tests so that we can turn on testing for
realtime quotas.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/quota      |   22 ++++++++++++++++++++++
 tests/generic/219 |    1 +
 tests/generic/230 |    1 +
 tests/generic/305 |    1 +
 tests/generic/326 |    1 +
 tests/generic/327 |    1 +
 tests/generic/328 |    1 +
 tests/generic/566 |    4 +++-
 tests/generic/587 |    1 +
 tests/generic/603 |    1 +
 tests/generic/691 |    2 ++
 tests/generic/710 |    4 +++-
 tests/xfs/050     |    2 ++
 tests/xfs/106     |    1 +
 tests/xfs/108     |    2 ++
 tests/xfs/152     |    1 +
 tests/xfs/153     |    2 ++
 tests/xfs/161     |    2 ++
 tests/xfs/213     |    1 +
 tests/xfs/214     |    1 +
 tests/xfs/220     |    2 ++
 tests/xfs/299     |    2 ++
 tests/xfs/330     |    1 +
 tests/xfs/434     |    1 +
 tests/xfs/435     |    1 +
 tests/xfs/441     |    1 +
 tests/xfs/442     |    1 +
 tests/xfs/508     |    2 ++
 tests/xfs/511     |   10 +++++++++-
 tests/xfs/720     |    5 +++++
 30 files changed, 75 insertions(+), 3 deletions(-)


diff --git a/common/quota b/common/quota
index 565057d932..06179eb8c8 100644
--- a/common/quota
+++ b/common/quota
@@ -467,5 +467,27 @@ _restore_project_quota()
 	fi
 }
 
+# Reconfigure the mounted fs as needed so that we can test the VFS quota
+# utilities.  They do not support realtime block limits or reporting, so
+# we forcibly inhibit rtinherit on XFS filesystems.
+_force_vfs_quota_testing()
+{
+	local mount="${1:-$TEST_DIR}"
+
+	test "$FSTYP" = "xfs" && _xfs_force_bdev data "$mount"
+}
+
+# Does the scratch filesystem have a realtime volume wherein quota works?
+_scratch_supports_rtquota()
+{
+	if [ "$FSTYP" = "xfs" ]; then
+		test "$USE_EXTERNAL" = yes && \
+		test -n "$SCRATCH_RTDEV" && \
+		_xfs_supports_rtquota
+	else
+		return 1
+	fi
+}
+
 # make sure this script returns success
 /bin/true
diff --git a/tests/generic/219 b/tests/generic/219
index 71da25e352..a38dc7d84a 100755
--- a/tests/generic/219
+++ b/tests/generic/219
@@ -83,6 +83,7 @@ test_accounting()
 _scratch_unmount 2>/dev/null
 _scratch_mkfs >> $seqres.full 2>&1
 _scratch_mount "-o usrquota,grpquota"
+_force_vfs_quota_testing $SCRATCH_MNT
 quotacheck -u -g $SCRATCH_MNT 2>/dev/null
 quotaon $SCRATCH_MNT 2>/dev/null
 _scratch_unmount
diff --git a/tests/generic/230 b/tests/generic/230
index e49e0da25c..8a0aac1e7b 100755
--- a/tests/generic/230
+++ b/tests/generic/230
@@ -103,6 +103,7 @@ _qmount_option 'defaults'
 
 _scratch_mkfs >> $seqres.full 2>&1
 _scratch_mount "-o usrquota,grpquota"
+_force_vfs_quota_testing $SCRATCH_MNT
 BLOCK_SIZE=$(_get_file_block_size $SCRATCH_MNT)
 quotacheck -u -g $SCRATCH_MNT 2>/dev/null
 quotaon $SCRATCH_MNT 2>/dev/null
diff --git a/tests/generic/305 b/tests/generic/305
index b46d512742..7c8f7692c1 100755
--- a/tests/generic/305
+++ b/tests/generic/305
@@ -27,6 +27,7 @@ echo "Format and mount"
 _scratch_mkfs > $seqres.full 2>&1
 export MOUNT_OPTIONS="-o usrquota,grpquota $MOUNT_OPTIONS"
 _scratch_mount >> $seqres.full 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT
 quotacheck -u -g $SCRATCH_MNT 2> /dev/null
 quotaon $SCRATCH_MNT 2> /dev/null
 
diff --git a/tests/generic/326 b/tests/generic/326
index f5c557b3a0..b5e4100f4a 100755
--- a/tests/generic/326
+++ b/tests/generic/326
@@ -28,6 +28,7 @@ echo "Format and mount"
 _scratch_mkfs > $seqres.full 2>&1
 export MOUNT_OPTIONS="-o usrquota,grpquota $MOUNT_OPTIONS"
 _scratch_mount >> $seqres.full 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT
 quotacheck -u -g $SCRATCH_MNT 2> /dev/null
 quotaon $SCRATCH_MNT 2> /dev/null
 
diff --git a/tests/generic/327 b/tests/generic/327
index 92540b19dd..dcdf2b3130 100755
--- a/tests/generic/327
+++ b/tests/generic/327
@@ -26,6 +26,7 @@ echo "Format and mount"
 _scratch_mkfs > $seqres.full 2>&1
 export MOUNT_OPTIONS="-o usrquota,grpquota $MOUNT_OPTIONS"
 _scratch_mount >> $seqres.full 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT
 quotacheck -u -g $SCRATCH_MNT 2> /dev/null
 quotaon $SCRATCH_MNT 2> /dev/null
 
diff --git a/tests/generic/328 b/tests/generic/328
index db7fd3db41..26c89fff62 100755
--- a/tests/generic/328
+++ b/tests/generic/328
@@ -27,6 +27,7 @@ echo "Format and mount"
 _scratch_mkfs > $seqres.full 2>&1
 export MOUNT_OPTIONS="-o usrquota,grpquota $MOUNT_OPTIONS"
 _scratch_mount >> $seqres.full 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT
 quotacheck -u -g $SCRATCH_MNT 2> /dev/null
 quotaon $SCRATCH_MNT 2> /dev/null
 
diff --git a/tests/generic/566 b/tests/generic/566
index 52b01f6d9e..052fabeb0e 100755
--- a/tests/generic/566
+++ b/tests/generic/566
@@ -37,7 +37,9 @@ _qmount
 dir="$SCRATCH_MNT/dummy"
 mkdir -p $dir
 chown $qa_user $dir
-$XFS_QUOTA_PROG -x -c "limit -g bsoft=100k bhard=100k $qa_user" $SCRATCH_MNT
+_scratch_supports_rtquota && \
+	extra_limits="rtbsoft=100k rtbhard=100k"
+$XFS_QUOTA_PROG -x -c "limit -g bsoft=100k bhard=100k $extra_limits $qa_user" $SCRATCH_MNT
 
 $XFS_IO_PROG -f -c 'pwrite -S 0x58 0 1m' $dir/foo >> $seqres.full
 chown $qa_user "${dir}/foo"
diff --git a/tests/generic/587 b/tests/generic/587
index ebfeea1d17..183ec1cad7 100755
--- a/tests/generic/587
+++ b/tests/generic/587
@@ -58,6 +58,7 @@ _scratch_mkfs > $seqres.full
 # This test must have user quota enabled
 _qmount_option usrquota
 _qmount >> $seqres.full
+_force_vfs_quota_testing $SCRATCH_MNT
 
 testfile=$SCRATCH_MNT/test-$seq
 touch $testfile
diff --git a/tests/generic/603 b/tests/generic/603
index 08ddcbf2ec..15dc2293a1 100755
--- a/tests/generic/603
+++ b/tests/generic/603
@@ -120,6 +120,7 @@ _scratch_mkfs >$seqres.full 2>&1
 _scratch_enable_pquota
 _qmount_option "usrquota,grpquota,prjquota"
 _qmount
+_force_vfs_quota_testing $SCRATCH_MNT
 _require_prjquota $SCRATCH_DEV
 BLOCK_SIZE=$(_get_file_block_size $SCRATCH_MNT)
 rm -rf $SCRATCH_MNT/t
diff --git a/tests/generic/691 b/tests/generic/691
index 6432834f9f..3d43ea5b04 100755
--- a/tests/generic/691
+++ b/tests/generic/691
@@ -39,6 +39,7 @@ _scratch_mkfs >$seqres.full 2>&1
 _scratch_enable_pquota
 _qmount_option "prjquota"
 _qmount
+_force_vfs_quota_testing $SCRATCH_MNT
 _require_prjquota $SCRATCH_DEV
 
 filter_quota()
@@ -66,6 +67,7 @@ exercise()
 		_scratch_enable_pquota
 	fi
 	_qmount
+	_force_vfs_quota_testing $SCRATCH_MNT
 	if [ "$type" = "P" ];then
 		_create_project_quota $SCRATCH_MNT/t 100 $qa_user
 		file=$SCRATCH_MNT/t/testfile
diff --git a/tests/generic/710 b/tests/generic/710
index c7fca05d4c..2f793e03fb 100755
--- a/tests/generic/710
+++ b/tests/generic/710
@@ -33,7 +33,9 @@ $XFS_IO_PROG -f -c 'pwrite -S 0x59 0 64k -b 64k' -c 'truncate 256k' $SCRATCH_MNT
 chown nobody $SCRATCH_MNT/b
 
 # Set up a quota limit
-$XFS_QUOTA_PROG -x -c "limit -u bhard=70k nobody" $SCRATCH_MNT
+_scratch_supports_rtquota && \
+	extra_limits="rtbhard=70k"
+$XFS_QUOTA_PROG -x -c "limit -u bhard=70k $extra_limits nobody" $SCRATCH_MNT
 
 echo before swapext >> $seqres.full
 $XFS_QUOTA_PROG -x -c 'report -a' $SCRATCH_MNT >> $seqres.full
diff --git a/tests/xfs/050 b/tests/xfs/050
index 10294e3f6d..03cc534bc7 100755
--- a/tests/xfs/050
+++ b/tests/xfs/050
@@ -35,6 +35,7 @@ _scratch_mkfs >/dev/null 2>&1
 orig_mntopts="$MOUNT_OPTIONS"
 _qmount_option "uquota"
 _scratch_mount
+_force_vfs_quota_testing $SCRATCH_MNT   # golden output encodes block usage
 bsize=$(_get_file_block_size $SCRATCH_MNT)
 # needs quota enabled to compute the number of metadata dir files
 HIDDEN_QUOTA_FILES=$(_xfs_calc_hidden_quota_files $SCRATCH_MNT)
@@ -77,6 +78,7 @@ _exercise()
 	. $tmp.mkfs
 
 	_qmount
+	_force_vfs_quota_testing $SCRATCH_MNT   # golden output encodes block usage
 
 	# Figure out whether we're doing large allocations
 	# (bail out if they're so large they stuff the test up)
diff --git a/tests/xfs/106 b/tests/xfs/106
index 388873bdee..9f00b3adb5 100755
--- a/tests/xfs/106
+++ b/tests/xfs/106
@@ -197,6 +197,7 @@ test_xfs_quota()
 {
 	_qmount_option $1
 	_qmount
+	_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 
 	if [ $type == "p" ]; then
 		_require_prjquota $SCRATCH_DEV
diff --git a/tests/xfs/108 b/tests/xfs/108
index 8593edbdd2..7f2578aa28 100755
--- a/tests/xfs/108
+++ b/tests/xfs/108
@@ -65,6 +65,7 @@ test_accounting()
 export MOUNT_OPTIONS="-opquota"
 _scratch_mkfs_xfs >> $seqres.full
 _qmount
+_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 _require_prjquota $SCRATCH_DEV
 
 # real QA test starts here
@@ -73,6 +74,7 @@ _scratch_unmount 2>/dev/null
 _scratch_mkfs_xfs | _filter_mkfs 2>$tmp.mkfs
 cat $tmp.mkfs >>$seqres.full
 _scratch_mount
+_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 
 uid=1
 gid=2
diff --git a/tests/xfs/152 b/tests/xfs/152
index 325a05c141..f9a0bee12f 100755
--- a/tests/xfs/152
+++ b/tests/xfs/152
@@ -242,6 +242,7 @@ qmount_idmapped()
 {
 	wipe_mounts
 	_try_scratch_mount || _fail "qmount failed"
+	_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 
 	mkdir -p "${SCRATCH_MNT}/unmapped"
 	mkdir -p "${SCRATCH_MNT}/idmapped"
diff --git a/tests/xfs/153 b/tests/xfs/153
index 9def579bba..304e0f11cc 100755
--- a/tests/xfs/153
+++ b/tests/xfs/153
@@ -40,6 +40,7 @@ _scratch_mkfs >/dev/null 2>&1
 orig_mntopts="$MOUNT_OPTIONS"
 _qmount_option "uquota"
 _scratch_mount
+_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 bsize=$(_get_file_block_size $SCRATCH_MNT)
 # needs quota enabled to compute the number of metadata dir files
 HIDDEN_QUOTA_FILES=$(_xfs_calc_hidden_quota_files $SCRATCH_MNT)
@@ -82,6 +83,7 @@ run_tests()
 	. $tmp.mkfs
 
 	_qmount
+	_force_vfs_quota_testing $SCRATCH_MNT   # golden output encodes block usage
 
 	# Figure out whether we're doing large allocations
 	# (bail out if they're so large they stuff the test up)
diff --git a/tests/xfs/161 b/tests/xfs/161
index 486fa6ca0e..57cfe5b003 100755
--- a/tests/xfs/161
+++ b/tests/xfs/161
@@ -38,6 +38,8 @@ _qmount_option "usrquota"
 _scratch_xfs_db -c 'version' -c 'sb 0' -c 'p' >> $seqres.full
 _scratch_mount >> $seqres.full
 
+_xfs_force_bdev data $SCRATCH_MNT
+
 # Force the block counters for uid 1 and 2 above zero
 _pwrite_byte 0x61 0 64k $SCRATCH_MNT/a >> $seqres.full
 _pwrite_byte 0x61 0 64k $SCRATCH_MNT/b >> $seqres.full
diff --git a/tests/xfs/213 b/tests/xfs/213
index e184962452..7f3b1b4f4a 100755
--- a/tests/xfs/213
+++ b/tests/xfs/213
@@ -30,6 +30,7 @@ echo "Format and mount"
 _scratch_mkfs > $seqres.full 2>&1
 export MOUNT_OPTIONS="-o usrquota,grpquota $MOUNT_OPTIONS"
 _scratch_mount >> $seqres.full 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT	# repquota
 quotacheck -u -g $SCRATCH_MNT 2> /dev/null
 quotaon $SCRATCH_MNT 2> /dev/null
 
diff --git a/tests/xfs/214 b/tests/xfs/214
index 84ba838f3a..858ac7ea4b 100755
--- a/tests/xfs/214
+++ b/tests/xfs/214
@@ -31,6 +31,7 @@ echo "Format and mount"
 _scratch_mkfs > $seqres.full 2>&1
 export MOUNT_OPTIONS="-o usrquota,grpquota $MOUNT_OPTIONS"
 _scratch_mount >> $seqres.full 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT	# repquota
 quotacheck -u -g $SCRATCH_MNT 2> /dev/null
 quotaon $SCRATCH_MNT 2> /dev/null
 
diff --git a/tests/xfs/220 b/tests/xfs/220
index 88eedf5161..77008bf80b 100755
--- a/tests/xfs/220
+++ b/tests/xfs/220
@@ -39,6 +39,7 @@ _scratch_mkfs_xfs >/dev/null 2>&1
 
 # mount  with quotas enabled
 _scratch_mount -o uquota
+_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 
 # turn off quota
 $XFS_QUOTA_PROG -x -c off $SCRATCH_DEV
@@ -51,6 +52,7 @@ _scratch_mkfs_xfs >/dev/null 2>&1
 
 # mount  with quotas enabled
 _scratch_mount -o uquota
+_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 
 # turn off quota accounting...
 $XFS_QUOTA_PROG -x -c off $SCRATCH_DEV
diff --git a/tests/xfs/299 b/tests/xfs/299
index 49a6527255..84fb76120d 100755
--- a/tests/xfs/299
+++ b/tests/xfs/299
@@ -157,6 +157,7 @@ projid_file="$tmp.projid"
 echo "*** user, group, and project"
 _qmount_option "uquota,gquota,pquota"
 _qmount
+_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 
 bsize=$(_get_file_block_size $SCRATCH_MNT)
 HIDDEN_QUOTA_FILES=$(_xfs_calc_hidden_quota_files $SCRATCH_MNT)
@@ -184,6 +185,7 @@ cat $tmp.mkfs >>$seqres.full
 echo "*** uqnoenforce, gqnoenforce, and pqnoenforce"
 _qmount_option "uqnoenforce,gqnoenforce,pqnoenforce"
 _qmount
+_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 _exercise uno
 _exercise gno
 _exercise pno
diff --git a/tests/xfs/330 b/tests/xfs/330
index 7ebbffff2a..7c18514923 100755
--- a/tests/xfs/330
+++ b/tests/xfs/330
@@ -38,6 +38,7 @@ echo "Format and mount"
 _scratch_mkfs > "$seqres.full" 2>&1
 export MOUNT_OPTIONS="-o usrquota,grpquota $MOUNT_OPTIONS"
 _scratch_mount >> "$seqres.full" 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT	 # golden output encodes block usage
 HIDDEN_QUOTA_FILES=$(_xfs_calc_hidden_quota_files $SCRATCH_MNT)
 quotacheck -u -g $SCRATCH_MNT 2> /dev/null
 quotaon $SCRATCH_MNT 2> /dev/null
diff --git a/tests/xfs/434 b/tests/xfs/434
index 12d1a0c9da..ef7652f7f3 100755
--- a/tests/xfs/434
+++ b/tests/xfs/434
@@ -47,6 +47,7 @@ _scratch_mount -o noquota >> "$seqres.full" 2>&1
 
 testdir="$SCRATCH_MNT/test-$seq"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 blks=3
 mkdir "$testdir"
 
diff --git a/tests/xfs/435 b/tests/xfs/435
index 44135c7653..c68d7a126a 100755
--- a/tests/xfs/435
+++ b/tests/xfs/435
@@ -38,6 +38,7 @@ _scratch_mount -o quota >> "$seqres.full" 2>&1
 
 testdir="$SCRATCH_MNT/test-$seq"
 blksz=65536
+_require_congruent_file_oplen $SCRATCH_MNT $blksz
 blks=3
 mkdir "$testdir"
 
diff --git a/tests/xfs/441 b/tests/xfs/441
index 82654bf332..04682796b5 100755
--- a/tests/xfs/441
+++ b/tests/xfs/441
@@ -32,6 +32,7 @@ check_quota() {
 echo "Format and mount (noquota)"
 _scratch_mkfs > "$seqres.full" 2>&1
 _scratch_mount "-o noquota" >> $seqres.full 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT	 # _check_quota_usage uses repquota
 
 echo "Create files"
 _pwrite_byte 0x58 0 1m $SCRATCH_MNT/a >> $seqres.full
diff --git a/tests/xfs/442 b/tests/xfs/442
index b04b1c8349..4a0608a095 100755
--- a/tests/xfs/442
+++ b/tests/xfs/442
@@ -70,6 +70,7 @@ _qmount_option "usrquota,grpquota,prjquota"
 # tests now have separate faster-running regression tests.
 _scratch_mkfs_sized $((1600 * 1048576)) > $seqres.full 2>&1
 _scratch_mount >> $seqres.full 2>&1
+_force_vfs_quota_testing $SCRATCH_MNT	 # _check_quota_usage uses repquota
 
 nr_cpus=$((LOAD_FACTOR * 4))
 nr_ops=$((25000 * nr_cpus * TIME_FACTOR))
diff --git a/tests/xfs/508 b/tests/xfs/508
index 47c04f89de..28d5be484c 100755
--- a/tests/xfs/508
+++ b/tests/xfs/508
@@ -59,6 +59,8 @@ _require_prjquota $SCRATCH_DEV
 mkdir $SCRATCH_MNT/dir
 $QUOTA_CMD -x -c 'project -s test' $SCRATCH_MNT >>$seqres.full 2>&1
 $QUOTA_CMD -x -c 'limit -p bsoft=1m bhard=2m test' $SCRATCH_MNT
+_scratch_supports_rtquota && \
+	$QUOTA_CMD -x -c 'limit -p rtbsoft=1m rtbhard=2m test' $SCRATCH_MNT
 
 # test the Project inheritance bit is a directory only flag, and it's set on
 # directory by default. Expect no complain about "project inheritance flag is
diff --git a/tests/xfs/511 b/tests/xfs/511
index d2550404b0..ce7d40300c 100755
--- a/tests/xfs/511
+++ b/tests/xfs/511
@@ -40,11 +40,19 @@ $XFS_IO_PROG -f -c "pwrite 0 65536" -c sync $SCRATCH_MNT/t/file >>$seqres.full
 quota_cmd="$XFS_QUOTA_PROG -x"
 $quota_cmd -c "project -s -p $SCRATCH_MNT/t 42" $SCRATCH_MNT >/dev/null 2>&1
 $quota_cmd -c 'limit -p isoft=53 bsoft=100m 42' $SCRATCH_MNT
+_scratch_supports_rtquota && \
+	$quota_cmd -c 'limit -p rtbsoft=100m 42' $SCRATCH_MNT
+
+# The golden output for this test was written with the assumption that the file
+# allocation unit divides 64k evenly, so the file block usage would be exactly
+# 64k.  On realtime filesystems this isn't always true (e.g. -rextsize=28k) so
+# we'll accept the space usage being the same as what du reports for the file.
+file_nblocks=$(du -B 1024 $SCRATCH_MNT/t/file | awk '{print $1}')
 
 # The itotal and size should be 53 and 102400(k), as above project quota limit.
 # The isued and used should be 2 and 64(k), as this case takes.
 df -k --output=file,itotal,iused,size,used $SCRATCH_MNT/t | \
-	_filter_scratch | _filter_spaces
+	_filter_scratch | _filter_spaces | sed -e "s|$file_nblocks$|64|"
 
 # success, all done
 status=0
diff --git a/tests/xfs/720 b/tests/xfs/720
index 3242a19b02..e36dc23948 100755
--- a/tests/xfs/720
+++ b/tests/xfs/720
@@ -32,6 +32,11 @@ _scratch_mkfs > "$seqres.full" 2>&1
 _qmount_option usrquota
 _qmount
 
+# This test tries to exceed quota limits by creating an N>2 block bmbt, setting
+# the block limit to 2N, and rebuilding the bmbt.  Hence we must force the
+# files to be created on the data device.
+_xfs_force_bdev data $SCRATCH_MNT
+
 blocksize=$(_get_block_size $SCRATCH_MNT)
 alloc_unit=$(_get_file_block_size $SCRATCH_MNT)
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/3] xfs: regression testing of quota on the realtime device
  2023-12-31 20:02 ` [PATCHSET v2.0 9/9] fstests: functional tests for rt quota Darrick J. Wong
  2023-12-27 14:05   ` [PATCH 1/3] common: enable testing of realtime quota when supported Darrick J. Wong
  2023-12-27 14:05   ` [PATCH 2/3] xfs: fix quota tests to adapt to realtime quota Darrick J. Wong
@ 2023-12-27 14:06   ` Darrick J. Wong
  2 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:06 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

From: Darrick J. Wong <djwong@kernel.org>

Make sure that quota accounting and enforcement work correctly for
realtime volumes on XFS.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/quota       |   30 +++++++++
 tests/xfs/1858     |  168 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1858.out |   41 +++++++++++++
 3 files changed, 239 insertions(+)
 create mode 100755 tests/xfs/1858
 create mode 100644 tests/xfs/1858.out


diff --git a/common/quota b/common/quota
index 06179eb8c8..da016896ac 100644
--- a/common/quota
+++ b/common/quota
@@ -117,6 +117,36 @@ _require_xfs_quota_acct_enabled()
 	_notrun "$qtype: accounting not enabled on $fsname filesystem."
 }
 
+# Decide if the mounted filesystem supports realtime quotas.
+_require_rtquota()
+{
+	local dev="$1"
+	test -z "$dev" && dev="$TEST_DEV"
+	local rtdev="$2"
+	test -z "$rtdev" && rtdev="$TEST_RTDEV"
+
+	test "$FSTYP" = "xfs" || \
+		_notrun "Realtime quota only supported on xfs"
+
+	[ -n "$XFS_QUOTA_PROG" ] || \
+		_notrun "xfs_quota user tool not installed"
+
+	$here/src/feature -q $dev || \
+		_notrun "Installed kernel does not support XFS quotas"
+
+	test -b "$rtdev" || \
+		_notrun "No realtime device supplied?"
+
+	test "$USE_EXTERNAL" = "yes" || \
+		_notrun "Realtime requires USE_EXTERNAL='yes'"
+
+	$here/src/feature -U $dev || \
+	$here/src/feature -G $dev || \
+	$here/src/feature -P $dev || \
+		_notrun "Mounted rt filesystem does not have quotas enabled"
+
+}
+
 #
 # checks that xfs_quota can operate on foreign (non-xfs) filesystems
 # Skips check on xfs filesystems, old xfs_quota is fine there.
diff --git a/tests/xfs/1858 b/tests/xfs/1858
new file mode 100755
index 0000000000..12c9cb392a
--- /dev/null
+++ b/tests/xfs/1858
@@ -0,0 +1,168 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+#
+# FS QA Test No. 1858
+#
+# Functional testing for realtime quotas.
+
+. ./common/preamble
+_begin_fstest auto quick quota realtime
+
+# Import common functions.
+. ./common/quota
+. ./common/filter
+
+# real QA test starts here
+_supported_fs xfs
+_require_test_program "punch-alternating"
+_require_scratch
+_require_user
+
+echo "Format filesystem" | tee -a $seqres.full
+_scratch_mkfs > $seqres.full
+_qmount_option 'usrquota'
+_qmount
+_require_rtquota $SCRATCH_DEV $SCRATCH_RTDEV
+
+# Make sure all our files are on the rt device
+_xfs_force_bdev realtime $SCRATCH_MNT
+chmod a+rwx $SCRATCH_MNT
+
+# Record rt geometry
+bmbt_blksz=$(_get_block_size $SCRATCH_MNT)
+file_blksz=$(_get_file_block_size $SCRATCH_MNT)
+rextsize=$((file_blksz / bmbt_blksz))
+echo "bmbt_blksz $bmbt_blksz" >> $seqres.full
+echo "file_blksz $file_blksz" >> $seqres.full
+echo "rextsize $rextsize" >> $seqres.full
+
+note() {
+	echo -e "\n$*" | tee -a $seqres.full
+}
+
+# Report on the user's block and rt block usage, soft limit, hard limit, and
+# warning count for rt volumes
+report_rtusage() {
+	local user="$1"
+	local timeout_arg="$2"
+	local print_timeout=0
+
+	test -z "$user" && user=$qa_user
+	test -n "$timeout_arg" && print_timeout=1
+
+	$XFS_QUOTA_PROG -c "quota -u -r -n -N $user" $SCRATCH_MNT | \
+		sed -e 's/ days/_days/g' >> $seqres.full
+
+	$XFS_QUOTA_PROG -c "quota -u -r -n -N $user" $SCRATCH_MNT | \
+		sed -e 's/ days/_days/g' | \
+		awk -v user=$user -v print_timeout=$print_timeout -v file_blksz=$file_blksz \
+			'{printf("%s[real] %d %d %d %d %s\n", user, $2 * 1024 / file_blksz, $3 * 1024 / file_blksz, $4 * 1024 / file_blksz, $5, print_timeout ? $6 : "---");}'
+}
+
+note "Write 128rx to root"
+$XFS_IO_PROG -f -c "pwrite 0 $((128 * file_blksz))" $SCRATCH_MNT/file1 > /dev/null
+chmod a+r $SCRATCH_MNT/file1
+sync
+report_rtusage 0
+
+note "Write 64rx to root, 4444, and 5555."
+$XFS_IO_PROG -f -c "pwrite 0 $((64 * file_blksz))" $SCRATCH_MNT/file3.5555 > /dev/null
+chown 5555 $SCRATCH_MNT/file3.5555
+$XFS_IO_PROG -f -c "pwrite 0 $((64 * file_blksz))" $SCRATCH_MNT/file3.4444 > /dev/null
+chown 4444 $SCRATCH_MNT/file3.4444
+$XFS_IO_PROG -f -c "pwrite 0 $((64 * file_blksz))" $SCRATCH_MNT/file3 > /dev/null
+sync
+report_rtusage 0
+report_rtusage 4444
+report_rtusage 5555
+
+note "Move 64rx from root to 5555"
+chown 5555 $SCRATCH_MNT/file3
+report_rtusage 0
+report_rtusage 4444
+report_rtusage 5555
+
+note "Move 64rx from 5555 to 4444"
+chown 4444 $SCRATCH_MNT/file3
+report_rtusage 0
+report_rtusage 4444
+report_rtusage 5555
+
+note "Set hard limit of 1024rx and check enforcement"
+$XFS_QUOTA_PROG -x -c "limit -u rtbhard=$((1024 * file_blksz)) $qa_user" $SCRATCH_MNT
+su $qa_user -c "$XFS_IO_PROG -f -c 'pwrite 0 $((2048 * file_blksz))' $SCRATCH_MNT/file2"
+report_rtusage
+
+note "Set soft limit of 512rx and check timelimit enforcement"
+rm -f $SCRATCH_MNT/file2 $SCRATCH_MNT/file2.1
+period=6	# seconds
+$XFS_QUOTA_PROG -x -c "limit -u rtbsoft=$((512 * file_blksz)) rtbhard=0 $qa_user" $SCRATCH_MNT
+$XFS_QUOTA_PROG -x -c "timer -u -r -d $period" $SCRATCH_MNT
+$XFS_QUOTA_PROG -x -c 'state -u' $SCRATCH_MNT >> $seqres.full
+
+su $qa_user -c "$XFS_IO_PROG -f -c 'pwrite 0 $((512 * file_blksz))' $SCRATCH_MNT/file2" > /dev/null
+report_rtusage
+
+overflow=$(date +%s)
+su $qa_user -c "$XFS_IO_PROG -f -c 'pwrite -b $file_blksz 0 $file_blksz' $SCRATCH_MNT/file2.1" > /dev/null
+report_rtusage
+sleep $((period / 2))
+echo "Try again after $((period / 2))s"
+su $qa_user -c "$XFS_IO_PROG -f -c 'pwrite -b $file_blksz $file_blksz $file_blksz' $SCRATCH_MNT/file2.1" > /dev/null
+report_rtusage
+sleep $period
+echo "Try again after another ${period}s"
+su $qa_user -c "$XFS_IO_PROG -f -c 'pwrite -b $file_blksz $((2 * file_blksz)) $file_blksz' $SCRATCH_MNT/file2.1" > /dev/null
+report_rtusage
+
+note "Extend time limits and warnings"
+rm -f $SCRATCH_MNT/file2 $SCRATCH_MNT/file2.1
+$XFS_QUOTA_PROG -x -c "limit -u rtbsoft=$((512 * file_blksz)) rtbhard=0 $qa_user" $SCRATCH_MNT
+$XFS_QUOTA_PROG -x -c "timer -u -r -d 49h" $SCRATCH_MNT
+$XFS_QUOTA_PROG -x -c 'state -u' $SCRATCH_MNT >> $seqres.full
+
+su $qa_user -c "$XFS_IO_PROG -f -c 'pwrite 0 $((512 * file_blksz))' $SCRATCH_MNT/file2" > /dev/null
+report_rtusage $qa_user want_timeout
+
+su $qa_user -c "$XFS_IO_PROG -f -c 'pwrite -b $file_blksz 0 $file_blksz' $SCRATCH_MNT/file2.1" > /dev/null
+report_rtusage $qa_user want_timeout
+
+$XFS_QUOTA_PROG -x -c "timer -u -r 73h $qa_user" $SCRATCH_MNT
+
+su $qa_user -c "$XFS_IO_PROG -f -c 'pwrite -b $file_blksz $file_blksz $file_blksz' $SCRATCH_MNT/file2.1" > /dev/null
+report_rtusage $qa_user want_timeout
+
+note "Test quota applied to bmbt"
+
+# Testing quota enforcement for bmbt shape changes is tricky.  The block
+# reservation will be for enough blocks to handle the maximal btree split.
+# This is (approximately) 9 blocks no matter the size of the existing extent
+# map structure, so we set the hard limit to one more than this quantity.
+#
+# However, that means that we need to make a file of at least twice that size
+# to ensure that we create enough extent records even in the rextsize==1 case
+# where punching doesn't just create unwritten records.
+#
+# Unfortunately, it's very difficult to predict when exactly the EDQUOT will
+# come down, so we just look for the error message.
+extent_records=$(( (25 * bmbt_blksz) / 16))
+echo "extent_records $extent_records" >> $seqres.full
+
+rm -f $SCRATCH_MNT/file2
+$XFS_QUOTA_PROG -x -c "limit -u rtbsoft=0 rtbhard=0 $qa_user" $SCRATCH_MNT
+$XFS_QUOTA_PROG -x -c "limit -u bhard=$((bmbt_blksz * 10)) bsoft=0 $qa_user" $SCRATCH_MNT
+$XFS_QUOTA_PROG -x -c 'state -u' $SCRATCH_MNT >> $seqres.full
+$XFS_IO_PROG -f -c "pwrite -S 0x58 -b 64m 0 $((extent_records * file_blksz))" $SCRATCH_MNT/file2 > /dev/null
+sync
+chown $qa_user $SCRATCH_MNT/file2
+$here/src/punch-alternating $SCRATCH_MNT/file2 2>&1 | _filter_scratch
+
+$XFS_QUOTA_PROG -c "quota -u -r -n -N $qa_user" -c "quota -u -b -n -N $qa_user" $SCRATCH_MNT >> $seqres.full
+$XFS_IO_PROG -c "bmap -e -l -p -v" $SCRATCH_MNT/file2 >> $seqres.full
+
+# success, all done
+$XFS_QUOTA_PROG -x -c 'report -a -u' -c 'report -a -u -r' $SCRATCH_MNT >> $seqres.full
+ls -latr $SCRATCH_MNT >> $seqres.full
+status=0
+exit
diff --git a/tests/xfs/1858.out b/tests/xfs/1858.out
new file mode 100644
index 0000000000..1edee3f422
--- /dev/null
+++ b/tests/xfs/1858.out
@@ -0,0 +1,41 @@
+QA output created by 1858
+Format filesystem
+
+Write 128rx to root
+0[real] 128 0 0 0 ---
+
+Write 64rx to root, 4444, and 5555.
+0[real] 192 0 0 0 ---
+4444[real] 64 0 0 0 ---
+5555[real] 64 0 0 0 ---
+
+Move 64rx from root to 5555
+0[real] 128 0 0 0 ---
+4444[real] 64 0 0 0 ---
+5555[real] 128 0 0 0 ---
+
+Move 64rx from 5555 to 4444
+0[real] 128 0 0 0 ---
+4444[real] 128 0 0 0 ---
+5555[real] 64 0 0 0 ---
+
+Set hard limit of 1024rx and check enforcement
+pwrite: Disk quota exceeded
+fsgqa[real] 1024 0 1024 0 ---
+
+Set soft limit of 512rx and check timelimit enforcement
+fsgqa[real] 512 512 0 0 ---
+fsgqa[real] 513 512 0 0 ---
+Try again after 3s
+fsgqa[real] 514 512 0 0 ---
+Try again after another 6s
+pwrite: Disk quota exceeded
+fsgqa[real] 514 512 0 0 ---
+
+Extend time limits and warnings
+fsgqa[real] 512 512 0 0 [--------]
+fsgqa[real] 513 512 0 0 [2_days]
+fsgqa[real] 514 512 0 0 [3_days]
+
+Test quota applied to bmbt
+SCRATCH_MNT/file2: Disk quota exceeded


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/1] design: document the changes required to handle metadata directories
  2023-12-31 20:03 ` [PATCHSET v2.0 1/4] xfs-documentation: document metadata directories Darrick J. Wong
@ 2023-12-27 14:08   ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:08 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Document the ondisk format changes for metadata directories.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 .../allocation_groups.asciidoc                     |   23 +++
 .../internal_inodes.asciidoc                       |  142 ++++++++++++++++++++
 .../XFS_Filesystem_Structure/ondisk_inode.asciidoc |    5 +
 3 files changed, 165 insertions(+), 5 deletions(-)


diff --git a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
index 7b128838..c91a06bf 100644
--- a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
+++ b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
@@ -142,10 +142,13 @@ start of the first possible inode chunk in AG 0.  This is 128 when using a 4KB
 block size.
 
 *sb_rbmino*::
-Bitmap inode for real-time extents.
+Bitmap inode for real-time extents if the +XFS_SB_FEAT_INCOMPAT_METADIR+
+feature is not enabled.  If the metadir feature is enabled, this field points
+to the inode for the root of the metadata directory tree.
 
 *sb_rsumino*::
-Summary inode for real-time bitmap.
+Summary inode for real-time bitmap if the +XFS_SB_FEAT_INCOMPAT_METADIR+
+feature is not enabled.
 
 *sb_rextsize*::
 Realtime extent size in blocks.
@@ -262,12 +265,16 @@ maintained in the first superblock.
 *sb_uquotino*::
 Inode for user quotas. This and the following two quota fields only apply if
 +XFS_SB_VERSION_QUOTABIT+ flag is set in +sb_versionnum+. Refer to
-xref:Quota_Inodes[quota inodes] for more information.
+xref:Quota_Inodes[quota inodes] for more information.  If the
++XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled, the user quota file is found
+through the metadata directory tree and this field must be zero.
 
 *sb_gquotino*::
 Inode for group or project quotas. Group and project quotas cannot be used at
 the same time on v4 filesystems.  On a v5 filesystem, this inode always stores
-group quota information.
+group quota information.  If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is
+enabled, the group quota file is found through the metadata directory tree and
+this field must be zero.
 
 *sb_qflags*::
 Quota flags. It can be a combination of the following flags:
@@ -458,6 +465,10 @@ xfs_repair before it can be mounted.
 Large file fork extent counts.  This greatly expands the maximum number of
 space mappings allowed in data and extended attribute file forks.
 
+| +XFS_SB_FEAT_INCOMPAT_METADIR+ |
+Metadata directory tree.  See the section about the xref:Metadata_Directories[
+metadata directory tree] for more information.
+
 |=====
 
 *sb_features_log_incompat*::
@@ -488,7 +499,9 @@ Sparse inode alignment, in fsblocks.  Each chunk of inodes referenced by a
 sparse inode B+tree record must be aligned to this block granularity.
 
 *sb_pquotino*::
-Project quota inode.
+Project quota inode.  If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled,
+the project quota file is found through the metadata directory tree and this
+field must be zero.
 
 *sb_lsn*::
 Log sequence number of the last superblock update.
diff --git a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
index 84e4cb96..42020a5f 100644
--- a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
+++ b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
@@ -5,6 +5,148 @@ XFS allocates several inodes when a filesystem is created. These are internal
 and not accessible from the standard directory structure. These inodes are only
 accessible from the superblock.
 
+[[Metadata_Directories]]
+== Metadata Directory Tree
+
+If the +XFS_SB_FEAT_INCOMPAT_METADIR+ feature is enabled, the +sb_rbmino+ field
+in the superblock points to the root of a directory tree containing metadata
+files.  This directory tree is completely internal to the filesystem and should
+not be exposed to user programs.
+
+Certain types of filesystem metadata can be stored in regular (but hidden)
+files.  Prior to the metadata directory feature, the superblock contained
+pointers to these files.
+
+When this feature is enabled, metadata files should be found by walking the
+metadata directory tree.  The superblock fields that formerly pointed to (some)
+of those inodes have been deallocated and may be reused by future features.
+
+.Metadata Directory Paths
+[options="header"]
+|=====
+| Metadata File                                  | Location
+| User xref:Quota_Inodes[Quotas]                 | /quota/user
+| Group Quotas                                   | /quota/group
+| Project Quotas                                 | /quota/project
+| xref:Real-Time_Bitmap_Inode[Realtime Bitmap]   | /realtime/bitmap
+| xref:Real-Time_Summary_Inode[Realtime Summary] | /realtime/summary
+|=====
+
+Metadata files are flagged by the +XFS_DIFLAG2_METADATA+ flag in the
++di_flags2+ field.  Metadata files must have their user, group, and project IDs
+set to zero, and must be chmod 0000.
+
+=== Metadata Directory Example
+
+This example shows a metadta directory from a freshly formatted root
+filesystem:
+
+----
+xfs_db> sb 0
+xfs_db> p
+magicnum = 0x58465342
+blocksize = 4096
+dblocks = 2579968
+rblocks = 0
+rextents = 0
+uuid = 9b535856-7d4c-4c11-a059-42199f602b03
+logstart = 2097159
+rootino = 128
+metadirino = 129
+rextsize = 1
+agblocks = 644992
+agcount = 4
+rbmblocks = 0
+logblocks = 3693
+...
+----
+
+Notice how the listing no longer includes the realtime bitmap or summary
+inodes, but does include the root of the metadata directory tree
+(+metadirino+).
+
+----
+xfs_db> path -m /
+xfs_db> ls
+129                directory      0x0000002e   1 .
+129                directory      0x0000172e   2 ..
+130                directory      0xce7fe1eb   8 realtime
+133                directory      0x1ebbfa66   5 quota
+----
+
+Here we use the +path+ and +ls+ commands to display the root directory of
+the metadata directory.  We can navigate the directory the old way, too:
+
+----
+xfs_db> p
+core.magic = 0x494e
+core.mode = 040000
+core.version = 3
+core.format = 1 (local)
+core.nlinkv2 = 4
+core.onlink = 0
+core.projid_lo = 0
+core.projid_hi = 0
+core.uid = 0
+core.gid = 0
+...
+v3.inumber = 129
+v3.uuid = 9b535856-7d4c-4c11-a059-42199f602b03
+v3.reflink = 0
+v3.cowextsz = 0
+v3.dax = 0
+v3.bigtime = 1
+v3.metadata = 1
+u3.sfdir3.hdr.count = 2
+u3.sfdir3.hdr.i8count = 0
+u3.sfdir3.hdr.parent.i4 = 129
+u3.sfdir3.list[0].namelen = 8
+u3.sfdir3.list[0].offset = 0x60
+u3.sfdir3.list[0].name = "realtime"
+u3.sfdir3.list[0].inumber.i4 = 130
+u3.sfdir3.list[0].filetype = 2
+u3.sfdir3.list[1].namelen = 5
+u3.sfdir3.list[1].offset = 0x78
+u3.sfdir3.list[1].name = "quota"
+u3.sfdir3.list[1].inumber.i4 = 133
+u3.sfdir3.list[1].filetype = 2
+----
+
+The root of the metadata directory is a short format directory, and looks just
+like any other directory.  The only difference is that the metadata flag is
+set, and the directory can only be viewed in the XFS debugger.
+
+----
+xfs_db> path -m /quota/user
+xfs_db> dblock 0
+xfs_db> p
+diskdq.magic = 0x4451
+diskdq.version = 0x1
+diskdq.type = 0x1
+diskdq.id = 0
+diskdq.blk_hardlimit = 0
+diskdq.blk_softlimit = 0
+diskdq.ino_hardlimit = 0
+diskdq.ino_softlimit = 0
+diskdq.bcount = 0
+diskdq.icount = 6
+diskdq.itimer = 0
+diskdq.btimer = 0
+diskdq.iwarns = 0
+diskdq.bwarns = 0
+diskdq.rtb_hardlimit = 0
+diskdq.rtb_softlimit = 0
+diskdq.rtbcount = 0
+diskdq.rtbtimer = 0
+diskdq.rtbwarns = 0
+crc = 0x7ada5c8b (correct)
+lsn = 0
+uuid = 9b535856-7d4c-4c11-a059-42199f602b03
+----
+
+Observe that we can use the xfs_db +path+ command to navigate the metadata
+directory tree to the user quota file and display its contents.
+
 [[Quota_Inodes]]
 == Quota Inodes
 
diff --git a/design/XFS_Filesystem_Structure/ondisk_inode.asciidoc b/design/XFS_Filesystem_Structure/ondisk_inode.asciidoc
index 34c06487..964a9611 100644
--- a/design/XFS_Filesystem_Structure/ondisk_inode.asciidoc
+++ b/design/XFS_Filesystem_Structure/ondisk_inode.asciidoc
@@ -383,6 +383,11 @@ will be copied to all newly created files and directories.
 Files with this flag set may have up to (2^48^ - 1) extents mapped to the data
 fork and up to (2^32^ - 1) extents mapped to the attribute fork.  This flag
 requires the +XFS_SB_FEAT_INCOMPAT_NREXT64+ feature to be enabled.
+| +XFS_DIFLAG2_METADATA+	|
+This file contains filesystem metadata.  This feature requires the
++XFS_SB_FEAT_INCOMPAT_METADIR+ feature to be enabled.  Metadata files must have
+their user, group, and project IDs set to zero, and the permissions bits of the
+mode must be zero.  Only directories and regular files can have this flag set.
 |=====
 
 *di_cowextsize*::


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/2] design: move discussion of realtime volumes to a separate section
  2023-12-31 20:03 ` [PATCHSET v2.0 2/4] xfs-documentation: shard the realtime section Darrick J. Wong
@ 2023-12-27 14:08   ` Darrick J. Wong
  2023-12-27 14:08   ` [PATCH 2/2] design: document realtime groups Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:08 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In preparation for documenting the realtime modernization project, move
the discussions of the realtime-realted ondisk metadata to a separate
file.  Since realtime reverse mapping btrees haven't been added to the
filesystem yet, stop including them in the final output.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 .../allocation_groups.asciidoc                     |   20 --------
 .../internal_inodes.asciidoc                       |   36 +-------------
 design/XFS_Filesystem_Structure/realtime.asciidoc  |   50 ++++++++++++++++++++
 .../xfs_filesystem_structure.asciidoc              |    2 +
 4 files changed, 54 insertions(+), 54 deletions(-)
 create mode 100644 design/XFS_Filesystem_Structure/realtime.asciidoc


diff --git a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
index bd825207..08151c7c 100644
--- a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
+++ b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
@@ -1330,23 +1330,3 @@ core.magic = 0x494e
 
 The chunk record also indicates that this chunk has 32 inodes, and that the
 missing inodes are also ``free''.
-
-[[Real-time_Devices]]
-== Real-time Devices
-
-The performance of the standard XFS allocator varies depending on the internal
-state of the various metadata indices enabled on the filesystem.  For
-applications which need to minimize the jitter of allocation latency, XFS
-supports the notion of a ``real-time device''.  This is a special device
-separate from the regular filesystem where extent allocations are tracked with
-a bitmap and free space is indexed with a two-dimensional array.  If an inode
-is flagged with +XFS_DIFLAG_REALTIME+, its data will live on the real time
-device.  The metadata for real time devices is discussed in the section about
-xref:Real-time_Inodes[real time inodes].
-
-By placing the real time device (and the journal) on separate high-performance
-storage devices, it is possible to reduce most of the unpredictability in I/O
-response times that come from metadata operations.
-
-None of the XFS per-AG B+trees are involved with real time files.  It is not
-possible for real time files to share data blocks.
diff --git a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
index 42020a5f..e07b5f19 100644
--- a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
+++ b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
@@ -316,41 +316,9 @@ Log sequence number of the last DQ block write.
 *dd_crc*::
 Checksum of the DQ block.
 
-
 [[Real-time_Inodes]]
 == Real-time Inodes
 
 There are two inodes allocated to managing the real-time device's space, the
-Bitmap Inode and the Summary Inode.
-
-[[Real-Time_Bitmap_Inode]]
-=== Real-Time Bitmap Inode
-
-The real time bitmap inode, +sb_rbmino+, tracks the used/free space in the
-real-time device using an old-style bitmap. One bit is allocated per real-time
-extent. The size of an extent is specified by the superblock's +sb_rextsize+
-value.
-
-The number of blocks used by the bitmap inode is equal to the number of
-real-time extents (+sb_rextents+) divided by the block size (+sb_blocksize+)
-and bits per byte. This value is stored in +sb_rbmblocks+. The nblocks and
-extent array for the inode should match this.  Each real time block gets its
-own bit in the bitmap.
-
-[[Real-Time_Summary_Inode]]
-=== Real-Time Summary Inode
-
-The real time summary inode, +sb_rsumino+, tracks the used and free space
-accounting information for the real-time device.  This file indexes the
-approximate location of each free extent on the real-time device first by
-log2(extent size) and then by the real-time bitmap block number.  The size of
-the summary inode file is equal to +sb_rbmblocks+ × log2(realtime device size)
-× sizeof(+xfs_suminfo_t+).  The entry for a given log2(extent size) and
-rtbitmap block number is 0 if there is no free extents of that size at that
-rtbitmap location, and positive if there are any.
-
-This data structure is not particularly space efficient, however it is a very
-fast way to provide the same data as the two free space B+trees for regular
-files since the space is preallocated and metadata maintenance is minimal.
-
-include::rtrmapbt.asciidoc[]
+xref:Real-Time_Bitmap_Inode[Bitmap Inode] and the
+xref:Real-Time_Summary_Inode[Summary Inode].
diff --git a/design/XFS_Filesystem_Structure/realtime.asciidoc b/design/XFS_Filesystem_Structure/realtime.asciidoc
new file mode 100644
index 00000000..11426e8f
--- /dev/null
+++ b/design/XFS_Filesystem_Structure/realtime.asciidoc
@@ -0,0 +1,50 @@
+[[Real-time_Devices]]
+= Real-time Devices
+
+The performance of the standard XFS allocator varies depending on the internal
+state of the various metadata indices enabled on the filesystem.  For
+applications which need to minimize the jitter of allocation latency, XFS
+supports the notion of a ``real-time device''.  This is a special device
+separate from the regular filesystem where extent allocations are tracked with
+a bitmap and free space is indexed with a two-dimensional array.  If an inode
+is flagged with +XFS_DIFLAG_REALTIME+, its data will live on the real time
+device.
+
+By placing the real time device (and the journal) on separate high-performance
+storage devices, it is possible to reduce most of the unpredictability in I/O
+response times that come from metadata operations.
+
+None of the XFS per-AG B+trees are involved with real time files.  It is not
+possible for real time files to share data blocks.
+
+[[Real-Time_Bitmap_Inode]]
+== Free Space Bitmap Inode
+
+The real time bitmap inode, +sb_rbmino+, tracks the used/free space in the
+real-time device using an old-style bitmap. One bit is allocated per real-time
+extent. The size of an extent is specified by the superblock's +sb_rextsize+
+value.
+
+The number of blocks used by the bitmap inode is equal to the number of
+real-time extents (+sb_rextents+) divided by the block size (+sb_blocksize+)
+and bits per byte. This value is stored in +sb_rbmblocks+. The nblocks and
+extent array for the inode should match this.  Each real time block gets its
+own bit in the bitmap.
+
+[[Real-Time_Summary_Inode]]
+== Free Space Summary Inode
+
+The real time summary inode, +sb_rsumino+, tracks the used and free space
+accounting information for the real-time device.  This file indexes the
+approximate location of each free extent on the real-time device first by
+log2(extent size) and then by the real-time bitmap block number.  The size of
+the summary inode file is equal to +sb_rbmblocks+ × log2(realtime device size)
+× sizeof(+xfs_suminfo_t+).  The entry for a given log2(extent size) and
+rtbitmap block number is 0 if there is no free extents of that size at that
+rtbitmap location, and positive if there are any.
+
+This data structure is not particularly space efficient, however it is a very
+fast way to provide the same data as the two free space B+trees for regular
+files since the space is preallocated and metadata maintenance is minimal.
+
+include::rtrmapbt.asciidoc[]
diff --git a/design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc b/design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc
index a95a5806..04cc8e13 100644
--- a/design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc
+++ b/design/XFS_Filesystem_Structure/xfs_filesystem_structure.asciidoc
@@ -84,6 +84,8 @@ include::journaling_log.asciidoc[]
 
 include::internal_inodes.asciidoc[]
 
+include::realtime.asciidoc[]
+
 :leveloffset: 0
 
 Dynamically Allocated Structures


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/2] design: document realtime groups
  2023-12-31 20:03 ` [PATCHSET v2.0 2/4] xfs-documentation: shard the realtime section Darrick J. Wong
  2023-12-27 14:08   ` [PATCH 1/2] design: move discussion of realtime volumes to a separate section Darrick J. Wong
@ 2023-12-27 14:08   ` Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:08 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Document the ondisk changes for realtime allocation groups.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 .../allocation_groups.asciidoc                     |   19 ++
 design/XFS_Filesystem_Structure/magic.asciidoc     |    1 
 design/XFS_Filesystem_Structure/realtime.asciidoc  |  182 ++++++++++++++++++++
 3 files changed, 202 insertions(+)


diff --git a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
index 08151c7c..ab8868d0 100644
--- a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
+++ b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
@@ -55,6 +55,13 @@ struct xfs_sb
 	xfs_fsblock_t		sb_logstart;
 	xfs_ino_t		sb_rootino;
 	xfs_ino_t		sb_rbmino;
+	union {
+		xfs_ino_t	sb_rsumino;
+		struct {
+			xfs_rgnumber_t	sb_rgcount;
+			xfs_rgblock_t	sb_rgblocks;
+		};
+	};
 	xfs_ino_t		sb_rsumino;
 	xfs_agblock_t		sb_rextsize;
 	xfs_agblock_t		sb_agblocks;
@@ -150,6 +157,14 @@ to the inode for the root of the metadata directory tree.
 Summary inode for real-time bitmap if the +XFS_SB_FEAT_INCOMPAT_METADIR+
 feature is not enabled.
 
+*sb_rgcount*::
+Count of realtime groups in the filesystem, if the
++XFS_SB_FEAT_INCOMPAT_RTGROUPS+ feature is enabled.
+
+*sb_rgblocks*::
+Maximum number of filesystem blocks that can be contained within a realtime
+group, if the +XFS_SB_FEAT_INCOMPAT_RTGROUPS+ feature is enabled.
+
 *sb_rextsize*::
 Realtime extent size in blocks.
 
@@ -473,6 +488,10 @@ metadata directory tree] for more information.
 Directory parent pointers.  See the section about xref:Parent_Pointers[parent
 pointers] for more information.
 
+| +XFS_SB_FEAT_INCOMPAT_RTGROUPS+ |
+Realtime allocation groups.  See the section about the xref:Realtime_Groups[
+realtime groups] for more information.
+
 |=====
 
 *sb_features_log_incompat*::
diff --git a/design/XFS_Filesystem_Structure/magic.asciidoc b/design/XFS_Filesystem_Structure/magic.asciidoc
index 613e50c0..c83f59a2 100644
--- a/design/XFS_Filesystem_Structure/magic.asciidoc
+++ b/design/XFS_Filesystem_Structure/magic.asciidoc
@@ -48,6 +48,7 @@ relevant chapters.  Magic numbers tend to have consistent locations:
 | +XFS_RTRMAP_CRC_MAGIC+	| 0x4d415052	| MAPR	| xref:Real_time_Reverse_Mapping_Btree[Real-Time Reverse Mapping B+tree], v5 only
 | +XFS_REFC_CRC_MAGIC+		| 0x52334643	| R3FC	| xref:Reference_Count_Btree[Reference Count B+tree], v5 only
 | +XFS_MD_MAGIC+		| 0x5846534d	| XFSM	| xref:Metadata_Dumps[Metadata Dumps]
+| +XFS_RTSB_MAGIC+		| 0x58524750	| XRGP	| xref:Realtime_Groups[Realtime Groups]
 |=====
 
 The magic numbers for log items are at offset zero in each log item, but items
diff --git a/design/XFS_Filesystem_Structure/realtime.asciidoc b/design/XFS_Filesystem_Structure/realtime.asciidoc
index 11426e8f..f5fdb4e5 100644
--- a/design/XFS_Filesystem_Structure/realtime.asciidoc
+++ b/design/XFS_Filesystem_Structure/realtime.asciidoc
@@ -47,4 +47,186 @@ This data structure is not particularly space efficient, however it is a very
 fast way to provide the same data as the two free space B+trees for regular
 files since the space is preallocated and metadata maintenance is minimal.
 
+[[Realtime_Groups]]
+== Realtime Groups
+
+To reduce metadata contention for space allocation and remapping activities
+being applied to realtime files, the realtime volume can be split into
+allocation groups, just like the data volume.  The free space information is
+still contained in a single file that applies to the entire volume.
+
+Each realtime allocation group can contain up to (2^31^ - 1) filesystem blocks,
+regardless of the underlying realtime extent size.
+
+Each realtime group has the following characteristics:
+
+         * A super block describing overall filesystem info
+         * Free space bitmap
+         * Summary of free space
+
+Each of these structures are expanded upon in the following sections.
+
+[[Realtime_Group_Superblocks]]
+=== Superblocks
+
+The first block of each realtime group contains a superblock.  These fields
+must match their counterparts in the filesystem superblock on the data device.
+
+[source, c]
+----
+struct xfs_rtsb {
+	__be32		rsb_magicnum;
+	__be32		rsb_blocksize;
+	__be64		rsb_rblocks;
+
+	__be64		rsb_rextents;
+	__be64		rsb_lsn;
+
+	__be32		rsb_rgcount;
+	char		rsb_fname[XFSLABEL_MAX];
+
+	uuid_t		rsb_uuid;
+
+	__be32		rsb_rextsize;
+	__be32		rsb_rbmblocks;
+
+	__be32		rsb_rgblocks;
+	__u8		rsb_blocklog;
+	__u8		rsb_sectlog;
+	__u8		rsb_rextslog;
+	__u8		rsb_pad;
+
+	__le32		rsb_crc;
+	__le32		rsb_pad2;
+
+	uuid_t		rsb_meta_uuid;
+
+	/* must be padded to 64 bit alignment */
+};
+----
+
+*rsb_magicnum*::
+Identifies the filesystem. Its value is +XFS_RTSB_MAGIC+ ``XRGP'' (0x58524750).
+
+*rsb_blocksize*::
+The size of a basic unit of space allocation in bytes. Typically, this is 4096
+(4KB) but can range from 512 to 65536 bytes.
+
+*rsb_rblocks*::
+Number blocks in the real-time disk device.
+
+*rsb_rextents*::
+Number of extents on the real-time device.
+
+*rsb_lsn*::
+Log sequence number of the last superblock update.
+
+*rsb_rgcount*::
+Count of realtime groups in the filesystem, if the
++XFS_SB_FEAT_INCOMPAT_RTGROUPS+ feature is enabled.
+
+*rsb_fname[12]*::
+Name for the filesystem. This value can be used in the mount command.
+
+*rsb_uuid*::
+UUID (Universally Unique ID) for the filesystem. Filesystems can be mounted by
+the UUID instead of device name.
+
+*rsb_rextsize*::
+Realtime extent size in blocks.
+
+*rsb_rbmblocks*::
+Number of real-time bitmap blocks.
+
+*rsb_rgblocks*::
+Maximum number of filesystem blocks that can be contained within a realtime
+group, if the +XFS_SB_FEAT_INCOMPAT_RTGROUPS+ feature is enabled.
+
+*rsb_blocklog*::
+log~2~ value of +sb_blocksize+. In other terms, +sb_blocksize = 2^sb_blocklog^+.
+
+*rsb_sectlog*::
+log~2~ value of +sb_sectsize+.
+
+*rsb_rextslog*::
+log~2~ value of +sb_rextents+.
+
+*rsb_pad*::
+Must be zero.
+
+*rsb_crc*::
+Superblock checksum.
+
+*rsb_pad2*::
+Must be zero.
+
+*rsb_meta_uuid*::
+If the +XFS_SB_FEAT_INCOMPAT_META_UUID+ feature is set, then the UUID field in
+all metadata blocks must match this UUID.  If not, the block header UUID field
+must match +sb_uuid+.
+
+==== xfs_db rtgroup Superblock Example
+
+A filesystem is made on a multidisk filesystem with the following command:
+
+----
+# mkfs.xfs -r rtgroups=1,rgcount=4,rtdev=/dev/sdb /dev/sda -f
+meta-data=/dev/sda               isize=512    agcount=4, agsize=1298176 blks
+         =                       sectsz=512   attr=2, projid32bit=1
+         =                       crc=1        finobt=1, sparse=1, rmapbt=1
+         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=1
+         =                       metadir=1
+data     =                       bsize=4096   blocks=5192704, imaxpct=25
+         =                       sunit=0      swidth=0 blks
+naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
+log      =internal log           bsize=4096   blocks=16384, version=2
+         =                       sectsz=512   sunit=0 blks, lazy-count=1
+realtime =/dev/sdb               extsz=4096   blocks=5192704, rtextents=5192704
+         =                       rgcount=4    rgsize=1298176 blks
+----
+
+And in xfs_db, inspecting the realtime group superblock and then the regular
+superblock:
+
+----
+# xfs_db -R /dev/sdb /dev/sda
+xfs_db> rtsb 0
+xfs_db> print
+magicnum = 0x58524750
+blocksize = 4096
+rblocks = 5192704
+rextents = 5192704
+uuid = c52adb8a-48a6-4325-b251-d4dcb30889ea
+rextsize = 1
+rgblocks = 1048576
+rgcount = 5
+rbmblocks = 161
+fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
+blocklog = 12
+sectlog = 9
+rextslog = 22
+crc = 0x36872867 (correct)
+lsn = 0
+meta_uuid = c52adb8a-48a6-4325-b251-d4dcb30889ea
+xfs_db> sb 0
+xfs_db> print magicnum blocksize rblocks rextents uuid rextsize rgblocks \
+rgcount rbmblocks fname blocklog sectlog rextslog crc lsn meta_uuid
+magicnum = 0x58465342
+blocksize = 4096
+rblocks = 5192704
+rextents = 5192704
+uuid = c52adb8a-48a6-4325-b251-d4dcb30889ea
+rextsize = 1
+rgblocks = 1048576
+rgcount = 5
+rbmblocks = 161
+fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
+blocklog = 12
+sectlog = 9
+rextslog = 22
+crc = 0xfbd2b2d2 (correct)
+lsn = 0
+meta_uuid = c52adb8a-48a6-4325-b251-d4dcb30889ea
+----
+
 include::rtrmapbt.asciidoc[]


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/1] design: document the revisions to the realtime rmap formats
  2023-12-31 20:04 ` [PATCHSET v2.0 3/4] xfs-documentation: realtime reverse-mapping support Darrick J. Wong
@ 2023-12-27 14:08   ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:08 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Update the realtime rmap ondisk format documentation, since it's changed
a bit since the introduction of the metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 .../allocation_groups.asciidoc                     |    8 -
 .../internal_inodes.asciidoc                       |    4 
 .../journaling_log.asciidoc                        |    9 +
 design/XFS_Filesystem_Structure/realtime.asciidoc  |    5 
 design/XFS_Filesystem_Structure/rtrmapbt.asciidoc  |  216 ++++++++------------
 5 files changed, 105 insertions(+), 137 deletions(-)


diff --git a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
index ab8868d0..51ed106f 100644
--- a/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
+++ b/design/XFS_Filesystem_Structure/allocation_groups.asciidoc
@@ -112,7 +112,6 @@ struct xfs_sb
 	xfs_ino_t		sb_pquotino;
 	xfs_lsn_t		sb_lsn;
 	uuid_t			sb_meta_uuid;
-	xfs_ino_t		sb_rrmapino;
 };
 ----
 *sb_magicnum*::
@@ -534,13 +533,6 @@ If the +XFS_SB_FEAT_INCOMPAT_META_UUID+ feature is set, then the UUID field in
 all metadata blocks must match this UUID.  If not, the block header UUID field
 must match +sb_uuid+.
 
-*sb_rrmapino*::
-If the +XFS_SB_FEAT_RO_COMPAT_RMAPBT+ feature is set and a real-time
-device is present (+sb_rblocks+ > 0), this field points to an inode
-that contains the root to the
-xref:Real_time_Reverse_Mapping_Btree[Real-Time Reverse Mapping B+tree].
-This field is zero otherwise.
-
 === xfs_db Superblock Example
 
 A filesystem is made on a single disk with the following command:
diff --git a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
index e07b5f19..7da0cdf6 100644
--- a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
+++ b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
@@ -30,6 +30,7 @@ of those inodes have been deallocated and may be reused by future features.
 | Project Quotas                                 | /quota/project
 | xref:Real-Time_Bitmap_Inode[Realtime Bitmap]   | /realtime/bitmap
 | xref:Real-Time_Summary_Inode[Realtime Summary] | /realtime/summary
+| xref:Real_time_Reverse_Mapping_Btree[Realtime Reverse Mapping B+tree] | /realtime/*.rmap
 |=====
 
 Metadata files are flagged by the +XFS_DIFLAG2_METADATA+ flag in the
@@ -322,3 +323,6 @@ Checksum of the DQ block.
 There are two inodes allocated to managing the real-time device's space, the
 xref:Real-Time_Bitmap_Inode[Bitmap Inode] and the
 xref:Real-Time_Summary_Inode[Summary Inode].
+
+Each realtime group can allocate one inode to managing a
+xref:Real_time_Reverse_Mapping_Btree[reverse-index of space] usage.
diff --git a/design/XFS_Filesystem_Structure/journaling_log.asciidoc b/design/XFS_Filesystem_Structure/journaling_log.asciidoc
index c91fbb6a..52513b18 100644
--- a/design/XFS_Filesystem_Structure/journaling_log.asciidoc
+++ b/design/XFS_Filesystem_Structure/journaling_log.asciidoc
@@ -332,7 +332,10 @@ typedef struct xfs_extent_64 {
 Start block of this extent.
 
 *ext_len*::
-Length of this extent.
+Length of this extent.  If the highest bit of this field is set
+(+XFS_EFI_REALTIME_EXT+), this EFI applies to blocks on the realtime volume.
+The realtime block range must be converted to a range of realtime extents
+before further processing occurs.
 
 The ``extent freeing intent'' operation comes first; it tells the log that XFS
 wants to free some extents.  This record is crucial for correct log recovery
@@ -464,6 +467,8 @@ reverse mapping operation we want.  The upper three bytes are flag bits.
 | +XFS_RMAP_EXTENT_ATTR_FORK+	| Extent is for the attribute fork.
 | +XFS_RMAP_EXTENT_BMBT_BLOCK+	| Extent is for a block mapping btree block.
 | +XFS_RMAP_EXTENT_UNWRITTEN+	| Extent is unwritten.
+| +XFS_RMAP_EXTENT_REALTIME+	| Extent describes a range of blocks on the
+realtime volume.
 |=====
 
 The ``rmap update intent'' operation comes first; it tells the log that XFS
@@ -478,7 +483,7 @@ struct xfs_rui_log_format {
      __uint16_t                rui_type;
      __uint16_t                rui_size;
      __uint32_t                rui_nextents;
-     __uint64_t                rui_id;	
+     __uint64_t                rui_id;
      struct xfs_map_extent     rui_extents[1];
 };
 ----
diff --git a/design/XFS_Filesystem_Structure/realtime.asciidoc b/design/XFS_Filesystem_Structure/realtime.asciidoc
index f5fdb4e5..77d947ba 100644
--- a/design/XFS_Filesystem_Structure/realtime.asciidoc
+++ b/design/XFS_Filesystem_Structure/realtime.asciidoc
@@ -53,7 +53,9 @@ files since the space is preallocated and metadata maintenance is minimal.
 To reduce metadata contention for space allocation and remapping activities
 being applied to realtime files, the realtime volume can be split into
 allocation groups, just like the data volume.  The free space information is
-still contained in a single file that applies to the entire volume.
+still contained in a single file that applies to the entire volume.  This
+sharding enables code reuse between the data and realtime reverse mapping
+indexes and supports parallelism of reverse mapping and online fsck activities.
 
 Each realtime allocation group can contain up to (2^31^ - 1) filesystem blocks,
 regardless of the underlying realtime extent size.
@@ -63,6 +65,7 @@ Each realtime group has the following characteristics:
          * A super block describing overall filesystem info
          * Free space bitmap
          * Summary of free space
+         * Reverse space mapping
 
 Each of these structures are expanded upon in the following sections.
 
diff --git a/design/XFS_Filesystem_Structure/rtrmapbt.asciidoc b/design/XFS_Filesystem_Structure/rtrmapbt.asciidoc
index 3a109b20..d6c21127 100644
--- a/design/XFS_Filesystem_Structure/rtrmapbt.asciidoc
+++ b/design/XFS_Filesystem_Structure/rtrmapbt.asciidoc
@@ -1,11 +1,8 @@
 [[Real_time_Reverse_Mapping_Btree]]
-=== Real-Time Reverse-Mapping B+tree
-
-[NOTE]
-This data structure is under construction!  Details may change.
+=== Reverse-Mapping B+tree
 
 If the reverse-mapping B+tree and real-time storage device features
-are enabled, the real-time device has its own reverse block-mapping
+are enabled, each real-time group has its own reverse block-mapping
 B+tree.
 
 As mentioned in the chapter about xref:Reconstruction[reconstruction],
@@ -19,10 +16,10 @@ This B+tree is only present if the +XFS_SB_FEAT_RO_COMPAT_RMAPBT+
 feature is enabled and a real time device is present.  The feature
 requires a version 5 filesystem.
 
-The real-time reverse mapping B+tree is rooted in an inode's data
-fork; the inode number is given by the +sb_rrmapino+ field in the
-superblock.  The B+tree blocks themselves are stored in the regular
-filesystem.  The structures used for an inode's B+tree root are:
+The rtgroup reverse mapping B+tree is rooted in an inode's data fork; the inode
+number can be found by resolving the path +/realtime/$rgno.rmap+ in the
+metadata directory tree.  The B+tree blocks themselves are stored on the data
+volume.  The structures used for an inode's B+tree root are:
 
 [source, c]
 ----
@@ -32,52 +29,53 @@ struct xfs_rtrmap_root {
 };
 ----
 
-* On disk, the B+tree node starts with the +xfs_rtrmap_root+ header
-followed by an array of +xfs_rtrmap_key+ values and then an array of
-+xfs_rtrmap_ptr_t+ values. The size of both arrays is specified by the
-header's +bb_numrecs+ value.
+* If the B+tree contains only a single level, the ondisk data fork area begins
+with a +xfs_rtrmap_root+ header followed by an array of +xfs_rmap_rec+ leaf
+records.
 
-* The root node in the inode can only contain up to 10 key/pointer
-pairs for a standard 512 byte inode before a new level of nodes is
-added between the root and the leaves.  +di_forkoff+ should always
-be zero, because there are no extended attributes.
+* Otherwise, the ondisk data fork area begins with the +xfs_rtrmap_root+
+header and is followed first by an array of doubled up +xfs_refcount_key+
+values and then an array of +xfs_rtrmap_ptr_t+ values.  The size of both arrays
+is specified by the header's +bb_numrecs+ value.
 
-Each record in the real-time reverse-mapping B+tree has the following
-structure:
+* The root node in the inode can only contain up to 14 leaf records or 7
+key/pointer pairs for a standard 512 byte inode before a new level of nodes is
+added between the root and the leaves.  +di_forkoff+ should always be zero,
+because there are no extended attributes.
+
+Each record in an rtgroup reverse-mapping B+tree has the same structure as an
+AG reverse mapping btree:
 
 [source, c]
 ----
-struct xfs_rtrmap_rec {
-     __be64                     rm_startblock;
-     __be64                     rm_blockcount;
+struct xfs_rmap_rec {
+     __be32                     rm_startblock;
+     __be32                     rm_blockcount;
      __be64                     rm_owner;
      __be64                     rm_fork:1;
      __be64                     rm_bmbt:1;
      __be64                     rm_unwritten:1;
-     __be64                     rm_unused:7;
-     __be64                     rm_offset:54;
+     __be64                     rm_offset:61;
 };
 ----
 
 *rm_startblock*::
-Real-time device block number of this record.
+rtgroup block number of this record.
 
 *rm_blockcount*::
-The length of this extent, in real-time blocks.
+The length of this extent, in filesystem blocks.
 
 *rm_owner*::
-A 64-bit number describing the owner of this extent.  This must be an
-inode number, because the real-time device is for file data only.
+A 64-bit number describing the owner of this extent.  This must be
++XFS_RMAP_OWN_FS+ for the first extent in the realtime group.  For all other
+records, it must be an inode number, because the real-time volume does not
+store any other metadata.
 
 *rm_fork*::
-If +rm_owner+ describes an inode, this can be 1 if this record is for
-an attribute fork.  This value will always be zero for real-time
-extents.
+This value will always be zero.
 
 *rm_bmbt*::
-If +rm_owner+ describes an inode, this can be 1 to signify that this
-record is for a block map B+tree block.  In this case, +rm_offset+ has
-no meaning.  This value will always be zero for real-time extents.
+This value will always be zero.
 
 *rm_unwritten*::
 A flag indicating that the extent is unwritten.  This corresponds to
@@ -85,7 +83,7 @@ the flag in the xref:Data_Extents[extent record] format which means
 +XFS_EXT_UNWRITTEN+.
 
 *rm_offset*::
-The 54-bit logical file block offset, if +rm_owner+ describes an
+The 61-bit logical file block offset, if +rm_owner+ describes an
 inode.
 
 [NOTE]
@@ -96,30 +94,30 @@ The key has the following structure:
 
 [source, c]
 ----
-struct xfs_rtrmap_key {
-     __be64                     rm_startblock;
+struct xfs_rmap_key {
+     __be32                     rm_startblock;
      __be64                     rm_owner;
      __be64                     rm_fork:1;
      __be64                     rm_bmbt:1;
      __be64                     rm_reserved:1;
-     __be64                     rm_unused:7;
-     __be64                     rm_offset:54;
+     __be64                     rm_offset:61;
 };
 ----
 
-* All block numbers are 64-bit real-time device block numbers.
+* All block numbers in records and keys are 32-bit real-time group block
+numbers.
 
 * The +bb_magic+ value is ``MAPR'' (0x4d415052).
 
-* The +xfs_btree_lblock_t+ header is used for intermediate B+tree node as well
-as the leaves.
+* The +struct xfs_btree_lblock+ header is used for intermediate B+tree node as
+well as the leaves.
 
 * Each pointer is associated with two keys.  The first of these is the
 "low key", which is the key of the smallest record accessible through
 the pointer.  This low key has the same meaning as the key in all
 other btrees.  The second key is the high key, which is the maximum of
 the largest key that can be used to access a given record underneath
-the pointer.  Recall that each record in the real-time reverse mapping
+the pointer.  Recall that each record in the rtgroup reverse mapping
 b+tree describes an interval of physical blocks mapped to an interval
 of logical file block offsets; therefore, it makes sense that a range
 of keys can be used to find to a record.
@@ -130,105 +128,71 @@ This example shows a real-time reverse-mapping B+tree from a freshly
 populated root filesystem:
 
 ----
-xfs_db> sb 0
-xfs_db> addr rrmapino
+xfs_db> path -m /realtime/0.rmap
 xfs_db> p
 core.magic = 0x494e
 core.mode = 0100000
 core.version = 3
-core.format = 5 (rtrmapbt)
+core.format = 5 (rmap)
 ...
-u3.rtrmapbt.level = 3
-u3.rtrmapbt.numrecs = 1
-u3.rtrmapbt.keys[1] = [startblock,owner,offset,attrfork,bmbtblock,startblock_hi,
-		       owner_hi,offset_hi,attrfork_hi,bmbtblock_hi]
-	1:[1,132,1,0,0,1705337,133,54431,0,0]
-u3.rtrmapbt.ptrs[1] = 1:671
+u3.rtrmapbt.level = 1
+u3.rtrmapbt.numrecs = 3
+u3.rtrmapbt.keys[1-3] = [startblock,owner,offset,attrfork,bmbtblock,
+			 startblock_hi,owner_hi,offset_hi,attrfork_hi,
+			 bmbtblock_hi] 
+1:[0,-3,0,0,0,682,10015,681,0,0] 
+2:[228,10014,227,0,0,454,10014,453,0,0] 
+3:[456,10014,455,0,0,682,10014,681,0,0]
+u3.rtrmapbt.ptrs[1-3] = 1:10 2:11 3:12
+---
+
+This is a two-level tree, so we should follow it towards the leaves.
+
+---
 xfs_db> addr u3.rtrmapbt.ptrs[1]
 xfs_db> p
 magic = 0x4d415052
-level = 2
-numrecs = 8
-leftsib = null
-rightsib = null
-bno = 5368
-lsn = 0x400000000
-uuid = 98bbde42-67e7-46a5-a73e-d64a76b1b5ce
-owner = 131
-crc = 0x2560d199 (correct)
-keys[1-8] = [startblock,owner,offset,attrfork,bmbtblock,startblock_hi,owner_hi,
-	     offset_hi,attrfork_hi,bmbtblock_hi]
-	1:[1,132,1,0,0,17749,132,17749,0,0]
-	2:[17751,132,17751,0,0,35499,132,35499,0,0]
-	3:[35501,132,35501,0,0,53249,132,53249,0,0]
-	4:[53251,132,53251,0,0,1658473,133,7567,0,0]
-	5:[1658475,133,7569,0,0,1667473,133,16567,0,0]
-	6:[1667475,133,16569,0,0,1685223,133,34317,0,0]
-	7:[1685225,133,34319,0,0,1694223,133,43317,0,0]
-	8:[1694225,133,43319,0,0,1705337,133,54431,0,0]
-ptrs[1-8] = 1:134 2:238 3:345 4:453 5:795 6:563 7:670 8:780
-----
-
-We arbitrarily pick pointer 7 (twice) to traverse downwards:
-
-----
-xfs_db> addr ptrs[7]
-xfs_db> p
-magic = 0x4d415052
-level = 1
-numrecs = 36
-leftsib = 563
-rightsib = 780
-bno = 5360
-lsn = 0
-uuid = 98bbde42-67e7-46a5-a73e-d64a76b1b5ce
-owner = 131
-crc = 0x6807761d (correct)
-keys[1-36] = [startblock,owner,offset,attrfork,bmbtblock,startblock_hi,owner_hi,
-	      offset_hi,attrfork_hi,bmbtblock_hi]
-	1:[1685225,133,34319,0,0,1685473,133,34567,0,0]
-	2:[1685475,133,34569,0,0,1685723,133,34817,0,0]
-	3:[1685725,133,34819,0,0,1685973,133,35067,0,0]
-	...
-	34:[1693475,133,42569,0,0,1693723,133,42817,0,0]
-	35:[1693725,133,42819,0,0,1693973,133,43067,0,0]
-	36:[1693975,133,43069,0,0,1694223,133,43317,0,0]
-ptrs[1-36] = 1:669 2:672 3:674...34:722 35:723 36:725
-xfs_db> addr ptrs[7]
-xfs_db> p
-magic = 0x4d415052
 level = 0
-numrecs = 125
-leftsib = 678
-rightsib = 681
-bno = 5440
+numrecs = 115
+leftsib = null
+rightsib = 11
+bno = 80
 lsn = 0
-uuid = 98bbde42-67e7-46a5-a73e-d64a76b1b5ce
-owner = 131
-crc = 0xefce34d4 (correct)
-recs[1-125] = [startblock,blockcount,owner,offset,extentflag,attrfork,bmbtblock]
-	1:[1686725,1,133,35819,0,0,0]
-	2:[1686727,1,133,35821,0,0,0]
-	3:[1686729,1,133,35823,0,0,0]
-	...
-	123:[1686969,1,133,36063,0,0,0]
-	124:[1686971,1,133,36065,0,0,0]
-	125:[1686973,1,133,36067,0,0,0]
+uuid = 23d157a4-8ca7-4fca-8782-637dc6746105
+owner = 133
+crc = 0x4c046e7d (correct)
+recs[1-115] = [startblock,blockcount,owner,offset,extentflag,attrfork,bmbtblock] 
+1:[0,1,-3,0,0,0,0] 
+2:[1,682,10015,0,0,0,0] 
+3:[2,1,10014,1,0,0,0] 
+4:[4,1,10014,3,0,0,0] 
+5:[6,1,10014,5,0,0,0] 
+6:[8,1,10014,7,0,0,0] 
+7:[10,1,10014,9,0,0,0] 
+8:[12,1,10014,11,0,0,0] 
+9:[14,1,10014,13,0,0,0] 
+...
+112:[220,1,10014,219,0,0,0] 
+113:[222,1,10014,221,0,0,0] 
+114:[224,1,10014,223,0,0,0] 
+115:[226,1,10014,225,0,0,0]
 ----
 
-Several interesting things pop out here.  The first record shows that
-inode 133 has mapped real-time block 1,686,725 at offset 35,819.  We
-confirm this by looking at the block map for that inode:
+Several interesting things pop out here.  The first record shows that inode
+10,014 has mapped real-time block 225 at offset 225.  We confirm this by
+looking at the block map for that inode:
 
 ----
-xfs_db> inode 133
+xfs_db> inode 10014
 xfs_db> p core.realtime
 core.realtime = 1
-xfs_db> bmap
-data offset 35817 startblock 1686723 (1/638147) count 1 flag 0
-data offset 35819 startblock 1686725 (1/638149) count 1 flag 0
-data offset 35821 startblock 1686727 (1/638151) count 1 flag 0
+xfs_db> bmap 220 10
+data offset 221 startblock 222 (0/222) count 1 flag 0
+data offset 223 startblock 224 (0/224) count 1 flag 0
+data offset 225 startblock 226 (0/226) count 1 flag 0
+data offset 227 startblock 228 (0/228) count 1 flag 0
+data offset 229 startblock 230 (0/230) count 1 flag 0
 ----
 
-Notice that inode 133 has the real-time flag set, which means that its
+Notice that inode 10,014 has the real-time flag set, which means that its
 data blocks are all allocated from the real-time device.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/1] design: document changes for the realtime refcount btree
  2023-12-31 20:04 ` [PATCHSET v2.0 4/4] xfs-documentation: reflink on the realtime device Darrick J. Wong
@ 2023-12-27 14:09   ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-27 14:09 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Update the ondisk format documentation to reflect the realtime refcount
btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 .../internal_inodes.asciidoc                       |    5 -
 .../journaling_log.asciidoc                        |    9 +
 design/XFS_Filesystem_Structure/magic.asciidoc     |    1 
 design/XFS_Filesystem_Structure/realtime.asciidoc  |    5 -
 .../XFS_Filesystem_Structure/rtrefcountbt.asciidoc |  173 ++++++++++++++++++++
 5 files changed, 190 insertions(+), 3 deletions(-)
 create mode 100644 design/XFS_Filesystem_Structure/rtrefcountbt.asciidoc


diff --git a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
index 7da0cdf6..0fc758c6 100644
--- a/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
+++ b/design/XFS_Filesystem_Structure/internal_inodes.asciidoc
@@ -31,6 +31,7 @@ of those inodes have been deallocated and may be reused by future features.
 | xref:Real-Time_Bitmap_Inode[Realtime Bitmap]   | /realtime/bitmap
 | xref:Real-Time_Summary_Inode[Realtime Summary] | /realtime/summary
 | xref:Real_time_Reverse_Mapping_Btree[Realtime Reverse Mapping B+tree] | /realtime/*.rmap
+| xref:Real_time_Refcount_Btree[Realtime Reference Count+tree] | /realtime/*.refcount
 |=====
 
 Metadata files are flagged by the +XFS_DIFLAG2_METADATA+ flag in the
@@ -325,4 +326,6 @@ xref:Real-Time_Bitmap_Inode[Bitmap Inode] and the
 xref:Real-Time_Summary_Inode[Summary Inode].
 
 Each realtime group can allocate one inode to managing a
-xref:Real_time_Reverse_Mapping_Btree[reverse-index of space] usage.
+xref:Real_time_Reverse_Mapping_Btree[reverse-index of space] usage, and
+a second one to manage xref:Real_time_Refcount_Btree[reference counts] of space
+usage.
diff --git a/design/XFS_Filesystem_Structure/journaling_log.asciidoc b/design/XFS_Filesystem_Structure/journaling_log.asciidoc
index 52513b18..2c7b7383 100644
--- a/design/XFS_Filesystem_Structure/journaling_log.asciidoc
+++ b/design/XFS_Filesystem_Structure/journaling_log.asciidoc
@@ -569,6 +569,15 @@ reverse mapping operation we want.  The upper three bytes are flag bits.
 | +XFS_REFCOUNT_EXTENT_FREE_COW+  | Unreserve an extent for staging copy on write.
 |=====
 
+.Reference count update log intent flags
+[options="header"]
+|=====
+| Value				  | Description
+| +XFS_REFCOUNT_EXTENT_REALTIME+  | Extent describes a range of blocks on the
+realtime volume.  The range must be aligned to the realtime extent size,
+because extents cannot be partially shared.
+|=====
+
 The ``reference count update intent'' operation comes first; it tells the log
 that XFS wants to update some reference counts.  This record is crucial for
 correct log recovery because it enables us to spread a complex metadata update
diff --git a/design/XFS_Filesystem_Structure/magic.asciidoc b/design/XFS_Filesystem_Structure/magic.asciidoc
index c83f59a2..19ee52b8 100644
--- a/design/XFS_Filesystem_Structure/magic.asciidoc
+++ b/design/XFS_Filesystem_Structure/magic.asciidoc
@@ -49,6 +49,7 @@ relevant chapters.  Magic numbers tend to have consistent locations:
 | +XFS_REFC_CRC_MAGIC+		| 0x52334643	| R3FC	| xref:Reference_Count_Btree[Reference Count B+tree], v5 only
 | +XFS_MD_MAGIC+		| 0x5846534d	| XFSM	| xref:Metadata_Dumps[Metadata Dumps]
 | +XFS_RTSB_MAGIC+		| 0x58524750	| XRGP	| xref:Realtime_Groups[Realtime Groups]
+| +XFS_RTREFC_CRC_MAGIC+	| 0x52434e54	| RCNT	| xref:Real_time_Refcount_Btree[Real-Time Reference Count B+tree], v5 only
 |=====
 
 The magic numbers for log items are at offset zero in each log item, but items
diff --git a/design/XFS_Filesystem_Structure/realtime.asciidoc b/design/XFS_Filesystem_Structure/realtime.asciidoc
index 77d947ba..5d0b47a2 100644
--- a/design/XFS_Filesystem_Structure/realtime.asciidoc
+++ b/design/XFS_Filesystem_Structure/realtime.asciidoc
@@ -14,8 +14,7 @@ By placing the real time device (and the journal) on separate high-performance
 storage devices, it is possible to reduce most of the unpredictability in I/O
 response times that come from metadata operations.
 
-None of the XFS per-AG B+trees are involved with real time files.  It is not
-possible for real time files to share data blocks.
+None of the XFS per-AG B+trees are involved with real time files.
 
 [[Real-Time_Bitmap_Inode]]
 == Free Space Bitmap Inode
@@ -233,3 +232,5 @@ meta_uuid = c52adb8a-48a6-4325-b251-d4dcb30889ea
 ----
 
 include::rtrmapbt.asciidoc[]
+
+include::rtrefcountbt.asciidoc[]
diff --git a/design/XFS_Filesystem_Structure/rtrefcountbt.asciidoc b/design/XFS_Filesystem_Structure/rtrefcountbt.asciidoc
new file mode 100644
index 00000000..617badbf
--- /dev/null
+++ b/design/XFS_Filesystem_Structure/rtrefcountbt.asciidoc
@@ -0,0 +1,173 @@
+[[Real_time_Refcount_Btree]]
+=== Reference Count B+tree
+
+If the reflink and real-time storage device features are enabled, each
+real-time group has its own reference count B+tree.
+
+As mentioned in the chapter about xref:Reflink_Deduplication[sharing data
+blocks], this data structure is necessary to track how many times each extent
+in the realtime volume has been mapped.  This is how the copy-on-write code
+determines what to do when a realtime file is written.
+
+This B+tree is only present if the +XFS_SB_FEAT_RO_COMPAT_REFLINK+ feature is
+enabled and a real time device is present.  The feature requires a version 5
+filesystem.
+
+The rtgroup reference count B+tree is rooted in an inode's data fork; the inode
+number can be found by resolving the path +/realtime/$rgno.refcount+ in the
+metadata directory tree.  superblock.  The B+tree blocks themselves are stored
+in the regular filesystem.  The structures used for an inode's B+tree root are:
+
+[source, c]
+----
+struct xfs_rtrefcount_root {
+     __be16                     bb_level;
+     __be16                     bb_numrecs;
+};
+----
+
+* If the B+tree contains only a single level, the ondisk data fork area begins
+with a +xfs_rtrefcount_root+ header followed by an array of +xfs_refcount_rec+
+leaf records.
+
+* Otherwise, the ondisk data fork area begins with the +xfs_rtrefcount_root+
+header and is followed first by an array of +xfs_refcount_key+ values and then
+an array of +xfs_rtrefcount_ptr_t+ values.  The size of both arrays is
+specified by the header's +bb_numrecs+ value.
+
+* The root node in the inode can only contain up to 28 leaf records or
+key/pointer pairs for a standard 512 byte inode before a new level of nodes is
+added between the root and the leaves.  +di_forkoff+ should always be zero,
+because there are no extended attributes.
+
+Each record in an rtgroup reference count B+tree has the same structure as an
+AG reference count btree:
+
+[source, c]
+----
+struct xfs_refcount_rec {
+     __be32                     rc_startblock;
+     __be32                     rc_blockcount;
+     __be32                     rc_refcount;
+};
+----
+
+*rc_startblock*::
+rtgroup block number of this record.  Note that reference count records are
+tracked in units of realtime blocks, not realtime extents.
+However, records must be aligned to the realtime extent size in accordance with
+the existing realtime extent handling strategy.  The high bit
+(+XFS_REFC_COW_FLAG+) is set for all records referring to an extent that is
+being used to stage a copy on write operation.  This reduces recovery time
+during mount operations.  The reference count of these staging events must only
+be 1.
+
+*rc_blockcount*::
+The length of this extent, in filesystem blocks.
+
+*rc_refcount*::
+Number of times this extent has been shared.
+
+The key has the following structure:
+
+[source, c]
+----
+struct xfs_refcount_key {
+     __be32                     rc_startblock;
+};
+----
+
+* All block numbers are 32-bit rtgroup device block numbers, though the
+key should be aligned to the realtime extent size.
+
+* The +bb_magic+ value is ``RCNT'' (0x52434354).
+
+* The +struct xfs_btree_lblock+ header is used for intermediate B+tree node as
+well as the leaves.
+
+==== xfs_db rtrefcountbt Example
+
+This example shows a real-time reference count B+tree from a freshly
+populated filesystem.  One directory tree has been reflinked:
+
+----
+xfs_db> path -m /realtime/0.refcount
+xfs_db> p
+core.magic = 0x494e
+core.mode = 0100000
+core.version = 3
+core.format = 6 (refcount)
+...
+v3.inumber = 134
+v3.uuid = 23d157a4-8ca7-4fca-8782-637dc6746105
+v3.reflink = 0
+v3.cowextsz = 0
+v3.dax = 0
+v3.bigtime = 1
+v3.nrext64 = 1
+v3.metadata = 1
+u3.rtrefcbt.level = 1
+u3.rtrefcbt.numrecs = 2
+u3.rtrefcbt.keys[1-2] = [startblock,cowflag] 
+1:[4,0] 
+2:[344,0]
+u3.rtrefcbt.ptrs[1-2] = 1:8 2:9
+----
+
+Notice that this is a two-level refcount btree; we must continue towards the
+leaf level.
+
+----
+xfs_db> addr u3.rtrefcbt.ptrs[2]
+xfs_db> p
+magic = 0x52434e54
+level = 0
+numrecs = 170
+leftsib = 8
+rightsib = null
+bno = 72
+lsn = 0
+uuid = 23d157a4-8ca7-4fca-8782-637dc6746105
+owner = 134
+crc = 0x21e04c3 (correct)
+recs[1-170] = [startblock,blockcount,refcount,cowflag] 
+1:[344,1,2,0] 
+2:[346,1,2,0] 
+3:[348,1,2,0] 
+4:[350,1,2,0] 
+5:[352,1,2,0] 
+6:[354,1,2,0] 
+...
+----
+
+This indicates that realtime block 354 is shared.  Let's use the realtime
+reverse mapping information to find which files are sharing these blocks:
+
+----
+xfs_db> fsmap -r 354 354
+0: 0/1 len 682 owner 10015 offset 0 bmbt 0 attrfork 0 extflag 0
+1: 0/354 len 1 owner 10014 offset 353 bmbt 0 attrfork 0 extflag 0
+----
+
+It looks as though inodes 10,014 and 10,015 share this block.  Let us confirm
+this by navigating to those inodes and dumping the data fork mappings:
+
+----
+xfs_db> inode 10015
+xfs_db> p core.realtime
+core.realtime = 1
+xfs_db> bmap
+data offset 0 startblock 1 (0/1) count 682 flag 0
+xfs_db> inode 10014
+xfs_db> p core.realtime
+core.realtime = 1
+xfs_db> bmap 350 10
+data offset 351 startblock 352 (0/352) count 1 flag 0
+data offset 353 startblock 354 (0/354) count 1 flag 0
+data offset 355 startblock 356 (0/356) count 1 flag 0
+data offset 357 startblock 358 (0/358) count 1 flag 0
+data offset 359 startblock 360 (0/360) count 1 flag 0
+----
+
+Notice that both inodes have their realtime flags set, and both of them map
+a data fork extent to the same realtime block 354.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume
@ 2023-12-31 18:23 Darrick J. Wong
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                   ` (44 more replies)
  0 siblings, 45 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 18:23 UTC (permalink / raw)
  To: xfs, fstests; +Cc: Christoph Hellwig

Hi everyone,

This third patchriver contains for the realtime modernization project.
There are five main parts to this effort -- adding a metadata directory
tree; sharding the realtime volume into allocation groups to reduce
metadata lock contention; adding reverse mapping; adding reflink; and
adding the one piece needed to make quotas work on realtime.  This
brings the robustness of the realtime volume up to par with the data
volume.

Christoph Hellwig has recently taken an interest in getting this feature
merged to better support zoned storage and garbage collection.  This
river is much smaller than last year's, as we've recently collaborated
to get a bunch of the typedef abuses and casting problems corrected.

In theory this can also be used to support things like pmem and cxl.mem
via multi-volume XFSes where the fs metadata lives on cheap(er) flash
storage so that the pmem can be the exclusive backing of file data on
the rt volume.

--D

^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
@ 2023-12-31 19:34 ` Darrick J. Wong
  2023-12-31 21:01   ` [PATCH 01/21] xfs: move inode copy-on-write predicates to xfs_inode.[ch] Darrick J. Wong
                     ` (20 more replies)
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                   ` (43 subsequent siblings)
  44 siblings, 21 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:34 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

This series hoists inode creation, renaming, and deletion operations to
libxfs in anticipation of the metadata inode directory feature, which
maintains a directory tree of metadata inodes.  This will be necessary
for further enhancements to the realtime feature, subvolume support.

There aren't supposed to be any functional changes in this intense
refactoring -- we just split the functions into pieces that are generic
and pieces that are specific to libxfs clients.  As a bonus, we can
remove various open-coded pieces of mkfs.xfs and xfs_repair when this
series gets to xfsprogs.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=inode-refactor

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=inode-refactor
---
 fs/xfs/Makefile                 |    1 
 fs/xfs/libxfs/xfs_bmap.c        |   42 +
 fs/xfs/libxfs/xfs_bmap.h        |    3 
 fs/xfs/libxfs/xfs_dir2.c        |  636 +++++++++++++++++
 fs/xfs/libxfs/xfs_dir2.h        |   48 +
 fs/xfs/libxfs/xfs_ialloc.c      |   15 
 fs/xfs/libxfs/xfs_inode_util.c  |  723 +++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_util.h  |   71 ++
 fs/xfs/libxfs/xfs_shared.h      |    7 
 fs/xfs/libxfs/xfs_trans_inode.c |    2 
 fs/xfs/scrub/common.c           |    1 
 fs/xfs/scrub/tempfile.c         |   20 -
 fs/xfs/xfs_inode.c              | 1486 +++++----------------------------------
 fs/xfs/xfs_inode.h              |   75 +-
 fs/xfs/xfs_ioctl.c              |   60 --
 fs/xfs/xfs_iops.c               |   47 +
 fs/xfs/xfs_linux.h              |    2 
 fs/xfs/xfs_qm.c                 |    9 
 fs/xfs/xfs_reflink.h            |   10 
 fs/xfs/xfs_symlink.c            |   63 +-
 fs/xfs/xfs_trans.h              |    1 
 21 files changed, 1830 insertions(+), 1492 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_inode_util.c
 create mode 100644 fs/xfs/libxfs/xfs_inode_util.h


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 02/15] xfs: metadata inode directories
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
@ 2023-12-31 19:35 ` Darrick J. Wong
  2023-12-31 21:06   ` [PATCH 01/32] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Darrick J. Wong
                     ` (31 more replies)
  2023-12-31 19:35 ` [PATCHSET v2.0 03/15] xfs: refactor realtime meta inode locking Darrick J. Wong
                   ` (42 subsequent siblings)
  44 siblings, 32 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:35 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

This series delivers a new feature -- metadata inode directories.  This
is a separate directory tree (rooted in the superblock) that contains
only inodes that contain filesystem metadata.  Different metadata
objects can be looked up with regular paths.

We start by creating xfs_imeta_* functions to mediate access to metadata
inode pointers.  This enables the imeta code to abstract inode pointers,
whether they're the classic five in the superblock, or the much more
complex directory tree.  All current users of metadata inodes (rt+quota)
are converted to use the boilerplate code.

Next, we define the metadir on-disk format, which consists of marking
inodes with a new iflag that says they're metadata.  This we use to
prevent bulkstat and friends from ever getting their hands on fs
metadata.

Finally, we implement metadir operations so that clients can create,
delete, zap, and look up metadata inodes by path.  Beware that much of
this code is only lightly used, because the five current users of
metadata inodes don't tend to change them very often.  This is likely to
change if and when the subvolume and multiple-rt-volume features get
written/merged/etc.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=metadir

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=metadir

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=metadir

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=metadir
---
 fs/xfs/Makefile                |    3 
 fs/xfs/libxfs/xfs_attr.c       |    5 
 fs/xfs/libxfs/xfs_bmap.c       |    5 
 fs/xfs/libxfs/xfs_format.h     |   72 ++-
 fs/xfs/libxfs/xfs_fs.h         |   29 +
 fs/xfs/libxfs/xfs_health.h     |    6 
 fs/xfs/libxfs/xfs_ialloc.c     |   56 +-
 fs/xfs/libxfs/xfs_ialloc.h     |    2 
 fs/xfs/libxfs/xfs_imeta.c      | 1077 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_imeta.h      |  110 ++++
 fs/xfs/libxfs/xfs_inode_buf.c  |   73 +++
 fs/xfs/libxfs/xfs_inode_buf.h  |    3 
 fs/xfs/libxfs/xfs_inode_util.c |    5 
 fs/xfs/libxfs/xfs_ondisk.h     |    1 
 fs/xfs/libxfs/xfs_sb.c         |   35 +
 fs/xfs/libxfs/xfs_trans_resv.c |  106 ++++
 fs/xfs/libxfs/xfs_trans_resv.h |    3 
 fs/xfs/libxfs/xfs_types.c      |    7 
 fs/xfs/scrub/agheader.c        |   29 +
 fs/xfs/scrub/common.c          |   23 +
 fs/xfs/scrub/common.h          |    1 
 fs/xfs/scrub/dir.c             |    8 
 fs/xfs/scrub/dir_repair.c      |    8 
 fs/xfs/scrub/dirtree.c         |   22 +
 fs/xfs/scrub/dirtree.h         |    2 
 fs/xfs/scrub/findparent.c      |   37 +
 fs/xfs/scrub/health.c          |    1 
 fs/xfs/scrub/inode.c           |   25 +
 fs/xfs/scrub/inode_repair.c    |   18 +
 fs/xfs/scrub/metapath.c        |  617 +++++++++++++++++++++++
 fs/xfs/scrub/nlinks.c          |   12 
 fs/xfs/scrub/nlinks_repair.c   |    2 
 fs/xfs/scrub/orphanage.c       |    2 
 fs/xfs/scrub/parent.c          |   30 +
 fs/xfs/scrub/parent_repair.c   |    2 
 fs/xfs/scrub/quotacheck.c      |    7 
 fs/xfs/scrub/repair.c          |    8 
 fs/xfs/scrub/repair.h          |    3 
 fs/xfs/scrub/scrub.c           |    9 
 fs/xfs/scrub/scrub.h           |    2 
 fs/xfs/scrub/stats.c           |    1 
 fs/xfs/scrub/tempfile.c        |  107 ++++
 fs/xfs/scrub/tempfile.h        |    3 
 fs/xfs/scrub/trace.c           |    1 
 fs/xfs/scrub/trace.h           |   51 ++
 fs/xfs/xfs_dquot.c             |    1 
 fs/xfs/xfs_health.c            |    2 
 fs/xfs/xfs_icache.c            |   43 ++
 fs/xfs/xfs_imeta_utils.c       |  341 +++++++++++++
 fs/xfs/xfs_imeta_utils.h       |   27 +
 fs/xfs/xfs_inode.c             |   28 +
 fs/xfs/xfs_inode.h             |   16 +
 fs/xfs/xfs_ioctl.c             |    7 
 fs/xfs/xfs_iops.c              |   34 +
 fs/xfs/xfs_itable.c            |   34 +
 fs/xfs/xfs_itable.h            |    3 
 fs/xfs/xfs_mount.c             |   48 ++
 fs/xfs/xfs_mount.h             |    3 
 fs/xfs/xfs_qm.c                |  245 ++++++---
 fs/xfs/xfs_qm_syscalls.c       |    9 
 fs/xfs/xfs_quota.h             |    5 
 fs/xfs/xfs_rtalloc.c           |   46 +-
 fs/xfs/xfs_super.c             |    4 
 fs/xfs/xfs_symlink.c           |    2 
 fs/xfs/xfs_trace.c             |    1 
 fs/xfs/xfs_trace.h             |  139 +++++
 fs/xfs/xfs_trans_dquot.c       |    6 
 67 files changed, 3466 insertions(+), 207 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_imeta.c
 create mode 100644 fs/xfs/libxfs/xfs_imeta.h
 create mode 100644 fs/xfs/scrub/metapath.c
 create mode 100644 fs/xfs/xfs_imeta_utils.c
 create mode 100644 fs/xfs/xfs_imeta_utils.h


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 03/15] xfs: refactor realtime meta inode locking
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
@ 2023-12-31 19:35 ` Darrick J. Wong
  2023-12-31 21:14   ` [PATCH 1/4] xfs: refactor realtime scrubbing context management Darrick J. Wong
                     ` (3 more replies)
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                   ` (41 subsequent siblings)
  44 siblings, 4 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:35 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

Replace all the open-coded locking of realtime metadata inodes with a
single rtlock function that can lock all the pieces that the caller
wants in a single call.  This will be important for maintaining correct
locking order later when we start adding more realtime metadata inodes.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=refactor-rt-locking

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=refactor-rt-locking
---
 fs/xfs/libxfs/xfs_bmap.c     |    7 +---
 fs/xfs/libxfs/xfs_rtbitmap.c |   57 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtbitmap.h |   17 ++++++++++
 fs/xfs/scrub/bmap.c          |    2 +
 fs/xfs/scrub/common.c        |   70 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/common.h        |   18 +++++++++++
 fs/xfs/scrub/fscounters.c    |    4 +-
 fs/xfs/scrub/rtbitmap.c      |    7 +---
 fs/xfs/scrub/rtsummary.c     |   12 ++-----
 fs/xfs/scrub/scrub.c         |    1 +
 fs/xfs/scrub/scrub.h         |    9 +++++
 fs/xfs/xfs_bmap_util.c       |    5 +--
 fs/xfs/xfs_fsmap.c           |    4 +-
 fs/xfs/xfs_inode.c           |    3 +-
 fs/xfs/xfs_inode.h           |   13 ++------
 fs/xfs/xfs_rtalloc.c         |   62 ++++++++++++++++++++++++++-----------
 16 files changed, 235 insertions(+), 56 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (2 preceding siblings ...)
  2023-12-31 19:35 ` [PATCHSET v2.0 03/15] xfs: refactor realtime meta inode locking Darrick J. Wong
@ 2023-12-31 19:35 ` Darrick J. Wong
  2023-12-31 21:15   ` [PATCH 01/24] xfs: create incore realtime group structures Darrick J. Wong
                     ` (23 more replies)
  2023-12-31 19:35 ` [PATCHSET v2.0 05/15] xfs: enable FITRIM for the realtime section Darrick J. Wong
                   ` (40 subsequent siblings)
  44 siblings, 24 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:35 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

Right now, the realtime section uses a single pair of metadata inodes to
store the free space information.  This presents a scalability problem
since every thread trying to allocate or free rt extents have to lock
these files.  It would be very useful if we could begin to tackle these
problems by sharding the realtime section, so create the notion of
realtime groups, which are similar to allocation groups on the data
section.

While we're at it, define a superblock to be stamped into the start of
each rt section.  This enables utilities such as blkid to identify block
devices containing realtime sections, and helpfully avoids the situation
where a file extent can cross an rtgroup boundary.

The best advantage for rtgroups will become evident later when we get to
adding rmap and reflink to the realtime volume, since the geometry
constraints are the same for rt groups and AGs.  Hence we can reuse all
that code directly.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-groups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-groups

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-groups

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-groups
---
 fs/xfs/Makefile                 |    3 
 fs/xfs/libxfs/xfs_bmap.h        |    5 
 fs/xfs/libxfs/xfs_format.h      |   96 ++++++
 fs/xfs/libxfs/xfs_fs.h          |    9 -
 fs/xfs/libxfs/xfs_fs_staging.h  |   17 +
 fs/xfs/libxfs/xfs_health.h      |   30 ++
 fs/xfs/libxfs/xfs_ondisk.h      |    2 
 fs/xfs/libxfs/xfs_rtbitmap.c    |  143 +++++++++-
 fs/xfs/libxfs/xfs_rtbitmap.h    |   68 ++++-
 fs/xfs/libxfs/xfs_rtgroup.c     |  585 +++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h     |  252 +++++++++++++++++
 fs/xfs/libxfs/xfs_sb.c          |  126 ++++++++
 fs/xfs/libxfs/xfs_shared.h      |    4 
 fs/xfs/libxfs/xfs_types.c       |   46 +++
 fs/xfs/libxfs/xfs_types.h       |    4 
 fs/xfs/scrub/common.c           |   88 ++++++
 fs/xfs/scrub/common.h           |   52 ++-
 fs/xfs/scrub/health.c           |   25 ++
 fs/xfs/scrub/repair.h           |    3 
 fs/xfs/scrub/rgsuper.c          |   77 +++++
 fs/xfs/scrub/rgsuper_repair.c   |   48 +++
 fs/xfs/scrub/rtbitmap.c         |  127 ++++++++
 fs/xfs/scrub/rtbitmap.h         |    6 
 fs/xfs/scrub/rtsummary.c        |    7 
 fs/xfs/scrub/rtsummary_repair.c |   15 +
 fs/xfs/scrub/scrub.c            |   27 ++
 fs/xfs/scrub/scrub.h            |   42 +--
 fs/xfs/scrub/stats.c            |    2 
 fs/xfs/scrub/trace.h            |    6 
 fs/xfs/xfs_bmap_item.c          |   18 +
 fs/xfs/xfs_buf_item_recover.c   |   43 +++
 fs/xfs/xfs_fsops.c              |    4 
 fs/xfs/xfs_health.c             |  114 ++++++++
 fs/xfs/xfs_ioctl.c              |   35 ++
 fs/xfs/xfs_log_recover.c        |    6 
 fs/xfs/xfs_mount.c              |   12 +
 fs/xfs/xfs_mount.h              |   11 +
 fs/xfs/xfs_rtalloc.c            |  273 +++++++++++++++++-
 fs/xfs/xfs_rtalloc.h            |    5 
 fs/xfs/xfs_super.c              |   18 +
 fs/xfs/xfs_trace.c              |    1 
 fs/xfs/xfs_trace.h              |   63 ++++
 fs/xfs/xfs_trans.c              |   38 ++-
 fs/xfs/xfs_trans.h              |    2 
 fs/xfs/xfs_trans_buf.c          |   25 +-
 45 files changed, 2460 insertions(+), 123 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c
 create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h
 create mode 100644 fs/xfs/scrub/rgsuper.c
 create mode 100644 fs/xfs/scrub/rgsuper_repair.c


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 05/15] xfs: enable FITRIM for the realtime section
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (3 preceding siblings ...)
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
@ 2023-12-31 19:35 ` Darrick J. Wong
  2023-12-31 21:22   ` [PATCH 1/1] xfs: enable FITRIM on the realtime device Darrick J. Wong
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                   ` (39 subsequent siblings)
  44 siblings, 1 reply; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:35 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

One thing that's been missing for a long time is the ability to tell
underlying storage that it can unmap the unused space on the realtime
device.  This short series exposes this functionality through FITRIM.
Callers that want ranged FITRIM should be aware that the realtime space
exists in the offset range after the data device.  However, it is
anticipated that most callers pass in offset=0 len=-1ULL and will not
notice or care.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-discard

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-discard
---
 fs/xfs/xfs_discard.c |  225 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_trace.h   |   29 ++++++
 2 files changed, 248 insertions(+), 6 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (4 preceding siblings ...)
  2023-12-31 19:35 ` [PATCHSET v2.0 05/15] xfs: enable FITRIM for the realtime section Darrick J. Wong
@ 2023-12-31 19:36 ` Darrick J. Wong
  2023-12-31 21:22   ` [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros Darrick J. Wong
                     ` (13 more replies)
  2023-12-31 19:36 ` [PATCHSET v2.0 07/15] xfs: enable in-core block reservation for rt metadata Darrick J. Wong
                   ` (38 subsequent siblings)
  44 siblings, 14 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:36 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

This series prepares the btree code to support realtime reverse mapping
btrees by refactoring xfs_ifork_realloc to be fed a per-btree ops
structure so that it can handle multiple types of inode-rooted btrees.
It moves on to refactoring the btree code to use the new realloc
routines and to support storing btree rcords in the inode root, because
the current bmbt code does not support this.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=btree-ifork-records

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=btree-ifork-records
---
 fs/xfs/libxfs/xfs_alloc_btree.c    |    6 -
 fs/xfs/libxfs/xfs_alloc_btree.h    |    3 
 fs/xfs/libxfs/xfs_attr_leaf.c      |    8 -
 fs/xfs/libxfs/xfs_bmap.c           |   59 ++----
 fs/xfs/libxfs/xfs_bmap_btree.c     |   93 ++++++++-
 fs/xfs/libxfs/xfs_bmap_btree.h     |  219 +++++++++++++++------
 fs/xfs/libxfs/xfs_btree.c          |  382 ++++++++++++++++++++++++++++--------
 fs/xfs/libxfs/xfs_btree.h          |    4 
 fs/xfs/libxfs/xfs_btree_staging.c  |    4 
 fs/xfs/libxfs/xfs_ialloc.c         |    4 
 fs/xfs/libxfs/xfs_ialloc_btree.c   |    6 -
 fs/xfs/libxfs/xfs_ialloc_btree.h   |    3 
 fs/xfs/libxfs/xfs_inode_fork.c     |  163 +++++++--------
 fs/xfs/libxfs/xfs_inode_fork.h     |   27 ++-
 fs/xfs/libxfs/xfs_refcount_btree.c |    5 
 fs/xfs/libxfs/xfs_refcount_btree.h |    3 
 fs/xfs/libxfs/xfs_rmap_btree.c     |    9 -
 fs/xfs/libxfs/xfs_rmap_btree.h     |    3 
 fs/xfs/libxfs/xfs_sb.c             |   16 +-
 fs/xfs/libxfs/xfs_trans_resv.c     |    2 
 fs/xfs/scrub/bmap_repair.c         |    2 
 fs/xfs/scrub/inode_repair.c        |   12 +
 fs/xfs/xfs_xchgrange.c             |    4 
 23 files changed, 707 insertions(+), 330 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 07/15] xfs: enable in-core block reservation for rt metadata
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (5 preceding siblings ...)
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
@ 2023-12-31 19:36 ` Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 1/2] xfs: simplify xfs_ag_resv_free signature Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 2/2] xfs: allow inode-based btrees to reserve space in the data device Darrick J. Wong
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
                   ` (37 subsequent siblings)
  44 siblings, 2 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:36 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

In preparation for adding reverse mapping and refcounting to the
realtime device, enhance the metadir code to reserve free space for
btree shape changes as delayed allocation blocks.

This effectively allows us to pre-allocate space for the rmap and
refcount btrees in the same manner as we do for the data device
counterparts, which is how we avoid ENOSPC failures when space is low
but we've already committed to a COW operation.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=reserve-rt-metadata-space

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=reserve-rt-metadata-space
---
 fs/xfs/libxfs/xfs_ag.c       |    4 -
 fs/xfs/libxfs/xfs_ag_resv.c  |   25 ++---
 fs/xfs/libxfs/xfs_ag_resv.h  |    2 
 fs/xfs/libxfs/xfs_errortag.h |    4 +
 fs/xfs/libxfs/xfs_imeta.c    |  191 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_imeta.h    |   11 ++
 fs/xfs/libxfs/xfs_types.h    |    7 ++
 fs/xfs/scrub/repair.c        |    5 -
 fs/xfs/xfs_error.c           |    3 +
 fs/xfs/xfs_fsops.c           |   39 +++++----
 fs/xfs/xfs_fsops.h           |    2 
 fs/xfs/xfs_inode.h           |    3 +
 fs/xfs/xfs_mount.c           |   10 ++
 fs/xfs/xfs_mount.h           |    1 
 fs/xfs/xfs_rtalloc.c         |   23 +++++
 fs/xfs/xfs_rtalloc.h         |    5 +
 fs/xfs/xfs_super.c           |    6 -
 fs/xfs/xfs_trace.h           |   46 ++++++++++
 18 files changed, 337 insertions(+), 50 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (6 preceding siblings ...)
  2023-12-31 19:36 ` [PATCHSET v2.0 07/15] xfs: enable in-core block reservation for rt metadata Darrick J. Wong
@ 2023-12-31 19:36 ` Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 1/9] xfs: clean up extent free log intent item tracepoint callsites Darrick J. Wong
                     ` (8 more replies)
  2023-12-31 19:36 ` [PATCHSET v2.0 09/15] xfs: widen EFI format to support rt Darrick J. Wong
                   ` (36 subsequent siblings)
  44 siblings, 9 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:36 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

Hi all,

This series cleans up some warts in the extent freeing log intent code.
We start by acknowledging that this mechanism does not have anything to
do with the bmap code by moving it to xfs_alloc.c and giving the
function a more descriptive name.  Then we clean up the tracepoints and
the _finish_one call paths to pass the intent structure around.  This
reduces the overhead when the tracepoints are disabled and will make
things much cleaner when we start adding realtime support in the next
patch.  I also incorporated a bunch of cleanups from Christoph Hellwig.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=extfree-intent-cleanups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=extfree-intent-cleanups
---
 fs/xfs/libxfs/xfs_ag.c             |    2 -
 fs/xfs/libxfs/xfs_alloc.c          |   92 ++++++++---------------------
 fs/xfs/libxfs/xfs_alloc.h          |   12 ++--
 fs/xfs/libxfs/xfs_bmap.c           |   12 +++-
 fs/xfs/libxfs/xfs_bmap_btree.c     |    2 -
 fs/xfs/libxfs/xfs_ialloc.c         |    5 +-
 fs/xfs/libxfs/xfs_ialloc_btree.c   |    2 -
 fs/xfs/libxfs/xfs_refcount.c       |    6 +-
 fs/xfs/libxfs/xfs_refcount_btree.c |    2 -
 fs/xfs/scrub/newbt.c               |    5 +-
 fs/xfs/scrub/reap.c                |    7 +-
 fs/xfs/xfs_bmap_item.c             |    6 --
 fs/xfs/xfs_drain.c                 |    8 +--
 fs/xfs/xfs_drain.h                 |    5 +-
 fs/xfs/xfs_extfree_item.c          |  115 +++++++++++++++++-------------------
 fs/xfs/xfs_extfree_item.h          |    6 ++
 fs/xfs/xfs_refcount_item.c         |    5 --
 fs/xfs/xfs_reflink.c               |    2 -
 fs/xfs/xfs_rmap_item.c             |    5 --
 fs/xfs/xfs_trace.h                 |   33 +++++-----
 20 files changed, 141 insertions(+), 191 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 09/15] xfs: widen EFI format to support rt
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (7 preceding siblings ...)
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
@ 2023-12-31 19:36 ` Darrick J. Wong
  2023-12-31 21:28   ` [PATCH 1/2] xfs: support logging EFIs for realtime extents Darrick J. Wong
  2023-12-31 21:29   ` [PATCH 2/2] xfs: support error injection when freeing rt extents Darrick J. Wong
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
                   ` (35 subsequent siblings)
  44 siblings, 2 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:36 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

Hi all,

Realtime reverse mapping (and beyond that, realtime reflink) needs to be
able to defer file mapping and extent freeing work in much the same
manner as is required on the data volume.  Make the extent freeing log
items operate on rt extents in preparation for realtime rmap.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-extfree-intents

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-extfree-intents
---
 fs/xfs/libxfs/xfs_alloc.c       |   16 ++
 fs/xfs/libxfs/xfs_alloc.h       |   17 ++
 fs/xfs/libxfs/xfs_defer.c       |    6 +
 fs/xfs/libxfs/xfs_defer.h       |    1 
 fs/xfs/libxfs/xfs_log_format.h  |    6 +
 fs/xfs/libxfs/xfs_log_recover.h |    2 
 fs/xfs/libxfs/xfs_rtbitmap.c    |    4 +
 fs/xfs/xfs_extfree_item.c       |  282 ++++++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_log_recover.c        |    2 
 9 files changed, 310 insertions(+), 26 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (8 preceding siblings ...)
  2023-12-31 19:36 ` [PATCHSET v2.0 09/15] xfs: widen EFI format to support rt Darrick J. Wong
@ 2023-12-31 19:37 ` Darrick J. Wong
  2023-12-31 21:29   ` [PATCH 01/10] xfs: attach rtgroup objects to btree cursors Darrick J. Wong
                     ` (9 more replies)
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                   ` (34 subsequent siblings)
  44 siblings, 10 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:37 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

Hi all,

This series cleans up the rmap intent code before we start adding
support for realtime devices.  Similar to previous intent cleanup
patchsets, we start transforming the tracepoints so that the data
extraction are done inside the tracepoint code, and then we start
passing the intent itself to the _finish_one function.  This reduces the
boxing and unboxing of parameters.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=rmap-intent-cleanups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=rmap-intent-cleanups
---
 fs/xfs/libxfs/xfs_btree.c |    4 +
 fs/xfs/libxfs/xfs_btree.h |    2 
 fs/xfs/libxfs/xfs_rmap.c  |  268 ++++++++++++++++-----------------------------
 fs/xfs/libxfs/xfs_rmap.h  |   15 ++-
 fs/xfs/xfs_rmap_item.c    |  148 +++++++++++++------------
 fs/xfs/xfs_rmap_item.h    |    4 +
 fs/xfs/xfs_trace.c        |    1 
 fs/xfs/xfs_trace.h        |  187 +++++++++++++++++++++----------
 8 files changed, 316 insertions(+), 313 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (9 preceding siblings ...)
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
@ 2023-12-31 19:37 ` Darrick J. Wong
  2023-12-31 21:32   ` [PATCH 01/39] xfs: prepare rmap btree cursor tracepoints for realtime Darrick J. Wong
                     ` (38 more replies)
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
                   ` (33 subsequent siblings)
  44 siblings, 39 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:37 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

Hi all,

This is the latest revision of a patchset that adds to XFS kernel
support for reverse mapping for the realtime device.  This time around
I've fixed some of the bitrot that I've noticed over the past few
months, and most notably have converted rtrmapbt to use the metadata
inode directory feature instead of burning more space in the superblock.

At the beginning of the set are patches to implement storing B+tree
leaves in an inode root, since the realtime rmapbt is rooted in an
inode, unlike the regular rmapbt which is rooted in an AG block.
Prior to this, the only btree that could be rooted in the inode fork
was the block mapping btree; if all the extent records fit in the
inode, format would be switched from 'btree' to 'extents'.

The next few patches widen the reverse mapping routines to fit the
64-bit numbers required to store information about the realtime
device and establish a new b+tree type (rtrmapbt) for the realtime
variant of the rmapbt.  After that are a few patches to handle rooting
the rtrmapbt in a specific inode that's referenced by the superblock.

Finally, there are patches to implement GETFSMAP with the rtrmapbt and
scrub functionality for the rtrmapbt and rtbitmap; and then wire up the
online scrub functionality.  We also enhance EFIs to support tracking
freeing of realtime extents so that when rmap is turned on we can
maintain the same order of operations as the regular rmap code.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-rmap

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-rmap

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-rmap

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-rmap
---
 fs/xfs/Makefile                  |    3 
 fs/xfs/libxfs/xfs_bmap.c         |   23 +
 fs/xfs/libxfs/xfs_btree.c        |   78 +++
 fs/xfs/libxfs/xfs_btree.h        |    7 
 fs/xfs/libxfs/xfs_defer.h        |    1 
 fs/xfs/libxfs/xfs_format.h       |   24 +
 fs/xfs/libxfs/xfs_fs.h           |    6 
 fs/xfs/libxfs/xfs_fs_staging.h   |    1 
 fs/xfs/libxfs/xfs_health.h       |    4 
 fs/xfs/libxfs/xfs_inode_buf.c    |   26 +
 fs/xfs/libxfs/xfs_inode_fork.c   |   13 
 fs/xfs/libxfs/xfs_log_format.h   |    6 
 fs/xfs/libxfs/xfs_log_recover.h  |    2 
 fs/xfs/libxfs/xfs_ondisk.h       |    2 
 fs/xfs/libxfs/xfs_refcount.c     |    6 
 fs/xfs/libxfs/xfs_rmap.c         |  198 +++++++
 fs/xfs/libxfs/xfs_rmap.h         |   25 +
 fs/xfs/libxfs/xfs_rtgroup.c      |   12 
 fs/xfs/libxfs/xfs_rtgroup.h      |   20 +
 fs/xfs/libxfs/xfs_rtrmap_btree.c | 1030 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |  217 ++++++++
 fs/xfs/libxfs/xfs_sb.c           |    6 
 fs/xfs/libxfs/xfs_shared.h       |    2 
 fs/xfs/libxfs/xfs_swapext.c      |    4 
 fs/xfs/libxfs/xfs_trans_resv.c   |   12 
 fs/xfs/libxfs/xfs_trans_space.h  |   13 
 fs/xfs/libxfs/xfs_types.h        |    5 
 fs/xfs/scrub/alloc_repair.c      |   10 
 fs/xfs/scrub/bmap.c              |  133 ++++-
 fs/xfs/scrub/bmap_repair.c       |  128 +++++
 fs/xfs/scrub/common.c            |  165 ++++++
 fs/xfs/scrub/common.h            |   18 +
 fs/xfs/scrub/health.c            |    1 
 fs/xfs/scrub/inode.c             |   10 
 fs/xfs/scrub/inode_repair.c      |  116 ++++
 fs/xfs/scrub/metapath.c          |   27 +
 fs/xfs/scrub/newbt.c             |   43 ++
 fs/xfs/scrub/newbt.h             |    1 
 fs/xfs/scrub/reap.c              |   41 ++
 fs/xfs/scrub/reap.h              |    2 
 fs/xfs/scrub/repair.c            |  229 ++++++++
 fs/xfs/scrub/repair.h            |   28 +
 fs/xfs/scrub/rmap_repair.c       |   36 +
 fs/xfs/scrub/rtbitmap.c          |   80 +++
 fs/xfs/scrub/rtbitmap.h          |   59 ++
 fs/xfs/scrub/rtbitmap_repair.c   |  676 +++++++++++++++++++++++++
 fs/xfs/scrub/rtrmap.c            |  283 ++++++++++
 fs/xfs/scrub/rtrmap_repair.c     |  946 +++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/rtsummary_repair.c  |    3 
 fs/xfs/scrub/scrub.c             |   13 
 fs/xfs/scrub/scrub.h             |   14 +
 fs/xfs/scrub/stats.c             |    1 
 fs/xfs/scrub/tempfile.c          |   15 -
 fs/xfs/scrub/tempswap.h          |    2 
 fs/xfs/scrub/trace.c             |    1 
 fs/xfs/scrub/trace.h             |  249 +++++++++
 fs/xfs/scrub/xfbtree.c           |    3 
 fs/xfs/xfs_bmap_item.c           |   11 
 fs/xfs/xfs_buf_item_recover.c    |    4 
 fs/xfs/xfs_drain.c               |   93 +++
 fs/xfs/xfs_drain.h               |   28 +
 fs/xfs/xfs_extfree_item.c        |   14 -
 fs/xfs/xfs_fsmap.c               |  464 ++++++++++++-----
 fs/xfs/xfs_fsops.c               |   12 
 fs/xfs/xfs_health.c              |    4 
 fs/xfs/xfs_inode.c               |   19 +
 fs/xfs/xfs_inode_item.c          |    2 
 fs/xfs/xfs_inode_item_recover.c  |   33 +
 fs/xfs/xfs_log_recover.c         |    2 
 fs/xfs/xfs_mount.c               |    5 
 fs/xfs/xfs_mount.h               |   10 
 fs/xfs/xfs_rmap_item.c           |  254 +++++++++
 fs/xfs/xfs_rtalloc.c             |  246 +++++++++
 fs/xfs/xfs_rtalloc.h             |    5 
 fs/xfs/xfs_super.c               |    6 
 fs/xfs/xfs_trace.c               |   18 +
 fs/xfs/xfs_trace.h               |  136 ++++-
 77 files changed, 6111 insertions(+), 334 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtrmap_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_rtrmap_btree.h
 create mode 100644 fs/xfs/scrub/rtrmap.c
 create mode 100644 fs/xfs/scrub/rtrmap_repair.c


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (10 preceding siblings ...)
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
@ 2023-12-31 19:37 ` Darrick J. Wong
  2023-12-31 21:42   ` [PATCH 01/10] xfs: give refcount btree cursor error tracepoints their own class Darrick J. Wong
                     ` (9 more replies)
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                   ` (32 subsequent siblings)
  44 siblings, 10 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:37 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

This series cleans up the refcount intent code before we start adding
support for realtime devices.  Similar to previous intent cleanup
patchsets, we start transforming the tracepoints so that the data
extraction are done inside the tracepoint code, and then we start
passing the intent itself to the _finish_one function.  This reduces the
boxing and unboxing of parameters.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=refcount-intent-cleanups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=refcount-intent-cleanups
---
 fs/xfs/libxfs/xfs_refcount.c |  150 ++++++++--------------------
 fs/xfs/libxfs/xfs_refcount.h |   11 +-
 fs/xfs/xfs_refcount_item.c   |  107 ++++++++++----------
 fs/xfs/xfs_refcount_item.h   |    5 +
 fs/xfs/xfs_trace.c           |    1 
 fs/xfs/xfs_trace.h           |  229 ++++++++++++++++++++----------------------
 6 files changed, 222 insertions(+), 281 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 13/15] xfs: reflink on the realtime device
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (11 preceding siblings ...)
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
@ 2023-12-31 19:37 ` Darrick J. Wong
  2023-12-31 21:44   ` [PATCH 01/44] xfs: prepare refcount btree cursor tracepoints for realtime Darrick J. Wong
                     ` (43 more replies)
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
                   ` (31 subsequent siblings)
  44 siblings, 44 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:37 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

This patchset enables use of the file data block sharing feature (i.e.
reflink) on the realtime device.  It follows the same basic sequence as
the realtime rmap series -- first a few cleanups; then widening of the
API parameters; and introduction of the new btree format and inode fork
format.  Next comes enabling CoW and remapping for the rt device; new
scrub, repair, and health reporting code; and at the end we implement
some code to lengthen write requests so that rt extents are always CoWed
fully.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-reflink

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-reflink

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-reflink

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-reflink
---
 fs/xfs/Makefile                      |    3 
 fs/xfs/libxfs/xfs_bmap.c             |   31 +
 fs/xfs/libxfs/xfs_btree.c            |    5 
 fs/xfs/libxfs/xfs_btree.h            |   12 -
 fs/xfs/libxfs/xfs_defer.h            |    1 
 fs/xfs/libxfs/xfs_format.h           |   25 +
 fs/xfs/libxfs/xfs_fs.h               |    6 
 fs/xfs/libxfs/xfs_fs_staging.h       |    1 
 fs/xfs/libxfs/xfs_health.h           |    4 
 fs/xfs/libxfs/xfs_inode_buf.c        |   36 +-
 fs/xfs/libxfs/xfs_inode_fork.c       |   13 +
 fs/xfs/libxfs/xfs_log_format.h       |    6 
 fs/xfs/libxfs/xfs_log_recover.h      |    2 
 fs/xfs/libxfs/xfs_ondisk.h           |    2 
 fs/xfs/libxfs/xfs_refcount.c         |  342 ++++++++++++--
 fs/xfs/libxfs/xfs_refcount.h         |   29 +
 fs/xfs/libxfs/xfs_rmap.c             |   14 +
 fs/xfs/libxfs/xfs_rtgroup.c          |   10 
 fs/xfs/libxfs/xfs_rtgroup.h          |    8 
 fs/xfs/libxfs/xfs_rtrefcount_btree.c |  805 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |  196 ++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.c     |   28 +
 fs/xfs/libxfs/xfs_sb.c               |    8 
 fs/xfs/libxfs/xfs_shared.h           |    2 
 fs/xfs/libxfs/xfs_trans_resv.c       |   25 +
 fs/xfs/libxfs/xfs_types.h            |    5 
 fs/xfs/scrub/bmap.c                  |   31 +
 fs/xfs/scrub/bmap_repair.c           |   26 +
 fs/xfs/scrub/common.c                |   40 +-
 fs/xfs/scrub/common.h                |    5 
 fs/xfs/scrub/cow_repair.c            |  212 ++++++++-
 fs/xfs/scrub/health.c                |    1 
 fs/xfs/scrub/inode.c                 |   32 +
 fs/xfs/scrub/inode_repair.c          |   68 +++
 fs/xfs/scrub/metapath.c              |   14 +
 fs/xfs/scrub/quota.c                 |    8 
 fs/xfs/scrub/quota_repair.c          |    2 
 fs/xfs/scrub/reap.c                  |  227 ++++++++++
 fs/xfs/scrub/reap.h                  |    7 
 fs/xfs/scrub/refcount.c              |    2 
 fs/xfs/scrub/refcount_repair.c       |    4 
 fs/xfs/scrub/repair.c                |   27 +
 fs/xfs/scrub/repair.h                |    9 
 fs/xfs/scrub/rgb_bitmap.h            |   37 ++
 fs/xfs/scrub/rmap_repair.c           |   36 ++
 fs/xfs/scrub/rtb_bitmap.h            |   37 ++
 fs/xfs/scrub/rtbitmap.c              |    2 
 fs/xfs/scrub/rtbitmap_repair.c       |   24 +
 fs/xfs/scrub/rtrefcount.c            |  671 ++++++++++++++++++++++++++++
 fs/xfs/scrub/rtrefcount_repair.c     |  781 +++++++++++++++++++++++++++++++++
 fs/xfs/scrub/rtrmap.c                |   54 ++
 fs/xfs/scrub/rtrmap_repair.c         |  105 ++++
 fs/xfs/scrub/scrub.c                 |    7 
 fs/xfs/scrub/scrub.h                 |   12 +
 fs/xfs/scrub/stats.c                 |    1 
 fs/xfs/scrub/trace.h                 |  108 +++++
 fs/xfs/xfs_bmap_util.c               |   66 ++-
 fs/xfs/xfs_buf_item_recover.c        |    4 
 fs/xfs/xfs_fsmap.c                   |   22 +
 fs/xfs/xfs_fsops.c                   |    2 
 fs/xfs/xfs_health.c                  |    4 
 fs/xfs/xfs_inode.c                   |   13 +
 fs/xfs/xfs_inode_item.c              |   16 +
 fs/xfs/xfs_inode_item_recover.c      |    5 
 fs/xfs/xfs_ioctl.c                   |   21 +
 fs/xfs/xfs_log_recover.c             |    2 
 fs/xfs/xfs_mount.c                   |    7 
 fs/xfs/xfs_mount.h                   |    9 
 fs/xfs/xfs_quota.h                   |    6 
 fs/xfs/xfs_refcount_item.c           |  260 ++++++++++-
 fs/xfs/xfs_reflink.c                 |  231 ++++++++--
 fs/xfs/xfs_rtalloc.c                 |  141 ++++++
 fs/xfs/xfs_super.c                   |   19 +
 fs/xfs/xfs_trace.c                   |    9 
 fs/xfs/xfs_trace.h                   |  116 +++--
 fs/xfs/xfs_trans_dquot.c             |   11 
 76 files changed, 4843 insertions(+), 330 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtrefcount_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_rtrefcount_btree.h
 create mode 100644 fs/xfs/scrub/rgb_bitmap.h
 create mode 100644 fs/xfs/scrub/rtb_bitmap.h
 create mode 100644 fs/xfs/scrub/rtrefcount.c
 create mode 100644 fs/xfs/scrub/rtrefcount_repair.c


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (12 preceding siblings ...)
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
@ 2023-12-31 19:38 ` Darrick J. Wong
  2023-12-31 21:56   ` [PATCH 1/9] vfs: explicitly pass the block size to the remap prep function Darrick J. Wong
                     ` (8 more replies)
  2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
                   ` (30 subsequent siblings)
  44 siblings, 9 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:38 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

Now that we've landed support for reflink on the realtime device for
cases where the rt extent size is the same as the fs block size, enhance
the reflink code further to support cases where the rt extent size is a
power-of-two multiple of the fs block size.  This enables us to do data
block sharing (for example) for much larger allocation units by dirtying
pagecache around shared extents and expanding writeback to write back
shared extents fully.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-reflink-extsize

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-reflink-extsize

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-reflink-extsize
---
 fs/dax.c                      |    5 +
 fs/remap_range.c              |   30 +++--
 fs/xfs/libxfs/xfs_bmap.c      |   22 ++++
 fs/xfs/libxfs/xfs_inode_buf.c |   20 +---
 fs/xfs/xfs_aops.c             |   57 +++++++++-
 fs/xfs/xfs_file.c             |  224 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_file.h             |    3 +
 fs/xfs/xfs_inode.h            |    6 +
 fs/xfs/xfs_iops.c             |   22 ++++
 fs/xfs/xfs_reflink.c          |  223 ++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_rtalloc.c          |    3 -
 fs/xfs/xfs_super.c            |   17 ++-
 fs/xfs/xfs_trace.h            |   39 +++++++
 include/linux/fs.h            |    3 -
 14 files changed, 627 insertions(+), 47 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (13 preceding siblings ...)
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
@ 2023-12-31 19:38 ` Darrick J. Wong
  2023-12-31 21:58   ` [PATCH 1/6] xfs: attach dquots to rt metadata files when starting quota Darrick J. Wong
                     ` (5 more replies)
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                   ` (29 subsequent siblings)
  44 siblings, 6 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:38 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

Hi all,

At some point, I realized that I've refactored enough of the quota code
in XFS that I should evaluate whether or not quota actually works on
realtime volumes.  It turns out that with two exceptions, it actually
does seem to work properly!  There are three broken pieces that I've
found so far: chown doesn't work, the quota accounting goes wrong when
the rt bitmap changes size, and the VFS quota ioctls don't report the
realtime warning counts or limits.  Hence this series fixes two things
in XFS and re-enables rt quota after a break of a couple decades.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-quotas

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-quotas

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-quotas
---
 fs/xfs/xfs_mount.c   |    4 ++-
 fs/xfs/xfs_qm.c      |   76 +++++++++++++++++++++++++++++++-------------------
 fs/xfs/xfs_qm_bhv.c  |   20 +++++++++----
 fs/xfs/xfs_quota.h   |    4 +--
 fs/xfs/xfs_rtalloc.c |   46 ++++++++++++++++++++----------
 fs/xfs/xfs_rtalloc.h |    3 ++
 fs/xfs/xfs_stats.c   |   12 ++++++--
 fs/xfs/xfs_super.c   |   11 +++----
 fs/xfs/xfs_trans.c   |   31 +++++++++++++++++++-
 9 files changed, 141 insertions(+), 66 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (14 preceding siblings ...)
  2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
@ 2023-12-31 19:51 ` Darrick J. Wong
  2023-12-31 23:22   ` [PATCH 01/28] xfs: hoist extent size helpers " Darrick J. Wong
                     ` (27 more replies)
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                   ` (28 subsequent siblings)
  44 siblings, 28 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:51 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

Add libxfs code from the kernel from the inode refactoring, then fix up
xfs_repair and mkfs to use library functions instead of open-coding
inode (re)creation.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=inode-refactor

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=inode-refactor
---
 configure.ac             |    1 
 db/iunlink.c             |  128 +-------
 db/namei.c               |   23 -
 include/builddefs.in     |    1 
 include/libxfs.h         |    1 
 include/xfs_inode.h      |   82 ++++-
 include/xfs_mount.h      |    1 
 include/xfs_trace.h      |    7 
 libxfs/Makefile          |    8 +
 libxfs/inode.c           |  280 ++++++++++++++++++
 libxfs/iunlink.c         |  163 ++++++++++
 libxfs/iunlink.h         |   24 ++
 libxfs/libxfs_api_defs.h |    9 +
 libxfs/libxfs_priv.h     |   52 +++
 libxfs/rdwr.c            |   95 ------
 libxfs/util.c            |  333 +--------------------
 libxfs/xfs_bmap.c        |   42 +++
 libxfs/xfs_bmap.h        |    3 
 libxfs/xfs_dir2.c        |  637 +++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_dir2.h        |   48 +++
 libxfs/xfs_ialloc.c      |   15 +
 libxfs/xfs_inode_util.c  |  720 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_inode_util.h  |   71 +++++
 libxfs/xfs_shared.h      |    7 
 libxfs/xfs_trans_inode.c |    2 
 m4/package_libcdev.m4    |   15 +
 mkfs/proto.c             |  100 +++++-
 repair/phase6.c          |  224 ++++----------
 28 files changed, 2347 insertions(+), 745 deletions(-)
 create mode 100644 libxfs/inode.c
 create mode 100644 libxfs/iunlink.c
 create mode 100644 libxfs/iunlink.h
 create mode 100644 libxfs/xfs_inode_util.c
 create mode 100644 libxfs/xfs_inode_util.h


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (15 preceding siblings ...)
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
@ 2023-12-31 19:52 ` Darrick J. Wong
  2023-12-31 23:29   ` [PATCH 01/58] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Darrick J. Wong
                     ` (57 more replies)
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
                   ` (27 subsequent siblings)
  44 siblings, 58 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:52 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

Add libxfs code from the kernel, then teach xfs_repair and mkfs to
use the metadir functions to find metadata inodes.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=metadir

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=metadir

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=metadir

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=metadir
---
 db/check.c                          |   25 +
 db/inode.c                          |    3 
 db/iunlink.c                        |    2 
 db/metadump.c                       |  114 +++-
 db/namei.c                          |   71 ++
 db/sb.c                             |   47 +-
 include/kmem.h                      |    5 
 include/libxfs.h                    |    2 
 include/xfs_inode.h                 |   13 
 include/xfs_mount.h                 |    3 
 include/xfs_trace.h                 |   16 +
 include/xfs_trans.h                 |    3 
 io/bulkstat.c                       |   16 -
 io/scrub.c                          |   62 ++
 libfrog/bulkstat.c                  |    3 
 libfrog/fsgeom.c                    |    4 
 libfrog/scrub.c                     |   34 +
 libfrog/scrub.h                     |    2 
 libxfs/Makefile                     |    4 
 libxfs/imeta_utils.c                |  320 ++++++++++
 libxfs/imeta_utils.h                |   29 +
 libxfs/init.c                       |   47 +-
 libxfs/inode.c                      |  132 ++++
 libxfs/libxfs_api_defs.h            |   27 +
 libxfs/libxfs_priv.h                |    4 
 libxfs/trans.c                      |   33 +
 libxfs/xfs_attr.c                   |    5 
 libxfs/xfs_bmap.c                   |    5 
 libxfs/xfs_format.h                 |   72 ++
 libxfs/xfs_fs.h                     |   29 +
 libxfs/xfs_health.h                 |    6 
 libxfs/xfs_ialloc.c                 |   56 +-
 libxfs/xfs_ialloc.h                 |    2 
 libxfs/xfs_imeta.c                  | 1076 +++++++++++++++++++++++++++++++++++
 libxfs/xfs_imeta.h                  |  110 ++++
 libxfs/xfs_inode_buf.c              |   73 ++
 libxfs/xfs_inode_buf.h              |    3 
 libxfs/xfs_inode_util.c             |    5 
 libxfs/xfs_ondisk.h                 |    1 
 libxfs/xfs_sb.c                     |   35 +
 libxfs/xfs_trans_resv.c             |  106 +++
 libxfs/xfs_trans_resv.h             |    3 
 libxfs/xfs_types.c                  |    7 
 man/man2/ioctl_xfs_fsgeometry.2     |    3 
 man/man2/ioctl_xfs_scrub_metadata.2 |   38 +
 man/man8/mkfs.xfs.8.in              |   11 
 man/man8/xfs_admin.8                |    8 
 man/man8/xfs_db.8                   |   23 +
 man/man8/xfs_io.8                   |   13 
 man/man8/xfs_protofile.8            |   33 +
 mdrestore/xfs_mdrestore.c           |    6 
 mkfs/Makefile                       |   10 
 mkfs/lts_4.19.conf                  |    1 
 mkfs/lts_5.10.conf                  |    1 
 mkfs/lts_5.15.conf                  |    1 
 mkfs/proto.c                        |  227 +++++--
 mkfs/xfs_mkfs.c                     |   26 +
 mkfs/xfs_protofile.in               |  152 +++++
 repair/agheader.c                   |   19 -
 repair/dino_chunks.c                |   58 ++
 repair/dinode.c                     |  173 +++++-
 repair/dinode.h                     |    6 
 repair/dir2.c                       |  123 +++-
 repair/globals.c                    |    4 
 repair/globals.h                    |    4 
 repair/incore.h                     |   50 +-
 repair/incore_ino.c                 |    1 
 repair/phase1.c                     |    2 
 repair/phase2.c                     |   76 ++
 repair/phase4.c                     |   21 +
 repair/phase5.c                     |    7 
 repair/phase6.c                     |  910 ++++++++++++++++++++++++------
 repair/pptr.c                       |   51 ++
 repair/pptr.h                       |    2 
 repair/protos.h                     |    6 
 repair/quotacheck.c                 |    5 
 repair/sb.c                         |    3 
 repair/scan.c                       |   43 +
 repair/scan.h                       |    7 
 repair/xfs_repair.c                 |   98 +++
 scrub/inodes.c                      |   11 
 scrub/inodes.h                      |    5 
 scrub/phase3.c                      |    7 
 scrub/phase5.c                      |  102 +++
 scrub/phase6.c                      |    2 
 scrub/scrub.c                       |   18 +
 scrub/scrub.h                       |    7 
 spaceman/health.c                   |    2 
 88 files changed, 4531 insertions(+), 460 deletions(-)
 create mode 100644 libxfs/imeta_utils.c
 create mode 100644 libxfs/imeta_utils.h
 create mode 100644 libxfs/xfs_imeta.c
 create mode 100644 libxfs/xfs_imeta.h
 create mode 100644 man/man8/xfs_protofile.8
 create mode 100644 mkfs/xfs_protofile.in


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (16 preceding siblings ...)
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
@ 2023-12-31 19:52 ` Darrick J. Wong
  2023-12-31 23:44   ` [PATCH 1/8] xfs_db: support passing the realtime device to the debugger Darrick J. Wong
                     ` (7 more replies)
  2023-12-31 19:52 ` [PATCHSET v2.0 04/17] xfs_metadump: support external devices Darrick J. Wong
                   ` (26 subsequent siblings)
  44 siblings, 8 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:52 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

Before we start modernizing the realtime device, let's first make a few
improvements to the XFS debugger to make our lives easier.  First up is
making it so that users can point the debugger at the block device
containing the realtime section, and augmenting the io cursor code to be
able to read blocks from the rt device.  Next, we add a new geometry
conversion command (rtconvert) to make it easier to go back and forth
between rt blocks, rt extents, and the corresponding locations within
the rt bitmap and summary files.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=debug-realtime-geometry
---
 db/block.c        |  167 +++++++++++++++++++++-
 db/convert.c      |  409 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 db/faddr.c        |    4 -
 db/init.c         |    7 +
 db/io.c           |   39 ++++-
 db/io.h           |    3 
 db/xfs_admin.sh   |    4 -
 man/man8/xfs_db.8 |  129 +++++++++++++++++
 8 files changed, 722 insertions(+), 40 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 04/17] xfs_metadump: support external devices
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (17 preceding siblings ...)
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
@ 2023-12-31 19:52 ` Darrick J. Wong
  2023-12-31 23:47   ` [PATCH 1/1] xfs_db: allow setting current address to log blocks Darrick J. Wong
  2023-12-31 19:52 ` [PATCHSET v2.0 05/17] xfsprogs: refactor realtime meta inode locking Darrick J. Wong
                   ` (25 subsequent siblings)
  44 siblings, 1 reply; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:52 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

This series augments the xfs_metadump and xfs_mdrestore utilities to
capture the contents of an external log in a metadump, and restore it on
the other end.  This will enable better debugging analysis of broken
filesystems, since it will now be possible to capture external log data.
This is a prequisite for the rt groups feature, since we'll also need to
capture the rt superblocks written to the rt device.

This also means we can capture the contents of external logs for better
analysis by support staff.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=metadump-external-devices

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=metadump-external-devices
---
 db/block.c        |  103 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 man/man8/xfs_db.8 |   17 +++++++++
 2 files changed, 119 insertions(+), 1 deletion(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 05/17] xfsprogs: refactor realtime meta inode locking
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (18 preceding siblings ...)
  2023-12-31 19:52 ` [PATCHSET v2.0 04/17] xfs_metadump: support external devices Darrick J. Wong
@ 2023-12-31 19:52 ` Darrick J. Wong
  2023-12-31 23:47   ` [PATCH 1/2] xfs: refactor realtime " Darrick J. Wong
  2023-12-31 23:47   ` [PATCH 2/2] xfs: remove XFS_ILOCK_RT* Darrick J. Wong
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                   ` (24 subsequent siblings)
  44 siblings, 2 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:52 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

Replace all the open-coded locking of realtime metadata inodes with a
single rtlock function that can lock all the pieces that the caller
wants in a single call.  This will be important for maintaining correct
locking order later when we start adding more realtime metadata inodes.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=refactor-rt-locking

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=refactor-rt-locking
---
 libxfs/libxfs_priv.h  |    1 +
 libxfs/xfs_bmap.c     |    7 ++----
 libxfs/xfs_rtbitmap.c |   57 +++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtbitmap.h |   17 +++++++++++++++
 4 files changed, 77 insertions(+), 5 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (19 preceding siblings ...)
  2023-12-31 19:52 ` [PATCHSET v2.0 05/17] xfsprogs: refactor realtime meta inode locking Darrick J. Wong
@ 2023-12-31 19:53 ` Darrick J. Wong
  2023-12-27 13:00   ` [PATCH 48/52] xfs_scrub: trim realtime volumes too Darrick J. Wong
                     ` (51 more replies)
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
                   ` (23 subsequent siblings)
  44 siblings, 52 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:53 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

Right now, the realtime section uses a single pair of metadata inodes to
store the free space information.  This presents a scalability problem
since every thread trying to allocate or free rt extents have to lock
these files.  It would be very useful if we could begin to tackle these
problems by sharding the realtime section, so create the notion of
realtime groups, which are similar to allocation groups on the data
section.

While we're at it, define a superblock to be stamped into the start of
each rt section.  This enables utilities such as blkid to identify block
devices containing realtime sections, and helpfully avoids the situation
where a file extent can cross an rtgroup boundary.

The best advantage for rtgroups will become evident later when we get to
adding rmap and reflink to the realtime volume, since the geometry
constraints are the same for rt groups and AGs.  Hence we can reuse all
that code directly.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-groups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-groups

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-groups

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-groups
---
 db/Makefile                           |    2 
 db/bit.c                              |   24 +
 db/bit.h                              |    1 
 db/check.c                            |   65 +++-
 db/command.c                          |    2 
 db/convert.c                          |   46 ++-
 db/field.c                            |   18 +
 db/field.h                            |   11 +
 db/fprint.c                           |   11 +
 db/inode.c                            |    9 -
 db/metadump.c                         |   61 +++
 db/rtgroup.c                          |  169 ++++++++++
 db/rtgroup.h                          |   21 +
 db/sb.c                               |  136 +++++++-
 db/type.c                             |   16 +
 db/type.h                             |   32 ++
 db/xfs_metadump.sh                    |    5 
 include/libxfs.h                      |    2 
 include/xfs_arch.h                    |    6 
 include/xfs_metadump.h                |    8 
 include/xfs_mount.h                   |   13 +
 include/xfs_trace.h                   |    7 
 include/xfs_trans.h                   |    1 
 io/Makefile                           |    2 
 io/aginfo.c                           |  215 ++++++++++++
 io/bmap.c                             |   30 +-
 io/fsmap.c                            |   23 +
 io/init.c                             |    1 
 io/io.h                               |    1 
 io/scrub.c                            |   40 ++
 libfrog/div64.h                       |    6 
 libfrog/fsgeom.c                      |   24 +
 libfrog/fsgeom.h                      |    4 
 libfrog/scrub.c                       |   10 +
 libfrog/scrub.h                       |    1 
 libfrog/util.c                        |   26 +
 libfrog/util.h                        |    3 
 libxfs/Makefile                       |    2 
 libxfs/defer_item.c                   |   17 +
 libxfs/init.c                         |   68 ++++
 libxfs/libxfs_api_defs.h              |    6 
 libxfs/libxfs_io.h                    |    1 
 libxfs/libxfs_priv.h                  |   25 +
 libxfs/rdwr.c                         |   17 +
 libxfs/topology.c                     |   42 ++
 libxfs/topology.h                     |    3 
 libxfs/trans.c                        |   29 ++
 libxfs/util.c                         |    7 
 libxfs/xfs_bmap.h                     |    5 
 libxfs/xfs_format.h                   |   96 +++++
 libxfs/xfs_fs.h                       |    9 -
 libxfs/xfs_fs_staging.h               |   17 +
 libxfs/xfs_health.h                   |   30 ++
 libxfs/xfs_ondisk.h                   |    2 
 libxfs/xfs_rtbitmap.c                 |  141 +++++++-
 libxfs/xfs_rtbitmap.h                 |   68 ++++
 libxfs/xfs_rtgroup.c                  |  582 +++++++++++++++++++++++++++++++++
 libxfs/xfs_rtgroup.h                  |  252 ++++++++++++++
 libxfs/xfs_sb.c                       |  126 +++++++
 libxfs/xfs_shared.h                   |    4 
 libxfs/xfs_types.c                    |   46 ++-
 libxfs/xfs_types.h                    |    4 
 man/man2/ioctl_xfs_rtgroup_geometry.2 |   99 ++++++
 man/man2/ioctl_xfs_scrub_metadata.2   |   21 +
 man/man8/mkfs.xfs.8.in                |   44 ++
 man/man8/xfs_admin.8                  |    9 +
 man/man8/xfs_db.8                     |   17 +
 man/man8/xfs_io.8                     |   29 ++
 man/man8/xfs_mdrestore.8              |   10 +
 man/man8/xfs_metadump.8               |   11 +
 man/man8/xfs_spaceman.8               |    5 
 mdrestore/xfs_mdrestore.c             |   53 ++-
 mkfs/proto.c                          |  107 ++++++
 mkfs/xfs_mkfs.c                       |  281 ++++++++++++++++
 repair/agheader.c                     |    2 
 repair/globals.c                      |    1 
 repair/globals.h                      |    1 
 repair/incore.c                       |   22 +
 repair/phase2.c                       |   42 ++
 repair/phase3.c                       |    3 
 repair/phase6.c                       |   37 ++
 repair/rt.c                           |  186 ++++++++++-
 repair/rt.h                           |    3 
 repair/sb.c                           |   49 +++
 repair/xfs_repair.c                   |   22 +
 scrub/phase2.c                        |  100 ++++++
 scrub/phase7.c                        |    7 
 scrub/phase8.c                        |   36 ++
 scrub/repair.c                        |    1 
 scrub/scrub.c                         |    3 
 scrub/scrub.h                         |    9 +
 scrub/spacemap.c                      |   74 ++++
 scrub/xfs_scrub.c                     |    2 
 scrub/xfs_scrub.h                     |    1 
 spaceman/health.c                     |   59 +++
 95 files changed, 3840 insertions(+), 157 deletions(-)
 create mode 100644 db/rtgroup.c
 create mode 100644 db/rtgroup.h
 create mode 100644 io/aginfo.c
 create mode 100644 libxfs/xfs_rtgroup.c
 create mode 100644 libxfs/xfs_rtgroup.h
 create mode 100644 man/man2/ioctl_xfs_rtgroup_geometry.2


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (20 preceding siblings ...)
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
@ 2023-12-31 19:53 ` Darrick J. Wong
  2023-12-27 13:01   ` [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros Darrick J. Wong
                     ` (13 more replies)
  2023-12-31 19:53 ` [PATCHSET v2.0 08/17] xfsprogs: enable in-core block reservation for rt metadata Darrick J. Wong
                   ` (22 subsequent siblings)
  44 siblings, 14 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:53 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

This series prepares the btree code to support realtime reverse mapping
btrees by refactoring xfs_ifork_realloc to be fed a per-btree ops
structure so that it can handle multiple types of inode-rooted btrees.
It moves on to refactoring the btree code to use the new realloc
routines and to support storing btree rcords in the inode root, because
the current bmbt code does not support this.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=btree-ifork-records

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=btree-ifork-records
---
 db/bmap.c                   |   10 +
 db/bmap_inflate.c           |    2 
 db/bmroot.c                 |    8 -
 db/btheight.c               |   18 --
 db/check.c                  |    8 -
 db/frag.c                   |    8 -
 db/metadump.c               |   16 +-
 libxfs/xfs_alloc_btree.c    |    6 -
 libxfs/xfs_alloc_btree.h    |    3 
 libxfs/xfs_attr_leaf.c      |    8 -
 libxfs/xfs_bmap.c           |   59 +++----
 libxfs/xfs_bmap_btree.c     |   93 +++++++++-
 libxfs/xfs_bmap_btree.h     |  219 +++++++++++++++++--------
 libxfs/xfs_btree.c          |  382 +++++++++++++++++++++++++++++++++----------
 libxfs/xfs_btree.h          |    4 
 libxfs/xfs_btree_staging.c  |    4 
 libxfs/xfs_ialloc.c         |    4 
 libxfs/xfs_ialloc_btree.c   |    6 -
 libxfs/xfs_ialloc_btree.h   |    3 
 libxfs/xfs_inode_fork.c     |  163 ++++++++----------
 libxfs/xfs_inode_fork.h     |   27 +++
 libxfs/xfs_refcount_btree.c |    5 -
 libxfs/xfs_refcount_btree.h |    3 
 libxfs/xfs_rmap_btree.c     |    9 +
 libxfs/xfs_rmap_btree.h     |    3 
 libxfs/xfs_sb.c             |   16 +-
 libxfs/xfs_trans_resv.c     |    2 
 repair/bmap_repair.c        |    2 
 repair/dinode.c             |   10 +
 repair/phase5.c             |   16 +-
 repair/prefetch.c           |    8 -
 repair/scan.c               |    6 -
 32 files changed, 749 insertions(+), 382 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 08/17] xfsprogs: enable in-core block reservation for rt metadata
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (21 preceding siblings ...)
  2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
@ 2023-12-31 19:53 ` Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 1/2] xfs: simplify xfs_ag_resv_free signature Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 2/2] xfs: allow inode-based btrees to reserve space in the data device Darrick J. Wong
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
                   ` (21 subsequent siblings)
  44 siblings, 2 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:53 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

In preparation for adding reverse mapping and refcounting to the
realtime device, enhance the metadir code to reserve free space for
btree shape changes as delayed allocation blocks.

This effectively allows us to pre-allocate space for the rmap and
refcount btrees in the same manner as we do for the data device
counterparts, which is how we avoid ENOSPC failures when space is low
but we've already committed to a COW operation.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=reserve-rt-metadata-space

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=reserve-rt-metadata-space
---
 include/xfs_inode.h   |    5 +
 include/xfs_mount.h   |    1 
 include/xfs_trace.h   |    8 ++
 io/inject.c           |    1 
 libxfs/init.c         |   11 +++
 libxfs/libxfs_priv.h  |   11 +++
 libxfs/xfs_ag.c       |    4 -
 libxfs/xfs_ag_resv.c  |   25 ++----
 libxfs/xfs_ag_resv.h  |    2 -
 libxfs/xfs_errortag.h |    4 +
 libxfs/xfs_imeta.c    |  190 +++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_imeta.h    |   11 +++
 libxfs/xfs_types.h    |    7 ++
 13 files changed, 255 insertions(+), 25 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (22 preceding siblings ...)
  2023-12-31 19:53 ` [PATCHSET v2.0 08/17] xfsprogs: enable in-core block reservation for rt metadata Darrick J. Wong
@ 2023-12-31 19:54 ` Darrick J. Wong
  2023-12-27 13:05   ` [PATCH 1/8] xfs: clean up extent free log intent item tracepoint callsites Darrick J. Wong
                     ` (7 more replies)
  2023-12-31 19:54 ` [PATCHSET v2.0 10/17] xfsprogs: widen EFI format to support rt Darrick J. Wong
                   ` (20 subsequent siblings)
  44 siblings, 8 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:54 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

Hi all,

This series cleans up some warts in the extent freeing log intent code.
We start by acknowledging that this mechanism does not have anything to
do with the bmap code by moving it to xfs_alloc.c and giving the
function a more descriptive name.  Then we clean up the tracepoints and
the _finish_one call paths to pass the intent structure around.  This
reduces the overhead when the tracepoints are disabled and will make
things much cleaner when we start adding realtime support in the next
patch.  I also incorporated a bunch of cleanups from Christoph Hellwig.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=extfree-intent-cleanups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=extfree-intent-cleanups
---
 include/xfs_mount.h         |    3 +
 include/xfs_trace.h         |    5 +-
 libxfs/defer_item.c         |   91 +++++++++++++++++--------------------------
 libxfs/defer_item.h         |    6 +++
 libxfs/xfs_ag.c             |    2 -
 libxfs/xfs_alloc.c          |   92 +++++++++++--------------------------------
 libxfs/xfs_alloc.h          |   12 +++---
 libxfs/xfs_bmap.c           |   12 ++++--
 libxfs/xfs_bmap_btree.c     |    2 -
 libxfs/xfs_ialloc.c         |    5 +-
 libxfs/xfs_ialloc_btree.c   |    2 -
 libxfs/xfs_refcount.c       |    6 +--
 libxfs/xfs_refcount_btree.c |    2 -
 repair/bulkload.c           |    3 +
 14 files changed, 96 insertions(+), 147 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 10/17] xfsprogs: widen EFI format to support rt
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (23 preceding siblings ...)
  2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
@ 2023-12-31 19:54 ` Darrick J. Wong
  2023-12-27 13:07   ` [PATCH 1/3] xfs: support logging EFIs for realtime extents Darrick J. Wong
                     ` (2 more replies)
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
                   ` (19 subsequent siblings)
  44 siblings, 3 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:54 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

Hi all,

Realtime reverse mapping (and beyond that, realtime reflink) needs to be
able to defer file mapping and extent freeing work in much the same
manner as is required on the data volume.  Make the extent freeing log
items operate on rt extents in preparation for realtime rmap.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-extfree-intents

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-extfree-intents
---
 libxfs/defer_item.c      |   75 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_alloc.c       |   16 ++++++++--
 libxfs/xfs_alloc.h       |   17 +++++++++-
 libxfs/xfs_defer.c       |    6 ++++
 libxfs/xfs_defer.h       |    1 +
 libxfs/xfs_log_format.h  |    6 +++-
 libxfs/xfs_rtbitmap.c    |    4 ++
 logprint/log_misc.c      |    2 +
 logprint/log_print_all.c |    8 +++++
 logprint/log_redo.c      |   57 ++++++++++++++++++++++++++---------
 10 files changed, 172 insertions(+), 20 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (24 preceding siblings ...)
  2023-12-31 19:54 ` [PATCHSET v2.0 10/17] xfsprogs: widen EFI format to support rt Darrick J. Wong
@ 2023-12-31 19:54 ` Darrick J. Wong
  2023-12-27 13:08   ` [PATCH 1/9] xfs: attach rtgroup objects to btree cursors Darrick J. Wong
                     ` (8 more replies)
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
                   ` (18 subsequent siblings)
  44 siblings, 9 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:54 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

Hi all,

This series cleans up the rmap intent code before we start adding
support for realtime devices.  Similar to previous intent cleanup
patchsets, we start transforming the tracepoints so that the data
extraction are done inside the tracepoint code, and then we start
passing the intent itself to the _finish_one function.  This reduces the
boxing and unboxing of parameters.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=rmap-intent-cleanups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=rmap-intent-cleanups
---
 libxfs/defer_item.c |   62 +++++++-----
 libxfs/defer_item.h |    4 +
 libxfs/xfs_btree.c  |    4 +
 libxfs/xfs_btree.h  |    2 
 libxfs/xfs_rmap.c   |  268 ++++++++++++++++++---------------------------------
 libxfs/xfs_rmap.h   |   15 ++-
 6 files changed, 150 insertions(+), 205 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (25 preceding siblings ...)
  2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
@ 2023-12-31 19:54 ` Darrick J. Wong
  2023-12-27 13:10   ` [PATCH 01/47] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions Darrick J. Wong
                     ` (46 more replies)
  2023-12-31 19:55 ` [PATCHSET v2.0 13/17] xfsprogs: file write utility refactoring Darrick J. Wong
                   ` (17 subsequent siblings)
  44 siblings, 47 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:54 UTC (permalink / raw)
  To: cem, djwong; +Cc: Christoph Hellwig, linux-xfs

Hi all,

Add libxfs code from the kernel, then teach the various utilities about
how to access realtime rmapbt information and rebuild it.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-rmap

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-rmap

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-rmap

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-rmap
---
 db/bmroot.c                           |  149 +++++
 db/bmroot.h                           |    2 
 db/btblock.c                          |  103 +++
 db/btblock.h                          |    5 
 db/btdump.c                           |   64 ++
 db/btheight.c                         |    5 
 db/check.c                            |  203 ++++++-
 db/field.c                            |   11 
 db/field.h                            |    5 
 db/fsmap.c                            |  164 +++++
 db/inode.c                            |  128 ++++
 db/inode.h                            |    4 
 db/metadump.c                         |  129 ++++
 db/type.c                             |    5 
 db/type.h                             |    1 
 include/libxfs.h                      |    2 
 include/xfs_mount.h                   |   21 +
 io/scrub.c                            |   41 +
 libfrog/scrub.c                       |   10 
 libxfs/Makefile                       |    2 
 libxfs/defer_item.c                   |  115 +++-
 libxfs/init.c                         |   20 -
 libxfs/libxfs_api_defs.h              |   19 +
 libxfs/rdwr.c                         |    2 
 libxfs/trans.c                        |    1 
 libxfs/xfbtree.c                      |    2 
 libxfs/xfs_bmap.c                     |   23 +
 libxfs/xfs_btree.c                    |   77 ++
 libxfs/xfs_btree.h                    |    7 
 libxfs/xfs_defer.h                    |    1 
 libxfs/xfs_format.h                   |   24 +
 libxfs/xfs_fs.h                       |    6 
 libxfs/xfs_fs_staging.h               |    1 
 libxfs/xfs_health.h                   |    4 
 libxfs/xfs_inode_buf.c                |   26 +
 libxfs/xfs_inode_fork.c               |   13 
 libxfs/xfs_log_format.h               |    6 
 libxfs/xfs_ondisk.h                   |    2 
 libxfs/xfs_refcount.c                 |    6 
 libxfs/xfs_rmap.c                     |  198 ++++++
 libxfs/xfs_rmap.h                     |   25 +
 libxfs/xfs_rtgroup.c                  |   12 
 libxfs/xfs_rtgroup.h                  |   20 +
 libxfs/xfs_rtrmap_btree.c             | 1028 +++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrmap_btree.h             |  217 +++++++
 libxfs/xfs_sb.c                       |    6 
 libxfs/xfs_shared.h                   |    2 
 libxfs/xfs_swapext.c                  |    4 
 libxfs/xfs_trans_resv.c               |   12 
 libxfs/xfs_trans_space.h              |   13 
 libxfs/xfs_types.h                    |    5 
 logprint/log_misc.c                   |    2 
 logprint/log_print_all.c              |    8 
 logprint/log_redo.c                   |   24 +
 man/man2/ioctl_xfs_rtgroup_geometry.2 |    3 
 man/man2/ioctl_xfs_scrub_metadata.2   |    8 
 man/man8/xfs_db.8                     |   63 ++
 man/man8/xfs_io.8                     |    3 
 mkfs/proto.c                          |   56 ++
 mkfs/xfs_mkfs.c                       |   90 +++
 repair/Makefile                       |    1 
 repair/agbtree.c                      |    5 
 repair/bmap_repair.c                  |  131 ++++
 repair/bulkload.c                     |   41 +
 repair/bulkload.h                     |    2 
 repair/dino_chunks.c                  |   13 
 repair/dinode.c                       |  381 ++++++++++--
 repair/dir2.c                         |    4 
 repair/globals.c                      |    6 
 repair/globals.h                      |    2 
 repair/incore.h                       |    1 
 repair/phase2.c                       |  100 +++
 repair/phase4.c                       |   12 
 repair/phase5.c                       |  116 ++++
 repair/phase6.c                       |  208 +++++++
 repair/rmap.c                         |  560 +++++++++++++++---
 repair/rmap.h                         |   17 -
 repair/rtrmap_repair.c                |  261 ++++++++
 repair/scan.c                         |  411 +++++++++++++
 repair/scan.h                         |   37 +
 repair/xfs_repair.c                   |    8 
 scrub/phase4.c                        |   54 ++
 scrub/phase5.c                        |   24 +
 scrub/repair.c                        |  159 +++++
 scrub/repair.h                        |    5 
 scrub/scrub.c                         |    3 
 scrub/scrub.h                         |    4 
 spaceman/health.c                     |   10 
 88 files changed, 5500 insertions(+), 284 deletions(-)
 create mode 100644 libxfs/xfs_rtrmap_btree.c
 create mode 100644 libxfs/xfs_rtrmap_btree.h
 create mode 100644 repair/rtrmap_repair.c


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 13/17] xfsprogs: file write utility refactoring
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (26 preceding siblings ...)
  2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
@ 2023-12-31 19:55 ` Darrick J. Wong
  2023-12-27 13:23   ` [PATCH 1/4] libxfs: resync libxfs_alloc_file_space interface with the kernel Darrick J. Wong
                     ` (3 more replies)
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
                   ` (16 subsequent siblings)
  44 siblings, 4 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:55 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

Refactor the parts of mkfs and xfs_repair that open-code the process of
mapping disk space into files and writing data into them.  This will help
primarily with resetting of the realtime metadata, but is also used for
protofiles.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=bmap-utils
---
 include/libxfs.h |    6 +-
 libxfs/util.c    |  212 ++++++++++++++++++++++++++++++++++++++++++------------
 mkfs/proto.c     |  108 +++++-----------------------
 repair/phase6.c  |   76 +++----------------
 4 files changed, 197 insertions(+), 205 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (27 preceding siblings ...)
  2023-12-31 19:55 ` [PATCHSET v2.0 13/17] xfsprogs: file write utility refactoring Darrick J. Wong
@ 2023-12-31 19:55 ` Darrick J. Wong
  2023-12-27 13:24   ` [PATCH 1/9] xfs: give refcount btree cursor error tracepoints their own class Darrick J. Wong
                     ` (8 more replies)
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
                   ` (15 subsequent siblings)
  44 siblings, 9 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:55 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

This series cleans up the refcount intent code before we start adding
support for realtime devices.  Similar to previous intent cleanup
patchsets, we start transforming the tracepoints so that the data
extraction are done inside the tracepoint code, and then we start
passing the intent itself to the _finish_one function.  This reduces the
boxing and unboxing of parameters.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=refcount-intent-cleanups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=refcount-intent-cleanups
---
 libxfs/defer_item.c   |   60 ++++++++++++--------
 libxfs/defer_item.h   |    5 ++
 libxfs/xfs_refcount.c |  150 +++++++++++++++----------------------------------
 libxfs/xfs_refcount.h |   11 ++--
 4 files changed, 93 insertions(+), 133 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (28 preceding siblings ...)
  2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
@ 2023-12-31 19:55 ` Darrick J. Wong
  2023-12-27 13:26   ` [PATCH 01/42] xfs: introduce realtime refcount btree definitions Darrick J. Wong
                     ` (41 more replies)
  2023-12-31 19:55 ` [PATCHSET v2.0 16/17] xfsprogs: reflink with large realtime extents Darrick J. Wong
                   ` (14 subsequent siblings)
  44 siblings, 42 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:55 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

This patchset enables use of the file data block sharing feature (i.e.
reflink) on the realtime device.  It follows the same basic sequence as
the realtime rmap series -- first a few cleanups; then widening of the
API parameters; and introduction of the new btree format and inode fork
format.  Next comes enabling CoW and remapping for the rt device; new
scrub, repair, and health reporting code; and at the end we implement
some code to lengthen write requests so that rt extents are always CoWed
fully.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-reflink

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-reflink

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-reflink

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-reflink
---
 db/bmroot.c                           |  148 ++++++
 db/bmroot.h                           |    2 
 db/btblock.c                          |   53 ++
 db/btblock.h                          |    5 
 db/btdump.c                           |   11 
 db/btheight.c                         |    5 
 db/check.c                            |  326 ++++++++++++-
 db/field.c                            |   15 +
 db/field.h                            |    6 
 db/inode.c                            |   91 ++++
 db/inode.h                            |    2 
 db/metadump.c                         |  129 +++++
 db/type.c                             |    5 
 db/type.h                             |    1 
 include/libxfs.h                      |    1 
 include/xfs_mount.h                   |    9 
 libfrog/scrub.c                       |   10 
 libxfs/Makefile                       |    2 
 libxfs/defer_item.c                   |  101 ++++
 libxfs/init.c                         |   14 -
 libxfs/libxfs_api_defs.h              |   13 +
 libxfs/logitem.c                      |   14 +
 libxfs/xfs_bmap.c                     |   31 +
 libxfs/xfs_btree.c                    |    5 
 libxfs/xfs_btree.h                    |   12 
 libxfs/xfs_defer.h                    |    1 
 libxfs/xfs_format.h                   |   25 +
 libxfs/xfs_fs.h                       |    6 
 libxfs/xfs_fs_staging.h               |    1 
 libxfs/xfs_health.h                   |    4 
 libxfs/xfs_inode_buf.c                |   36 +
 libxfs/xfs_inode_fork.c               |   13 +
 libxfs/xfs_log_format.h               |    6 
 libxfs/xfs_ondisk.h                   |    2 
 libxfs/xfs_refcount.c                 |  341 ++++++++++++--
 libxfs/xfs_refcount.h                 |   29 +
 libxfs/xfs_rmap.c                     |   14 +
 libxfs/xfs_rtgroup.c                  |   10 
 libxfs/xfs_rtgroup.h                  |    8 
 libxfs/xfs_rtrefcount_btree.c         |  803 +++++++++++++++++++++++++++++++++
 libxfs/xfs_rtrefcount_btree.h         |  196 ++++++++
 libxfs/xfs_rtrmap_btree.c             |   28 +
 libxfs/xfs_sb.c                       |    8 
 libxfs/xfs_shared.h                   |    2 
 libxfs/xfs_trans_resv.c               |   25 +
 libxfs/xfs_types.h                    |    5 
 logprint/log_misc.c                   |    2 
 logprint/log_print_all.c              |    8 
 logprint/log_redo.c                   |   24 +
 man/man2/ioctl_xfs_rtgroup_geometry.2 |    3 
 man/man2/ioctl_xfs_scrub_metadata.2   |    9 
 man/man8/xfs_db.8                     |   49 ++
 mkfs/proto.c                          |   36 +
 mkfs/xfs_mkfs.c                       |   79 +++
 repair/Makefile                       |    1 
 repair/agbtree.c                      |    4 
 repair/dino_chunks.c                  |   11 
 repair/dinode.c                       |  267 ++++++++++-
 repair/dir2.c                         |    2 
 repair/incore.h                       |    1 
 repair/phase2.c                       |   83 +++
 repair/phase4.c                       |   30 +
 repair/phase5.c                       |    3 
 repair/phase6.c                       |  133 +++++
 repair/rmap.c                         |  353 ++++++++++++---
 repair/rmap.h                         |   19 +
 repair/rtrefcount_repair.c            |  256 +++++++++++
 repair/scan.c                         |  324 +++++++++++++
 repair/scan.h                         |   33 +
 scrub/repair.c                        |    1 
 spaceman/health.c                     |   10 
 71 files changed, 4041 insertions(+), 274 deletions(-)
 create mode 100644 libxfs/xfs_rtrefcount_btree.c
 create mode 100644 libxfs/xfs_rtrefcount_btree.h
 create mode 100644 repair/rtrefcount_repair.c


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 16/17] xfsprogs: reflink with large realtime extents
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (29 preceding siblings ...)
  2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
@ 2023-12-31 19:55 ` Darrick J. Wong
  2023-12-27 13:37   ` [PATCH 1/3] xfs: enable extent size hints for CoW when rtextsize > 1 Darrick J. Wong
                     ` (2 more replies)
  2023-12-31 19:56 ` [PATCHSET v2.0 17/17] xfsprogs: enable quota for realtime voluems Darrick J. Wong
                   ` (13 subsequent siblings)
  44 siblings, 3 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:55 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

Now that we've landed support for reflink on the realtime device for
cases where the rt extent size is the same as the fs block size, enhance
the reflink code further to support cases where the rt extent size is a
power-of-two multiple of the fs block size.  This enables us to do data
block sharing (for example) for much larger allocation units by dirtying
pagecache around shared extents and expanding writeback to write back
shared extents fully.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-reflink-extsize

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-reflink-extsize

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-reflink-extsize
---
 libxfs/init.c          |    7 -------
 libxfs/xfs_bmap.c      |   22 ++++++++++++++++++++++
 libxfs/xfs_inode_buf.c |   20 ++++++--------------
 mkfs/xfs_mkfs.c        |   37 -------------------------------------
 4 files changed, 28 insertions(+), 58 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 17/17] xfsprogs: enable quota for realtime voluems
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (30 preceding siblings ...)
  2023-12-31 19:55 ` [PATCHSET v2.0 16/17] xfsprogs: reflink with large realtime extents Darrick J. Wong
@ 2023-12-31 19:56 ` Darrick J. Wong
  2023-12-27 13:38   ` [PATCH 1/1] xfs_quota: report warning limits for realtime space quotas Darrick J. Wong
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
                   ` (12 subsequent siblings)
  44 siblings, 1 reply; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 19:56 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

Hi all,

At some point, I realized that I've refactored enough of the quota code
in XFS that I should evaluate whether or not quota actually works on
realtime volumes.  It turns out that with two exceptions, it actually
does seem to work properly!  There are three broken pieces that I've
found so far: chown doesn't work, the quota accounting goes wrong when
the rt bitmap changes size, and the VFS quota ioctls don't report the
realtime warning counts or limits.  Hence this series fixes two things
in XFS and re-enables rt quota after a break of a couple decades.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-quotas

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-quotas

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-quotas
---
 include/xqm.h |    5 ++++-
 quota/state.c |    1 +
 2 files changed, 5 insertions(+), 1 deletion(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 1/9] fstests: test XFS metadata directories
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (31 preceding siblings ...)
  2023-12-31 19:56 ` [PATCHSET v2.0 17/17] xfsprogs: enable quota for realtime voluems Darrick J. Wong
@ 2023-12-31 20:00 ` Darrick J. Wong
  2023-12-27 13:50   ` [PATCH 01/11] xfs/122: fix metadirino Darrick J. Wong
                     ` (10 more replies)
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
                   ` (11 subsequent siblings)
  44 siblings, 11 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:00 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

Adjust fstests as needed to support the XFS metadata directory feature,
and add some new tests for online fsck and fuzz testing of the ondisk
metadata.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=metadir

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=metadir

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=metadir

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=metadir
---
 common/filter      |    7 +
 common/repair      |    4 
 common/xfs         |   96 +++++++++++-
 tests/xfs/007      |   16 +-
 tests/xfs/030      |    1 
 tests/xfs/033      |    1 
 tests/xfs/050      |    5 +
 tests/xfs/122.out  |    1 
 tests/xfs/153      |    5 +
 tests/xfs/1546     |   37 ++++
 tests/xfs/1546.out |  316 ++++++++++++++++++++++++++++++++++++++
 tests/xfs/1547     |   37 ++++
 tests/xfs/1547.out |   36 ++++
 tests/xfs/1548     |   37 ++++
 tests/xfs/1548.out |  318 ++++++++++++++++++++++++++++++++++++++
 tests/xfs/1549     |   38 +++++
 tests/xfs/1549.out |  330 ++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1550     |   37 ++++
 tests/xfs/1550.out |  434 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1551     |   37 ++++
 tests/xfs/1551.out |   35 ++++
 tests/xfs/1552     |   37 ++++
 tests/xfs/1552.out |  343 +++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1553     |   38 +++++
 tests/xfs/1553.out |  370 ++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/178      |    1 
 tests/xfs/1856     |    2 
 tests/xfs/1874     |  132 ++++++++++++++++
 tests/xfs/1874.out |   16 ++
 tests/xfs/206      |    3 
 tests/xfs/299      |    1 
 tests/xfs/330      |    6 +
 tests/xfs/509      |   21 ++-
 tests/xfs/529      |    5 -
 tests/xfs/530      |    6 -
 tests/xfs/739      |    9 -
 tests/xfs/740      |    9 -
 tests/xfs/741      |    9 -
 tests/xfs/742      |    9 -
 tests/xfs/743      |    9 -
 tests/xfs/744      |    9 -
 tests/xfs/745      |    9 -
 tests/xfs/746      |    9 -
 43 files changed, 2803 insertions(+), 78 deletions(-)
 create mode 100755 tests/xfs/1546
 create mode 100644 tests/xfs/1546.out
 create mode 100755 tests/xfs/1547
 create mode 100644 tests/xfs/1547.out
 create mode 100755 tests/xfs/1548
 create mode 100644 tests/xfs/1548.out
 create mode 100755 tests/xfs/1549
 create mode 100644 tests/xfs/1549.out
 create mode 100755 tests/xfs/1550
 create mode 100644 tests/xfs/1550.out
 create mode 100755 tests/xfs/1551
 create mode 100644 tests/xfs/1551.out
 create mode 100755 tests/xfs/1552
 create mode 100644 tests/xfs/1552.out
 create mode 100755 tests/xfs/1553
 create mode 100644 tests/xfs/1553.out
 create mode 100755 tests/xfs/1874
 create mode 100644 tests/xfs/1874.out


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (32 preceding siblings ...)
  2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
@ 2023-12-31 20:00 ` Darrick J. Wong
  2023-12-27 13:53   ` [PATCH 01/17] common/populate: refactor caching of metadumps to a helper Darrick J. Wong
                     ` (16 more replies)
  2023-12-31 20:00 ` [PATCHSET v2.0 3/9] fstests: enable FITRIM for the realtime section Darrick J. Wong
                   ` (10 subsequent siblings)
  44 siblings, 17 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:00 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

Right now, the realtime section uses a single pair of metadata inodes to
store the free space information.  This presents a scalability problem
since every thread trying to allocate or free rt extents have to lock
these files.  It would be very useful if we could begin to tackle these
problems by sharding the realtime section, so create the notion of
realtime groups, which are similar to allocation groups on the data
section.

While we're at it, define a superblock to be stamped into the start of
each rt section.  This enables utilities such as blkid to identify block
devices containing realtime sections, and helpfully avoids the situation
where a file extent can cross an rtgroup boundary.

The best advantage for rtgroups will become evident later when we get to
adding rmap and reflink to the realtime volume, since the geometry
constraints are the same for rt groups and AGs.  Hence we can reuse all
that code directly.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-groups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-groups

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-groups

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-groups
---
 common/ext4             |   17 +++++
 common/fuzzy            |   38 ++++++++++--
 common/populate         |   78 +++++++++++++-----------
 common/xfs              |  150 ++++++++++++++++++++++++++++++++++++++++++++---
 src/punch-alternating.c |   28 ++++++++-
 tests/xfs/114           |    4 +
 tests/xfs/122           |    2 -
 tests/xfs/122.out       |    8 +++
 tests/xfs/146           |    2 -
 tests/xfs/185           |    2 -
 tests/xfs/187           |    3 +
 tests/xfs/206           |    3 +
 tests/xfs/271           |    3 +
 tests/xfs/341           |    4 +
 tests/xfs/449           |    6 ++
 tests/xfs/556           |   16 +++--
 tests/xfs/581           |    2 -
 tests/xfs/720           |    2 -
 tests/xfs/795           |    2 -
 19 files changed, 298 insertions(+), 72 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 3/9] fstests: enable FITRIM for the realtime section
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (33 preceding siblings ...)
  2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
@ 2023-12-31 20:00 ` Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 1/2] xfs: refactor statfs field extraction Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 2/2] common/xfs: FITRIM now supports realtime volumes Darrick J. Wong
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
                   ` (9 subsequent siblings)
  44 siblings, 2 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:00 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

One thing that's been missing for a long time is the ability to tell
underlying storage that it can unmap the unused space on the realtime
device.  This short series exposes this functionality through FITRIM.
Callers that want ranged FITRIM should be aware that the realtime space
exists in the offset range after the data device.  However, it is
anticipated that most callers pass in offset=0 len=-1ULL and will not
notice or care.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-discard

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-discard
---
 common/xfs    |   46 ++++++++++++++++++++++++++++++++++++++++++++--
 tests/xfs/176 |    4 ++--
 tests/xfs/187 |    6 +++---
 tests/xfs/541 |    6 ++----
 4 files changed, 51 insertions(+), 11 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (34 preceding siblings ...)
  2023-12-31 20:00 ` [PATCHSET v2.0 3/9] fstests: enable FITRIM for the realtime section Darrick J. Wong
@ 2023-12-31 20:00 ` Darrick J. Wong
  2023-12-27 13:57   ` [PATCH 01/13] xfs: fix tests that try to access the realtime rmap inode Darrick J. Wong
                     ` (12 more replies)
  2023-12-31 20:01 ` [PATCHSET v2.0 5/9] fstests: establish baseline for realtime rmap fuzz tests Darrick J. Wong
                   ` (8 subsequent siblings)
  44 siblings, 13 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:00 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

Fix a few regressions in fstests when rmap is enabled on the realtime
volume.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-rmap

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-rmap

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-rmap

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-rmap
---
 common/populate    |   36 +++++++++++++++++++++++++
 common/xfs         |   74 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/104      |    1 +
 tests/xfs/122.out  |    5 +---
 tests/xfs/1528     |   41 +++++++++++++++++++++++++++++
 tests/xfs/1528.out |    4 +++
 tests/xfs/1529     |   40 ++++++++++++++++++++++++++++
 tests/xfs/1529.out |    4 +++
 tests/xfs/1817     |   42 ++++++++++++++++++++++++++++++
 tests/xfs/1817.out |    2 +
 tests/xfs/1821     |   42 ++++++++++++++++++++++++++++++
 tests/xfs/1821.out |    2 +
 tests/xfs/1856     |   39 +++++++++++++++++++++++++++
 tests/xfs/1857     |   42 ++++++++++++++++++++++++++++++
 tests/xfs/1857.out |    2 +
 tests/xfs/272      |    2 +
 tests/xfs/276      |    2 +
 tests/xfs/277      |    2 +
 tests/xfs/291      |    3 +-
 tests/xfs/332      |    6 +---
 tests/xfs/332.out  |    2 -
 tests/xfs/333      |   45 --------------------------------
 tests/xfs/333.out  |    6 ----
 tests/xfs/337      |    2 +
 tests/xfs/338      |   30 ++++++++++++++++++---
 tests/xfs/339      |    5 ++--
 tests/xfs/340      |   25 ++++++++++++++----
 tests/xfs/341      |   12 +++-----
 tests/xfs/341.out  |    1 -
 tests/xfs/342      |    4 +--
 tests/xfs/343      |    2 +
 tests/xfs/406      |    6 +++-
 tests/xfs/407      |    6 +++-
 tests/xfs/408      |    7 ++++-
 tests/xfs/409      |    7 ++++-
 tests/xfs/443      |    9 ++++--
 tests/xfs/481      |    6 +++-
 tests/xfs/482      |    7 ++++-
 38 files changed, 466 insertions(+), 107 deletions(-)
 create mode 100755 tests/xfs/1528
 create mode 100644 tests/xfs/1528.out
 create mode 100755 tests/xfs/1529
 create mode 100644 tests/xfs/1529.out
 create mode 100755 tests/xfs/1817
 create mode 100644 tests/xfs/1817.out
 create mode 100755 tests/xfs/1821
 create mode 100644 tests/xfs/1821.out
 create mode 100755 tests/xfs/1857
 create mode 100644 tests/xfs/1857.out
 delete mode 100755 tests/xfs/333
 delete mode 100644 tests/xfs/333.out


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 5/9] fstests: establish baseline for realtime rmap fuzz tests
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (35 preceding siblings ...)
  2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
@ 2023-12-31 20:01 ` Darrick J. Wong
  2023-12-27 14:01   ` [PATCH 1/1] fuzzy: create known output for rt rmap btree " Darrick J. Wong
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
                   ` (7 subsequent siblings)
  44 siblings, 1 reply; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:01 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

Establish a baseline golden output for the realtime rmap fuzz tests.
This shouldn't be merged upstream because the output is very dependent
on the geometry of the filesystem that is created.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-rmap-baseline
---
 tests/xfs/1528.out |   22 ++++++++
 tests/xfs/1529.out |  138 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/406.out  |   22 ++++++++
 tests/xfs/408.out  |  138 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/409.out  |   15 ++++++
 tests/xfs/481.out  |   22 ++++++++
 tests/xfs/482.out  |  138 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 495 insertions(+)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 6/9] fstests: reflink on the realtime device
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (36 preceding siblings ...)
  2023-12-31 20:01 ` [PATCHSET v2.0 5/9] fstests: establish baseline for realtime rmap fuzz tests Darrick J. Wong
@ 2023-12-31 20:01 ` Darrick J. Wong
  2023-12-27 14:01   ` [PATCH 1/9] xfs/122: update fields for realtime reflink Darrick J. Wong
                     ` (8 more replies)
  2023-12-31 20:01 ` [PATCHSET v2.0 7/9] fstests: establish baseline for realtime reflink fuzz tests Darrick J. Wong
                   ` (6 subsequent siblings)
  44 siblings, 9 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:01 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

This patchset enables use of the file data block sharing feature (i.e.
reflink) on the realtime device.  It follows the same basic sequence as
the realtime rmap series -- first a few cleanups; then widening of the
API parameters; and introduction of the new btree format and inode fork
format.  Next comes enabling CoW and remapping for the rt device; new
scrub, repair, and health reporting code; and at the end we implement
some code to lengthen write requests so that rt extents are always CoWed
fully.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-reflink

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-reflink

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-reflink

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-reflink
---
 common/populate       |   26 ++++++++++++++++++---
 common/xfs            |   15 ++++++++++++
 tests/generic/331     |   12 ++++++++-
 tests/generic/331.out |    2 +-
 tests/xfs/122.out     |    3 ++
 tests/xfs/131         |   48 --------------------------------------
 tests/xfs/131.out     |    5 ----
 tests/xfs/1538        |   41 ++++++++++++++++++++++++++++++++
 tests/xfs/1538.out    |    4 +++
 tests/xfs/1539        |   41 ++++++++++++++++++++++++++++++++
 tests/xfs/1539.out    |    4 +++
 tests/xfs/1540        |   41 ++++++++++++++++++++++++++++++++
 tests/xfs/1540.out    |    4 +++
 tests/xfs/1541        |   42 +++++++++++++++++++++++++++++++++
 tests/xfs/1541.out    |    4 +++
 tests/xfs/1542        |   41 ++++++++++++++++++++++++++++++++
 tests/xfs/1542.out    |    4 +++
 tests/xfs/1543        |   40 ++++++++++++++++++++++++++++++++
 tests/xfs/1543.out    |    4 +++
 tests/xfs/1544        |   40 ++++++++++++++++++++++++++++++++
 tests/xfs/1544.out    |    4 +++
 tests/xfs/1545        |   41 ++++++++++++++++++++++++++++++++
 tests/xfs/1545.out    |    4 +++
 tests/xfs/1818        |   43 ++++++++++++++++++++++++++++++++++
 tests/xfs/1818.out    |    2 ++
 tests/xfs/1819        |   43 ++++++++++++++++++++++++++++++++++
 tests/xfs/1819.out    |    2 ++
 tests/xfs/1856        |    3 ++
 tests/xfs/240         |   13 +++++++++-
 tests/xfs/240.out     |    2 +-
 tests/xfs/272         |   40 +++++++++++++++++++++-----------
 tests/xfs/274         |   62 ++++++++++++++++++++++++++++++++++---------------
 32 files changed, 585 insertions(+), 95 deletions(-)
 delete mode 100755 tests/xfs/131
 delete mode 100644 tests/xfs/131.out
 create mode 100755 tests/xfs/1538
 create mode 100644 tests/xfs/1538.out
 create mode 100755 tests/xfs/1539
 create mode 100644 tests/xfs/1539.out
 create mode 100755 tests/xfs/1540
 create mode 100644 tests/xfs/1540.out
 create mode 100755 tests/xfs/1541
 create mode 100644 tests/xfs/1541.out
 create mode 100755 tests/xfs/1542
 create mode 100644 tests/xfs/1542.out
 create mode 100755 tests/xfs/1543
 create mode 100644 tests/xfs/1543.out
 create mode 100755 tests/xfs/1544
 create mode 100644 tests/xfs/1544.out
 create mode 100755 tests/xfs/1545
 create mode 100644 tests/xfs/1545.out
 create mode 100755 tests/xfs/1818
 create mode 100644 tests/xfs/1818.out
 create mode 100755 tests/xfs/1819
 create mode 100644 tests/xfs/1819.out


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 7/9] fstests: establish baseline for realtime reflink fuzz tests
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (37 preceding siblings ...)
  2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
@ 2023-12-31 20:01 ` Darrick J. Wong
  2023-12-27 14:03   ` [PATCH 1/1] xfs: baseline golden output for rt refcount btree " Darrick J. Wong
  2023-12-31 20:01 ` [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents Darrick J. Wong
                   ` (5 subsequent siblings)
  44 siblings, 1 reply; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:01 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

Establish a baseline golden output for the realtime reflink fuzz tests.
This shouldn't be merged upstream because the output is very dependent
on the geometry of the filesystem that is created.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-reflink-baseline
---
 tests/xfs/1539.out |    6 ++++++
 tests/xfs/1540.out |    6 ++++++
 tests/xfs/1541.out |    6 ++++++
 tests/xfs/1542.out |   10 ++++++++++
 tests/xfs/1543.out |    2 ++
 tests/xfs/1544.out |   12 ++++++++++++
 tests/xfs/1545.out |   12 ++++++++++++
 7 files changed, 54 insertions(+)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (38 preceding siblings ...)
  2023-12-31 20:01 ` [PATCHSET v2.0 7/9] fstests: establish baseline for realtime reflink fuzz tests Darrick J. Wong
@ 2023-12-31 20:01 ` Darrick J. Wong
  2023-12-27 14:04   ` [PATCH 1/5] xfs: make sure that CoW will write around when rextsize > 1 Darrick J. Wong
                     ` (4 more replies)
  2023-12-31 20:02 ` [PATCHSET v2.0 9/9] fstests: functional tests for rt quota Darrick J. Wong
                   ` (4 subsequent siblings)
  44 siblings, 5 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:01 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

Now that we've landed support for reflink on the realtime device for
cases where the rt extent size is the same as the fs block size, enhance
the reflink code further to support cases where the rt extent size is a
power-of-two multiple of the fs block size.  This enables us to do data
block sharing (for example) for much larger allocation units by dirtying
pagecache around shared extents and expanding writeback to write back
shared extents fully.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-reflink-extsize

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-reflink-extsize

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-reflink-extsize
---
 common/rc          |   23 +++++++
 common/reflink     |   27 ++++++++
 tests/generic/145  |    1 
 tests/generic/147  |    1 
 tests/generic/261  |    1 
 tests/generic/262  |    1 
 tests/generic/303  |    8 ++
 tests/generic/331  |    1 
 tests/generic/353  |    2 -
 tests/generic/517  |    1 
 tests/generic/657  |    1 
 tests/generic/658  |    1 
 tests/generic/659  |    1 
 tests/generic/660  |    1 
 tests/generic/663  |    1 
 tests/generic/664  |    1 
 tests/generic/665  |    1 
 tests/generic/670  |    1 
 tests/generic/672  |    1 
 tests/xfs/180      |    1 
 tests/xfs/182      |    1 
 tests/xfs/184      |    1 
 tests/xfs/1919     |  163 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1919.out |   84 +++++++++++++++++++++++++
 tests/xfs/192      |    1 
 tests/xfs/1926     |  177 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1926.out |    2 +
 tests/xfs/200      |    1 
 tests/xfs/204      |    1 
 tests/xfs/208      |    1 
 tests/xfs/315      |    1 
 tests/xfs/326      |    6 ++
 tests/xfs/420      |    3 +
 tests/xfs/421      |    3 +
 tests/xfs/792      |    1 
 35 files changed, 520 insertions(+), 2 deletions(-)
 create mode 100755 tests/xfs/1919
 create mode 100644 tests/xfs/1919.out
 create mode 100755 tests/xfs/1926
 create mode 100644 tests/xfs/1926.out


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 9/9] fstests: functional tests for rt quota
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (39 preceding siblings ...)
  2023-12-31 20:01 ` [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents Darrick J. Wong
@ 2023-12-31 20:02 ` Darrick J. Wong
  2023-12-27 14:05   ` [PATCH 1/3] common: enable testing of realtime quota when supported Darrick J. Wong
                     ` (2 more replies)
  2023-12-31 20:03 ` [PATCHSET v2.0 1/4] xfs-documentation: document metadata directories Darrick J. Wong
                   ` (3 subsequent siblings)
  44 siblings, 3 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:02 UTC (permalink / raw)
  To: djwong, zlang; +Cc: guan, linux-xfs, fstests

Hi all,

The sole patch in this series sets up functional testing for quota on
the xfs realtime device.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

This has been running on the djcloud for months with no problems.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-quotas

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-quotas

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-quotas
---
 common/populate    |    2 -
 common/quota       |   64 ++++++++++++++++++--
 common/xfs         |   12 ++++
 tests/generic/219  |    1 
 tests/generic/230  |    1 
 tests/generic/305  |    1 
 tests/generic/326  |    1 
 tests/generic/327  |    1 
 tests/generic/328  |    1 
 tests/generic/566  |    4 +
 tests/generic/587  |    1 
 tests/generic/603  |    1 
 tests/generic/691  |    2 +
 tests/generic/710  |    4 +
 tests/xfs/050      |    2 +
 tests/xfs/106      |    1 
 tests/xfs/108      |    2 +
 tests/xfs/152      |    1 
 tests/xfs/153      |    2 +
 tests/xfs/161      |    2 +
 tests/xfs/1858     |  168 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/1858.out |   41 +++++++++++++
 tests/xfs/213      |    1 
 tests/xfs/214      |    1 
 tests/xfs/220      |    2 +
 tests/xfs/299      |    2 +
 tests/xfs/330      |    1 
 tests/xfs/434      |    1 
 tests/xfs/435      |    1 
 tests/xfs/441      |    1 
 tests/xfs/442      |    1 
 tests/xfs/508      |    2 +
 tests/xfs/511      |   10 +++
 tests/xfs/720      |    5 ++
 34 files changed, 333 insertions(+), 10 deletions(-)
 create mode 100755 tests/xfs/1858
 create mode 100644 tests/xfs/1858.out


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 1/4] xfs-documentation: document metadata directories
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (40 preceding siblings ...)
  2023-12-31 20:02 ` [PATCHSET v2.0 9/9] fstests: functional tests for rt quota Darrick J. Wong
@ 2023-12-31 20:03 ` Darrick J. Wong
  2023-12-27 14:08   ` [PATCH 1/1] design: document the changes required to handle " Darrick J. Wong
  2023-12-31 20:03 ` [PATCHSET v2.0 2/4] xfs-documentation: shard the realtime section Darrick J. Wong
                   ` (2 subsequent siblings)
  44 siblings, 1 reply; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:03 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

Hi all,

This patch documents the metadata directory tree feature.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=metadir

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=metadir

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=metadir

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=metadir
---
 .../allocation_groups.asciidoc                     |   23 +++
 .../internal_inodes.asciidoc                       |  142 ++++++++++++++++++++
 .../XFS_Filesystem_Structure/ondisk_inode.asciidoc |    5 +
 3 files changed, 165 insertions(+), 5 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 2/4] xfs-documentation: shard the realtime section
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (41 preceding siblings ...)
  2023-12-31 20:03 ` [PATCHSET v2.0 1/4] xfs-documentation: document metadata directories Darrick J. Wong
@ 2023-12-31 20:03 ` Darrick J. Wong
  2023-12-27 14:08   ` [PATCH 1/2] design: move discussion of realtime volumes to a separate section Darrick J. Wong
  2023-12-27 14:08   ` [PATCH 2/2] design: document realtime groups Darrick J. Wong
  2023-12-31 20:04 ` [PATCHSET v2.0 3/4] xfs-documentation: realtime reverse-mapping support Darrick J. Wong
  2023-12-31 20:04 ` [PATCHSET v2.0 4/4] xfs-documentation: reflink on the realtime device Darrick J. Wong
  44 siblings, 2 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:03 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

Hi all,

Document changes to the ondisk format for realtime groups.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-groups

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-groups

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-groups

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-groups
---
 .../allocation_groups.asciidoc                     |   39 ++-
 .../internal_inodes.asciidoc                       |   36 ---
 design/XFS_Filesystem_Structure/magic.asciidoc     |    1 
 design/XFS_Filesystem_Structure/realtime.asciidoc  |  232 ++++++++++++++++++++
 .../xfs_filesystem_structure.asciidoc              |    2 
 5 files changed, 256 insertions(+), 54 deletions(-)
 create mode 100644 design/XFS_Filesystem_Structure/realtime.asciidoc


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 3/4] xfs-documentation: realtime reverse-mapping support
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (42 preceding siblings ...)
  2023-12-31 20:03 ` [PATCHSET v2.0 2/4] xfs-documentation: shard the realtime section Darrick J. Wong
@ 2023-12-31 20:04 ` Darrick J. Wong
  2023-12-27 14:08   ` [PATCH 1/1] design: document the revisions to the realtime rmap formats Darrick J. Wong
  2023-12-31 20:04 ` [PATCHSET v2.0 4/4] xfs-documentation: reflink on the realtime device Darrick J. Wong
  44 siblings, 1 reply; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:04 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

Hi all,

This patch documents the realtime reverse mapping btree.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-rmap

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-rmap

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-rmap

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-rmap
---
 .../allocation_groups.asciidoc                     |    8 -
 .../internal_inodes.asciidoc                       |    4 
 .../journaling_log.asciidoc                        |    9 +
 design/XFS_Filesystem_Structure/realtime.asciidoc  |    5 
 design/XFS_Filesystem_Structure/rtrmapbt.asciidoc  |  216 ++++++++------------
 5 files changed, 105 insertions(+), 137 deletions(-)


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCHSET v2.0 4/4] xfs-documentation: reflink on the realtime device
  2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
                   ` (43 preceding siblings ...)
  2023-12-31 20:04 ` [PATCHSET v2.0 3/4] xfs-documentation: realtime reverse-mapping support Darrick J. Wong
@ 2023-12-31 20:04 ` Darrick J. Wong
  2023-12-27 14:09   ` [PATCH 1/1] design: document changes for the realtime refcount btree Darrick J. Wong
  44 siblings, 1 reply; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 20:04 UTC (permalink / raw)
  To: darrick.wong, djwong; +Cc: linux-xfs

Hi all,

This patch documents the realtime reference count btree.

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

Comments and questions are, as always, welcome.

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=realtime-reflink

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=realtime-reflink

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=realtime-reflink

xfsdocs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-documentation.git/log/?h=realtime-reflink
---
 .../internal_inodes.asciidoc                       |    5 -
 .../journaling_log.asciidoc                        |    9 +
 design/XFS_Filesystem_Structure/magic.asciidoc     |    1 
 design/XFS_Filesystem_Structure/realtime.asciidoc  |    5 -
 .../XFS_Filesystem_Structure/rtrefcountbt.asciidoc |  173 ++++++++++++++++++++
 5 files changed, 190 insertions(+), 3 deletions(-)
 create mode 100644 design/XFS_Filesystem_Structure/rtrefcountbt.asciidoc


^ permalink raw reply	[flat|nested] 632+ messages in thread

* [PATCH 01/21] xfs: move inode copy-on-write predicates to xfs_inode.[ch]
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
@ 2023-12-31 21:01   ` Darrick J. Wong
  2023-12-31 21:01   ` [PATCH 02/21] xfs: hoist extent size helpers to libxfs Darrick J. Wong
                     ` (19 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:01 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move these inode predicate functions to xfs_inode.[ch] since they're not
reflink functions.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_inode.c   |    8 ++++++++
 fs/xfs/xfs_inode.h   |    7 +++++++
 fs/xfs/xfs_reflink.h |   10 ----------
 3 files changed, 15 insertions(+), 10 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 01b100e28b541..199def25c0343 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -4263,3 +4263,11 @@ xfs_inode_alloc_unitsize(
 
 	return XFS_FSB_TO_B(ip->i_mount, blocks);
 }
+
+/* Should we always be using copy on write for file writes? */
+bool
+xfs_is_always_cow_inode(
+	struct xfs_inode	*ip)
+{
+	return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index cbde77d711b49..f6c463ce46424 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -292,6 +292,13 @@ static inline bool xfs_is_metadata_inode(struct xfs_inode *ip)
 		xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
 }
 
+bool xfs_is_always_cow_inode(struct xfs_inode *ip);
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
 /*
  * Check if an inode has any data in the COW fork.  This might be often false
  * even for inodes with the reflink flag when there is no pending COW operation.
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 65c5dfe17ecf7..fb55e4ce49fa1 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,6 @@
 #ifndef __XFS_REFLINK_H
 #define __XFS_REFLINK_H 1
 
-static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
-{
-	return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
-}
-
-static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
-{
-	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
-}
-
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *irec, bool *shared);
 int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/21] xfs: hoist extent size helpers to libxfs
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
  2023-12-31 21:01   ` [PATCH 01/21] xfs: move inode copy-on-write predicates to xfs_inode.[ch] Darrick J. Wong
@ 2023-12-31 21:01   ` Darrick J. Wong
  2023-12-31 21:01   ` [PATCH 03/21] xfs: hoist inode flag conversion functions " Darrick J. Wong
                     ` (18 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:01 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the extent size helpers to xfs_bmap.c in libxfs since they're used
there already.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |   41 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_bmap.h |    3 +++
 fs/xfs/xfs_inode.c       |   43 -------------------------------------------
 fs/xfs/xfs_inode.h       |    3 ---
 fs/xfs/xfs_iops.c        |    1 +
 5 files changed, 45 insertions(+), 46 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index d34354d2cdd49..dc77b1a59faf8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6363,3 +6363,44 @@ xfs_bmap_query_all(
 
 	return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query);
 }
+
+/* Helper function to extract extent size hint from inode */
+xfs_extlen_t
+xfs_get_extsz_hint(
+	struct xfs_inode	*ip)
+{
+	/*
+	 * No point in aligning allocations if we need to COW to actually
+	 * write to them.
+	 */
+	if (xfs_is_always_cow_inode(ip))
+		return 0;
+	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+		return ip->i_extsize;
+	if (XFS_IS_REALTIME_INODE(ip))
+		return ip->i_mount->m_sb.sb_rextsize;
+	return 0;
+}
+
+/*
+ * Helper function to extract CoW extent size hint from inode.
+ * Between the extent size hint and the CoW extent size hint, we
+ * return the greater of the two.  If the value is zero (automatic),
+ * use the default size.
+ */
+xfs_extlen_t
+xfs_get_cowextsz_hint(
+	struct xfs_inode	*ip)
+{
+	xfs_extlen_t		a, b;
+
+	a = 0;
+	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+		a = ip->i_cowextsize;
+	b = xfs_get_extsz_hint(ip);
+
+	a = max(a, b);
+	if (a == 0)
+		return XFS_DEFAULT_COWEXTSZ_HINT;
+	return a;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index c9e297dba88d0..bd7f936262cc6 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -294,4 +294,7 @@ typedef int (*xfs_bmap_query_range_fn)(
 int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn,
 		void *priv);
 
+xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
+
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 199def25c0343..0c8fe6437e31b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,49 +45,6 @@
 
 struct kmem_cache *xfs_inode_cache;
 
-/*
- * helper function to extract extent size hint from inode
- */
-xfs_extlen_t
-xfs_get_extsz_hint(
-	struct xfs_inode	*ip)
-{
-	/*
-	 * No point in aligning allocations if we need to COW to actually
-	 * write to them.
-	 */
-	if (xfs_is_always_cow_inode(ip))
-		return 0;
-	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
-		return ip->i_extsize;
-	if (XFS_IS_REALTIME_INODE(ip))
-		return ip->i_mount->m_sb.sb_rextsize;
-	return 0;
-}
-
-/*
- * Helper function to extract CoW extent size hint from inode.
- * Between the extent size hint and the CoW extent size hint, we
- * return the greater of the two.  If the value is zero (automatic),
- * use the default size.
- */
-xfs_extlen_t
-xfs_get_cowextsz_hint(
-	struct xfs_inode	*ip)
-{
-	xfs_extlen_t		a, b;
-
-	a = 0;
-	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
-		a = ip->i_cowextsize;
-	b = xfs_get_extsz_hint(ip);
-
-	a = max(a, b);
-	if (a == 0)
-		return XFS_DEFAULT_COWEXTSZ_HINT;
-	return a;
-}
-
 /*
  * These two are wrapper routines around the xfs_ilock() routine used to
  * centralize some grungy code.  They are used in places that wish to lock the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f6c463ce46424..91a5a77910b3d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -559,9 +559,6 @@ int		xfs_iflush_cluster(struct xfs_buf *);
 void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
 				struct xfs_inode *ip1, uint ip1_mode);
 
-xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
-xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
-
 int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp,
 		struct xfs_inode *pip, xfs_ino_t ino, umode_t mode,
 		xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 62b425129d11c..8f5b8f8973a5f 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -26,6 +26,7 @@
 #include "xfs_ioctl.h"
 #include "xfs_xattr.h"
 #include "xfs_file.h"
+#include "xfs_bmap.h"
 
 #include <linux/posix_acl.h>
 #include <linux/security.h>


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/21] xfs: hoist inode flag conversion functions to libxfs
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
  2023-12-31 21:01   ` [PATCH 01/21] xfs: move inode copy-on-write predicates to xfs_inode.[ch] Darrick J. Wong
  2023-12-31 21:01   ` [PATCH 02/21] xfs: hoist extent size helpers to libxfs Darrick J. Wong
@ 2023-12-31 21:01   ` Darrick J. Wong
  2023-12-31 21:01   ` [PATCH 04/21] xfs: hoist project id get/set " Darrick J. Wong
                     ` (17 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:01 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Hoist the inode flag conversion functions into libxfs so that we can
keep them in sync.  Do this by creating a new xfs_inode_util.c file in
libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile                |    1 
 fs/xfs/libxfs/xfs_bmap.c       |    1 
 fs/xfs/libxfs/xfs_inode_util.c |  124 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_util.h |   14 +++++
 fs/xfs/xfs_inode.c             |   49 ----------------
 fs/xfs/xfs_inode.h             |    2 -
 fs/xfs/xfs_ioctl.c             |   60 -------------------
 7 files changed, 141 insertions(+), 110 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_inode_util.c
 create mode 100644 fs/xfs/libxfs/xfs_inode_util.h


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c4a950ad517c9..6f7b0683a46cd 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -39,6 +39,7 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_iext_tree.o \
 				   xfs_inode_fork.o \
 				   xfs_inode_buf.o \
+				   xfs_inode_util.o \
 				   xfs_log_rlimit.o \
 				   xfs_ag_resv.o \
 				   xfs_parent.o \
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index dc77b1a59faf8..dd0229963ad97 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -39,6 +39,7 @@
 #include "xfs_health.h"
 #include "xfs_bmap_item.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_inode_util.h"
 
 struct kmem_cache		*xfs_bmap_intent_cache;
 
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
new file mode 100644
index 0000000000000..ed5e1a9b4b8c6
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_inode_util.h"
+
+uint16_t
+xfs_flags2diflags(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	/* can't set PREALLOC this way, just preserve it */
+	uint16_t		di_flags =
+		(ip->i_diflags & XFS_DIFLAG_PREALLOC);
+
+	if (xflags & FS_XFLAG_IMMUTABLE)
+		di_flags |= XFS_DIFLAG_IMMUTABLE;
+	if (xflags & FS_XFLAG_APPEND)
+		di_flags |= XFS_DIFLAG_APPEND;
+	if (xflags & FS_XFLAG_SYNC)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if (xflags & FS_XFLAG_NOATIME)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if (xflags & FS_XFLAG_NODUMP)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if (xflags & FS_XFLAG_NODEFRAG)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (xflags & FS_XFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		if (xflags & FS_XFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (xflags & FS_XFLAG_NOSYMLINKS)
+			di_flags |= XFS_DIFLAG_NOSYMLINKS;
+		if (xflags & FS_XFLAG_EXTSZINHERIT)
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+		if (xflags & FS_XFLAG_PROJINHERIT)
+			di_flags |= XFS_DIFLAG_PROJINHERIT;
+	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
+		if (xflags & FS_XFLAG_REALTIME)
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (xflags & FS_XFLAG_EXTSIZE)
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+	}
+
+	return di_flags;
+}
+
+uint64_t
+xfs_flags2diflags2(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	uint64_t		di_flags2 =
+		(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
+				   XFS_DIFLAG2_BIGTIME |
+				   XFS_DIFLAG2_NREXT64));
+
+	if (xflags & FS_XFLAG_DAX)
+		di_flags2 |= XFS_DIFLAG2_DAX;
+	if (xflags & FS_XFLAG_COWEXTSIZE)
+		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+
+	return di_flags2;
+}
+
+uint32_t
+xfs_ip2xflags(
+	struct xfs_inode	*ip)
+{
+	uint32_t		flags = 0;
+
+	if (ip->i_diflags & XFS_DIFLAG_ANY) {
+		if (ip->i_diflags & XFS_DIFLAG_REALTIME)
+			flags |= FS_XFLAG_REALTIME;
+		if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
+			flags |= FS_XFLAG_PREALLOC;
+		if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
+			flags |= FS_XFLAG_IMMUTABLE;
+		if (ip->i_diflags & XFS_DIFLAG_APPEND)
+			flags |= FS_XFLAG_APPEND;
+		if (ip->i_diflags & XFS_DIFLAG_SYNC)
+			flags |= FS_XFLAG_SYNC;
+		if (ip->i_diflags & XFS_DIFLAG_NOATIME)
+			flags |= FS_XFLAG_NOATIME;
+		if (ip->i_diflags & XFS_DIFLAG_NODUMP)
+			flags |= FS_XFLAG_NODUMP;
+		if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
+			flags |= FS_XFLAG_RTINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+			flags |= FS_XFLAG_PROJINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
+			flags |= FS_XFLAG_NOSYMLINKS;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+			flags |= FS_XFLAG_EXTSIZE;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
+			flags |= FS_XFLAG_EXTSZINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
+			flags |= FS_XFLAG_NODEFRAG;
+		if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
+			flags |= FS_XFLAG_FILESTREAM;
+	}
+
+	if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
+		if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
+			flags |= FS_XFLAG_DAX;
+		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+			flags |= FS_XFLAG_COWEXTSIZE;
+	}
+
+	if (xfs_inode_has_attr_fork(ip))
+		flags |= FS_XFLAG_HASATTR;
+	return flags;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h
new file mode 100644
index 0000000000000..6ad1898a0f73f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_util.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_INODE_UTIL_H__
+#define	__XFS_INODE_UTIL_H__
+
+uint16_t	xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags);
+uint64_t	xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags);
+uint32_t	xfs_dic2xflags(struct xfs_inode *ip);
+uint32_t	xfs_ip2xflags(struct xfs_inode *ip);
+
+#endif /* __XFS_INODE_UTIL_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0c8fe6437e31b..3f69379bfef59 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -544,55 +544,6 @@ xfs_lock_two_inodes(
 	}
 }
 
-uint
-xfs_ip2xflags(
-	struct xfs_inode	*ip)
-{
-	uint			flags = 0;
-
-	if (ip->i_diflags & XFS_DIFLAG_ANY) {
-		if (ip->i_diflags & XFS_DIFLAG_REALTIME)
-			flags |= FS_XFLAG_REALTIME;
-		if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
-			flags |= FS_XFLAG_PREALLOC;
-		if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
-			flags |= FS_XFLAG_IMMUTABLE;
-		if (ip->i_diflags & XFS_DIFLAG_APPEND)
-			flags |= FS_XFLAG_APPEND;
-		if (ip->i_diflags & XFS_DIFLAG_SYNC)
-			flags |= FS_XFLAG_SYNC;
-		if (ip->i_diflags & XFS_DIFLAG_NOATIME)
-			flags |= FS_XFLAG_NOATIME;
-		if (ip->i_diflags & XFS_DIFLAG_NODUMP)
-			flags |= FS_XFLAG_NODUMP;
-		if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
-			flags |= FS_XFLAG_RTINHERIT;
-		if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
-			flags |= FS_XFLAG_PROJINHERIT;
-		if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
-			flags |= FS_XFLAG_NOSYMLINKS;
-		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
-			flags |= FS_XFLAG_EXTSIZE;
-		if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
-			flags |= FS_XFLAG_EXTSZINHERIT;
-		if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
-			flags |= FS_XFLAG_NODEFRAG;
-		if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
-			flags |= FS_XFLAG_FILESTREAM;
-	}
-
-	if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
-		if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
-			flags |= FS_XFLAG_DAX;
-		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
-			flags |= FS_XFLAG_COWEXTSIZE;
-	}
-
-	if (xfs_inode_has_attr_fork(ip))
-		flags |= FS_XFLAG_HASATTR;
-	return flags;
-}
-
 /*
  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
  * is allowed, otherwise it has to be an exact match. If a CI match is found,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 91a5a77910b3d..283b71965ef7c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -8,6 +8,7 @@
 
 #include "xfs_inode_buf.h"
 #include "xfs_inode_fork.h"
+#include "xfs_inode_util.h"
 
 /*
  * Kernel only inode definitions
@@ -545,7 +546,6 @@ bool		xfs_isilocked(struct xfs_inode *, uint);
 uint		xfs_ilock_data_map_shared(struct xfs_inode *);
 uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
 
-uint		xfs_ip2xflags(struct xfs_inode *);
 int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
 int		xfs_itruncate_extents_flags(struct xfs_trans **,
 				struct xfs_inode *, int, xfs_fsize_t, int);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a0dfefdf4c491..184916bc6528a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1060,66 +1060,6 @@ xfs_fileattr_get(
 	return 0;
 }
 
-STATIC uint16_t
-xfs_flags2diflags(
-	struct xfs_inode	*ip,
-	unsigned int		xflags)
-{
-	/* can't set PREALLOC this way, just preserve it */
-	uint16_t		di_flags =
-		(ip->i_diflags & XFS_DIFLAG_PREALLOC);
-
-	if (xflags & FS_XFLAG_IMMUTABLE)
-		di_flags |= XFS_DIFLAG_IMMUTABLE;
-	if (xflags & FS_XFLAG_APPEND)
-		di_flags |= XFS_DIFLAG_APPEND;
-	if (xflags & FS_XFLAG_SYNC)
-		di_flags |= XFS_DIFLAG_SYNC;
-	if (xflags & FS_XFLAG_NOATIME)
-		di_flags |= XFS_DIFLAG_NOATIME;
-	if (xflags & FS_XFLAG_NODUMP)
-		di_flags |= XFS_DIFLAG_NODUMP;
-	if (xflags & FS_XFLAG_NODEFRAG)
-		di_flags |= XFS_DIFLAG_NODEFRAG;
-	if (xflags & FS_XFLAG_FILESTREAM)
-		di_flags |= XFS_DIFLAG_FILESTREAM;
-	if (S_ISDIR(VFS_I(ip)->i_mode)) {
-		if (xflags & FS_XFLAG_RTINHERIT)
-			di_flags |= XFS_DIFLAG_RTINHERIT;
-		if (xflags & FS_XFLAG_NOSYMLINKS)
-			di_flags |= XFS_DIFLAG_NOSYMLINKS;
-		if (xflags & FS_XFLAG_EXTSZINHERIT)
-			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-		if (xflags & FS_XFLAG_PROJINHERIT)
-			di_flags |= XFS_DIFLAG_PROJINHERIT;
-	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
-		if (xflags & FS_XFLAG_REALTIME)
-			di_flags |= XFS_DIFLAG_REALTIME;
-		if (xflags & FS_XFLAG_EXTSIZE)
-			di_flags |= XFS_DIFLAG_EXTSIZE;
-	}
-
-	return di_flags;
-}
-
-STATIC uint64_t
-xfs_flags2diflags2(
-	struct xfs_inode	*ip,
-	unsigned int		xflags)
-{
-	uint64_t		di_flags2 =
-		(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
-				   XFS_DIFLAG2_BIGTIME |
-				   XFS_DIFLAG2_NREXT64));
-
-	if (xflags & FS_XFLAG_DAX)
-		di_flags2 |= XFS_DIFLAG2_DAX;
-	if (xflags & FS_XFLAG_COWEXTSIZE)
-		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
-
-	return di_flags2;
-}
-
 static int
 xfs_ioctl_setattr_xflags(
 	struct xfs_trans	*tp,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/21] xfs: hoist project id get/set functions to libxfs
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:01   ` [PATCH 03/21] xfs: hoist inode flag conversion functions " Darrick J. Wong
@ 2023-12-31 21:01   ` Darrick J. Wong
  2023-12-31 21:02   ` [PATCH 05/21] xfs: pack icreate initialization parameters into a separate structure Darrick J. Wong
                     ` (16 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:01 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the project id get and set functions into libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_util.c |   11 +++++++++++
 fs/xfs/libxfs/xfs_inode_util.h |    2 ++
 fs/xfs/xfs_inode.h             |    9 ---------
 fs/xfs/xfs_linux.h             |    2 --
 4 files changed, 13 insertions(+), 11 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index ed5e1a9b4b8c6..2624d18922c02 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -122,3 +122,14 @@ xfs_ip2xflags(
 		flags |= FS_XFLAG_HASATTR;
 	return flags;
 }
+
+#define XFS_PROJID_DEFAULT	0
+
+prid_t
+xfs_get_initial_prid(struct xfs_inode *dp)
+{
+	if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT)
+		return dp->i_projid;
+
+	return XFS_PROJID_DEFAULT;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h
index 6ad1898a0f73f..f7e4d5a8235dd 100644
--- a/fs/xfs/libxfs/xfs_inode_util.h
+++ b/fs/xfs/libxfs/xfs_inode_util.h
@@ -11,4 +11,6 @@ uint64_t	xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags);
 uint32_t	xfs_dic2xflags(struct xfs_inode *ip);
 uint32_t	xfs_ip2xflags(struct xfs_inode *ip);
 
+prid_t		xfs_get_initial_prid(struct xfs_inode *dp);
+
 #endif /* __XFS_INODE_UTIL_H__ */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 283b71965ef7c..f4937d57ad7da 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -271,15 +271,6 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
 	return ret;
 }
 
-static inline prid_t
-xfs_get_initial_prid(struct xfs_inode *dp)
-{
-	if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT)
-		return dp->i_projid;
-
-	return XFS_PROJID_DEFAULT;
-}
-
 static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
 {
 	return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 13511ff810d18..953466922ddf7 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -137,8 +137,6 @@ typedef __u32			xfs_nlink_t;
  */
 #define __this_address	({ __label__ __here; __here: barrier(); &&__here; })
 
-#define XFS_PROJID_DEFAULT	0
-
 #define howmany(x, y)	(((x)+((y)-1))/(y))
 
 static inline void delay(long ticks)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/21] xfs: pack icreate initialization parameters into a separate structure
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:01   ` [PATCH 04/21] xfs: hoist project id get/set " Darrick J. Wong
@ 2023-12-31 21:02   ` Darrick J. Wong
  2023-12-31 21:02   ` [PATCH 06/21] xfs: implement atime updates in xfs_trans_ichgtime Darrick J. Wong
                     ` (15 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:02 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Callers that want to create an inode currently pass all possible file
attribute values for the new inode into xfs_init_new_inode as ten
separate parameters.  This causes two code maintenance issues: first, we
have large multi-line call sites which programmers must read carefully
to make sure they did not accidentally invert a value.  Second, all
three file id parameters must be passed separately to the quota
functions; any discrepancy results in quota count errors.

Clean this up by creating a new icreate_args structure to hold all this
information, some helpers to initialize them properly, and make the
callers pass this structure through to the creation function, whose name
we shorten to xfs_icreate.  This eliminates the issues, enables us to
keep the inode init code in sync with userspace via libxfs, and is
needed for future metadata directory tree management.

(A subsequent cleanup will also fix the quota alloc calls.)

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_util.h |   32 ++++++++++++
 fs/xfs/scrub/tempfile.c        |   13 +++--
 fs/xfs/xfs_inode.c             |  106 +++++++++++++++++++++++++++++-----------
 fs/xfs/xfs_inode.h             |   12 +++--
 fs/xfs/xfs_qm.c                |    9 +++
 fs/xfs/xfs_symlink.c           |   14 +++--
 6 files changed, 138 insertions(+), 48 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h
index f7e4d5a8235dd..a494f7c4a3fe0 100644
--- a/fs/xfs/libxfs/xfs_inode_util.h
+++ b/fs/xfs/libxfs/xfs_inode_util.h
@@ -13,4 +13,36 @@ uint32_t	xfs_ip2xflags(struct xfs_inode *ip);
 
 prid_t		xfs_get_initial_prid(struct xfs_inode *dp);
 
+/*
+ * Initial ids, link count, device number, and mode of a new inode.
+ *
+ * Due to our only partial reliance on the VFS to propagate uid and gid values
+ * according to accepted Unix behaviors, callers must initialize mnt_userns to
+ * the appropriate namespace, uid to fsuid_into_mnt(), and gid to
+ * fsgid_into_mnt() to get the correct inheritance behaviors when
+ * XFS_MOUNT_GRPID is set.  Use the xfs_ialloc_inherit_args() helper.
+ *
+ * To override the default ids, use the FORCE flags defined below.
+ */
+struct xfs_icreate_args {
+	struct mnt_idmap	*idmap;
+
+	struct xfs_inode	*pip;	/* parent inode or null */
+
+	kuid_t			uid;
+	kgid_t			gid;
+	prid_t			prid;
+
+	xfs_nlink_t		nlink;
+	dev_t			rdev;
+
+	umode_t			mode;
+
+#define XFS_ICREATE_ARGS_FORCE_UID	(1 << 0)
+#define XFS_ICREATE_ARGS_FORCE_GID	(1 << 1)
+#define XFS_ICREATE_ARGS_FORCE_MODE	(1 << 2)
+#define XFS_ICREATE_ARGS_INIT_XATTRS	(1 << 3)
+	uint16_t		flags;
+};
+
 #endif /* __XFS_INODE_UTIL_H__ */
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 43d48f1e331de..c326cf66dea5c 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -40,6 +40,7 @@ xrep_tempfile_create(
 	struct xfs_scrub	*sc,
 	uint16_t		mode)
 {
+	struct xfs_icreate_args	args = { .pip = sc->mp->m_rootip, };
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_trans	*tp = NULL;
 	struct xfs_dquot	*udqp = NULL;
@@ -60,12 +61,15 @@ xrep_tempfile_create(
 	ASSERT(sc->tp == NULL);
 	ASSERT(sc->tempip == NULL);
 
+	/* Force everything to have the root ids and mode we want. */
+	xfs_icreate_args_rootfile(&args, mp, mode, false);
+
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.  The temporary
 	 * inode should be completely root owned so that we don't fail due to
 	 * quota limits.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+	error = xfs_qm_vop_dqalloc(dp, args.uid, args.gid, args.prid,
 			XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
 	if (error)
 		return error;
@@ -87,14 +91,11 @@ xrep_tempfile_create(
 	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
 	if (error)
 		goto out_trans_cancel;
-	error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
-			0, false, &sc->tempip);
+	error = xfs_icreate(tp, ino, &args, &sc->tempip);
 	if (error)
 		goto out_trans_cancel;
 
-	/* Change the ownership of the inode to root. */
-	VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
-	VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
+	/* We don't touch file data, so drop the realtime flags. */
 	sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
 	xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3f69379bfef59..72d2441b65a78 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -683,18 +683,13 @@ xfs_inode_inherit_flags2(
  * Caller is responsible for unlocking the inode manually upon return
  */
 int
-xfs_init_new_inode(
-	struct mnt_idmap	*idmap,
+xfs_icreate(
 	struct xfs_trans	*tp,
-	struct xfs_inode	*pip,
 	xfs_ino_t		ino,
-	umode_t			mode,
-	xfs_nlink_t		nlink,
-	dev_t			rdev,
-	prid_t			prid,
-	bool			init_xattrs,
+	const struct xfs_icreate_args *args,
 	struct xfs_inode	**ipp)
 {
+	struct xfs_inode	*pip = args->pip;
 	struct inode		*dir = pip ? VFS_I(pip) : NULL;
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_inode	*ip;
@@ -727,16 +722,16 @@ xfs_init_new_inode(
 
 	ASSERT(ip != NULL);
 	inode = VFS_I(ip);
-	set_nlink(inode, nlink);
-	inode->i_rdev = rdev;
-	ip->i_projid = prid;
+	set_nlink(inode, args->nlink);
+	inode->i_rdev = args->rdev;
+	ip->i_projid = args->prid;
 
 	if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
-		inode_fsuid_set(inode, idmap);
+		inode_fsuid_set(inode, args->idmap);
 		inode->i_gid = dir->i_gid;
-		inode->i_mode = mode;
+		inode->i_mode = args->mode;
 	} else {
-		inode_init_owner(idmap, inode, dir, mode);
+		inode_init_owner(args->idmap, inode, dir, args->mode);
 	}
 
 	/*
@@ -745,9 +740,21 @@ xfs_init_new_inode(
 	 * (and only if the irix_sgid_inherit compatibility variable is set).
 	 */
 	if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
-	    !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
+	    !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
 		inode->i_mode &= ~S_ISGID;
 
+	/* struct copies */
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_UID)
+		inode->i_uid = args->uid;
+	else
+		ASSERT(uid_eq(inode->i_uid, args->uid));
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_GID)
+		inode->i_gid = args->gid;
+	else if (!pip || !XFS_INHERIT_GID(pip))
+		ASSERT(gid_eq(inode->i_gid, args->gid));
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_MODE)
+		inode->i_mode = args->mode;
+
 	ip->i_disk_size = 0;
 	ip->i_df.if_nextents = 0;
 	ASSERT(ip->i_nblocks == 0);
@@ -766,7 +773,7 @@ xfs_init_new_inode(
 	}
 
 	flags = XFS_ILOG_CORE;
-	switch (mode & S_IFMT) {
+	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFCHR:
 	case S_IFBLK:
@@ -799,7 +806,8 @@ xfs_init_new_inode(
 	 * this saves us from needing to run a separate transaction to set the
 	 * fork offset in the immediate future.
 	 */
-	if (init_xattrs && xfs_has_attr(mp)) {
+	if ((args->flags & XFS_ICREATE_ARGS_INIT_XATTRS) &&
+	    xfs_has_attr(mp)) {
 		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
 		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
 	}
@@ -817,6 +825,47 @@ xfs_init_new_inode(
 	return 0;
 }
 
+/* Set up inode attributes for newly created children of a directory. */
+void
+xfs_icreate_args_inherit(
+	struct xfs_icreate_args	*args,
+	struct xfs_inode	*dp,
+	struct mnt_idmap	*idmap,
+	umode_t			mode,
+	bool			init_xattrs)
+{
+	args->idmap = idmap;
+	args->pip = dp;
+	args->uid = mapped_fsuid(idmap, &init_user_ns);
+	args->gid = mapped_fsgid(idmap, &init_user_ns);
+	args->prid = xfs_get_initial_prid(dp);
+	args->mode = mode;
+
+	/* Don't clobber the caller's flags */
+	if (init_xattrs)
+		args->flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
+}
+
+/* Set up inode attributes for newly created internal files. */
+void
+xfs_icreate_args_rootfile(
+	struct xfs_icreate_args	*args,
+	struct xfs_mount	*mp,
+	umode_t			mode,
+	bool			init_xattrs)
+{
+	args->idmap = &nop_mnt_idmap;
+	args->uid = GLOBAL_ROOT_UID;
+	args->gid = GLOBAL_ROOT_GID;
+	args->prid = 0;
+	args->mode = mode;
+	args->flags = XFS_ICREATE_ARGS_FORCE_UID |
+		      XFS_ICREATE_ARGS_FORCE_GID |
+		      XFS_ICREATE_ARGS_FORCE_MODE;
+	if (init_xattrs)
+		args->flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
+}
+
 /*
  * Decrement the link count on an inode & log the change.  If this causes the
  * link count to go to zero, move the inode to AGI unlinked list so that it can
@@ -946,13 +995,16 @@ xfs_create(
 	bool			init_xattrs,
 	xfs_inode_t		**ipp)
 {
+	struct xfs_icreate_args	args = {
+		.rdev		= rdev,
+		.nlink		= S_ISDIR(mode) ? 2 : 1,
+	};
 	int			is_dir = S_ISDIR(mode);
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_inode	*ip = NULL;
 	struct xfs_trans	*tp = NULL;
 	int			error;
 	bool			unlock_dp_on_error = false;
-	prid_t			prid;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
 	struct xfs_dquot	*pdqp = NULL;
@@ -968,13 +1020,12 @@ xfs_create(
 	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
 		return -EIO;
 
-	prid = xfs_get_initial_prid(dp);
+	xfs_icreate_args_inherit(&args, dp, idmap, mode, init_xattrs);
 
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
-			mapped_fsgid(idmap, &init_user_ns), prid,
+	error = xfs_qm_vop_dqalloc(dp, args.uid, args.gid, args.prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
@@ -1019,8 +1070,7 @@ xfs_create(
 	 */
 	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
 	if (!error)
-		error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
-				is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
+		error = xfs_icreate(tp, ino, &args, &ip);
 	if (error)
 		goto out_trans_cancel;
 
@@ -1126,11 +1176,11 @@ xfs_create_tmpfile(
 	bool			init_xattrs,
 	struct xfs_inode	**ipp)
 {
+	struct xfs_icreate_args	args = { NULL };
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_inode	*ip = NULL;
 	struct xfs_trans	*tp = NULL;
 	int			error;
-	prid_t                  prid;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
 	struct xfs_dquot	*pdqp = NULL;
@@ -1141,13 +1191,12 @@ xfs_create_tmpfile(
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 
-	prid = xfs_get_initial_prid(dp);
+	xfs_icreate_args_inherit(&args, dp, idmap, mode, init_xattrs);
 
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
-			mapped_fsgid(idmap, &init_user_ns), prid,
+	error = xfs_qm_vop_dqalloc(dp, args.uid, args.gid, args.prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
@@ -1163,8 +1212,7 @@ xfs_create_tmpfile(
 
 	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
 	if (!error)
-		error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
-				0, 0, prid, init_xattrs, &ip);
+		error = xfs_icreate(tp, ino, &args, &ip);
 	if (error)
 		goto out_trans_cancel;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f4937d57ad7da..8ccf4cf049709 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -550,10 +550,8 @@ int		xfs_iflush_cluster(struct xfs_buf *);
 void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
 				struct xfs_inode *ip1, uint ip1_mode);
 
-int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp,
-		struct xfs_inode *pip, xfs_ino_t ino, umode_t mode,
-		xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs,
-		struct xfs_inode **ipp);
+int xfs_icreate(struct xfs_trans *tp, xfs_ino_t ino,
+		const struct xfs_icreate_args *args, struct xfs_inode **ipp);
 
 static inline int
 xfs_itruncate_extents(
@@ -661,4 +659,10 @@ void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
 # define xfs_dir_update_hook(dp, ip, delta, name)	((void)0)
 #endif /* CONFIG_XFS_LIVE_HOOKS */
 
+void xfs_icreate_args_inherit(struct xfs_icreate_args *args,
+		struct xfs_inode *dp, struct mnt_idmap *idmap, umode_t mode,
+		bool init_xattrs);
+void xfs_icreate_args_rootfile(struct xfs_icreate_args *args,
+		struct xfs_mount *mp, umode_t mode, bool init_xattrs);
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index c25d917487f0e..07d0d0231252a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -792,12 +792,17 @@ xfs_qm_qino_alloc(
 		return error;
 
 	if (need_alloc) {
+		struct xfs_icreate_args	args = {
+			.nlink		= 1,
+		};
 		xfs_ino_t	ino;
 
+		xfs_icreate_args_rootfile(&args, mp, S_IFREG,
+				xfs_has_parent(mp));
+
 		error = xfs_dialloc(&tp, 0, S_IFREG, &ino);
 		if (!error)
-			error = xfs_init_new_inode(&nop_mnt_idmap, tp, NULL, ino,
-					S_IFREG, 1, 0, 0, false, ipp);
+			error = xfs_icreate(tp, ino, &args, ipp);
 		if (error) {
 			xfs_trans_cancel(tp);
 			return error;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index ced29d8c48c0a..f40fa37302829 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -90,6 +90,9 @@ xfs_symlink(
 	umode_t			mode,
 	struct xfs_inode	**ipp)
 {
+	struct xfs_icreate_args	args = {
+		.nlink		= 1,
+	};
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_trans	*tp = NULL;
 	struct xfs_inode	*ip = NULL;
@@ -97,7 +100,6 @@ xfs_symlink(
 	int			pathlen;
 	bool                    unlock_dp_on_error = false;
 	xfs_filblks_t		fs_blocks;
-	prid_t			prid;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
 	struct xfs_dquot	*pdqp = NULL;
@@ -120,13 +122,13 @@ xfs_symlink(
 		return -ENAMETOOLONG;
 	ASSERT(pathlen > 0);
 
-	prid = xfs_get_initial_prid(dp);
+	xfs_icreate_args_inherit(&args, dp, idmap, S_IFLNK | (mode & ~S_IFMT),
+			xfs_has_parent(mp));
 
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
-			mapped_fsgid(idmap, &init_user_ns), prid,
+	error = xfs_qm_vop_dqalloc(dp, args.uid, args.gid, args.prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
@@ -169,9 +171,7 @@ xfs_symlink(
 	 */
 	error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino);
 	if (!error)
-		error = xfs_init_new_inode(idmap, tp, dp, ino,
-				S_IFLNK | (mode & ~S_IFMT), 1, 0, prid,
-				xfs_has_parent(mp), &ip);
+		error = xfs_icreate(tp, ino, &args, &ip);
 	if (error)
 		goto out_trans_cancel;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/21] xfs: implement atime updates in xfs_trans_ichgtime
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:02   ` [PATCH 05/21] xfs: pack icreate initialization parameters into a separate structure Darrick J. Wong
@ 2023-12-31 21:02   ` Darrick J. Wong
  2023-12-31 21:02   ` [PATCH 07/21] xfs: use xfs_trans_ichgtime to set times when allocating inode Darrick J. Wong
                     ` (14 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:02 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable xfs_trans_ichgtime to change the inode access time so that we can
use this function to set inode times when allocating inodes instead of
open-coding it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_shared.h      |    1 +
 fs/xfs/libxfs/xfs_trans_inode.c |    2 ++
 2 files changed, 3 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 7509c1406a355..1d327685f6ee3 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -136,6 +136,7 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
 #define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
 #define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
+#define	XFS_ICHGTIME_ACCESS	0x8	/* last access timestamp */
 
 /* Computed inode geometry for the filesystem. */
 struct xfs_ino_geometry {
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 70e97ea6eee7c..94c235fff7461 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -68,6 +68,8 @@ xfs_trans_ichgtime(
 		inode_set_mtime_to_ts(inode, tv);
 	if (flags & XFS_ICHGTIME_CHG)
 		inode_set_ctime_to_ts(inode, tv);
+	if (flags & XFS_ICHGTIME_ACCESS)
+		inode_set_atime_to_ts(inode, tv);
 	if (flags & XFS_ICHGTIME_CREATE)
 		ip->i_crtime = tv;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/21] xfs: use xfs_trans_ichgtime to set times when allocating inode
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:02   ` [PATCH 06/21] xfs: implement atime updates in xfs_trans_ichgtime Darrick J. Wong
@ 2023-12-31 21:02   ` Darrick J. Wong
  2023-12-31 21:02   ` [PATCH 08/21] xfs: split new inode creation into two pieces Darrick J. Wong
                     ` (13 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:02 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use xfs_trans_ichgtime to set the inode times when allocating an inode,
instead of open-coding them here.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_inode.c |   15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 72d2441b65a78..041d1634d7c19 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -693,10 +693,11 @@ xfs_icreate(
 	struct inode		*dir = pip ? VFS_I(pip) : NULL;
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_inode	*ip;
-	unsigned int		flags;
-	int			error;
-	struct timespec64	tv;
 	struct inode		*inode;
+	unsigned int		flags;
+	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
+					XFS_ICHGTIME_ACCESS;
+	int			error;
 
 	/*
 	 * Protect against obviously corrupt allocation btree records. Later
@@ -759,19 +760,17 @@ xfs_icreate(
 	ip->i_df.if_nextents = 0;
 	ASSERT(ip->i_nblocks == 0);
 
-	tv = inode_set_ctime_current(inode);
-	inode_set_mtime_to_ts(inode, tv);
-	inode_set_atime_to_ts(inode, tv);
-
 	ip->i_extsize = 0;
 	ip->i_diflags = 0;
 
 	if (xfs_has_v3inodes(mp)) {
 		inode_set_iversion(inode, 1);
 		ip->i_cowextsize = 0;
-		ip->i_crtime = tv;
+		times |= XFS_ICHGTIME_CREATE;
 	}
 
+	xfs_trans_ichgtime(tp, ip, times);
+
 	flags = XFS_ILOG_CORE;
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/21] xfs: split new inode creation into two pieces
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:02   ` [PATCH 07/21] xfs: use xfs_trans_ichgtime to set times when allocating inode Darrick J. Wong
@ 2023-12-31 21:02   ` Darrick J. Wong
  2023-12-31 21:03   ` [PATCH 09/21] xfs: hoist new inode initialization functions to libxfs Darrick J. Wong
                     ` (12 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:02 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

There are two parts to initializing a newly allocated inode: setting up
the incore structures, and initializing the new inode core based on the
parent inode and the current user's environment.  The initialization
code is not specific to the kernel, so we would like to share that with
userspace by hoisting it to libxfs.  Therefore, split xfs_icreate into
separate functions to prepare for the next few patches.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ialloc.c |   15 +++++++++
 fs/xfs/xfs_inode.c         |   76 ++++++++++++++++++++------------------------
 2 files changed, 50 insertions(+), 41 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 8b38a1a87954f..d58d700ed8a8b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1901,6 +1901,21 @@ xfs_dialloc(
 		}
 		return -ENOSPC;
 	}
+
+	/*
+	 * Protect against obviously corrupt allocation btree records. Later
+	 * xfs_iget checks will catch re-allocation of other active in-memory
+	 * and on-disk inodes. If we don't catch reallocating the parent inode
+	 * here we will deadlock in xfs_iget() so we have to do these checks
+	 * first.
+	 */
+	if (ino == parent || !xfs_verify_dir_ino(mp, ino)) {
+		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+		xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
+				XFS_SICK_AG_INOBT);
+		return -EFSCORRUPTED;
+	}
+
 	*new_ino = ino;
 	return 0;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 041d1634d7c19..a4e027d26ebc9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -676,53 +676,21 @@ xfs_inode_inherit_flags2(
 	}
 }
 
-/*
- * Initialise a newly allocated inode and return the in-core inode to the
- * caller locked exclusively.
- *
- * Caller is responsible for unlocking the inode manually upon return
- */
-int
-xfs_icreate(
+/* Initialise an inode's attributes. */
+static void
+xfs_inode_init(
 	struct xfs_trans	*tp,
-	xfs_ino_t		ino,
 	const struct xfs_icreate_args *args,
-	struct xfs_inode	**ipp)
+	struct xfs_inode	*ip)
 {
 	struct xfs_inode	*pip = args->pip;
 	struct inode		*dir = pip ? VFS_I(pip) : NULL;
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_inode	*ip;
-	struct inode		*inode;
+	struct inode		*inode = VFS_I(ip);
 	unsigned int		flags;
 	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
 					XFS_ICHGTIME_ACCESS;
-	int			error;
 
-	/*
-	 * Protect against obviously corrupt allocation btree records. Later
-	 * xfs_iget checks will catch re-allocation of other active in-memory
-	 * and on-disk inodes. If we don't catch reallocating the parent inode
-	 * here we will deadlock in xfs_iget() so we have to do these checks
-	 * first.
-	 */
-	if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
-		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
-		xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
-				XFS_SICK_AG_INOBT);
-		return -EFSCORRUPTED;
-	}
-
-	/*
-	 * Get the in-core inode with the lock held exclusively to prevent
-	 * others from looking at until we're done.
-	 */
-	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
-	if (error)
-		return error;
-
-	ASSERT(ip != NULL);
-	inode = VFS_I(ip);
 	set_nlink(inode, args->nlink);
 	inode->i_rdev = args->rdev;
 	ip->i_projid = args->prid;
@@ -811,11 +779,37 @@ xfs_icreate(
 		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
 	}
 
-	/*
-	 * Log the new values stuffed into the inode.
-	 */
-	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, flags);
+}
+
+/*
+ * Initialise a newly allocated inode and return the in-core inode to the
+ * caller locked exclusively.
+ *
+ * Caller is responsible for unlocking the inode manually upon return
+ */
+int
+xfs_icreate(
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	const struct xfs_icreate_args *args,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	/*
+	 * Get the in-core inode with the lock held exclusively to prevent
+	 * others from looking at until we're done.
+	 */
+	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+	if (error)
+		return error;
+
+	ASSERT(ip != NULL);
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_inode_init(tp, args, ip);
 
 	/* now that we have an i_mode we can setup the inode structure */
 	xfs_setup_inode(ip);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/21] xfs: hoist new inode initialization functions to libxfs
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:02   ` [PATCH 08/21] xfs: split new inode creation into two pieces Darrick J. Wong
@ 2023-12-31 21:03   ` Darrick J. Wong
  2023-12-31 21:03   ` [PATCH 10/21] xfs: push xfs_icreate_args creation out of xfs_create* Darrick J. Wong
                     ` (11 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:03 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move all the code that initializes a new inode's attributes from the
icreate_args structure and the parent directory into libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_util.c |  202 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_util.h |   12 ++
 fs/xfs/libxfs/xfs_shared.h     |    8 --
 fs/xfs/xfs_inode.c             |  198 ---------------------------------------
 fs/xfs/xfs_trans.h             |    1 
 5 files changed, 215 insertions(+), 206 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 2624d18922c02..5ddcdd9087fed 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
+#include <linux/iversion.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
@@ -13,6 +14,10 @@
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_inode_util.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
 
 uint16_t
 xfs_flags2diflags(
@@ -133,3 +138,200 @@ xfs_get_initial_prid(struct xfs_inode *dp)
 
 	return XFS_PROJID_DEFAULT;
 }
+
+/* Propagate di_flags from a parent inode to a child inode. */
+static inline void
+xfs_inode_inherit_flags(
+	struct xfs_inode	*ip,
+	const struct xfs_inode	*pip)
+{
+	unsigned int		di_flags = 0;
+	xfs_failaddr_t		failaddr;
+	umode_t			mode = VFS_I(ip)->i_mode;
+
+	if (S_ISDIR(mode)) {
+		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+			ip->i_extsize = pip->i_extsize;
+		}
+		if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+			di_flags |= XFS_DIFLAG_PROJINHERIT;
+	} else if (S_ISREG(mode)) {
+		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+		    xfs_has_realtime(ip->i_mount))
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+			ip->i_extsize = pip->i_extsize;
+		}
+	}
+	if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
+	    xfs_inherit_noatime)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
+	    xfs_inherit_nodump)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
+	    xfs_inherit_sync)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
+	    xfs_inherit_nosymlinks)
+		di_flags |= XFS_DIFLAG_NOSYMLINKS;
+	if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
+	    xfs_inherit_nodefrag)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+
+	ip->i_diflags |= di_flags;
+
+	/*
+	 * Inode verifiers on older kernels only check that the extent size
+	 * hint is an integer multiple of the rt extent size on realtime files.
+	 * They did not check the hint alignment on a directory with both
+	 * rtinherit and extszinherit flags set.  If the misaligned hint is
+	 * propagated from a directory into a new realtime file, new file
+	 * allocations will fail due to math errors in the rt allocator and/or
+	 * trip the verifiers.  Validate the hint settings in the new file so
+	 * that we don't let broken hints propagate.
+	 */
+	failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
+			VFS_I(ip)->i_mode, ip->i_diflags);
+	if (failaddr) {
+		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+				   XFS_DIFLAG_EXTSZINHERIT);
+		ip->i_extsize = 0;
+	}
+}
+
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static inline void
+xfs_inode_inherit_flags2(
+	struct xfs_inode	*ip,
+	const struct xfs_inode	*pip)
+{
+	xfs_failaddr_t		failaddr;
+
+	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+		ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = pip->i_cowextsize;
+	}
+	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
+		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+
+	/* Don't let invalid cowextsize hints propagate. */
+	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
+			VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
+	if (failaddr) {
+		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = 0;
+	}
+}
+
+/* Initialise an inode's attributes. */
+void
+xfs_inode_init(
+	struct xfs_trans	*tp,
+	const struct xfs_icreate_args *args,
+	struct xfs_inode	*ip)
+{
+	struct xfs_inode	*pip = args->pip;
+	struct inode		*dir = pip ? VFS_I(pip) : NULL;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct inode		*inode = VFS_I(ip);
+	unsigned int		flags;
+	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
+					XFS_ICHGTIME_ACCESS;
+
+	set_nlink(inode, args->nlink);
+	inode->i_rdev = args->rdev;
+	ip->i_projid = args->prid;
+
+	if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
+		inode_fsuid_set(inode, args->idmap);
+		inode->i_gid = dir->i_gid;
+		inode->i_mode = args->mode;
+	} else {
+		inode_init_owner(args->idmap, inode, dir, args->mode);
+	}
+
+	/*
+	 * If the group ID of the new file does not match the effective group
+	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
+	 * (and only if the irix_sgid_inherit compatibility variable is set).
+	 */
+	if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
+	    !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
+		inode->i_mode &= ~S_ISGID;
+
+	/* struct copies */
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_UID)
+		inode->i_uid = args->uid;
+	else
+		ASSERT(uid_eq(inode->i_uid, args->uid));
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_GID)
+		inode->i_gid = args->gid;
+	else if (!pip || !XFS_INHERIT_GID(pip))
+		ASSERT(gid_eq(inode->i_gid, args->gid));
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_MODE)
+		inode->i_mode = args->mode;
+
+	ip->i_disk_size = 0;
+	ip->i_df.if_nextents = 0;
+	ASSERT(ip->i_nblocks == 0);
+
+	ip->i_extsize = 0;
+	ip->i_diflags = 0;
+
+	if (xfs_has_v3inodes(mp)) {
+		inode_set_iversion(inode, 1);
+		ip->i_cowextsize = 0;
+		times |= XFS_ICHGTIME_CREATE;
+	}
+
+	xfs_trans_ichgtime(tp, ip, times);
+
+	flags = XFS_ILOG_CORE;
+	switch (args->mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
+		flags |= XFS_ILOG_DEV;
+		break;
+	case S_IFREG:
+	case S_IFDIR:
+		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
+			xfs_inode_inherit_flags(ip, pip);
+		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
+			xfs_inode_inherit_flags2(ip, pip);
+		fallthrough;
+	case S_IFLNK:
+		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+		ip->i_df.if_bytes = 0;
+		ip->i_df.if_u1.if_root = NULL;
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	/*
+	 * If we need to create attributes immediately after allocating the
+	 * inode, initialise an empty attribute fork right now. We use the
+	 * default fork offset for attributes here as we don't know exactly what
+	 * size or how many attributes we might be adding. We can do this
+	 * safely here because we know the data fork is completely empty and
+	 * this saves us from needing to run a separate transaction to set the
+	 * fork offset in the immediate future.
+	 */
+	if ((args->flags & XFS_ICREATE_ARGS_INIT_XATTRS) &&
+	    xfs_has_attr(mp)) {
+		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
+		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+	}
+
+	xfs_trans_log_inode(tp, ip, flags);
+}
diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h
index a494f7c4a3fe0..54d96e1aa9e5b 100644
--- a/fs/xfs/libxfs/xfs_inode_util.h
+++ b/fs/xfs/libxfs/xfs_inode_util.h
@@ -45,4 +45,16 @@ struct xfs_icreate_args {
 	uint16_t		flags;
 };
 
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
+#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
+#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
+#define	XFS_ICHGTIME_ACCESS	0x8	/* last access timestamp */
+void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags);
+
+void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
+		struct xfs_inode *ip);
+
 #endif /* __XFS_INODE_UTIL_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1d327685f6ee3..2cecebe018814 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -130,14 +130,6 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define	XFS_REFC_BTREE_REF	1
 #define	XFS_SSB_REF		0
 
-/*
- * Flags for xfs_trans_ichgtime().
- */
-#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
-#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
-#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
-#define	XFS_ICHGTIME_ACCESS	0x8	/* last access timestamp */
-
 /* Computed inode geometry for the filesystem. */
 struct xfs_ino_geometry {
 	/* Maximum inode count in this filesystem. */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a4e027d26ebc9..a6ab37ba4f729 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,6 +42,7 @@
 #include "xfs_pnfs.h"
 #include "xfs_parent.h"
 #include "xfs_xattr.h"
+#include "xfs_inode_util.h"
 
 struct kmem_cache *xfs_inode_cache;
 
@@ -585,203 +586,6 @@ xfs_lookup(
 	return error;
 }
 
-/* Propagate di_flags from a parent inode to a child inode. */
-static void
-xfs_inode_inherit_flags(
-	struct xfs_inode	*ip,
-	const struct xfs_inode	*pip)
-{
-	unsigned int		di_flags = 0;
-	xfs_failaddr_t		failaddr;
-	umode_t			mode = VFS_I(ip)->i_mode;
-
-	if (S_ISDIR(mode)) {
-		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
-			di_flags |= XFS_DIFLAG_RTINHERIT;
-		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
-			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-			ip->i_extsize = pip->i_extsize;
-		}
-		if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
-			di_flags |= XFS_DIFLAG_PROJINHERIT;
-	} else if (S_ISREG(mode)) {
-		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-		    xfs_has_realtime(ip->i_mount))
-			di_flags |= XFS_DIFLAG_REALTIME;
-		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
-			di_flags |= XFS_DIFLAG_EXTSIZE;
-			ip->i_extsize = pip->i_extsize;
-		}
-	}
-	if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
-	    xfs_inherit_noatime)
-		di_flags |= XFS_DIFLAG_NOATIME;
-	if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
-	    xfs_inherit_nodump)
-		di_flags |= XFS_DIFLAG_NODUMP;
-	if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
-	    xfs_inherit_sync)
-		di_flags |= XFS_DIFLAG_SYNC;
-	if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
-	    xfs_inherit_nosymlinks)
-		di_flags |= XFS_DIFLAG_NOSYMLINKS;
-	if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
-	    xfs_inherit_nodefrag)
-		di_flags |= XFS_DIFLAG_NODEFRAG;
-	if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
-		di_flags |= XFS_DIFLAG_FILESTREAM;
-
-	ip->i_diflags |= di_flags;
-
-	/*
-	 * Inode verifiers on older kernels only check that the extent size
-	 * hint is an integer multiple of the rt extent size on realtime files.
-	 * They did not check the hint alignment on a directory with both
-	 * rtinherit and extszinherit flags set.  If the misaligned hint is
-	 * propagated from a directory into a new realtime file, new file
-	 * allocations will fail due to math errors in the rt allocator and/or
-	 * trip the verifiers.  Validate the hint settings in the new file so
-	 * that we don't let broken hints propagate.
-	 */
-	failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
-			VFS_I(ip)->i_mode, ip->i_diflags);
-	if (failaddr) {
-		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
-				   XFS_DIFLAG_EXTSZINHERIT);
-		ip->i_extsize = 0;
-	}
-}
-
-/* Propagate di_flags2 from a parent inode to a child inode. */
-static void
-xfs_inode_inherit_flags2(
-	struct xfs_inode	*ip,
-	const struct xfs_inode	*pip)
-{
-	xfs_failaddr_t		failaddr;
-
-	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
-		ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
-		ip->i_cowextsize = pip->i_cowextsize;
-	}
-	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
-		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
-
-	/* Don't let invalid cowextsize hints propagate. */
-	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
-			VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
-	if (failaddr) {
-		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
-		ip->i_cowextsize = 0;
-	}
-}
-
-/* Initialise an inode's attributes. */
-static void
-xfs_inode_init(
-	struct xfs_trans	*tp,
-	const struct xfs_icreate_args *args,
-	struct xfs_inode	*ip)
-{
-	struct xfs_inode	*pip = args->pip;
-	struct inode		*dir = pip ? VFS_I(pip) : NULL;
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct inode		*inode = VFS_I(ip);
-	unsigned int		flags;
-	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
-					XFS_ICHGTIME_ACCESS;
-
-	set_nlink(inode, args->nlink);
-	inode->i_rdev = args->rdev;
-	ip->i_projid = args->prid;
-
-	if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
-		inode_fsuid_set(inode, args->idmap);
-		inode->i_gid = dir->i_gid;
-		inode->i_mode = args->mode;
-	} else {
-		inode_init_owner(args->idmap, inode, dir, args->mode);
-	}
-
-	/*
-	 * If the group ID of the new file does not match the effective group
-	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
-	 * (and only if the irix_sgid_inherit compatibility variable is set).
-	 */
-	if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
-	    !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
-		inode->i_mode &= ~S_ISGID;
-
-	/* struct copies */
-	if (args->flags & XFS_ICREATE_ARGS_FORCE_UID)
-		inode->i_uid = args->uid;
-	else
-		ASSERT(uid_eq(inode->i_uid, args->uid));
-	if (args->flags & XFS_ICREATE_ARGS_FORCE_GID)
-		inode->i_gid = args->gid;
-	else if (!pip || !XFS_INHERIT_GID(pip))
-		ASSERT(gid_eq(inode->i_gid, args->gid));
-	if (args->flags & XFS_ICREATE_ARGS_FORCE_MODE)
-		inode->i_mode = args->mode;
-
-	ip->i_disk_size = 0;
-	ip->i_df.if_nextents = 0;
-	ASSERT(ip->i_nblocks == 0);
-
-	ip->i_extsize = 0;
-	ip->i_diflags = 0;
-
-	if (xfs_has_v3inodes(mp)) {
-		inode_set_iversion(inode, 1);
-		ip->i_cowextsize = 0;
-		times |= XFS_ICHGTIME_CREATE;
-	}
-
-	xfs_trans_ichgtime(tp, ip, times);
-
-	flags = XFS_ILOG_CORE;
-	switch (args->mode & S_IFMT) {
-	case S_IFIFO:
-	case S_IFCHR:
-	case S_IFBLK:
-	case S_IFSOCK:
-		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
-		flags |= XFS_ILOG_DEV;
-		break;
-	case S_IFREG:
-	case S_IFDIR:
-		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
-			xfs_inode_inherit_flags(ip, pip);
-		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
-			xfs_inode_inherit_flags2(ip, pip);
-		fallthrough;
-	case S_IFLNK:
-		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
-		ip->i_df.if_bytes = 0;
-		ip->i_df.if_u1.if_root = NULL;
-		break;
-	default:
-		ASSERT(0);
-	}
-
-	/*
-	 * If we need to create attributes immediately after allocating the
-	 * inode, initialise an empty attribute fork right now. We use the
-	 * default fork offset for attributes here as we don't know exactly what
-	 * size or how many attributes we might be adding. We can do this
-	 * safely here because we know the data fork is completely empty and
-	 * this saves us from needing to run a separate transaction to set the
-	 * fork offset in the immediate future.
-	 */
-	if ((args->flags & XFS_ICREATE_ARGS_INIT_XATTRS) &&
-	    xfs_has_attr(mp)) {
-		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
-		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
-	}
-
-	xfs_trans_log_inode(tp, ip, flags);
-}
-
 /*
  * Initialise a newly allocated inode and return the in-core inode to the
  * caller locked exclusively.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 3f7e3a09a49ff..3cffc3cb41814 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -224,7 +224,6 @@ void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 bool		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
-void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
 void		xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
 				  uint);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/21] xfs: push xfs_icreate_args creation out of xfs_create*
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 21:03   ` [PATCH 09/21] xfs: hoist new inode initialization functions to libxfs Darrick J. Wong
@ 2023-12-31 21:03   ` Darrick J. Wong
  2023-12-31 21:03   ` [PATCH 11/21] xfs: wrap inode creation dqalloc calls Darrick J. Wong
                     ` (10 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:03 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the initialization of the xfs_icreate_args structure out of
xfs_create and xfs_create_tempfile into their callers so that we can set
the new inode's attributes in one place and pass that through instead of
open coding the collection of attributes all over the code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_inode.c |   65 ++++++++++++++++++++++++----------------------------
 fs/xfs/xfs_inode.h |    9 +++----
 fs/xfs/xfs_iops.c  |   46 ++++++++++++++++++++++---------------
 3 files changed, 62 insertions(+), 58 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a6ab37ba4f729..a95f05cd06ba6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -784,32 +784,26 @@ xfs_dir_hook_del(
 
 int
 xfs_create(
-	struct mnt_idmap	*idmap,
 	struct xfs_inode	*dp,
 	struct xfs_name		*name,
-	umode_t			mode,
-	dev_t			rdev,
-	bool			init_xattrs,
-	xfs_inode_t		**ipp)
+	const struct xfs_icreate_args *args,
+	struct xfs_inode	**ipp)
 {
-	struct xfs_icreate_args	args = {
-		.rdev		= rdev,
-		.nlink		= S_ISDIR(mode) ? 2 : 1,
-	};
-	int			is_dir = S_ISDIR(mode);
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_inode	*ip = NULL;
 	struct xfs_trans	*tp = NULL;
-	int			error;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_trans_res	*tres;
+	struct xfs_parent_args	*ppargs;
+	xfs_ino_t		ino;
 	bool			unlock_dp_on_error = false;
-	struct xfs_dquot	*udqp = NULL;
-	struct xfs_dquot	*gdqp = NULL;
-	struct xfs_dquot	*pdqp = NULL;
-	struct xfs_trans_res	*tres;
+	bool			is_dir = S_ISDIR(args->mode);
 	uint			resblks;
-	xfs_ino_t		ino;
-	struct xfs_parent_args	*ppargs;
+	int			error;
 
+	ASSERT(args->pip == dp);
 	trace_xfs_create(dp, name);
 
 	if (xfs_is_shutdown(mp))
@@ -817,12 +811,10 @@ xfs_create(
 	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
 		return -EIO;
 
-	xfs_icreate_args_inherit(&args, dp, idmap, mode, init_xattrs);
-
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, args.uid, args.gid, args.prid,
+	error = xfs_qm_vop_dqalloc(dp, args->uid, args->gid, args->prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
@@ -865,9 +857,9 @@ xfs_create(
 	 * entry pointing to them, but a directory also the "." entry
 	 * pointing to itself.
 	 */
-	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+	error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
 	if (!error)
-		error = xfs_icreate(tp, ino, &args, &ip);
+		error = xfs_icreate(tp, ino, args, &ip);
 	if (error)
 		goto out_trans_cancel;
 
@@ -967,33 +959,31 @@ xfs_create(
 
 int
 xfs_create_tmpfile(
-	struct mnt_idmap	*idmap,
 	struct xfs_inode	*dp,
-	umode_t			mode,
-	bool			init_xattrs,
+	const struct xfs_icreate_args *args,
 	struct xfs_inode	**ipp)
 {
-	struct xfs_icreate_args	args = { NULL };
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_inode	*ip = NULL;
 	struct xfs_trans	*tp = NULL;
-	int			error;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
 	struct xfs_dquot	*pdqp = NULL;
 	struct xfs_trans_res	*tres;
-	uint			resblks;
 	xfs_ino_t		ino;
+	uint			resblks;
+	int			error;
+
+	ASSERT(args->nlink == 0);
+	ASSERT(args->pip == dp);
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 
-	xfs_icreate_args_inherit(&args, dp, idmap, mode, init_xattrs);
-
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, args.uid, args.gid, args.prid,
+	error = xfs_qm_vop_dqalloc(dp, args->uid, args->gid, args->prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
@@ -1007,9 +997,9 @@ xfs_create_tmpfile(
 	if (error)
 		goto out_release_dquots;
 
-	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+	error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
 	if (!error)
-		error = xfs_icreate(tp, ino, &args, &ip);
+		error = xfs_icreate(tp, ino, args, &ip);
 	if (error)
 		goto out_trans_cancel;
 
@@ -2858,12 +2848,17 @@ xfs_rename_alloc_whiteout(
 	struct xfs_inode	*dp,
 	struct xfs_inode	**wip)
 {
+	struct xfs_icreate_args	args = {
+		.nlink		= 0,
+	};
 	struct xfs_inode	*tmpfile;
 	struct qstr		name;
 	int			error;
 
-	error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE,
-			xfs_has_parent(dp->i_mount), &tmpfile);
+	xfs_icreate_args_inherit(&args, dp, idmap, S_IFCHR | WHITEOUT_MODE,
+			xfs_has_parent(dp->i_mount));
+
+	error = xfs_create_tmpfile(dp, &args, &tmpfile);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8ccf4cf049709..f9530494dd873 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -512,12 +512,11 @@ int		xfs_release(struct xfs_inode *ip);
 int		xfs_inactive(struct xfs_inode *ip);
 int		xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
 			   struct xfs_inode **ipp, struct xfs_name *ci_name);
-int		xfs_create(struct mnt_idmap *idmap,
-			   struct xfs_inode *dp, struct xfs_name *name,
-			   umode_t mode, dev_t rdev, bool need_xattr,
+int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
+			   const struct xfs_icreate_args *iargs,
 			   struct xfs_inode **ipp);
-int		xfs_create_tmpfile(struct mnt_idmap *idmap,
-			   struct xfs_inode *dp, umode_t mode, bool init_xattrs,
+int		xfs_create_tmpfile(struct xfs_inode *dp,
+			   const struct xfs_icreate_args *iargs,
 			   struct xfs_inode **ipp);
 int		xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
 			   struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8f5b8f8973a5f..90296b3c838aa 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -174,49 +174,59 @@ xfs_generic_create(
 	dev_t			rdev,
 	struct file		*tmpfile)	/* unnamed file */
 {
-	struct inode	*inode;
-	struct xfs_inode *ip = NULL;
-	struct posix_acl *default_acl, *acl;
-	struct xfs_name	name;
-	int		error;
+	struct xfs_icreate_args args = {
+		.rdev		= rdev,
+	};
+	struct inode		*inode;
+	struct xfs_inode	*ip = NULL;
+	struct posix_acl	*default_acl, *acl;
+	struct xfs_name		name;
+	int			error;
+
+	xfs_icreate_args_inherit(&args, XFS_I(dir), idmap, mode, false);
+	if (tmpfile)
+		args.nlink = 0;
+	else if (S_ISDIR(mode))
+		args.nlink = 2;
+	else
+		args.nlink = 1;
 
 	/*
 	 * Irix uses Missed'em'V split, but doesn't want to see
 	 * the upper 5 bits of (14bit) major.
 	 */
-	if (S_ISCHR(mode) || S_ISBLK(mode)) {
-		if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+	if (S_ISCHR(args.mode) || S_ISBLK(args.mode)) {
+		if (unlikely(!sysv_valid_dev(args.rdev) ||
+			     MAJOR(args.rdev) & ~0x1ff))
 			return -EINVAL;
 	} else {
-		rdev = 0;
+		args.rdev = 0;
 	}
 
-	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	error = posix_acl_create(dir, &args.mode, &default_acl, &acl);
 	if (error)
 		return error;
 
 	/* Verify mode is valid also for tmpfile case */
-	error = xfs_dentry_mode_to_name(&name, dentry, mode);
+	error = xfs_dentry_mode_to_name(&name, dentry, args.mode);
 	if (unlikely(error))
 		goto out_free_acl;
 
 	if (!tmpfile) {
-		error = xfs_create(idmap, XFS_I(dir), &name, mode, rdev,
-				xfs_create_need_xattr(dir, default_acl, acl),
-				&ip);
-	} else {
-		bool	init_xattrs = false;
+		if (xfs_create_need_xattr(dir, default_acl, acl))
+			args.flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
 
+		error = xfs_create(XFS_I(dir), &name, &args, &ip);
+	} else {
 		/*
 		 * If this temporary file will be linkable, set up the file
 		 * with an attr fork to receive a parent pointer.
 		 */
 		if (!(tmpfile->f_flags & O_EXCL) &&
 		    xfs_has_parent(XFS_I(dir)->i_mount))
-			init_xattrs = true;
+			args.flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
 
-		error = xfs_create_tmpfile(idmap, XFS_I(dir), mode,
-				init_xattrs, &ip);
+		error = xfs_create_tmpfile(XFS_I(dir), &args, &ip);
 	}
 	if (unlikely(error))
 		goto out_free_acl;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/21] xfs: wrap inode creation dqalloc calls
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 21:03   ` [PATCH 10/21] xfs: push xfs_icreate_args creation out of xfs_create* Darrick J. Wong
@ 2023-12-31 21:03   ` Darrick J. Wong
  2023-12-31 21:03   ` [PATCH 12/21] xfs: hoist xfs_iunlink to libxfs Darrick J. Wong
                     ` (9 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:03 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a helper that calls dqalloc to allocate and grab a reference to
dquots for the user, group, and project ids listed in an icreate
structure.  This simplifies the creat-related dqalloc callsites
scattered around the code base.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/tempfile.c |    9 ++++-----
 fs/xfs/xfs_inode.c      |   38 ++++++++++++++++++++++++++------------
 fs/xfs/xfs_inode.h      |    3 +++
 fs/xfs/xfs_symlink.c    |   10 ++++------
 4 files changed, 37 insertions(+), 23 deletions(-)


diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index c326cf66dea5c..6d207559df989 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -43,9 +43,9 @@ xrep_tempfile_create(
 	struct xfs_icreate_args	args = { .pip = sc->mp->m_rootip, };
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_trans	*tp = NULL;
-	struct xfs_dquot	*udqp = NULL;
-	struct xfs_dquot	*gdqp = NULL;
-	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_dquot	*udqp;
+	struct xfs_dquot	*gdqp;
+	struct xfs_dquot	*pdqp;
 	struct xfs_trans_res	*tres;
 	struct xfs_inode	*dp = mp->m_rootip;
 	xfs_ino_t		ino;
@@ -69,8 +69,7 @@ xrep_tempfile_create(
 	 * inode should be completely root owned so that we don't fail due to
 	 * quota limits.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, args.uid, args.gid, args.prid,
-			XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
+	error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a95f05cd06ba6..7d3c50be179dc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -782,6 +782,24 @@ xfs_dir_hook_del(
 }
 #endif /* CONFIG_XFS_LIVE_HOOKS */
 
+int
+xfs_icreate_dqalloc(
+	const struct xfs_icreate_args	*args,
+	struct xfs_dquot		**udqpp,
+	struct xfs_dquot		**gdqpp,
+	struct xfs_dquot		**pdqpp)
+{
+	unsigned int			flags = XFS_QMOPT_QUOTALL;
+
+	*udqpp = *gdqpp = *pdqpp = NULL;
+
+	if (!(args->flags & XFS_ICREATE_ARGS_FORCE_GID))
+		flags |= XFS_QMOPT_INHERIT;
+
+	return xfs_qm_vop_dqalloc(args->pip, args->uid, args->gid, args->prid,
+			flags, udqpp, gdqpp, pdqpp);
+}
+
 int
 xfs_create(
 	struct xfs_inode	*dp,
@@ -792,9 +810,9 @@ xfs_create(
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_inode	*ip = NULL;
 	struct xfs_trans	*tp = NULL;
-	struct xfs_dquot	*udqp = NULL;
-	struct xfs_dquot	*gdqp = NULL;
-	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_dquot	*udqp;
+	struct xfs_dquot	*gdqp;
+	struct xfs_dquot	*pdqp;
 	struct xfs_trans_res	*tres;
 	struct xfs_parent_args	*ppargs;
 	xfs_ino_t		ino;
@@ -814,9 +832,7 @@ xfs_create(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, args->uid, args->gid, args->prid,
-			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-			&udqp, &gdqp, &pdqp);
+	error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);
 	if (error)
 		return error;
 
@@ -966,9 +982,9 @@ xfs_create_tmpfile(
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_inode	*ip = NULL;
 	struct xfs_trans	*tp = NULL;
-	struct xfs_dquot	*udqp = NULL;
-	struct xfs_dquot	*gdqp = NULL;
-	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_dquot	*udqp;
+	struct xfs_dquot	*gdqp;
+	struct xfs_dquot	*pdqp;
 	struct xfs_trans_res	*tres;
 	xfs_ino_t		ino;
 	uint			resblks;
@@ -983,9 +999,7 @@ xfs_create_tmpfile(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, args->uid, args->gid, args->prid,
-			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-			&udqp, &gdqp, &pdqp);
+	error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f9530494dd873..32c92e2a0fa25 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -663,5 +663,8 @@ void xfs_icreate_args_inherit(struct xfs_icreate_args *args,
 		bool init_xattrs);
 void xfs_icreate_args_rootfile(struct xfs_icreate_args *args,
 		struct xfs_mount *mp, umode_t mode, bool init_xattrs);
+int xfs_icreate_dqalloc(const struct xfs_icreate_args *args,
+		struct xfs_dquot **udqpp, struct xfs_dquot **gdqpp,
+		struct xfs_dquot **pdqpp);
 
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index f40fa37302829..a4872a6903e69 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -100,9 +100,9 @@ xfs_symlink(
 	int			pathlen;
 	bool                    unlock_dp_on_error = false;
 	xfs_filblks_t		fs_blocks;
-	struct xfs_dquot	*udqp = NULL;
-	struct xfs_dquot	*gdqp = NULL;
-	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_dquot	*udqp;
+	struct xfs_dquot	*gdqp;
+	struct xfs_dquot	*pdqp;
 	uint			resblks;
 	xfs_ino_t		ino;
 	struct xfs_parent_args	*ppargs;
@@ -128,9 +128,7 @@ xfs_symlink(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, args.uid, args.gid, args.prid,
-			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-			&udqp, &gdqp, &pdqp);
+	error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp);
 	if (error)
 		return error;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/21] xfs: hoist xfs_iunlink to libxfs
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 21:03   ` [PATCH 11/21] xfs: wrap inode creation dqalloc calls Darrick J. Wong
@ 2023-12-31 21:03   ` Darrick J. Wong
  2023-12-31 21:04   ` [PATCH 13/21] xfs: hoist xfs_{bump,drop}link " Darrick J. Wong
                     ` (8 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:03 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move xfs_iunlink and xfs_iunlink_remove to libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_util.c |  282 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_util.h |    4 +
 fs/xfs/xfs_inode.c             |  280 ----------------------------------------
 fs/xfs/xfs_inode.h             |    5 -
 4 files changed, 289 insertions(+), 282 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 5ddcdd9087fed..be1fa47dc9c88 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -18,6 +18,10 @@
 #include "xfs_ialloc.h"
 #include "xfs_health.h"
 #include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "xfs_iunlink_item.h"
 
 uint16_t
 xfs_flags2diflags(
@@ -335,3 +339,281 @@ xfs_inode_init(
 
 	xfs_trans_log_inode(tp, ip, flags);
 }
+
+/*
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
+ *
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
+ */
+
+/*
+ * Update the prev pointer of the next agino.  Returns -ENOLINK if the inode
+ * is not in cache.
+ */
+static int
+xfs_iunlink_update_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_inode	*ip;
+
+	/* No update necessary if we are at the end of the list. */
+	if (next_agino == NULLAGINO)
+		return 0;
+
+	ip = xfs_iunlink_lookup(pag, next_agino);
+	if (!ip)
+		return -ENOLINK;
+
+	ip->i_prev_unlinked = prev_agino;
+	return 0;
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results.  The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	unsigned int		bucket_index,
+	xfs_agino_t		new_agino)
+{
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		old_value;
+	int			offset;
+
+	ASSERT(xfs_verify_agino_or_null(pag, new_agino));
+
+	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
+			old_value, new_agino);
+
+	/*
+	 * We should never find the head of the list already set to the value
+	 * passed in because either we're adding or removing ourselves from the
+	 * head of the list.
+	 */
+	if (old_value == new_agino) {
+		xfs_buf_mark_corrupt(agibp);
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+		return -EFSCORRUPTED;
+	}
+
+	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+	offset = offsetof(struct xfs_agi, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	return 0;
+}
+
+static int
+xfs_iunlink_insert_inode(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		next_agino;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			error;
+
+	/*
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the pointer isn't garbage and that this inode
+	 * isn't already on the list.
+	 */
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (next_agino == agino ||
+	    !xfs_verify_agino_or_null(pag, next_agino)) {
+		xfs_buf_mark_corrupt(agibp);
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Update the prev pointer in the next inode to point back to this
+	 * inode.
+	 */
+	error = xfs_iunlink_update_backref(pag, agino, next_agino);
+	if (error == -ENOLINK)
+		error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
+	if (error)
+		return error;
+
+	if (next_agino != NULLAGINO) {
+		/*
+		 * There is already another inode in the bucket, so point this
+		 * inode to the current head of the list.
+		 */
+		error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
+		if (error)
+			return error;
+		ip->i_next_unlinked = next_agino;
+	}
+
+	/* Point the head of the list to point to this inode. */
+	ip->i_prev_unlinked = NULLAGINO;
+	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+ * list when the inode is freed.
+ */
+int
+xfs_iunlink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_perag	*pag;
+	struct xfs_buf		*agibp;
+	int			error;
+
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
+	trace_xfs_iunlink(ip);
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(pag, tp, &agibp);
+	if (error)
+		goto out;
+
+	error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
+static int
+xfs_iunlink_remove_inode(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	xfs_agino_t		head_agino;
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			error;
+
+	trace_xfs_iunlink_remove(ip);
+
+	/*
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the head pointer isn't garbage.
+	 */
+	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (!xfs_verify_agino(pag, head_agino)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				agi, sizeof(*agi));
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Set our inode's next_unlinked pointer to NULL and then return
+	 * the old pointer value so that we can update whatever was previous
+	 * to us in the list to point to whatever was next in the list.
+	 */
+	error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
+	if (error)
+		return error;
+
+	/*
+	 * Update the prev pointer in the next inode to point back to previous
+	 * inode in the chain.
+	 */
+	error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+			ip->i_next_unlinked);
+	if (error == -ENOLINK)
+		error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
+				ip->i_next_unlinked);
+	if (error)
+		return error;
+
+	if (head_agino != agino) {
+		struct xfs_inode	*prev_ip;
+
+		prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+		if (!prev_ip) {
+			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+			return -EFSCORRUPTED;
+		}
+
+		error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+				ip->i_next_unlinked);
+		prev_ip->i_next_unlinked = ip->i_next_unlinked;
+	} else {
+		/* Point the head of the list to the next unlinked inode. */
+		error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+				ip->i_next_unlinked);
+	}
+
+	ip->i_next_unlinked = NULLAGINO;
+	ip->i_prev_unlinked = 0;
+	return error;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+int
+xfs_iunlink_remove(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	struct xfs_buf		*agibp;
+	int			error;
+
+	trace_xfs_iunlink_remove(ip);
+
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(pag, tp, &agibp);
+	if (error)
+		return error;
+
+	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
+}
diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h
index 54d96e1aa9e5b..0406401d21b76 100644
--- a/fs/xfs/libxfs/xfs_inode_util.h
+++ b/fs/xfs/libxfs/xfs_inode_util.h
@@ -57,4 +57,8 @@ void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags);
 void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
 		struct xfs_inode *ip);
 
+int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
+int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
+		struct xfs_inode *ip);
+
 #endif /* __XFS_INODE_UTIL_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7d3c50be179dc..798c4fdf700c3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1775,39 +1775,6 @@ xfs_inactive(
 	return error;
 }
 
-/*
- * In-Core Unlinked List Lookups
- * =============================
- *
- * Every inode is supposed to be reachable from some other piece of metadata
- * with the exception of the root directory.  Inodes with a connection to a
- * file descriptor but not linked from anywhere in the on-disk directory tree
- * are collectively known as unlinked inodes, though the filesystem itself
- * maintains links to these inodes so that on-disk metadata are consistent.
- *
- * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
- * header contains a number of buckets that point to an inode, and each inode
- * record has a pointer to the next inode in the hash chain.  This
- * singly-linked list causes scaling problems in the iunlink remove function
- * because we must walk that list to find the inode that points to the inode
- * being removed from the unlinked hash bucket list.
- *
- * Hence we keep an in-memory double linked list to link each inode on an
- * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
- * based lists would require having 64 list heads in the perag, one for each
- * list. This is expensive in terms of memory (think millions of AGs) and cache
- * misses on lookups. Instead, use the fact that inodes on the unlinked list
- * must be referenced at the VFS level to keep them on the list and hence we
- * have an existence guarantee for inodes on the unlinked list.
- *
- * Given we have an existence guarantee, we can use lockless inode cache lookups
- * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
- * for the double linked unlinked list, and we don't need any extra locking to
- * keep the list safe as all manipulations are done under the AGI buffer lock.
- * Keeping the list up to date does not require memory allocation, just finding
- * the XFS inode and updating the next/prev unlinked list aginos.
- */
-
 /*
  * Find an inode on the unlinked list. This does not take references to the
  * inode as we have existence guarantees by holding the AGI buffer lock and that
@@ -1842,76 +1809,12 @@ xfs_iunlink_lookup(
 	return ip;
 }
 
-/*
- * Update the prev pointer of the next agino.  Returns -ENOLINK if the inode
- * is not in cache.
- */
-static int
-xfs_iunlink_update_backref(
-	struct xfs_perag	*pag,
-	xfs_agino_t		prev_agino,
-	xfs_agino_t		next_agino)
-{
-	struct xfs_inode	*ip;
-
-	/* No update necessary if we are at the end of the list. */
-	if (next_agino == NULLAGINO)
-		return 0;
-
-	ip = xfs_iunlink_lookup(pag, next_agino);
-	if (!ip)
-		return -ENOLINK;
-
-	ip->i_prev_unlinked = prev_agino;
-	return 0;
-}
-
-/*
- * Point the AGI unlinked bucket at an inode and log the results.  The caller
- * is responsible for validating the old value.
- */
-STATIC int
-xfs_iunlink_update_bucket(
-	struct xfs_trans	*tp,
-	struct xfs_perag	*pag,
-	struct xfs_buf		*agibp,
-	unsigned int		bucket_index,
-	xfs_agino_t		new_agino)
-{
-	struct xfs_agi		*agi = agibp->b_addr;
-	xfs_agino_t		old_value;
-	int			offset;
-
-	ASSERT(xfs_verify_agino_or_null(pag, new_agino));
-
-	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-	trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
-			old_value, new_agino);
-
-	/*
-	 * We should never find the head of the list already set to the value
-	 * passed in because either we're adding or removing ourselves from the
-	 * head of the list.
-	 */
-	if (old_value == new_agino) {
-		xfs_buf_mark_corrupt(agibp);
-		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
-		return -EFSCORRUPTED;
-	}
-
-	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
-	offset = offsetof(struct xfs_agi, agi_unlinked) +
-			(sizeof(xfs_agino_t) * bucket_index);
-	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
-	return 0;
-}
-
 /*
  * Load the inode @next_agino into the cache and set its prev_unlinked pointer
  * to @prev_agino.  Caller must hold the AGI to synchronize with other changes
  * to the unlinked list.
  */
-STATIC int
+int
 xfs_iunlink_reload_next(
 	struct xfs_trans	*tp,
 	struct xfs_buf		*agibp,
@@ -1967,187 +1870,6 @@ xfs_iunlink_reload_next(
 	return error;
 }
 
-static int
-xfs_iunlink_insert_inode(
-	struct xfs_trans	*tp,
-	struct xfs_perag	*pag,
-	struct xfs_buf		*agibp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi = agibp->b_addr;
-	xfs_agino_t		next_agino;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-	int			error;
-
-	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the pointer isn't garbage and that this inode
-	 * isn't already on the list.
-	 */
-	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-	if (next_agino == agino ||
-	    !xfs_verify_agino_or_null(pag, next_agino)) {
-		xfs_buf_mark_corrupt(agibp);
-		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
-		return -EFSCORRUPTED;
-	}
-
-	/*
-	 * Update the prev pointer in the next inode to point back to this
-	 * inode.
-	 */
-	error = xfs_iunlink_update_backref(pag, agino, next_agino);
-	if (error == -ENOLINK)
-		error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
-	if (error)
-		return error;
-
-	if (next_agino != NULLAGINO) {
-		/*
-		 * There is already another inode in the bucket, so point this
-		 * inode to the current head of the list.
-		 */
-		error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
-		if (error)
-			return error;
-		ip->i_next_unlinked = next_agino;
-	}
-
-	/* Point the head of the list to point to this inode. */
-	ip->i_prev_unlinked = NULLAGINO;
-	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
-}
-
-/*
- * This is called when the inode's link count has gone to 0 or we are creating
- * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
- *
- * We place the on-disk inode on a list in the AGI.  It will be pulled from this
- * list when the inode is freed.
- */
-int
-xfs_iunlink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_perag	*pag;
-	struct xfs_buf		*agibp;
-	int			error;
-
-	ASSERT(VFS_I(ip)->i_nlink == 0);
-	ASSERT(VFS_I(ip)->i_mode != 0);
-	trace_xfs_iunlink(ip);
-
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
-	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(pag, tp, &agibp);
-	if (error)
-		goto out;
-
-	error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
-out:
-	xfs_perag_put(pag);
-	return error;
-}
-
-static int
-xfs_iunlink_remove_inode(
-	struct xfs_trans	*tp,
-	struct xfs_perag	*pag,
-	struct xfs_buf		*agibp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi = agibp->b_addr;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	xfs_agino_t		head_agino;
-	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-	int			error;
-
-	trace_xfs_iunlink_remove(ip);
-
-	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the head pointer isn't garbage.
-	 */
-	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-	if (!xfs_verify_agino(pag, head_agino)) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-				agi, sizeof(*agi));
-		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
-		return -EFSCORRUPTED;
-	}
-
-	/*
-	 * Set our inode's next_unlinked pointer to NULL and then return
-	 * the old pointer value so that we can update whatever was previous
-	 * to us in the list to point to whatever was next in the list.
-	 */
-	error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
-	if (error)
-		return error;
-
-	/*
-	 * Update the prev pointer in the next inode to point back to previous
-	 * inode in the chain.
-	 */
-	error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
-			ip->i_next_unlinked);
-	if (error == -ENOLINK)
-		error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
-				ip->i_next_unlinked);
-	if (error)
-		return error;
-
-	if (head_agino != agino) {
-		struct xfs_inode	*prev_ip;
-
-		prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
-		if (!prev_ip) {
-			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
-			return -EFSCORRUPTED;
-		}
-
-		error = xfs_iunlink_log_inode(tp, prev_ip, pag,
-				ip->i_next_unlinked);
-		prev_ip->i_next_unlinked = ip->i_next_unlinked;
-	} else {
-		/* Point the head of the list to the next unlinked inode. */
-		error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
-				ip->i_next_unlinked);
-	}
-
-	ip->i_next_unlinked = NULLAGINO;
-	ip->i_prev_unlinked = 0;
-	return error;
-}
-
-/*
- * Pull the on-disk inode from the AGI unlinked list.
- */
-int
-xfs_iunlink_remove(
-	struct xfs_trans	*tp,
-	struct xfs_perag	*pag,
-	struct xfs_inode	*ip)
-{
-	struct xfs_buf		*agibp;
-	int			error;
-
-	trace_xfs_iunlink_remove(ip);
-
-	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(pag, tp, &agibp);
-	if (error)
-		return error;
-
-	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
-}
-
 /*
  * Look up the inode number specified and if it is not already marked XFS_ISTALE
  * mark it stale. We should only find clean inodes in this lookup that aren't
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 32c92e2a0fa25..95e7a73d90e58 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -604,10 +604,9 @@ extern struct kmem_cache	*xfs_inode_cache;
 
 bool xfs_inode_needs_inactive(struct xfs_inode *ip);
 
-int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
-int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
-		struct xfs_inode *ip);
 struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino);
+int xfs_iunlink_reload_next(struct xfs_trans *tp, struct xfs_buf *agibp,
+		xfs_agino_t prev_agino, xfs_agino_t next_agino);
 
 void xfs_end_io(struct work_struct *work);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/21] xfs: hoist xfs_{bump,drop}link to libxfs
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 21:03   ` [PATCH 12/21] xfs: hoist xfs_iunlink to libxfs Darrick J. Wong
@ 2023-12-31 21:04   ` Darrick J. Wong
  2023-12-31 21:04   ` [PATCH 14/21] xfs: create libxfs helper to link a new inode into a directory Darrick J. Wong
                     ` (7 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:04 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move xfs_bumplink and xfs_droplink to libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_util.c |   53 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_util.h |    2 ++
 fs/xfs/xfs_inode.c             |   53 ----------------------------------------
 fs/xfs/xfs_inode.h             |    2 --
 4 files changed, 55 insertions(+), 55 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index be1fa47dc9c88..3de478eb5a83a 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -617,3 +617,56 @@ xfs_iunlink_remove(
 
 	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
 }
+
+/*
+ * Decrement the link count on an inode & log the change.  If this causes the
+ * link count to go to zero, move the inode to AGI unlinked list so that it can
+ * be freed when the last active reference goes away via xfs_inactive().
+ */
+int
+xfs_droplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	if (inode->i_nlink == 0) {
+		xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count dropped below zero.  Pinning link count.",
+				ip->i_ino);
+		set_nlink(inode, XFS_NLINK_PINNED);
+	}
+	if (inode->i_nlink != XFS_NLINK_PINNED)
+		drop_nlink(inode);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (inode->i_nlink)
+		return 0;
+
+	return xfs_iunlink(tp, ip);
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+void
+xfs_bumplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	if (inode->i_nlink == XFS_NLINK_PINNED - 1)
+		xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count exceeded maximum.  Pinning link count.",
+				ip->i_ino);
+	if (inode->i_nlink != XFS_NLINK_PINNED)
+		inc_nlink(inode);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h
index 0406401d21b76..a2a1796a72ff3 100644
--- a/fs/xfs/libxfs/xfs_inode_util.h
+++ b/fs/xfs/libxfs/xfs_inode_util.h
@@ -60,5 +60,7 @@ void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
 int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
 int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
 		struct xfs_inode *ip);
+int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
 
 #endif /* __XFS_INODE_UTIL_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 798c4fdf700c3..0e37bbc5b6784 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -663,59 +663,6 @@ xfs_icreate_args_rootfile(
 		args->flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
 }
 
-/*
- * Decrement the link count on an inode & log the change.  If this causes the
- * link count to go to zero, move the inode to AGI unlinked list so that it can
- * be freed when the last active reference goes away via xfs_inactive().
- */
-int
-xfs_droplink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-
-	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-	if (inode->i_nlink == 0) {
-		xfs_info_ratelimited(tp->t_mountp,
- "Inode 0x%llx link count dropped below zero.  Pinning link count.",
-				ip->i_ino);
-		set_nlink(inode, XFS_NLINK_PINNED);
-	}
-	if (inode->i_nlink != XFS_NLINK_PINNED)
-		drop_nlink(inode);
-
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	if (inode->i_nlink)
-		return 0;
-
-	return xfs_iunlink(tp, ip);
-}
-
-/*
- * Increment the link count on an inode & log the change.
- */
-void
-xfs_bumplink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-
-	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-	if (inode->i_nlink == XFS_NLINK_PINNED - 1)
-		xfs_info_ratelimited(tp->t_mountp,
- "Inode 0x%llx link count exceeded maximum.  Pinning link count.",
-				ip->i_ino);
-	if (inode->i_nlink != XFS_NLINK_PINNED)
-		inc_nlink(inode);
-
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
-
 #ifdef CONFIG_XFS_LIVE_HOOKS
 /*
  * Use a static key here to reduce the overhead of directory live update hooks.
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 95e7a73d90e58..6433492e7301f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -613,8 +613,6 @@ void xfs_end_io(struct work_struct *work);
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
-int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip);
-void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
 void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode);
 void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/21] xfs: create libxfs helper to link a new inode into a directory
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 21:04   ` [PATCH 13/21] xfs: hoist xfs_{bump,drop}link " Darrick J. Wong
@ 2023-12-31 21:04   ` Darrick J. Wong
  2023-12-31 21:04   ` [PATCH 15/21] xfs: create libxfs helper to link an existing " Darrick J. Wong
                     ` (6 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:04 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to link a newly created inode into a
directory.  The upcoming metadata directory feature will need this to
create a metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2.c |   47 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_dir2.h |   12 ++++++++++
 fs/xfs/xfs_inode.c       |   53 +++++++++++++++-------------------------------
 fs/xfs/xfs_symlink.c     |   43 ++++++++++++++++---------------------
 4 files changed, 95 insertions(+), 60 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 525b23a3800b6..16dfe869ef2de 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -19,6 +19,9 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_health.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_parent.h"
 
 const struct xfs_name xfs_name_dotdot = {
 	.name	= (const unsigned char *)"..",
@@ -782,3 +785,47 @@ xfs_dir2_compname(
 		return xfs_ascii_ci_compname(args, name, len);
 	return xfs_da_compname(args, name, len);
 }
+
+/*
+ * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip
+ * into @dp under the given @name.  If @ip is a directory, it will be
+ * initialized.  Both inodes must have the ILOCK held and the transaction must
+ * have sufficient blocks reserved.
+ */
+int
+xfs_dir_create_child(
+	struct xfs_trans	*tp,
+	unsigned int		resblks,
+	struct xfs_dir_update	*du)
+{
+	struct xfs_inode	*dp = du->dp;
+	const struct xfs_name	*name = du->name;
+	struct xfs_inode	*ip = du->ip;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_EXCL));
+
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks);
+	if (error) {
+		ASSERT(error != -ENOSPC);
+		return error;
+	}
+
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		error = xfs_dir_init(tp, ip, dp);
+		if (error)
+			return error;
+
+		xfs_bumplink(tp, dp);
+	}
+
+	/*
+	 * If we have parent pointers, we need to add the attribute containing
+	 * the parent information now.
+	 */
+	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index ca1949ed4f5e8..71a8d8e8a8e7b 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -293,4 +293,16 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c)
 	return c;
 }
 
+struct xfs_parent_args;
+
+struct xfs_dir_update {
+	struct xfs_inode	*dp;
+	const struct xfs_name	*name;
+	struct xfs_inode	*ip;
+	struct xfs_parent_args	*ppargs;
+};
+
+int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks,
+		struct xfs_dir_update *du);
+
 #endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0e37bbc5b6784..401bd2a455196 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -754,14 +754,16 @@ xfs_create(
 	const struct xfs_icreate_args *args,
 	struct xfs_inode	**ipp)
 {
+	struct xfs_dir_update	du = {
+		.dp		= dp,
+		.name		= name,
+	};
 	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_inode	*ip = NULL;
 	struct xfs_trans	*tp = NULL;
 	struct xfs_dquot	*udqp;
 	struct xfs_dquot	*gdqp;
 	struct xfs_dquot	*pdqp;
 	struct xfs_trans_res	*tres;
-	struct xfs_parent_args	*ppargs;
 	xfs_ino_t		ino;
 	bool			unlock_dp_on_error = false;
 	bool			is_dir = S_ISDIR(args->mode);
@@ -791,7 +793,7 @@ xfs_create(
 		tres = &M_RES(mp)->tr_create;
 	}
 
-	error = xfs_parent_start(mp, &ppargs);
+	error = xfs_parent_start(mp, &du.ppargs);
 	if (error)
 		goto out_release_dquots;
 
@@ -822,7 +824,7 @@ xfs_create(
 	 */
 	error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
 	if (!error)
-		error = xfs_icreate(tp, ino, args, &ip);
+		error = xfs_icreate(tp, ino, args, &du.ip);
 	if (error)
 		goto out_trans_cancel;
 
@@ -835,28 +837,7 @@ xfs_create(
 	 */
 	xfs_trans_ijoin(tp, dp, 0);
 
-	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
-					resblks - XFS_IALLOC_SPACE_RES(mp));
-	if (error) {
-		ASSERT(error != -ENOSPC);
-		goto out_trans_cancel;
-	}
-	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
-	if (is_dir) {
-		error = xfs_dir_init(tp, ip, dp);
-		if (error)
-			goto out_trans_cancel;
-
-		xfs_bumplink(tp, dp);
-	}
-
-	/*
-	 * If we have parent pointers, we need to add the attribute containing
-	 * the parent information now.
-	 */
-	error = xfs_parent_add(tp, ppargs, dp, name, ip);
+	error = xfs_dir_create_child(tp, resblks, &du);
 	if (error)
 		goto out_trans_cancel;
 
@@ -864,7 +845,7 @@ xfs_create(
 	 * Create ip with a reference from dp, and add '.' and '..' references
 	 * if it's a directory.
 	 */
-	xfs_dir_update_hook(dp, ip, 1, name);
+	xfs_dir_update_hook(dp, du.ip, 1, name);
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -879,7 +860,7 @@ xfs_create(
 	 * These ids of the inode couldn't have changed since the new
 	 * inode has been locked ever since it was created.
 	 */
-	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+	xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp);
 
 	error = xfs_trans_commit(tp);
 	if (error)
@@ -889,10 +870,10 @@ xfs_create(
 	xfs_qm_dqrele(gdqp);
 	xfs_qm_dqrele(pdqp);
 
-	*ipp = ip;
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	*ipp = du.ip;
+	xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-	xfs_parent_finish(mp, ppargs);
+	xfs_parent_finish(mp, du.ppargs);
 	return 0;
 
  out_trans_cancel:
@@ -903,13 +884,13 @@ xfs_create(
 	 * setup of the inode and release the inode.  This prevents recursive
 	 * transactions and deadlocks from xfs_inactive.
 	 */
-	if (ip) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_finish_inode_setup(ip);
-		xfs_irele(ip);
+	if (du.ip) {
+		xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
+		xfs_finish_inode_setup(du.ip);
+		xfs_irele(du.ip);
 	}
  out_parent:
-	xfs_parent_finish(mp, ppargs);
+	xfs_parent_finish(mp, du.ppargs);
  out_release_dquots:
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index a4872a6903e69..a923160a7dae0 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -93,9 +93,12 @@ xfs_symlink(
 	struct xfs_icreate_args	args = {
 		.nlink		= 1,
 	};
+	struct xfs_dir_update	du = {
+		.dp		= dp,
+		.name		= link_name,
+	};
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_trans	*tp = NULL;
-	struct xfs_inode	*ip = NULL;
 	int			error = 0;
 	int			pathlen;
 	bool                    unlock_dp_on_error = false;
@@ -105,7 +108,6 @@ xfs_symlink(
 	struct xfs_dquot	*pdqp;
 	uint			resblks;
 	xfs_ino_t		ino;
-	struct xfs_parent_args	*ppargs;
 
 	*ipp = NULL;
 
@@ -144,7 +146,7 @@ xfs_symlink(
 		fs_blocks = xfs_symlink_blocks(mp, pathlen);
 	resblks = xfs_symlink_space_res(mp, link_name->len, fs_blocks);
 
-	error = xfs_parent_start(mp, &ppargs);
+	error = xfs_parent_start(mp, &du.ppargs);
 	if (error)
 		goto out_release_dquots;
 
@@ -169,7 +171,7 @@ xfs_symlink(
 	 */
 	error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino);
 	if (!error)
-		error = xfs_icreate(tp, ino, &args, &ip);
+		error = xfs_icreate(tp, ino, &args, &du.ip);
 	if (error)
 		goto out_trans_cancel;
 
@@ -185,31 +187,24 @@ xfs_symlink(
 	/*
 	 * Also attach the dquot(s) to it, if applicable.
 	 */
-	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+	xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp);
 
 	resblks -= XFS_IALLOC_SPACE_RES(mp);
-	error = xfs_symlink_write_target(tp, ip, target_path, pathlen,
+	error = xfs_symlink_write_target(tp, du.ip, target_path, pathlen,
 			fs_blocks, resblks);
 	if (error)
 		goto out_trans_cancel;
 	resblks -= fs_blocks;
-	i_size_write(VFS_I(ip), ip->i_disk_size);
+	i_size_write(VFS_I(du.ip), du.ip->i_disk_size);
 
 	/*
 	 * Create the directory entry for the symlink.
 	 */
-	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, resblks);
+	error = xfs_dir_create_child(tp, resblks, &du);
 	if (error)
 		goto out_trans_cancel;
-	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
-	/* Add parent pointer for the new symlink. */
-	error = xfs_parent_add(tp, ppargs, dp, link_name, ip);
-	if (error)
-		goto out_trans_cancel;
-
-	xfs_dir_update_hook(dp, ip, 1, link_name);
+	xfs_dir_update_hook(dp, du.ip, 1, link_name);
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -227,10 +222,10 @@ xfs_symlink(
 	xfs_qm_dqrele(gdqp);
 	xfs_qm_dqrele(pdqp);
 
-	*ipp = ip;
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	*ipp = du.ip;
+	xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-	xfs_parent_finish(mp, ppargs);
+	xfs_parent_finish(mp, du.ppargs);
 	return 0;
 
 out_trans_cancel:
@@ -241,13 +236,13 @@ xfs_symlink(
 	 * setup of the inode and release the inode.  This prevents recursive
 	 * transactions and deadlocks from xfs_inactive.
 	 */
-	if (ip) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_finish_inode_setup(ip);
-		xfs_irele(ip);
+	if (du.ip) {
+		xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
+		xfs_finish_inode_setup(du.ip);
+		xfs_irele(du.ip);
 	}
 out_parent:
-	xfs_parent_finish(mp, ppargs);
+	xfs_parent_finish(mp, du.ppargs);
 out_release_dquots:
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/21] xfs: create libxfs helper to link an existing inode into a directory
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-31 21:04   ` [PATCH 14/21] xfs: create libxfs helper to link a new inode into a directory Darrick J. Wong
@ 2023-12-31 21:04   ` Darrick J. Wong
  2023-12-31 21:04   ` [PATCH 16/21] xfs: hoist inode free function to libxfs Darrick J. Wong
                     ` (5 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:04 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to link an existing inode into a directory.
The upcoming metadata directory feature will need this to create a
metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2.c |   65 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/libxfs/xfs_dir2.h |    4 ++-
 fs/xfs/xfs_inode.c       |   50 +++++++----------------------------
 3 files changed, 75 insertions(+), 44 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 16dfe869ef2de..91f5a10176248 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -22,6 +22,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_parent.h"
+#include "xfs_ag.h"
 
 const struct xfs_name xfs_name_dotdot = {
 	.name	= (const unsigned char *)"..",
@@ -562,9 +563,9 @@ xfs_dir_replace(
  */
 int
 xfs_dir_canenter(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*dp,
-	struct xfs_name	*name)		/* name of entry to add */
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	const struct xfs_name	*name)		/* name of entry to add */
 {
 	return xfs_dir_createname(tp, dp, name, 0, 0);
 }
@@ -829,3 +830,61 @@ xfs_dir_create_child(
 	 */
 	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
 }
+
+/*
+ * Given a directory @dp, an existing non-directory inode @ip, and a @name,
+ * link @ip into @dp under the given @name.  Both inodes must have the ILOCK
+ * held.
+ */
+int
+xfs_dir_add_child(
+	struct xfs_trans	*tp,
+	unsigned int		resblks,
+	struct xfs_dir_update	*du)
+{
+	struct xfs_inode	*dp = du->dp;
+	const struct xfs_name	*name = du->name;
+	struct xfs_inode	*ip = du->ip;
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_EXCL));
+	ASSERT(!S_ISDIR(VFS_I(ip)->i_mode));
+
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, dp, name);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Handle initial link state of O_TMPFILE inode
+	 */
+	if (VFS_I(ip)->i_nlink == 0) {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+		error = xfs_iunlink_remove(tp, pag, ip);
+		xfs_perag_put(pag);
+		if (error)
+			return error;
+	}
+
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks);
+	if (error)
+		return error;
+
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	xfs_bumplink(tp, ip);
+
+	/*
+	 * If we have parent pointers, we now need to add the parent record to
+	 * the attribute fork of the inode. If this is the initial parent
+	 * attribute, we need to create it correctly, otherwise we can just add
+	 * the parent to the inode.
+	 */
+	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 71a8d8e8a8e7b..1215d5d1ebe6c 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -61,7 +61,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
 				const struct xfs_name *name, xfs_ino_t inum,
 				xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-				struct xfs_name *name);
+				const struct xfs_name *name);
 
 /*
  * Direct call from the bmap code, bypassing the generic directory layer.
@@ -304,5 +304,7 @@ struct xfs_dir_update {
 
 int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks,
 		struct xfs_dir_update *du);
+int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks,
+		struct xfs_dir_update *du);
 
 #endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 401bd2a455196..70d13629071f5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -998,11 +998,15 @@ xfs_link(
 	struct xfs_inode	*sip,
 	struct xfs_name		*target_name)
 {
+	struct xfs_dir_update	du = {
+		.dp		= tdp,
+		.name		= target_name,
+		.ip		= sip,
+	};
 	struct xfs_mount	*mp = tdp->i_mount;
 	struct xfs_trans	*tp;
 	int			error, nospace_error = 0;
 	int			resblks;
-	struct xfs_parent_args	*ppargs;
 
 	trace_xfs_link(tdp, target_name);
 
@@ -1021,7 +1025,7 @@ xfs_link(
 	if (error)
 		goto std_return;
 
-	error = xfs_parent_start(mp, &ppargs);
+	error = xfs_parent_start(mp, &du.ppargs);
 	if (error)
 		goto std_return;
 
@@ -1036,7 +1040,7 @@ xfs_link(
 	 * pointers are enabled because we can't back out if the xattrs must
 	 * grow.
 	 */
-	if (ppargs && nospace_error) {
+	if (du.ppargs && nospace_error) {
 		error = nospace_error;
 		goto error_return;
 	}
@@ -1052,41 +1056,7 @@ xfs_link(
 		goto error_return;
 	}
 
-	if (!resblks) {
-		error = xfs_dir_canenter(tp, tdp, target_name);
-		if (error)
-			goto error_return;
-	}
-
-	/*
-	 * Handle initial link state of O_TMPFILE inode
-	 */
-	if (VFS_I(sip)->i_nlink == 0) {
-		struct xfs_perag	*pag;
-
-		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
-		error = xfs_iunlink_remove(tp, pag, sip);
-		xfs_perag_put(pag);
-		if (error)
-			goto error_return;
-	}
-
-	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
-				   resblks);
-	if (error)
-		goto error_return;
-	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
-
-	xfs_bumplink(tp, sip);
-
-	/*
-	 * If we have parent pointers, we now need to add the parent record to
-	 * the attribute fork of the inode. If this is the initial parent
-	 * attribute, we need to create it correctly, otherwise we can just add
-	 * the parent to the inode.
-	 */
-	error = xfs_parent_add(tp, ppargs, tdp, target_name, sip);
+	error = xfs_dir_add_child(tp, resblks, &du);
 	if (error)
 		goto error_return;
 
@@ -1103,7 +1073,7 @@ xfs_link(
 	error = xfs_trans_commit(tp);
 	xfs_iunlock(tdp, XFS_ILOCK_EXCL);
 	xfs_iunlock(sip, XFS_ILOCK_EXCL);
-	xfs_parent_finish(mp, ppargs);
+	xfs_parent_finish(mp, du.ppargs);
 	return error;
 
  error_return:
@@ -1111,7 +1081,7 @@ xfs_link(
 	xfs_iunlock(tdp, XFS_ILOCK_EXCL);
 	xfs_iunlock(sip, XFS_ILOCK_EXCL);
  out_parent:
-	xfs_parent_finish(mp, ppargs);
+	xfs_parent_finish(mp, du.ppargs);
  std_return:
 	if (error == -ENOSPC && nospace_error)
 		error = nospace_error;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/21] xfs: hoist inode free function to libxfs
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-31 21:04   ` [PATCH 15/21] xfs: create libxfs helper to link an existing " Darrick J. Wong
@ 2023-12-31 21:04   ` Darrick J. Wong
  2023-12-31 21:05   ` [PATCH 17/21] xfs: create libxfs helper to remove an existing inode/name from a directory Darrick J. Wong
                     ` (4 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:04 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a libxfs helper function that marks an inode free on disk.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_util.c |   51 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_util.h |    5 ++++
 fs/xfs/xfs_inode.c             |   35 +--------------------------
 3 files changed, 57 insertions(+), 34 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 3de478eb5a83a..8f85c74603dba 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -22,6 +22,7 @@
 #include "xfs_trace.h"
 #include "xfs_ag.h"
 #include "xfs_iunlink_item.h"
+#include "xfs_inode_item.h"
 
 uint16_t
 xfs_flags2diflags(
@@ -670,3 +671,53 @@ xfs_bumplink(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
+
+/* Mark an inode free on disk. */
+int
+xfs_dir_ifree(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip,
+	struct xfs_icluster	*xic)
+{
+	int			error;
+
+	/*
+	 * Free the inode first so that we guarantee that the AGI lock is going
+	 * to be taken before we remove the inode from the unlinked list. This
+	 * makes the AGI lock -> unlinked list modification order the same as
+	 * used in O_TMPFILE creation.
+	 */
+	error = xfs_difree(tp, pag, ip->i_ino, xic);
+	if (error)
+		return error;
+
+	error = xfs_iunlink_remove(tp, pag, ip);
+	if (error)
+		return error;
+
+	/*
+	 * Free any local-format data sitting around before we reset the
+	 * data fork to extents format.  Note that the attr fork data has
+	 * already been freed by xfs_attr_inactive.
+	 */
+	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		kmem_free(ip->i_df.if_u1.if_data);
+		ip->i_df.if_u1.if_data = NULL;
+		ip->i_df.if_bytes = 0;
+	}
+
+	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */
+	ip->i_diflags = 0;
+	ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2;
+	ip->i_forkoff = 0;		/* mark the attr fork not in use */
+	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+
+	/*
+	 * Bump the generation count so no one will be confused
+	 * by reincarnations of this inode.
+	 */
+	VFS_I(ip)->i_generation++;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h
index a2a1796a72ff3..521f044198c4a 100644
--- a/fs/xfs/libxfs/xfs_inode_util.h
+++ b/fs/xfs/libxfs/xfs_inode_util.h
@@ -6,6 +6,8 @@
 #ifndef	__XFS_INODE_UTIL_H__
 #define	__XFS_INODE_UTIL_H__
 
+struct xfs_icluster;
+
 uint16_t	xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags);
 uint64_t	xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags);
 uint32_t	xfs_dic2xflags(struct xfs_inode *ip);
@@ -57,6 +59,9 @@ void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags);
 void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
 		struct xfs_inode *ip);
 
+int xfs_dir_ifree(struct xfs_trans *tp, struct xfs_perag *pag,
+		struct xfs_inode *ip, struct xfs_icluster *xic);
+
 int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
 int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
 		struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 70d13629071f5..11f7c2e78c8d9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1971,36 +1971,10 @@ xfs_ifree(
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 
-	/*
-	 * Free the inode first so that we guarantee that the AGI lock is going
-	 * to be taken before we remove the inode from the unlinked list. This
-	 * makes the AGI lock -> unlinked list modification order the same as
-	 * used in O_TMPFILE creation.
-	 */
-	error = xfs_difree(tp, pag, ip->i_ino, &xic);
+	error = xfs_dir_ifree(tp, pag, ip, &xic);
 	if (error)
 		goto out;
 
-	error = xfs_iunlink_remove(tp, pag, ip);
-	if (error)
-		goto out;
-
-	/*
-	 * Free any local-format data sitting around before we reset the
-	 * data fork to extents format.  Note that the attr fork data has
-	 * already been freed by xfs_attr_inactive.
-	 */
-	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
-		kmem_free(ip->i_df.if_u1.if_data);
-		ip->i_df.if_u1.if_data = NULL;
-		ip->i_df.if_bytes = 0;
-	}
-
-	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */
-	ip->i_diflags = 0;
-	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
-	ip->i_forkoff = 0;		/* mark the attr fork not in use */
-	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
 	if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
 		xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
 
@@ -2009,13 +1983,6 @@ xfs_ifree(
 	iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
 	spin_unlock(&iip->ili_lock);
 
-	/*
-	 * Bump the generation count so no one will be confused
-	 * by reincarnations of this inode.
-	 */
-	VFS_I(ip)->i_generation++;
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
 	if (xic.deleted)
 		error = xfs_ifree_cluster(tp, pag, ip, &xic);
 out:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/21] xfs: create libxfs helper to remove an existing inode/name from a directory
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-31 21:04   ` [PATCH 16/21] xfs: hoist inode free function to libxfs Darrick J. Wong
@ 2023-12-31 21:05   ` Darrick J. Wong
  2023-12-31 21:05   ` [PATCH 18/21] xfs: create libxfs helper to exchange two directory entries Darrick J. Wong
                     ` (3 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:05 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to remove a (name, inode) entry from a
directory.  The upcoming metadata directory feature will need this to
create a metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2.c |   75 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_dir2.h |    2 +
 fs/xfs/xfs_inode.c       |   72 ++++++--------------------------------------
 3 files changed, 86 insertions(+), 63 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 91f5a10176248..dfc79d75b3dd6 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -888,3 +888,78 @@ xfs_dir_add_child(
 	 */
 	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
 }
+
+/*
+ * Given a directory @dp, a child @ip, and a @name, remove the (@name, @ip)
+ * entry from the directory.  Both inodes must have the ILOCK held.
+ */
+int
+xfs_dir_remove_child(
+	struct xfs_trans	*tp,
+	unsigned int		resblks,
+	struct xfs_dir_update	*du)
+{
+	struct xfs_inode	*dp = du->dp;
+	const struct xfs_name	*name = du->name;
+	struct xfs_inode	*ip = du->ip;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_EXCL));
+
+	/*
+	 * If we're removing a directory perform some additional validation.
+	 */
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		ASSERT(VFS_I(ip)->i_nlink >= 2);
+		if (VFS_I(ip)->i_nlink != 2)
+			return -ENOTEMPTY;
+		if (!xfs_dir_isempty(ip))
+			return -ENOTEMPTY;
+
+		/* Drop the link from ip's "..".  */
+		error = xfs_droplink(tp, dp);
+		if (error)
+			return error;
+
+		/* Drop the "." link from ip to self.  */
+		error = xfs_droplink(tp, ip);
+		if (error)
+			return error;
+
+		/*
+		 * Point the unlinked child directory's ".." entry to the root
+		 * directory to eliminate back-references to inodes that may
+		 * get freed before the child directory is closed.  If the fs
+		 * gets shrunk, this can lead to dirent inode validation errors.
+		 */
+		if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
+			error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+					tp->t_mountp->m_sb.sb_rootino, 0);
+			if (error)
+				return error;
+		}
+	} else {
+		/*
+		 * When removing a non-directory we need to log the parent
+		 * inode here.  For a directory this is done implicitly
+		 * by the xfs_droplink call for the ".." entry.
+		 */
+		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/* Drop the link from dp to ip. */
+	error = xfs_droplink(tp, ip);
+	if (error)
+		return error;
+
+	error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
+	if (error) {
+		ASSERT(error != -ENOENT);
+		return error;
+	}
+
+	/* Remove parent pointer. */
+	return xfs_parent_remove(tp, du->ppargs, dp, name, ip);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 1215d5d1ebe6c..8c8b55b487d67 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -306,5 +306,7 @@ int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks,
 		struct xfs_dir_update *du);
 int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks,
 		struct xfs_dir_update *du);
+int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks,
+		struct xfs_dir_update *du);
 
 #endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 11f7c2e78c8d9..a00579cd6683b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2066,13 +2066,17 @@ xfs_remove(
 	struct xfs_name		*name,
 	struct xfs_inode	*ip)
 {
+	struct xfs_dir_update	du = {
+		.dp		= dp,
+		.name		= name,
+		.ip		= ip,
+	};
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_trans	*tp = NULL;
 	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);
 	int			dontcare;
 	int                     error = 0;
 	uint			resblks;
-	struct xfs_parent_args	*ppargs;
 
 	trace_xfs_remove(dp, name);
 
@@ -2089,7 +2093,7 @@ xfs_remove(
 	if (error)
 		goto std_return;
 
-	error = xfs_parent_start(mp, &ppargs);
+	error = xfs_parent_start(mp, &du.ppargs);
 	if (error)
 		goto std_return;
 
@@ -2112,65 +2116,7 @@ xfs_remove(
 		goto out_parent;
 	}
 
-	/*
-	 * If we're removing a directory perform some additional validation.
-	 */
-	if (is_dir) {
-		ASSERT(VFS_I(ip)->i_nlink >= 2);
-		if (VFS_I(ip)->i_nlink != 2) {
-			error = -ENOTEMPTY;
-			goto out_trans_cancel;
-		}
-		if (!xfs_dir_isempty(ip)) {
-			error = -ENOTEMPTY;
-			goto out_trans_cancel;
-		}
-
-		/* Drop the link from ip's "..".  */
-		error = xfs_droplink(tp, dp);
-		if (error)
-			goto out_trans_cancel;
-
-		/* Drop the "." link from ip to self.  */
-		error = xfs_droplink(tp, ip);
-		if (error)
-			goto out_trans_cancel;
-
-		/*
-		 * Point the unlinked child directory's ".." entry to the root
-		 * directory to eliminate back-references to inodes that may
-		 * get freed before the child directory is closed.  If the fs
-		 * gets shrunk, this can lead to dirent inode validation errors.
-		 */
-		if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
-			error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
-					tp->t_mountp->m_sb.sb_rootino, 0);
-			if (error)
-				goto out_trans_cancel;
-		}
-	} else {
-		/*
-		 * When removing a non-directory we need to log the parent
-		 * inode here.  For a directory this is done implicitly
-		 * by the xfs_droplink call for the ".." entry.
-		 */
-		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-	}
-	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-	/* Drop the link from dp to ip. */
-	error = xfs_droplink(tp, ip);
-	if (error)
-		goto out_trans_cancel;
-
-	error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
-	if (error) {
-		ASSERT(error != -ENOENT);
-		goto out_trans_cancel;
-	}
-
-	/* Remove parent pointer. */
-	error = xfs_parent_remove(tp, ppargs, dp, name, ip);
+	error = xfs_dir_remove_child(tp, resblks, &du);
 	if (error)
 		goto out_trans_cancel;
 
@@ -2197,7 +2143,7 @@ xfs_remove(
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-	xfs_parent_finish(mp, ppargs);
+	xfs_parent_finish(mp, du.ppargs);
 	return 0;
 
  out_trans_cancel:
@@ -2206,7 +2152,7 @@ xfs_remove(
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
  out_parent:
-	xfs_parent_finish(mp, ppargs);
+	xfs_parent_finish(mp, du.ppargs);
  std_return:
 	return error;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/21] xfs: create libxfs helper to exchange two directory entries
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-31 21:05   ` [PATCH 17/21] xfs: create libxfs helper to remove an existing inode/name from a directory Darrick J. Wong
@ 2023-12-31 21:05   ` Darrick J. Wong
  2023-12-31 21:05   ` [PATCH 19/21] xfs: create libxfs helper to rename " Darrick J. Wong
                     ` (2 subsequent siblings)
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:05 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to exchange two directory entries.
The upcoming metadata directory feature will need this to replace a
metadata inode directory entry.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2.c |  117 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_dir2.h |    3 +
 fs/xfs/xfs_inode.c       |  108 ++++++------------------------------------
 3 files changed, 135 insertions(+), 93 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index dfc79d75b3dd6..f6b448d5fb0e4 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -963,3 +963,120 @@ xfs_dir_remove_child(
 	/* Remove parent pointer. */
 	return xfs_parent_remove(tp, du->ppargs, dp, name, ip);
 }
+
+/*
+ * Exchange the entry (@name1, @ip1) in directory @dp1 with the entry (@name2,
+ * @ip2) in directory @dp2, and update '..' @ip1 and @ip2's entries as needed.
+ * @ip1 and @ip2 need not be of the same type.
+ *
+ * All inodes must have the ILOCK held, and both entries must already exist.
+ */
+int
+xfs_dir_exchange_children(
+	struct xfs_trans	*tp,
+	struct xfs_dir_update	*du1,
+	struct xfs_dir_update	*du2,
+	unsigned int		spaceres)
+{
+	struct xfs_inode	*dp1 = du1->dp;
+	const struct xfs_name	*name1 = du1->name;
+	struct xfs_inode	*ip1 = du1->ip;
+	struct xfs_inode	*dp2 = du2->dp;
+	const struct xfs_name	*name2 = du2->name;
+	struct xfs_inode	*ip2 = du2->ip;
+	int			ip1_flags = 0;
+	int			ip2_flags = 0;
+	int			dp2_flags = 0;
+	int			error;
+
+	/* Swap inode number for dirent in first parent */
+	error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
+	if (error)
+		return error;
+
+	/* Swap inode number for dirent in second parent */
+	error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
+	if (error)
+		return error;
+
+	/*
+	 * If we're renaming one or more directories across different parents,
+	 * update the respective ".." entries (and link counts) to match the new
+	 * parents.
+	 */
+	if (dp1 != dp2) {
+		dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+
+		if (S_ISDIR(VFS_I(ip2)->i_mode)) {
+			error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
+						dp1->i_ino, spaceres);
+			if (error)
+				return error;
+
+			/* transfer ip2 ".." reference to dp1 */
+			if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
+				error = xfs_droplink(tp, dp2);
+				if (error)
+					return error;
+				xfs_bumplink(tp, dp1);
+			}
+
+			/*
+			 * Although ip1 isn't changed here, userspace needs
+			 * to be warned about the change, so that applications
+			 * relying on it (like backup ones), will properly
+			 * notify the change
+			 */
+			ip1_flags |= XFS_ICHGTIME_CHG;
+			ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+		}
+
+		if (S_ISDIR(VFS_I(ip1)->i_mode)) {
+			error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
+						dp2->i_ino, spaceres);
+			if (error)
+				return error;
+
+			/* transfer ip1 ".." reference to dp2 */
+			if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
+				error = xfs_droplink(tp, dp1);
+				if (error)
+					return error;
+				xfs_bumplink(tp, dp2);
+			}
+
+			/*
+			 * Although ip2 isn't changed here, userspace needs
+			 * to be warned about the change, so that applications
+			 * relying on it (like backup ones), will properly
+			 * notify the change
+			 */
+			ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+			ip2_flags |= XFS_ICHGTIME_CHG;
+		}
+	}
+
+	if (ip1_flags) {
+		xfs_trans_ichgtime(tp, ip1, ip1_flags);
+		xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+	}
+	if (ip2_flags) {
+		xfs_trans_ichgtime(tp, ip2, ip2_flags);
+		xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+	}
+	if (dp2_flags) {
+		xfs_trans_ichgtime(tp, dp2, dp2_flags);
+		xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
+	}
+	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+
+	/* Schedule parent pointer replacements */
+	error = xfs_parent_replace(tp, du1->ppargs, dp1, name1, dp2, name2,
+			ip1);
+	if (error)
+		return error;
+
+	return xfs_parent_replace(tp, du2->ppargs, dp2, name2, dp1, name1,
+			ip2);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 8c8b55b487d67..dbca60ec93462 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -309,4 +309,7 @@ int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks,
 int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks,
 		struct xfs_dir_update *du);
 
+int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1,
+		struct xfs_dir_update *du2, unsigned int spaceres);
+
 #endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a00579cd6683b..1eb92c05d8b78 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2264,102 +2264,24 @@ xfs_cross_rename(
 	struct xfs_parent_args	*ip2_ppargs,
 	int			spaceres)
 {
-	int			error = 0;
-	int			ip1_flags = 0;
-	int			ip2_flags = 0;
-	int			dp2_flags = 0;
-
-	/* Swap inode number for dirent in first parent */
-	error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
-	if (error)
-		goto out_trans_abort;
-
-	/* Swap inode number for dirent in second parent */
-	error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
-	if (error)
-		goto out_trans_abort;
-
-	/*
-	 * If we're renaming one or more directories across different parents,
-	 * update the respective ".." entries (and link counts) to match the new
-	 * parents.
-	 */
-	if (dp1 != dp2) {
-		dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
-
-		if (S_ISDIR(VFS_I(ip2)->i_mode)) {
-			error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
-						dp1->i_ino, spaceres);
-			if (error)
-				goto out_trans_abort;
-
-			/* transfer ip2 ".." reference to dp1 */
-			if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
-				error = xfs_droplink(tp, dp2);
-				if (error)
-					goto out_trans_abort;
-				xfs_bumplink(tp, dp1);
-			}
-
-			/*
-			 * Although ip1 isn't changed here, userspace needs
-			 * to be warned about the change, so that applications
-			 * relying on it (like backup ones), will properly
-			 * notify the change
-			 */
-			ip1_flags |= XFS_ICHGTIME_CHG;
-			ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
-		}
-
-		if (S_ISDIR(VFS_I(ip1)->i_mode)) {
-			error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
-						dp2->i_ino, spaceres);
-			if (error)
-				goto out_trans_abort;
-
-			/* transfer ip1 ".." reference to dp2 */
-			if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
-				error = xfs_droplink(tp, dp1);
-				if (error)
-					goto out_trans_abort;
-				xfs_bumplink(tp, dp2);
-			}
-
-			/*
-			 * Although ip2 isn't changed here, userspace needs
-			 * to be warned about the change, so that applications
-			 * relying on it (like backup ones), will properly
-			 * notify the change
-			 */
-			ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
-			ip2_flags |= XFS_ICHGTIME_CHG;
-		}
-	}
-
-	/* Schedule parent pointer replacements */
-	error = xfs_parent_replace(tp, ip1_ppargs, dp1, name1, dp2, name2, ip1);
+	struct xfs_dir_update	du1 = {
+		.dp		= dp1,
+		.name		= name1,
+		.ip		= ip1,
+		.ppargs		= ip1_ppargs,
+	};
+	struct xfs_dir_update	du2 = {
+		.dp		= dp2,
+		.name		= name2,
+		.ip		= ip2,
+		.ppargs		= ip2_ppargs,
+	};
+	int			error;
+
+	error = xfs_dir_exchange_children(tp, &du1, &du2, spaceres);
 	if (error)
 		goto out_trans_abort;
 
-	error = xfs_parent_replace(tp, ip2_ppargs, dp2, name2, dp1, name1, ip2);
-	if (error)
-		goto out_trans_abort;
-
-	if (ip1_flags) {
-		xfs_trans_ichgtime(tp, ip1, ip1_flags);
-		xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
-	}
-	if (ip2_flags) {
-		xfs_trans_ichgtime(tp, ip2, ip2_flags);
-		xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
-	}
-	if (dp2_flags) {
-		xfs_trans_ichgtime(tp, dp2, dp2_flags);
-		xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
-	}
-	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
-
 	/*
 	 * Inform our hook clients that we've finished an exchange operation as
 	 * follows: removed the source and target files from their directories;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/21] xfs: create libxfs helper to rename two directory entries
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-31 21:05   ` [PATCH 18/21] xfs: create libxfs helper to exchange two directory entries Darrick J. Wong
@ 2023-12-31 21:05   ` Darrick J. Wong
  2023-12-31 21:05   ` [PATCH 20/21] xfs: move dirent update hooks to xfs_dir2.c Darrick J. Wong
  2023-12-31 21:06   ` [PATCH 21/21] xfs: get rid of trivial rename helpers Darrick J. Wong
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:05 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to rename two directory entries.  The
upcoming metadata directory feature will need this to replace a metadata
inode directory entry.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2.c |  217 +++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_dir2.h |    3 
 fs/xfs/xfs_inode.c       |  288 ++++++++++------------------------------------
 3 files changed, 282 insertions(+), 226 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index f6b448d5fb0e4..0ec653d1d5b8d 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -23,6 +23,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_parent.h"
 #include "xfs_ag.h"
+#include "xfs_ialloc.h"
 
 const struct xfs_name xfs_name_dotdot = {
 	.name	= (const unsigned char *)"..",
@@ -1080,3 +1081,219 @@ xfs_dir_exchange_children(
 	return xfs_parent_replace(tp, du2->ppargs, dp2, name2, dp1, name1,
 			ip2);
 }
+
+/*
+ * Given an entry (@src_name, @src_ip) in directory @src_dp, make the entry
+ * @target_name in directory @target_dp point to @src_ip and remove the
+ * original entry, cleaning up everything left behind.
+ *
+ * Cleanup involves dropping a link count on @target_ip, and either removing
+ * the (@src_name, @src_ip) entry from @src_dp or simply replacing the entry
+ * with (@src_name, @wip) if a whiteout inode @wip is supplied.
+ *
+ * All inodes must have the ILOCK held.  We assume that if @src_ip is a
+ * directory then its '..' doesn't already point to @target_dp, and that @wip
+ * is a freshly allocated whiteout.
+ */
+int
+xfs_dir_rename_children(
+	struct xfs_trans	*tp,
+	struct xfs_dir_update	*du_src,
+	struct xfs_dir_update	*du_tgt,
+	unsigned int		spaceres,
+	struct xfs_dir_update	*du_wip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*src_dp = du_src->dp;
+	const struct xfs_name	*src_name = du_src->name;
+	struct xfs_inode	*src_ip = du_src->ip;
+	struct xfs_inode	*target_dp = du_tgt->dp;
+	const struct xfs_name	*target_name = du_tgt->name;
+	struct xfs_inode	*target_ip = du_tgt->ip;
+	bool			new_parent = (src_dp != target_dp);
+	bool			src_is_directory;
+	int			error;
+
+	src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
+
+	/*
+	 * Check for expected errors before we dirty the transaction
+	 * so we can return an error without a transaction abort.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If there's no space reservation, check the entry will
+		 * fit before actually inserting it.
+		 */
+		if (!spaceres) {
+			error = xfs_dir_canenter(tp, target_dp, target_name);
+			if (error)
+				return error;
+		}
+	} else {
+		/*
+		 * If target exists and it's a directory, check that whether
+		 * it can be destroyed.
+		 */
+		if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+		    (!xfs_dir_isempty(target_ip) ||
+		     (VFS_I(target_ip)->i_nlink > 2)))
+			return -EEXIST;
+	}
+
+	/*
+	 * Directory entry creation below may acquire the AGF. Remove
+	 * the whiteout from the unlinked list first to preserve correct
+	 * AGI/AGF locking order. This dirties the transaction so failures
+	 * after this point will abort and log recovery will clean up the
+	 * mess.
+	 *
+	 * For whiteouts, we need to bump the link count on the whiteout
+	 * inode. After this point, we have a real link, clear the tmpfile
+	 * state flag from the inode so it doesn't accidentally get misused
+	 * in future.
+	 */
+	if (du_wip->ip) {
+		struct xfs_perag	*pag;
+
+		ASSERT(VFS_I(du_wip->ip)->i_nlink == 0);
+
+		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, du_wip->ip->i_ino));
+		error = xfs_iunlink_remove(tp, pag, du_wip->ip);
+		xfs_perag_put(pag);
+		if (error)
+			return error;
+
+		xfs_bumplink(tp, du_wip->ip);
+	}
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If target does not exist and the rename crosses
+		 * directories, adjust the target directory link count
+		 * to account for the ".." reference from the new entry.
+		 */
+		error = xfs_dir_createname(tp, target_dp, target_name,
+					   src_ip->i_ino, spaceres);
+		if (error)
+			return error;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		if (new_parent && src_is_directory) {
+			xfs_bumplink(tp, target_dp);
+		}
+	} else { /* target_ip != NULL */
+		/*
+		 * Link the source inode under the target name.
+		 * If the source inode is a directory and we are moving
+		 * it across directories, its ".." entry will be
+		 * inconsistent until we replace that down below.
+		 *
+		 * In case there is already an entry with the same
+		 * name at the destination directory, remove it first.
+		 */
+		error = xfs_dir_replace(tp, target_dp, target_name,
+					src_ip->i_ino, spaceres);
+		if (error)
+			return error;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		/*
+		 * Decrement the link count on the target since the target
+		 * dir no longer points to it.
+		 */
+		error = xfs_droplink(tp, target_ip);
+		if (error)
+			return error;
+
+		if (src_is_directory) {
+			/*
+			 * Drop the link from the old "." entry.
+			 */
+			error = xfs_droplink(tp, target_ip);
+			if (error)
+				return error;
+		}
+	} /* target_ip != NULL */
+
+	/*
+	 * Remove the source.
+	 */
+	if (new_parent && src_is_directory) {
+		/*
+		 * Rewrite the ".." entry to point to the new
+		 * directory.
+		 */
+		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+					target_dp->i_ino, spaceres);
+		ASSERT(error != -EEXIST);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * We always want to hit the ctime on the source inode.
+	 *
+	 * This isn't strictly required by the standards since the source
+	 * inode isn't really being changed, but old unix file systems did
+	 * it and some incremental backup programs won't work without it.
+	 */
+	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+	/*
+	 * Adjust the link count on src_dp.  This is necessary when
+	 * renaming a directory, either within one parent when
+	 * the target existed, or across two parent directories.
+	 */
+	if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+		/*
+		 * Decrement link count on src_directory since the
+		 * entry that's moved no longer points to it.
+		 */
+		error = xfs_droplink(tp, src_dp);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * For whiteouts, we only need to update the source dirent with the
+	 * inode number of the whiteout inode rather than removing it
+	 * altogether.
+	 */
+	if (du_wip->ip)
+		error = xfs_dir_replace(tp, src_dp, src_name, du_wip->ip->i_ino,
+					spaceres);
+	else
+		error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+					   spaceres);
+	if (error)
+		return error;
+
+	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+	if (new_parent)
+		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
+	/* Schedule parent pointer updates. */
+	error = xfs_parent_add(tp, du_wip->ppargs, src_dp, src_name,
+			du_wip->ip);
+	if (error)
+		return error;
+
+	error = xfs_parent_replace(tp, du_src->ppargs, src_dp, src_name,
+			target_dp, target_name, src_ip);
+	if (error)
+		return error;
+
+	return xfs_parent_remove(tp, du_tgt->ppargs, target_dp, target_name,
+			target_ip);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index dbca60ec93462..5e8b18f3f0036 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -311,5 +311,8 @@ int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks,
 
 int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1,
 		struct xfs_dir_update *du2, unsigned int spaceres);
+int xfs_dir_rename_children(struct xfs_trans *tp, struct xfs_dir_update *du_src,
+		struct xfs_dir_update *du_tgt, unsigned int spaceres,
+		struct xfs_dir_update *du_wip);
 
 #endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1eb92c05d8b78..1a87ccf4e0474 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2358,29 +2358,37 @@ xfs_rename_alloc_whiteout(
  */
 int
 xfs_rename(
-	struct mnt_idmap	*idmap,
-	struct xfs_inode	*src_dp,
-	struct xfs_name		*src_name,
-	struct xfs_inode	*src_ip,
-	struct xfs_inode	*target_dp,
-	struct xfs_name		*target_name,
-	struct xfs_inode	*target_ip,
-	unsigned int		flags)
+	struct mnt_idmap		*idmap,
+	struct xfs_inode		*src_dp,
+	struct xfs_name			*src_name,
+	struct xfs_inode		*src_ip,
+	struct xfs_inode		*target_dp,
+	struct xfs_name			*target_name,
+	struct xfs_inode		*target_ip,
+	unsigned int			flags)
 {
-	struct xfs_mount	*mp = src_dp->i_mount;
-	struct xfs_trans	*tp;
-	struct xfs_inode	*wip = NULL;		/* whiteout inode */
-	struct xfs_inode	*inodes[__XFS_SORT_INODES];
-	struct xfs_parent_args	*src_ppargs = NULL;
-	struct xfs_parent_args	*tgt_ppargs = NULL;
-	struct xfs_parent_args	*wip_ppargs = NULL;
-	int			i;
-	int			num_inodes = __XFS_SORT_INODES;
-	bool			new_parent = (src_dp != target_dp);
-	bool			src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
-	int			spaceres;
-	bool			retried = false;
-	int			error, nospace_error = 0;
+	struct xfs_dir_update		du_src = {
+		.dp			= src_dp,
+		.name			= src_name,
+		.ip			= src_ip,
+	};
+	struct xfs_dir_update		du_tgt = {
+		.dp			= target_dp,
+		.name			= target_name,
+		.ip			= target_ip,
+	};
+	struct xfs_dir_update		du_wip = { };
+	struct xfs_mount		*mp = src_dp->i_mount;
+	struct xfs_trans		*tp;
+	struct xfs_inode		*inodes[__XFS_SORT_INODES];
+	int				i;
+	int				num_inodes = __XFS_SORT_INODES;
+	bool				new_parent = (src_dp != target_dp);
+	bool				src_is_directory =
+						S_ISDIR(VFS_I(src_ip)->i_mode);
+	int				spaceres;
+	bool				retried = false;
+	int				error, nospace_error = 0;
 
 	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
 
@@ -2393,8 +2401,8 @@ xfs_rename(
 	 * appropriately.
 	 */
 	if (flags & RENAME_WHITEOUT) {
-		error = xfs_rename_alloc_whiteout(idmap, src_name,
-						  target_dp, &wip);
+		error = xfs_rename_alloc_whiteout(idmap, src_name, target_dp,
+				&du_wip.ip);
 		if (error)
 			return error;
 
@@ -2402,21 +2410,21 @@ xfs_rename(
 		src_name->type = XFS_DIR3_FT_CHRDEV;
 	}
 
-	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
-				inodes, &num_inodes);
+	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, du_wip.ip,
+			inodes, &num_inodes);
 
-	error = xfs_parent_start(mp, &src_ppargs);
+	error = xfs_parent_start(mp, &du_src.ppargs);
 	if (error)
 		goto out_release_wip;
 
-	if (wip) {
-		error = xfs_parent_start(mp, &wip_ppargs);
+	if (du_wip.ip) {
+		error = xfs_parent_start(mp, &du_wip.ppargs);
 		if (error)
 			goto out_src_ppargs;
 	}
 
 	if (target_ip) {
-		error = xfs_parent_start(mp, &tgt_ppargs);
+		error = xfs_parent_start(mp, &du_tgt.ppargs);
 		if (error)
 			goto out_wip_ppargs;
 	}
@@ -2424,7 +2432,7 @@ xfs_rename(
 retry:
 	nospace_error = 0;
 	spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL,
-			target_name->len, wip != NULL);
+			target_name->len, du_wip.ip != NULL);
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
 	if (error == -ENOSPC) {
 		nospace_error = error;
@@ -2439,7 +2447,7 @@ xfs_rename(
 	 * We don't allow reservationless renaming when parent pointers are
 	 * enabled because we can't back out if the xattrs must grow.
 	 */
-	if (src_ppargs && nospace_error) {
+	if (du_src.ppargs && nospace_error) {
 		error = nospace_error;
 		xfs_trans_cancel(tp);
 		goto out_tgt_ppargs;
@@ -2471,8 +2479,8 @@ xfs_rename(
 	xfs_trans_ijoin(tp, src_ip, 0);
 	if (target_ip)
 		xfs_trans_ijoin(tp, target_ip, 0);
-	if (wip)
-		xfs_trans_ijoin(tp, wip, 0);
+	if (du_wip.ip)
+		xfs_trans_ijoin(tp, du_wip.ip, 0);
 
 	/*
 	 * If we are using project inheritance, we only allow renames
@@ -2488,8 +2496,8 @@ xfs_rename(
 	/* RENAME_EXCHANGE is unique from here on. */
 	if (flags & RENAME_EXCHANGE) {
 		error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
-				src_ppargs, target_dp, target_name, target_ip,
-				tgt_ppargs, spaceres);
+				du_src.ppargs, target_dp, target_name,
+				target_ip, du_tgt.ppargs, spaceres);
 		nospace_error = 0;
 		goto out_unlock;
 	}
@@ -2524,38 +2532,11 @@ xfs_rename(
 	 * We don't allow quotaless renaming when parent pointers are enabled
 	 * because we can't back out if the xattrs must grow.
 	 */
-	if (src_ppargs && nospace_error) {
+	if (du_src.ppargs && nospace_error) {
 		error = nospace_error;
 		goto out_trans_cancel;
 	}
 
-	/*
-	 * Check for expected errors before we dirty the transaction
-	 * so we can return an error without a transaction abort.
-	 */
-	if (target_ip == NULL) {
-		/*
-		 * If there's no space reservation, check the entry will
-		 * fit before actually inserting it.
-		 */
-		if (!spaceres) {
-			error = xfs_dir_canenter(tp, target_dp, target_name);
-			if (error)
-				goto out_trans_cancel;
-		}
-	} else {
-		/*
-		 * If target exists and it's a directory, check that whether
-		 * it can be destroyed.
-		 */
-		if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
-		    (!xfs_dir_isempty(target_ip) ||
-		     (VFS_I(target_ip)->i_nlink > 2))) {
-			error = -EEXIST;
-			goto out_trans_cancel;
-		}
-	}
-
 	/*
 	 * Lock the AGI buffers we need to handle bumping the nlink of the
 	 * whiteout inode off the unlinked list and to handle dropping the
@@ -2567,7 +2548,7 @@ xfs_rename(
 	 * target_ip is either null or an empty directory.
 	 */
 	for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
-		if (inodes[i] == wip ||
+		if (inodes[i] == du_wip.ip ||
 		    (inodes[i] == target_ip &&
 		     (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
 			struct xfs_perag	*pag;
@@ -2582,165 +2563,20 @@ xfs_rename(
 		}
 	}
 
-	/*
-	 * Directory entry creation below may acquire the AGF. Remove
-	 * the whiteout from the unlinked list first to preserve correct
-	 * AGI/AGF locking order. This dirties the transaction so failures
-	 * after this point will abort and log recovery will clean up the
-	 * mess.
-	 *
-	 * For whiteouts, we need to bump the link count on the whiteout
-	 * inode. After this point, we have a real link, clear the tmpfile
-	 * state flag from the inode so it doesn't accidentally get misused
-	 * in future.
-	 */
-	if (wip) {
-		struct xfs_perag	*pag;
-
-		ASSERT(VFS_I(wip)->i_nlink == 0);
-
-		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
-		error = xfs_iunlink_remove(tp, pag, wip);
-		xfs_perag_put(pag);
-		if (error)
-			goto out_trans_cancel;
-
-		xfs_bumplink(tp, wip);
-		VFS_I(wip)->i_state &= ~I_LINKABLE;
-	}
-
-	/*
-	 * Set up the target.
-	 */
-	if (target_ip == NULL) {
-		/*
-		 * If target does not exist and the rename crosses
-		 * directories, adjust the target directory link count
-		 * to account for the ".." reference from the new entry.
-		 */
-		error = xfs_dir_createname(tp, target_dp, target_name,
-					   src_ip->i_ino, spaceres);
-		if (error)
-			goto out_trans_cancel;
-
-		xfs_trans_ichgtime(tp, target_dp,
-					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-		if (new_parent && src_is_directory) {
-			xfs_bumplink(tp, target_dp);
-		}
-	} else { /* target_ip != NULL */
-		/*
-		 * Link the source inode under the target name.
-		 * If the source inode is a directory and we are moving
-		 * it across directories, its ".." entry will be
-		 * inconsistent until we replace that down below.
-		 *
-		 * In case there is already an entry with the same
-		 * name at the destination directory, remove it first.
-		 */
-		error = xfs_dir_replace(tp, target_dp, target_name,
-					src_ip->i_ino, spaceres);
-		if (error)
-			goto out_trans_cancel;
-
-		xfs_trans_ichgtime(tp, target_dp,
-					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-		/*
-		 * Decrement the link count on the target since the target
-		 * dir no longer points to it.
-		 */
-		error = xfs_droplink(tp, target_ip);
-		if (error)
-			goto out_trans_cancel;
-
-		if (src_is_directory) {
-			/*
-			 * Drop the link from the old "." entry.
-			 */
-			error = xfs_droplink(tp, target_ip);
-			if (error)
-				goto out_trans_cancel;
-		}
-	} /* target_ip != NULL */
-
-	/*
-	 * Remove the source.
-	 */
-	if (new_parent && src_is_directory) {
-		/*
-		 * Rewrite the ".." entry to point to the new
-		 * directory.
-		 */
-		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
-					target_dp->i_ino, spaceres);
-		ASSERT(error != -EEXIST);
-		if (error)
-			goto out_trans_cancel;
-	}
-
-	/*
-	 * We always want to hit the ctime on the source inode.
-	 *
-	 * This isn't strictly required by the standards since the source
-	 * inode isn't really being changed, but old unix file systems did
-	 * it and some incremental backup programs won't work without it.
-	 */
-	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
-	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
-
-	/*
-	 * Adjust the link count on src_dp.  This is necessary when
-	 * renaming a directory, either within one parent when
-	 * the target existed, or across two parent directories.
-	 */
-	if (src_is_directory && (new_parent || target_ip != NULL)) {
+	error = xfs_dir_rename_children(tp, &du_src, &du_tgt, spaceres,
+			&du_wip);
+	if (error)
+		goto out_trans_cancel;
 
+	if (du_wip.ip) {
 		/*
-		 * Decrement link count on src_directory since the
-		 * entry that's moved no longer points to it.
+		 * Now we have a real link, clear the "I'm a tmpfile" state
+		 * flag from the inode so it doesn't accidentally get misused in
+		 * future.
 		 */
-		error = xfs_droplink(tp, src_dp);
-		if (error)
-			goto out_trans_cancel;
+		VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
 	}
 
-	/*
-	 * For whiteouts, we only need to update the source dirent with the
-	 * inode number of the whiteout inode rather than removing it
-	 * altogether.
-	 */
-	if (wip)
-		error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
-					spaceres);
-	else
-		error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
-					   spaceres);
-
-	if (error)
-		goto out_trans_cancel;
-
-	/* Schedule parent pointer updates. */
-	error = xfs_parent_add(tp, wip_ppargs, src_dp, src_name, wip);
-	if (error)
-		goto out_trans_cancel;
-
-	error = xfs_parent_replace(tp, src_ppargs, src_dp, src_name, target_dp,
-			target_name, src_ip);
-	if (error)
-		goto out_trans_cancel;
-
-	error = xfs_parent_remove(tp, tgt_ppargs, target_dp, target_name,
-			target_ip);
-	if (error)
-		goto out_trans_cancel;
-
-	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-	if (new_parent)
-		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-
 	/*
 	 * Inform our hook clients that we've finished a rename operation as
 	 * follows: removed the source and target files from their directories;
@@ -2753,8 +2589,8 @@ xfs_rename(
 		xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
 	xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
 	xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
-	if (wip)
-		xfs_dir_update_hook(src_dp, wip, 1, src_name);
+	if (du_wip.ip)
+		xfs_dir_update_hook(src_dp, du_wip.ip, 1, src_name);
 
 	error = xfs_finish_rename(tp);
 	nospace_error = 0;
@@ -2765,14 +2601,14 @@ xfs_rename(
 out_unlock:
 	xfs_iunlock_rename(inodes, num_inodes);
 out_tgt_ppargs:
-	xfs_parent_finish(mp, tgt_ppargs);
+	xfs_parent_finish(mp, du_tgt.ppargs);
 out_wip_ppargs:
-	xfs_parent_finish(mp, wip_ppargs);
+	xfs_parent_finish(mp, du_wip.ppargs);
 out_src_ppargs:
-	xfs_parent_finish(mp, src_ppargs);
+	xfs_parent_finish(mp, du_src.ppargs);
 out_release_wip:
-	if (wip)
-		xfs_irele(wip);
+	if (du_wip.ip)
+		xfs_irele(du_wip.ip);
 	if (error == -ENOSPC && nospace_error)
 		error = nospace_error;
 	return error;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/21] xfs: move dirent update hooks to xfs_dir2.c
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-31 21:05   ` [PATCH 19/21] xfs: create libxfs helper to rename " Darrick J. Wong
@ 2023-12-31 21:05   ` Darrick J. Wong
  2023-12-31 21:06   ` [PATCH 21/21] xfs: get rid of trivial rename helpers Darrick J. Wong
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:05 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the directory entry update hook code to xfs_dir2 so that it is
mostly consolidated with the higher level directory functions.  Retain
the exports so that online fsck can still send notifications through the
hooks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2.c |  125 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/libxfs/xfs_dir2.h |   24 +++++++++
 fs/xfs/scrub/common.c    |    1 
 fs/xfs/xfs_inode.c       |  108 ----------------------------------------
 fs/xfs/xfs_inode.h       |   25 ---------
 fs/xfs/xfs_symlink.c     |    2 -
 6 files changed, 145 insertions(+), 140 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 0ec653d1d5b8d..a8a611e60ed08 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -788,6 +788,72 @@ xfs_dir2_compname(
 	return xfs_da_compname(args, name, len);
 }
 
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of directory live update hooks.
+ * If the compiler supports jump labels, the static branch will be replaced by
+ * a nop sled when there are no hook users.  Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
+
+void
+xfs_dir_hook_disable(void)
+{
+	xfs_hooks_switch_off(&xfs_dir_hooks_switch);
+}
+
+void
+xfs_dir_hook_enable(void)
+{
+	xfs_hooks_switch_on(&xfs_dir_hooks_switch);
+}
+
+/* Call hooks for a directory update relating to a child dirent update. */
+inline void
+xfs_dir_update_hook(
+	struct xfs_inode		*dp,
+	struct xfs_inode		*ip,
+	int				delta,
+	const struct xfs_name		*name)
+{
+	if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
+		struct xfs_dir_update_params	p = {
+			.dp		= dp,
+			.ip		= ip,
+			.delta		= delta,
+			.name		= name,
+		};
+		struct xfs_mount	*mp = ip->i_mount;
+
+		xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+	}
+}
+
+/* Call the specified function during a directory update. */
+int
+xfs_dir_hook_add(
+	struct xfs_mount	*mp,
+	struct xfs_dir_hook	*hook)
+{
+	return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Stop calling the specified function during a directory update. */
+void
+xfs_dir_hook_del(
+	struct xfs_mount	*mp,
+	struct xfs_dir_hook	*hook)
+{
+	xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 /*
  * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip
  * into @dp under the given @name.  If @ip is a directory, it will be
@@ -829,7 +895,12 @@ xfs_dir_create_child(
 	 * If we have parent pointers, we need to add the attribute containing
 	 * the parent information now.
 	 */
-	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
+	error = xfs_parent_add(tp, du->ppargs, dp, name, ip);
+	if (error)
+		return error;
+
+	xfs_dir_update_hook(dp, ip, 1, name);
+	return 0;
 }
 
 /*
@@ -887,7 +958,12 @@ xfs_dir_add_child(
 	 * attribute, we need to create it correctly, otherwise we can just add
 	 * the parent to the inode.
 	 */
-	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
+	error = xfs_parent_add(tp, du->ppargs, dp, name, ip);
+	if (error)
+		return error;
+
+	xfs_dir_update_hook(dp, ip, 1, name);
+	return 0;
 }
 
 /*
@@ -962,7 +1038,12 @@ xfs_dir_remove_child(
 	}
 
 	/* Remove parent pointer. */
-	return xfs_parent_remove(tp, du->ppargs, dp, name, ip);
+	error = xfs_parent_remove(tp, du->ppargs, dp, name, ip);
+	if (error)
+		return error;
+
+	xfs_dir_update_hook(dp, ip, -1, name);
+	return 0;
 }
 
 /*
@@ -1078,8 +1159,24 @@ xfs_dir_exchange_children(
 	if (error)
 		return error;
 
-	return xfs_parent_replace(tp, du2->ppargs, dp2, name2, dp1, name1,
+	error = xfs_parent_replace(tp, du2->ppargs, dp2, name2, dp1, name1,
 			ip2);
+	if (error)
+		return error;
+
+	/*
+	 * Inform our hook clients that we've finished an exchange operation as
+	 * follows: removed the source and target files from their directories;
+	 * added the target to the source directory; and added the source to
+	 * the target directory.  All inodes are locked, so it's ok to model a
+	 * rename this way so long as we say we deleted entries before we add
+	 * new ones.
+	 */
+	xfs_dir_update_hook(dp1, ip1, -1, name1);
+	xfs_dir_update_hook(dp2, ip2, -1, name2);
+	xfs_dir_update_hook(dp1, ip2, 1, name1);
+	xfs_dir_update_hook(dp2, ip1, 1, name2);
+	return 0;
 }
 
 /*
@@ -1294,6 +1391,24 @@ xfs_dir_rename_children(
 	if (error)
 		return error;
 
-	return xfs_parent_remove(tp, du_tgt->ppargs, target_dp, target_name,
+	error = xfs_parent_remove(tp, du_tgt->ppargs, target_dp, target_name,
 			target_ip);
+	if (error)
+		return error;
+
+	/*
+	 * Inform our hook clients that we've finished a rename operation as
+	 * follows: removed the source and target files from their directories;
+	 * that we've added the source to the target directory; and finally
+	 * that we've added the whiteout, if there was one.  All inodes are
+	 * locked, so it's ok to model a rename this way so long as we say we
+	 * deleted entries before we add new ones.
+	 */
+	if (target_ip)
+		xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
+	xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
+	xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
+	if (du_wip->ip)
+		xfs_dir_update_hook(src_dp, du_wip->ip, 1, src_name);
+	return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 5e8b18f3f0036..57b124abb17f4 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -293,6 +293,30 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c)
 	return c;
 }
 
+struct xfs_dir_update_params {
+	const struct xfs_inode	*dp;
+	const struct xfs_inode	*ip;
+	const struct xfs_name	*name;
+	int			delta;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
+		int delta, const struct xfs_name *name);
+
+struct xfs_dir_hook {
+	struct xfs_hook		dirent_hook;
+};
+
+void xfs_dir_hook_disable(void);
+void xfs_dir_hook_enable(void);
+
+int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+#else
+# define xfs_dir_update_hook(dp, ip, delta, name)	((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 struct xfs_parent_args;
 
 struct xfs_dir_update {
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 9a8bd6f050af9..74be3b7341b51 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -26,6 +26,7 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
 #include "xfs_attr.h"
 #include "xfs_reflink.h"
 #include "xfs_ag.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1a87ccf4e0474..bafa13ae3c1d7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -663,72 +663,6 @@ xfs_icreate_args_rootfile(
 		args->flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
 }
 
-#ifdef CONFIG_XFS_LIVE_HOOKS
-/*
- * Use a static key here to reduce the overhead of directory live update hooks.
- * If the compiler supports jump labels, the static branch will be replaced by
- * a nop sled when there are no hook users.  Online fsck is currently the only
- * caller, so this is a reasonable tradeoff.
- *
- * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
- * parts of the kernel allocate memory with that lock held, which means that
- * XFS callers cannot hold any locks that might be used by memory reclaim or
- * writeback when calling the static_branch_{inc,dec} functions.
- */
-DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
-
-void
-xfs_dir_hook_disable(void)
-{
-	xfs_hooks_switch_off(&xfs_dir_hooks_switch);
-}
-
-void
-xfs_dir_hook_enable(void)
-{
-	xfs_hooks_switch_on(&xfs_dir_hooks_switch);
-}
-
-/* Call hooks for a directory update relating to a child dirent update. */
-inline void
-xfs_dir_update_hook(
-	struct xfs_inode		*dp,
-	struct xfs_inode		*ip,
-	int				delta,
-	const struct xfs_name		*name)
-{
-	if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
-		struct xfs_dir_update_params	p = {
-			.dp		= dp,
-			.ip		= ip,
-			.delta		= delta,
-			.name		= name,
-		};
-		struct xfs_mount	*mp = ip->i_mount;
-
-		xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
-	}
-}
-
-/* Call the specified function during a directory update. */
-int
-xfs_dir_hook_add(
-	struct xfs_mount	*mp,
-	struct xfs_dir_hook	*hook)
-{
-	return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
-}
-
-/* Stop calling the specified function during a directory update. */
-void
-xfs_dir_hook_del(
-	struct xfs_mount	*mp,
-	struct xfs_dir_hook	*hook)
-{
-	xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
-}
-#endif /* CONFIG_XFS_LIVE_HOOKS */
-
 int
 xfs_icreate_dqalloc(
 	const struct xfs_icreate_args	*args,
@@ -841,12 +775,6 @@ xfs_create(
 	if (error)
 		goto out_trans_cancel;
 
-	/*
-	 * Create ip with a reference from dp, and add '.' and '..' references
-	 * if it's a directory.
-	 */
-	xfs_dir_update_hook(dp, du.ip, 1, name);
-
 	/*
 	 * If this is a synchronous mount, make sure that the
 	 * create transaction goes to disk before returning to
@@ -1060,8 +988,6 @@ xfs_link(
 	if (error)
 		goto error_return;
 
-	xfs_dir_update_hook(tdp, sip, 1, target_name);
-
 	/*
 	 * If this is a synchronous mount, make sure that the
 	 * link transaction goes to disk before returning to
@@ -2120,12 +2046,6 @@ xfs_remove(
 	if (error)
 		goto out_trans_cancel;
 
-	/*
-	 * Drop the link from dp to ip, and if ip was a directory, remove the
-	 * '.' and '..' references since we freed the directory.
-	 */
-	xfs_dir_update_hook(dp, ip, -1, name);
-
 	/*
 	 * If this is a synchronous mount, make sure that the
 	 * remove transaction goes to disk before returning to
@@ -2282,19 +2202,6 @@ xfs_cross_rename(
 	if (error)
 		goto out_trans_abort;
 
-	/*
-	 * Inform our hook clients that we've finished an exchange operation as
-	 * follows: removed the source and target files from their directories;
-	 * added the target to the source directory; and added the source to
-	 * the target directory.  All inodes are locked, so it's ok to model a
-	 * rename this way so long as we say we deleted entries before we add
-	 * new ones.
-	 */
-	xfs_dir_update_hook(dp1, ip1, -1, name1);
-	xfs_dir_update_hook(dp2, ip2, -1, name2);
-	xfs_dir_update_hook(dp1, ip2, 1, name1);
-	xfs_dir_update_hook(dp2, ip1, 1, name2);
-
 	return xfs_finish_rename(tp);
 
 out_trans_abort:
@@ -2577,21 +2484,6 @@ xfs_rename(
 		VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
 	}
 
-	/*
-	 * Inform our hook clients that we've finished a rename operation as
-	 * follows: removed the source and target files from their directories;
-	 * that we've added the source to the target directory; and finally
-	 * that we've added the whiteout, if there was one.  All inodes are
-	 * locked, so it's ok to model a rename this way so long as we say we
-	 * deleted entries before we add new ones.
-	 */
-	if (target_ip)
-		xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
-	xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
-	xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
-	if (du_wip.ip)
-		xfs_dir_update_hook(src_dp, du_wip.ip, 1, src_name);
-
 	error = xfs_finish_rename(tp);
 	nospace_error = 0;
 	goto out_unlock;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6433492e7301f..cc6a075505636 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -630,31 +630,6 @@ void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
 unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip);
 
-struct xfs_dir_update_params {
-	const struct xfs_inode	*dp;
-	const struct xfs_inode	*ip;
-	const struct xfs_name	*name;
-	int			delta;
-};
-
-#ifdef CONFIG_XFS_LIVE_HOOKS
-void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
-		int delta, const struct xfs_name *name);
-
-struct xfs_dir_hook {
-	struct xfs_hook		dirent_hook;
-};
-
-void xfs_dir_hook_disable(void);
-void xfs_dir_hook_enable(void);
-
-int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
-void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
-
-#else
-# define xfs_dir_update_hook(dp, ip, delta, name)	((void)0)
-#endif /* CONFIG_XFS_LIVE_HOOKS */
-
 void xfs_icreate_args_inherit(struct xfs_icreate_args *args,
 		struct xfs_inode *dp, struct mnt_idmap *idmap, umode_t mode,
 		bool init_xattrs);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index a923160a7dae0..e7936858dfcf5 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -204,8 +204,6 @@ xfs_symlink(
 	if (error)
 		goto out_trans_cancel;
 
-	xfs_dir_update_hook(dp, du.ip, 1, link_name);
-
 	/*
 	 * If this is a synchronous mount, make sure that the
 	 * symlink transaction goes to disk before returning to


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/21] xfs: get rid of trivial rename helpers
  2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-31 21:05   ` [PATCH 20/21] xfs: move dirent update hooks to xfs_dir2.c Darrick J. Wong
@ 2023-12-31 21:06   ` Darrick J. Wong
  20 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:06 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Get rid of the largely pointless xfs_cross_rename and xfs_finish_rename
now that we've refactored its parent.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_inode.c |   77 +++++++++-------------------------------------------
 1 file changed, 14 insertions(+), 63 deletions(-)


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bafa13ae3c1d7..789b9603989c5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2152,63 +2152,6 @@ xfs_sort_inodes(
 	}
 }
 
-static int
-xfs_finish_rename(
-	struct xfs_trans	*tp)
-{
-	/*
-	 * If this is a synchronous mount, make sure that the rename transaction
-	 * goes to disk before returning to the user.
-	 */
-	if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
-		xfs_trans_set_sync(tp);
-
-	return xfs_trans_commit(tp);
-}
-
-/*
- * xfs_cross_rename()
- *
- * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
- */
-STATIC int
-xfs_cross_rename(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp1,
-	struct xfs_name		*name1,
-	struct xfs_inode	*ip1,
-	struct xfs_parent_args	*ip1_ppargs,
-	struct xfs_inode	*dp2,
-	struct xfs_name		*name2,
-	struct xfs_inode	*ip2,
-	struct xfs_parent_args	*ip2_ppargs,
-	int			spaceres)
-{
-	struct xfs_dir_update	du1 = {
-		.dp		= dp1,
-		.name		= name1,
-		.ip		= ip1,
-		.ppargs		= ip1_ppargs,
-	};
-	struct xfs_dir_update	du2 = {
-		.dp		= dp2,
-		.name		= name2,
-		.ip		= ip2,
-		.ppargs		= ip2_ppargs,
-	};
-	int			error;
-
-	error = xfs_dir_exchange_children(tp, &du1, &du2, spaceres);
-	if (error)
-		goto out_trans_abort;
-
-	return xfs_finish_rename(tp);
-
-out_trans_abort:
-	xfs_trans_cancel(tp);
-	return error;
-}
-
 /*
  * xfs_rename_alloc_whiteout()
  *
@@ -2402,11 +2345,11 @@ xfs_rename(
 
 	/* RENAME_EXCHANGE is unique from here on. */
 	if (flags & RENAME_EXCHANGE) {
-		error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
-				du_src.ppargs, target_dp, target_name,
-				target_ip, du_tgt.ppargs, spaceres);
-		nospace_error = 0;
-		goto out_unlock;
+		error = xfs_dir_exchange_children(tp, &du_src, &du_tgt,
+				spaceres);
+		if (error)
+			goto out_trans_cancel;
+		goto out_commit;
 	}
 
 	/*
@@ -2484,7 +2427,15 @@ xfs_rename(
 		VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
 	}
 
-	error = xfs_finish_rename(tp);
+out_commit:
+	/*
+	 * If this is a synchronous mount, make sure that the rename
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp);
 	nospace_error = 0;
 	goto out_unlock;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/32] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
@ 2023-12-31 21:06   ` Darrick J. Wong
  2023-12-31 21:06   ` [PATCH 02/32] xfs: create imeta abstractions to get and set metadata inodes Darrick J. Wong
                     ` (30 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:06 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Currently, the XFS_SB_CRC_OFF macro uses the incore superblock struct
(xfs_sb) to compute the address of sb_crc within the ondisk superblock
struct (xfs_dsb).  This is a landmine if we ever change the layout of
the incore superblock (as we're about to do), so redefine the macro
to use xfs_dsb to compute the layout of xfs_dsb.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h |    9 ++++-----
 fs/xfs/libxfs/xfs_ondisk.h |    1 +
 2 files changed, 5 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index b0aaa825539f6..c60af43696326 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -90,8 +90,7 @@ struct xfs_ifork;
 #define XFSLABEL_MAX			12
 
 /*
- * Superblock - in core version.  Must match the ondisk version below.
- * Must be padded to 64 bit alignment.
+ * Superblock - in core version.  Must be padded to 64 bit alignment.
  */
 typedef struct xfs_sb {
 	uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
@@ -178,10 +177,8 @@ typedef struct xfs_sb {
 	/* must be padded to 64 bit alignment */
 } xfs_sb_t;
 
-#define XFS_SB_CRC_OFF		offsetof(struct xfs_sb, sb_crc)
-
 /*
- * Superblock - on disk version.  Must match the in core version above.
+ * Superblock - on disk version.
  * Must be padded to 64 bit alignment.
  */
 struct xfs_dsb {
@@ -265,6 +262,8 @@ struct xfs_dsb {
 	/* must be padded to 64 bit alignment */
 };
 
+#define XFS_SB_CRC_OFF		offsetof(struct xfs_dsb, sb_crc)
+
 /*
  * Misc. Flags - warning - these will be cleared by xfs_repair unless
  * a feature bit is set when the flag is used.
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index bffd39242d487..832d96f0f3c54 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -85,6 +85,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,	12);
 	 */
 
+	XFS_CHECK_OFFSET(struct xfs_dsb, sb_crc,		224);
 	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen,	0);
 	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen,	2);
 	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval,	3);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/32] xfs: create imeta abstractions to get and set metadata inodes
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
  2023-12-31 21:06   ` [PATCH 01/32] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Darrick J. Wong
@ 2023-12-31 21:06   ` Darrick J. Wong
  2023-12-31 21:07   ` [PATCH 03/32] xfs: create transaction reservations for metadata inode operations Darrick J. Wong
                     ` (29 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:06 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create some helper routines to get and set metadata inode numbers
instead of open-coding them throughout xfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile           |    2 
 fs/xfs/libxfs/xfs_imeta.c |  411 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_imeta.h |   46 +++++
 fs/xfs/libxfs/xfs_types.c |    5 -
 fs/xfs/xfs_imeta_utils.c  |  198 ++++++++++++++++++++++
 fs/xfs/xfs_imeta_utils.h  |   24 +++
 fs/xfs/xfs_mount.c        |   28 +++
 fs/xfs/xfs_trace.c        |    1 
 fs/xfs/xfs_trace.h        |   87 +++++++++-
 9 files changed, 798 insertions(+), 4 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_imeta.c
 create mode 100644 fs/xfs/libxfs/xfs_imeta.h
 create mode 100644 fs/xfs/xfs_imeta_utils.c
 create mode 100644 fs/xfs/xfs_imeta_utils.h


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6f7b0683a46cd..29cf5a5f8104a 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,6 +37,7 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_ialloc.o \
 				   xfs_ialloc_btree.o \
 				   xfs_iext_tree.o \
+				   xfs_imeta.o \
 				   xfs_inode_fork.o \
 				   xfs_inode_buf.o \
 				   xfs_inode_util.o \
@@ -80,6 +81,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_globals.o \
 				   xfs_health.o \
 				   xfs_icache.o \
+				   xfs_imeta_utils.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
 				   xfs_iops.o \
diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
new file mode 100644
index 0000000000000..717e67b3264cf
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_imeta.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+
+/*
+ * Metadata File Management
+ * ========================
+ *
+ * These functions provide an abstraction layer for looking up, creating, and
+ * deleting metadata inodes.  These pointers live in the in-core superblock,
+ * so the functions moderate access to those fields and take care of logging.
+ *
+ * For the five existing metadata inodes (real time bitmap & summary; and the
+ * user, group, and quotas) we'll continue to maintain the in-core superblock
+ * inodes for reads and only require xfs_imeta_create and xfs_imeta_unlink to
+ * persist changes.  New metadata inode types must only use the xfs_imeta_*
+ * functions.
+ *
+ * Callers wishing to create or unlink a metadata inode must pass in a
+ * xfs_imeta_end structure.  After committing or cancelling the transaction,
+ * this structure must be passed to xfs_imeta_end_update to free resources that
+ * cannot be freed during the transaction.
+ *
+ * Right now we only support callers passing in the predefined metadata inode
+ * paths; the goal is that callers will some day locate metadata inodes based
+ * on path lookups into a metadata directory structure.
+ */
+
+/* Static metadata inode paths */
+
+const struct xfs_imeta_path XFS_IMETA_RTBITMAP = {
+	.bogus = 0,
+};
+
+const struct xfs_imeta_path XFS_IMETA_RTSUMMARY = {
+	.bogus = 1,
+};
+
+const struct xfs_imeta_path XFS_IMETA_USRQUOTA = {
+	.bogus = 2,
+};
+
+const struct xfs_imeta_path XFS_IMETA_GRPQUOTA = {
+	.bogus = 3,
+};
+
+const struct xfs_imeta_path XFS_IMETA_PRJQUOTA = {
+	.bogus = 4,
+};
+
+/* Are these two paths equal? */
+STATIC bool
+xfs_imeta_path_compare(
+	const struct xfs_imeta_path	*a,
+	const struct xfs_imeta_path	*b)
+{
+	return a == b;
+}
+
+/* Is this path ok? */
+static inline bool
+xfs_imeta_path_check(
+	const struct xfs_imeta_path	*path)
+{
+	return true;
+}
+
+/* Functions for storing and retrieving superblock inode values. */
+
+/* Mapping of metadata inode paths to in-core superblock values. */
+static const struct xfs_imeta_sbmap {
+	const struct xfs_imeta_path	*path;
+	unsigned int			offset;
+} xfs_imeta_sbmaps[] = {
+	{
+		.path	= &XFS_IMETA_RTBITMAP,
+		.offset	= offsetof(struct xfs_sb, sb_rbmino),
+	},
+	{
+		.path	= &XFS_IMETA_RTSUMMARY,
+		.offset	= offsetof(struct xfs_sb, sb_rsumino),
+	},
+	{
+		.path	= &XFS_IMETA_USRQUOTA,
+		.offset	= offsetof(struct xfs_sb, sb_uquotino),
+	},
+	{
+		.path	= &XFS_IMETA_GRPQUOTA,
+		.offset	= offsetof(struct xfs_sb, sb_gquotino),
+	},
+	{
+		.path	= &XFS_IMETA_PRJQUOTA,
+		.offset	= offsetof(struct xfs_sb, sb_pquotino),
+	},
+	{ NULL, 0 },
+};
+
+/* Return a pointer to the in-core superblock inode value. */
+static inline xfs_ino_t *
+xfs_imeta_sbmap_to_inop(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_sbmap	*map)
+{
+	return (xfs_ino_t *)(((char *)&mp->m_sb) + map->offset);
+}
+
+/* Compute location of metadata inode pointer in the in-core superblock */
+static inline xfs_ino_t *
+xfs_imeta_path_to_sb_inop(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path)
+{
+	const struct xfs_imeta_sbmap	*p;
+
+	for (p = xfs_imeta_sbmaps; p->path; p++)
+		if (xfs_imeta_path_compare(p->path, path))
+			return xfs_imeta_sbmap_to_inop(mp, p);
+
+	return NULL;
+}
+
+/* Look up a superblock metadata inode by its path. */
+STATIC int
+xfs_imeta_sb_lookup(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	xfs_ino_t			*inop)
+{
+	xfs_ino_t			*sb_inop;
+
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, path);
+	if (!sb_inop)
+		return -EINVAL;
+
+	trace_xfs_imeta_sb_lookup(mp, sb_inop);
+	*inop = *sb_inop;
+	return 0;
+}
+
+/* Update inode pointers in the superblock. */
+static inline void
+xfs_imeta_log_sb(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp = xfs_trans_getsb(tp);
+
+	/*
+	 * Update the inode flags in the ondisk superblock without touching
+	 * the summary counters.  We have not quiesced inode chunk allocation,
+	 * so we cannot coordinate with updates to the icount and ifree percpu
+	 * counters.
+	 */
+	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+}
+
+/*
+ * Create a new metadata inode and set a superblock pointer to this new inode.
+ * The superblock field must not already be pointing to an inode.
+ */
+STATIC int
+xfs_imeta_sb_create(
+	struct xfs_imeta_update		*upd,
+	umode_t				mode)
+{
+	struct xfs_icreate_args		args = {
+		.nlink			= S_ISDIR(mode) ? 2 : 1,
+	};
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+	xfs_ino_t			ino;
+	int				error;
+
+	/* Files rooted in the superblock do not have parents. */
+	xfs_icreate_args_rootfile(&args, mp, mode, false);
+
+	/* Reject if the sb already points to some inode. */
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (!sb_inop)
+		return -EINVAL;
+
+	if (*sb_inop != NULLFSINO)
+		return -EEXIST;
+
+	/* Create a new inode and set the sb pointer. */
+	error = xfs_dialloc(&upd->tp, 0, mode, &ino);
+	if (error)
+		return error;
+	error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
+	if (error)
+		return error;
+	upd->ip_locked = true;
+
+	/*
+	 * If we ever need the ability to create rt metadata files on a
+	 * pre-metadir filesystem, we'll need to dqattach the child here.
+	 * Currently we assume that mkfs will create the files and quotacheck
+	 * will account for them.
+	 */
+
+	/* Update superblock pointer. */
+	*sb_inop = ino;
+	xfs_imeta_log_sb(upd->tp);
+
+	trace_xfs_imeta_sb_create(upd);
+	return 0;
+}
+
+/*
+ * Clear the given inode pointer from the superblock and drop the link count
+ * of the metadata inode.
+ */
+STATIC int
+xfs_imeta_sb_unlink(
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+
+	ASSERT(xfs_isilocked(upd->ip, XFS_ILOCK_EXCL));
+
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (!sb_inop)
+		return -EINVAL;
+
+	/* Reject if the sb doesn't point to the inode that was passed in. */
+	if (*sb_inop != upd->ip->i_ino)
+		return -ENOENT;
+
+	trace_xfs_imeta_sb_unlink(upd);
+
+	*sb_inop = NULLFSINO;
+	xfs_imeta_log_sb(upd->tp);
+	return xfs_droplink(upd->tp, upd->ip);
+}
+
+/* Set the given inode pointer in the superblock. */
+STATIC int
+xfs_imeta_sb_link(
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+
+	ASSERT(xfs_isilocked(upd->ip, XFS_ILOCK_EXCL));
+
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (!sb_inop)
+		return -EINVAL;
+	if (*sb_inop != NULLFSINO)
+		return -EEXIST;
+
+	trace_xfs_imeta_sb_link(upd);
+
+	xfs_bumplink(upd->tp, upd->ip);
+	xfs_imeta_log_sb(upd->tp);
+
+	*sb_inop = upd->ip->i_ino;
+	return 0;
+}
+
+/* General functions for managing metadata inode pointers */
+
+/*
+ * Is this metadata inode pointer ok?  We allow the fields to be set to
+ * NULLFSINO if the metadata structure isn't present, and we don't allow
+ * obviously incorrect inode pointers.
+ */
+static inline bool
+xfs_imeta_verify(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	if (ino == NULLFSINO)
+		return true;
+	return xfs_verify_ino(mp, ino);
+}
+
+/* Look up a metadata inode by its path. */
+int
+xfs_imeta_lookup(
+	struct xfs_trans		*tp,
+	const struct xfs_imeta_path	*path,
+	xfs_ino_t			*inop)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_ino_t			ino;
+	int				error;
+
+	ASSERT(xfs_imeta_path_check(path));
+
+	error = xfs_imeta_sb_lookup(mp, path, &ino);
+	if (error)
+		return error;
+
+	if (!xfs_imeta_verify(mp, ino))
+		return -EFSCORRUPTED;
+
+	*inop = ino;
+	return 0;
+}
+
+/*
+ * Create a metadata inode with the given @mode, and insert it into the
+ * metadata directory tree at the given @path.  The path (up to the final
+ * component) must already exist.
+ *
+ * The new metadata inode will be attached to the update structure @upd->ip,
+ * with the ILOCK held until the caller releases it.  @ipp is set to upd->ip
+ * as a convenience for callers.
+ *
+ * Callers must ensure that the root dquots are allocated, if applicable.
+ *
+ * NOTE: This function may return a new inode to the caller even if it returns
+ * a negative error code.  If an inode is passed back, the caller must finish
+ * setting up the inode before releasing it.
+ */
+int
+xfs_imeta_create(
+	struct xfs_imeta_update		*upd,
+	umode_t				mode,
+	struct xfs_inode		**ipp)
+{
+	int				error;
+
+	ASSERT(xfs_imeta_path_check(upd->path));
+
+	*ipp = NULL;
+
+	error = xfs_imeta_sb_create(upd, mode);
+	*ipp = upd->ip;
+	return error;
+}
+
+/*
+ * Unlink a metadata inode @upd->ip from the metadata directory given by @path.
+ * The path must already exist.
+ */
+int
+xfs_imeta_unlink(
+	struct xfs_imeta_update		*upd)
+{
+	ASSERT(xfs_imeta_path_check(upd->path));
+	ASSERT(xfs_imeta_verify(upd->mp, upd->ip->i_ino));
+
+	return xfs_imeta_sb_unlink(upd);
+}
+
+/*
+ * Link the metadata directory given by @path to the inode @upd->ip.
+ * The path (up to the final component) must already exist, but the final
+ * component must not already exist.
+ */
+int
+xfs_imeta_link(
+	struct xfs_imeta_update		*upd)
+{
+	ASSERT(xfs_imeta_path_check(upd->path));
+
+	return xfs_imeta_sb_link(upd);
+}
+
+/* Does this inode number refer to a static metadata inode? */
+bool
+xfs_is_static_meta_ino(
+	struct xfs_mount		*mp,
+	xfs_ino_t			ino)
+{
+	const struct xfs_imeta_sbmap	*p;
+
+	if (ino == NULLFSINO)
+		return false;
+
+	for (p = xfs_imeta_sbmaps; p->path; p++)
+		if (ino == *xfs_imeta_sbmap_to_inop(mp, p))
+			return true;
+
+	return false;
+}
+
+/*
+ * Ensure that the in-core superblock has all the values that it should.
+ * Caller should pass in an empty transaction to avoid livelocking on btree
+ * cycles.
+ */
+int
+xfs_imeta_mount(
+	struct xfs_trans	*tp)
+{
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h
new file mode 100644
index 0000000000000..c1833b8b1c977
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_imeta.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_IMETA_H__
+#define __XFS_IMETA_H__
+
+/* Key for looking up metadata inodes. */
+struct xfs_imeta_path {
+	/* Temporary: integer to keep the static imeta definitions unique */
+	int		bogus;
+};
+
+/* Cleanup widget for metadata inode creation and deletion. */
+struct xfs_imeta_update {
+	struct xfs_mount	*mp;
+	struct xfs_trans	*tp;
+
+	const struct xfs_imeta_path *path;
+
+	/* Metadata inode */
+	struct xfs_inode	*ip;
+
+	unsigned int		ip_locked:1;
+};
+
+/* Lookup keys for static metadata inodes. */
+extern const struct xfs_imeta_path XFS_IMETA_RTBITMAP;
+extern const struct xfs_imeta_path XFS_IMETA_RTSUMMARY;
+extern const struct xfs_imeta_path XFS_IMETA_USRQUOTA;
+extern const struct xfs_imeta_path XFS_IMETA_GRPQUOTA;
+extern const struct xfs_imeta_path XFS_IMETA_PRJQUOTA;
+
+int xfs_imeta_lookup(struct xfs_trans *tp, const struct xfs_imeta_path *path,
+		xfs_ino_t *ino);
+
+int xfs_imeta_create(struct xfs_imeta_update *upd, umode_t mode,
+		struct xfs_inode **ipp);
+int xfs_imeta_unlink(struct xfs_imeta_update *upd);
+int xfs_imeta_link(struct xfs_imeta_update *upd);
+
+bool xfs_is_static_meta_ino(struct xfs_mount *mp, xfs_ino_t ino);
+int xfs_imeta_mount(struct xfs_trans *tp);
+
+#endif /* __XFS_IMETA_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index c299b16c9365f..72f7e050e600a 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -12,6 +12,7 @@
 #include "xfs_bit.h"
 #include "xfs_mount.h"
 #include "xfs_ag.h"
+#include "xfs_imeta.h"
 
 
 /*
@@ -115,9 +116,7 @@ xfs_internal_inum(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino)
 {
-	return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
-		(xfs_has_quota(mp) &&
-		 xfs_is_quota_inode(&mp->m_sb, ino));
+	return xfs_is_static_meta_ino(mp, ino);
 }
 
 /*
diff --git a/fs/xfs/xfs_imeta_utils.c b/fs/xfs/xfs_imeta_utils.c
new file mode 100644
index 0000000000000..262422daa931f
--- /dev/null
+++ b/fs/xfs/xfs_imeta_utils.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_quota.h"
+#include "xfs_imeta.h"
+#include "xfs_imeta_utils.h"
+#include "xfs_trace.h"
+
+/* Initialize a metadata update structure. */
+static inline int
+xfs_imeta_init(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_imeta_update		*upd)
+{
+	memset(upd, 0, sizeof(struct xfs_imeta_update));
+	upd->mp = mp;
+	upd->path = path;
+	return 0;
+}
+
+/*
+ * Unlock and release resources after committing (or cancelling) a metadata
+ * directory tree operation.  The caller retains its reference to @upd->ip
+ * and must release it explicitly.
+ */
+static inline void
+xfs_imeta_teardown(
+	struct xfs_imeta_update		*upd,
+	int				error)
+{
+	trace_xfs_imeta_teardown(upd, error);
+
+	if (upd->ip) {
+		if (upd->ip_locked)
+			xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
+		upd->ip_locked = false;
+	}
+}
+
+/*
+ * Begin the process of creating a metadata file by allocating transactions
+ * and taking whatever resources we're going to need.
+ */
+int
+xfs_imeta_start_create(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	error = xfs_imeta_init(mp, path, upd);
+	if (error)
+		return error;
+
+	/*
+	 * If we ever need the ability to create rt metadata files on a
+	 * pre-metadir filesystem, we'll need to dqattach the parent here.
+	 * Currently we assume that mkfs will create the files and quotacheck
+	 * will account for them.
+	 */
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+			xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp);
+	if (error)
+		goto out_teardown;
+
+	trace_xfs_imeta_start_create(upd);
+	return 0;
+out_teardown:
+	xfs_imeta_teardown(upd, error);
+	return error;
+}
+
+/*
+ * Begin the process of linking a metadata file by allocating transactions
+ * and locking whatever resources we're going to need.
+ */
+static inline int
+xfs_imeta_start_dir_update(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_inode		*ip,
+	struct xfs_trans_res		*tr_resv,
+	unsigned int			resblks,
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	error = xfs_imeta_init(mp, path, upd);
+	if (error)
+		return error;
+
+	upd->ip = ip;
+
+	error = xfs_trans_alloc_inode(upd->ip, tr_resv, resblks, 0, false,
+			&upd->tp);
+	if (error)
+		goto out_teardown;
+
+	upd->ip_locked = true;
+	return 0;
+out_teardown:
+	xfs_imeta_teardown(upd, error);
+	return error;
+}
+
+/*
+ * Begin the process of linking a metadata file by allocating transactions
+ * and locking whatever resources we're going to need.
+ */
+int
+xfs_imeta_start_link(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_inode		*ip,
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	error = xfs_imeta_start_dir_update(mp, path, ip, &M_RES(mp)->tr_link,
+			xfs_link_space_res(mp, MAXNAMELEN), upd);
+	if (error)
+		return error;
+
+	trace_xfs_imeta_start_link(upd);
+	return 0;
+}
+
+/*
+ * Begin the process of unlinking a metadata file by allocating transactions
+ * and locking whatever resources we're going to need.
+ */
+int
+xfs_imeta_start_unlink(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_inode		*ip,
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	error = xfs_imeta_start_dir_update(mp, path, ip, &M_RES(mp)->tr_remove,
+			xfs_remove_space_res(mp, MAXNAMELEN), upd);
+	if (error)
+		return error;
+
+	trace_xfs_imeta_start_unlink(upd);
+	return 0;
+}
+
+/* Commit a metadir update and unlock/drop all resources. */
+int
+xfs_imeta_commit_update(
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	trace_xfs_imeta_update_commit(upd);
+
+	error = xfs_trans_commit(upd->tp);
+	upd->tp = NULL;
+
+	xfs_imeta_teardown(upd, error);
+	return error;
+}
+
+/* Cancel a metadir update and unlock/drop all resources. */
+void
+xfs_imeta_cancel_update(
+	struct xfs_imeta_update		*upd,
+	int				error)
+{
+	trace_xfs_imeta_update_cancel(upd);
+
+	xfs_trans_cancel(upd->tp);
+	upd->tp = NULL;
+
+	xfs_imeta_teardown(upd, error);
+}
diff --git a/fs/xfs/xfs_imeta_utils.h b/fs/xfs/xfs_imeta_utils.h
new file mode 100644
index 0000000000000..0235f7048ff1d
--- /dev/null
+++ b/fs/xfs/xfs_imeta_utils.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_IMETA_UTILS_H__
+#define __XFS_IMETA_UTILS_H__
+
+int xfs_imeta_start_create(struct xfs_mount *mp,
+		const struct xfs_imeta_path *path,
+		struct xfs_imeta_update *upd);
+
+int xfs_imeta_start_link(struct xfs_mount *mp,
+		const struct xfs_imeta_path *path,
+		struct xfs_inode *ip, struct xfs_imeta_update *upd);
+
+int xfs_imeta_start_unlink(struct xfs_mount *mp,
+		const struct xfs_imeta_path *path,
+		struct xfs_inode *ip, struct xfs_imeta_update *upd);
+
+int xfs_imeta_commit_update(struct xfs_imeta_update *upd);
+void xfs_imeta_cancel_update(struct xfs_imeta_update *upd, int error);
+
+#endif /* __XFS_IMETA_UTILS_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 85c7ca0b211b1..ab11fe9a4b4ab 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -34,6 +34,7 @@
 #include "xfs_health.h"
 #include "xfs_trace.h"
 #include "xfs_ag.h"
+#include "xfs_imeta.h"
 #include "scrub/stats.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -610,6 +611,29 @@ xfs_mount_setup_inode_geom(
 	xfs_ialloc_setup_geometry(mp);
 }
 
+STATIC int
+xfs_mount_setup_metadir(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	error = xfs_imeta_mount(tp);
+	if (error) {
+		xfs_warn(mp, "Failed to load metadata inodes, error %d",
+				error);
+		goto err_cancel;
+	}
+
+err_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
 /* Compute maximum possible height for per-AG btree types for this fs. */
 static inline void
 xfs_agbtree_compute_maxlevels(
@@ -846,6 +870,10 @@ xfs_mountfs(
 		mp->m_features |= XFS_FEAT_ATTR2;
 	}
 
+	error = xfs_mount_setup_metadir(mp);
+	if (error)
+		goto out_log_dealloc;
+
 	/*
 	 * Get and sanity-check the root inode.
 	 * Save the pointer to it in the mount structure.
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 7ccb7b3473943..453cf7ffdea03 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -43,6 +43,7 @@
 #include "xfs_swapext.h"
 #include "xfs_xchgrange.h"
 #include "xfs_parent.h"
+#include "xfs_imeta.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 070c76f443737..bedfed2ef3e60 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -86,6 +86,7 @@ struct xfs_swapext_req;
 struct xfs_getparents;
 struct xfs_parent_name_irec;
 struct xfs_attrlist_cursor_kern;
+struct xfs_imeta_update;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -153,7 +154,7 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
 
-TRACE_EVENT(xlog_intent_recovery_failed,
+DECLARE_EVENT_CLASS(xfs_fs_error_class,
 	TP_PROTO(struct xfs_mount *mp, int error, void *function),
 	TP_ARGS(mp, error, function),
 	TP_STRUCT__entry(
@@ -170,6 +171,11 @@ TRACE_EVENT(xlog_intent_recovery_failed,
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->error, __entry->function)
 );
+#define DEFINE_FS_ERROR_EVENT(name)	\
+DEFINE_EVENT(xfs_fs_error_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, int error, void *function), \
+	TP_ARGS(mp, error, function))
+DEFINE_FS_ERROR_EVENT(xlog_intent_recovery_failed);
 
 DECLARE_EVENT_CLASS(xfs_perag_class,
 	TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip),
@@ -5054,6 +5060,85 @@ TRACE_EVENT(xfs_getparent_pointers,
 		  __entry->offset)
 );
 
+DECLARE_EVENT_CLASS(xfs_imeta_sb_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_ino_t *sb_inop),
+	TP_ARGS(mp, sb_inop),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, sb_offset)
+		__field(xfs_ino_t, ino)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->sb_offset = (char *)sb_inop - (char *)&mp->m_sb;
+		__entry->ino = *sb_inop;
+	),
+	TP_printk("dev %d:%d sb_offset 0x%x ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->sb_offset,
+		  __entry->ino)
+)
+
+#define DEFINE_IMETA_SB_EVENT(name) \
+DEFINE_EVENT(xfs_imeta_sb_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_ino_t *sb_inop), \
+	TP_ARGS(mp, sb_inop))
+DEFINE_IMETA_SB_EVENT(xfs_imeta_sb_lookup);
+
+DECLARE_EVENT_CLASS(xfs_imeta_update_class,
+	TP_PROTO(const struct xfs_imeta_update *upd),
+	TP_ARGS(upd),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+	),
+	TP_fast_assign(
+		__entry->dev = upd->mp->m_super->s_dev;
+		__entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
+	),
+	TP_printk("dev %d:%d ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino)
+)
+
+#define DEFINE_IMETA_UPDATE_EVENT(name) \
+DEFINE_EVENT(xfs_imeta_update_class, name, \
+	TP_PROTO(const struct xfs_imeta_update *upd), \
+	TP_ARGS(upd))
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_start_create);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_start_link);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_start_unlink);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_update_commit);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_update_cancel);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_sb_create);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_sb_unlink);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_sb_link);
+
+DECLARE_EVENT_CLASS(xfs_imeta_update_error_class,
+	TP_PROTO(const struct xfs_imeta_update *upd, int error),
+	TP_ARGS(upd, error),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->dev = upd->mp->m_super->s_dev;
+		__entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
+		__entry->error = error;
+	),
+	TP_printk("dev %d:%d ino 0x%llx error %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->error)
+)
+
+#define DEFINE_IMETA_UPDATE_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_imeta_update_error_class, name, \
+	TP_PROTO(const struct xfs_imeta_update *upd, int error), \
+	TP_ARGS(upd, error))
+DEFINE_IMETA_UPDATE_ERROR_EVENT(xfs_imeta_teardown);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/32] xfs: create transaction reservations for metadata inode operations
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
  2023-12-31 21:06   ` [PATCH 01/32] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Darrick J. Wong
  2023-12-31 21:06   ` [PATCH 02/32] xfs: create imeta abstractions to get and set metadata inodes Darrick J. Wong
@ 2023-12-31 21:07   ` Darrick J. Wong
  2023-12-31 21:07   ` [PATCH 04/32] xfs: refactor the v4 group/project inode pointer switch Darrick J. Wong
                     ` (28 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:07 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create transaction reservation types and block reservation helpers to
help us calculate transaction requirements.  Right now the reservations
are the same as always; we're just separating the symbols for a future
patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_imeta.c      |    4 ++++
 fs/xfs/libxfs/xfs_imeta.h      |    4 ++++
 fs/xfs/libxfs/xfs_trans_resv.c |    5 +++++
 fs/xfs/libxfs/xfs_trans_resv.h |    3 +++
 fs/xfs/xfs_imeta_utils.c       |    8 +++++---
 5 files changed, 21 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index 717e67b3264cf..497d28abaff10 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -19,6 +19,10 @@
 #include "xfs_inode.h"
 #include "xfs_quota.h"
 #include "xfs_ialloc.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_trans_space.h"
 
 /*
  * Metadata File Management
diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h
index c1833b8b1c977..7b3da865c0931 100644
--- a/fs/xfs/libxfs/xfs_imeta.h
+++ b/fs/xfs/libxfs/xfs_imeta.h
@@ -43,4 +43,8 @@ int xfs_imeta_link(struct xfs_imeta_update *upd);
 bool xfs_is_static_meta_ino(struct xfs_mount *mp, xfs_ino_t ino);
 int xfs_imeta_mount(struct xfs_trans *tp);
 
+unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp);
+unsigned int xfs_imeta_link_space_res(struct xfs_mount *mp);
+unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp);
+
 #endif /* __XFS_IMETA_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index b390d9aa02142..d7c9af3406949 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -1249,4 +1249,9 @@ xfs_trans_resv_calc(
 	resp->tr_itruncate.tr_logcount += logcount_adj;
 	resp->tr_write.tr_logcount += logcount_adj;
 	resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
+
+	/* metadata inode creation and unlink */
+	resp->tr_imeta_create = resp->tr_create;
+	resp->tr_imeta_link = resp->tr_link;
+	resp->tr_imeta_unlink = resp->tr_remove;
 }
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0554b9d775d26..6b851dfe1ac07 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -48,6 +48,9 @@ struct xfs_trans_resv {
 	struct xfs_trans_res	tr_qm_dqalloc;	/* allocate quota on disk */
 	struct xfs_trans_res	tr_sb;		/* modify superblock */
 	struct xfs_trans_res	tr_fsyncts;	/* update timestamps on fsync */
+	struct xfs_trans_res	tr_imeta_create; /* create metadata inode */
+	struct xfs_trans_res	tr_imeta_link;	/* link metadata inode */
+	struct xfs_trans_res	tr_imeta_unlink; /* unlink metadata inode */
 };
 
 /* shorthand way of accessing reservation structure */
diff --git a/fs/xfs/xfs_imeta_utils.c b/fs/xfs/xfs_imeta_utils.c
index 262422daa931f..65a3aea49a5fc 100644
--- a/fs/xfs/xfs_imeta_utils.c
+++ b/fs/xfs/xfs_imeta_utils.c
@@ -78,7 +78,7 @@ xfs_imeta_start_create(
 	 * will account for them.
 	 */
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_imeta_create,
 			xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp);
 	if (error)
 		goto out_teardown;
@@ -136,7 +136,8 @@ xfs_imeta_start_link(
 {
 	int				error;
 
-	error = xfs_imeta_start_dir_update(mp, path, ip, &M_RES(mp)->tr_link,
+	error = xfs_imeta_start_dir_update(mp, path, ip,
+			&M_RES(mp)->tr_imeta_link,
 			xfs_link_space_res(mp, MAXNAMELEN), upd);
 	if (error)
 		return error;
@@ -158,7 +159,8 @@ xfs_imeta_start_unlink(
 {
 	int				error;
 
-	error = xfs_imeta_start_dir_update(mp, path, ip, &M_RES(mp)->tr_remove,
+	error = xfs_imeta_start_dir_update(mp, path, ip,
+			&M_RES(mp)->tr_imeta_unlink,
 			xfs_remove_space_res(mp, MAXNAMELEN), upd);
 	if (error)
 		return error;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/32] xfs: refactor the v4 group/project inode pointer switch
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:07   ` [PATCH 03/32] xfs: create transaction reservations for metadata inode operations Darrick J. Wong
@ 2023-12-31 21:07   ` Darrick J. Wong
  2023-12-31 21:07   ` [PATCH 05/32] xfs: convert quota file creation to use imeta methods Darrick J. Wong
                     ` (27 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:07 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Refactor the group and project quota inode pointer switcheroo that
happens only on v4 filesystems into a separate function prior to
enhancing the xfs_qm_qino_alloc function.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_qm.c |   90 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 54 insertions(+), 36 deletions(-)


diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 07d0d0231252a..3e7e0f9cecc0e 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -732,6 +732,57 @@ xfs_qm_destroy_quotainfo(
 	mp->m_quotainfo = NULL;
 }
 
+/*
+ * Switch the group and project quota in-core inode pointers if needed.
+ *
+ * On v4 superblocks that don't have separate pquotino, we share an inode
+ * between gquota and pquota. If the on-disk superblock has GQUOTA and the
+ * filesystem is now mounted with PQUOTA, just use sb_gquotino for sb_pquotino
+ * and vice-versa.
+ */
+STATIC int
+xfs_qm_qino_switch(
+	struct xfs_mount	*mp,
+	struct xfs_inode	**ipp,
+	unsigned int		flags,
+	bool			*need_alloc)
+{
+	xfs_ino_t		ino = NULLFSINO;
+	int			error;
+
+	if (xfs_has_pquotino(mp) ||
+	    !(flags & (XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)))
+		return 0;
+
+	if ((flags & XFS_QMOPT_PQUOTA) &&
+	    (mp->m_sb.sb_gquotino != NULLFSINO)) {
+		ino = mp->m_sb.sb_gquotino;
+		if (XFS_IS_CORRUPT(mp, mp->m_sb.sb_pquotino != NULLFSINO)) {
+			xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
+			return -EFSCORRUPTED;
+		}
+	} else if ((flags & XFS_QMOPT_GQUOTA) &&
+		   (mp->m_sb.sb_pquotino != NULLFSINO)) {
+		ino = mp->m_sb.sb_pquotino;
+		if (XFS_IS_CORRUPT(mp, mp->m_sb.sb_gquotino != NULLFSINO)) {
+			xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
+			return -EFSCORRUPTED;
+		}
+	}
+
+	if (ino == NULLFSINO)
+		return 0;
+
+	error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
+	if (error)
+		return error;
+
+	mp->m_sb.sb_gquotino = NULLFSINO;
+	mp->m_sb.sb_pquotino = NULLFSINO;
+	*need_alloc = false;
+	return 0;
+}
+
 /*
  * Create an inode and return with a reference already taken, but unlocked
  * This is how we create quota inodes
@@ -747,43 +798,10 @@ xfs_qm_qino_alloc(
 	bool			need_alloc = true;
 
 	*ipp = NULL;
-	/*
-	 * With superblock that doesn't have separate pquotino, we
-	 * share an inode between gquota and pquota. If the on-disk
-	 * superblock has GQUOTA and the filesystem is now mounted
-	 * with PQUOTA, just use sb_gquotino for sb_pquotino and
-	 * vice-versa.
-	 */
-	if (!xfs_has_pquotino(mp) &&
-			(flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
-		xfs_ino_t ino = NULLFSINO;
 
-		if ((flags & XFS_QMOPT_PQUOTA) &&
-			     (mp->m_sb.sb_gquotino != NULLFSINO)) {
-			ino = mp->m_sb.sb_gquotino;
-			if (XFS_IS_CORRUPT(mp,
-					   mp->m_sb.sb_pquotino != NULLFSINO)) {
-				xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
-				return -EFSCORRUPTED;
-			}
-		} else if ((flags & XFS_QMOPT_GQUOTA) &&
-			     (mp->m_sb.sb_pquotino != NULLFSINO)) {
-			ino = mp->m_sb.sb_pquotino;
-			if (XFS_IS_CORRUPT(mp,
-					   mp->m_sb.sb_gquotino != NULLFSINO)) {
-				xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
-				return -EFSCORRUPTED;
-			}
-		}
-		if (ino != NULLFSINO) {
-			error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
-			if (error)
-				return error;
-			mp->m_sb.sb_gquotino = NULLFSINO;
-			mp->m_sb.sb_pquotino = NULLFSINO;
-			need_alloc = false;
-		}
-	}
+	error = xfs_qm_qino_switch(mp, ipp, flags, &need_alloc);
+	if (error)
+		return error;
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
 			need_alloc ? XFS_QM_QINOCREATE_SPACE_RES(mp) : 0,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/32] xfs: convert quota file creation to use imeta methods
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:07   ` [PATCH 04/32] xfs: refactor the v4 group/project inode pointer switch Darrick J. Wong
@ 2023-12-31 21:07   ` Darrick J. Wong
  2023-12-31 21:07   ` [PATCH 06/32] xfs: iget for metadata inodes Darrick J. Wong
                     ` (26 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:07 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert the quota file creation code to use xfs_imeta_create instead of
open-coding a bunch of logic.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_qm.c |   91 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 34 deletions(-)


diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3e7e0f9cecc0e..2352eed966022 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -27,6 +27,8 @@
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
 #include "xfs_health.h"
+#include "xfs_imeta.h"
+#include "xfs_imeta_utils.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
@@ -732,6 +734,18 @@ xfs_qm_destroy_quotainfo(
 	mp->m_quotainfo = NULL;
 }
 
+static inline const struct xfs_imeta_path *
+xfs_qflags_to_imeta_path(
+	unsigned int	qflags)
+{
+	if (qflags & XFS_QMOPT_UQUOTA)
+		return &XFS_IMETA_USRQUOTA;
+	else if (qflags & XFS_QMOPT_GQUOTA)
+		return &XFS_IMETA_GRPQUOTA;
+	else
+		return &XFS_IMETA_PRJQUOTA;
+}
+
 /*
  * Switch the group and project quota in-core inode pointers if needed.
  *
@@ -739,6 +753,12 @@ xfs_qm_destroy_quotainfo(
  * between gquota and pquota. If the on-disk superblock has GQUOTA and the
  * filesystem is now mounted with PQUOTA, just use sb_gquotino for sb_pquotino
  * and vice-versa.
+ *
+ * We tolerate the direct manipulation of the in-core sb quota inode pointers
+ * here because calling xfs_imeta_ functions is only really required for
+ * filesystems with the metadata directory feature.  That feature requires a v5
+ * superblock, which always supports simultaneous group and project quotas, so
+ * we'll never get here.
  */
 STATIC int
 xfs_qm_qino_switch(
@@ -777,8 +797,13 @@ xfs_qm_qino_switch(
 	if (error)
 		return error;
 
-	mp->m_sb.sb_gquotino = NULLFSINO;
-	mp->m_sb.sb_pquotino = NULLFSINO;
+	if (flags & XFS_QMOPT_PQUOTA) {
+		mp->m_sb.sb_gquotino = NULLFSINO;
+		mp->m_sb.sb_pquotino = ino;
+	} else if (flags & XFS_QMOPT_GQUOTA) {
+		mp->m_sb.sb_gquotino = ino;
+		mp->m_sb.sb_pquotino = NULLFSINO;
+	}
 	*need_alloc = false;
 	return 0;
 }
@@ -793,7 +818,8 @@ xfs_qm_qino_alloc(
 	struct xfs_inode	**ipp,
 	unsigned int		flags)
 {
-	struct xfs_trans	*tp;
+	struct xfs_imeta_update	upd = { };
+	const struct xfs_imeta_path *path = xfs_qflags_to_imeta_path(flags);
 	int			error;
 	bool			need_alloc = true;
 
@@ -803,29 +829,14 @@ xfs_qm_qino_alloc(
 	if (error)
 		return error;
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
-			need_alloc ? XFS_QM_QINOCREATE_SPACE_RES(mp) : 0,
-			0, 0, &tp);
-	if (error)
-		return error;
-
 	if (need_alloc) {
-		struct xfs_icreate_args	args = {
-			.nlink		= 1,
-		};
-		xfs_ino_t	ino;
-
-		xfs_icreate_args_rootfile(&args, mp, S_IFREG,
-				xfs_has_parent(mp));
-
-		error = xfs_dialloc(&tp, 0, S_IFREG, &ino);
-		if (!error)
-			error = xfs_icreate(tp, ino, &args, ipp);
-		if (error) {
-			xfs_trans_cancel(tp);
-			return error;
-		}
+		error = xfs_imeta_start_create(mp, path, &upd);
+	} else {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, 0, 0, 0,
+				&upd.tp);
 	}
+	if (error)
+		goto out_end;
 
 	/*
 	 * Make the changes in the superblock, and log those too.
@@ -844,23 +855,35 @@ xfs_qm_qino_alloc(
 		/* qflags will get updated fully _after_ quotacheck */
 		mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
 	}
-	if (flags & XFS_QMOPT_UQUOTA)
-		mp->m_sb.sb_uquotino = (*ipp)->i_ino;
-	else if (flags & XFS_QMOPT_GQUOTA)
-		mp->m_sb.sb_gquotino = (*ipp)->i_ino;
-	else
-		mp->m_sb.sb_pquotino = (*ipp)->i_ino;
 	spin_unlock(&mp->m_sb_lock);
-	xfs_log_sb(tp);
+	xfs_log_sb(upd.tp);
 
-	error = xfs_trans_commit(tp);
+	if (need_alloc) {
+		error = xfs_imeta_create(&upd, S_IFREG, ipp);
+		if (error)
+			goto out_cancel;
+	}
+
+	error = xfs_imeta_commit_update(&upd);
 	if (error) {
 		ASSERT(xfs_is_shutdown(mp));
 		xfs_alert(mp, "%s failed (error %d)!", __func__, error);
+		goto out_end;
 	}
-	if (need_alloc) {
-		xfs_iunlock(*ipp, XFS_ILOCK_EXCL);
+
+	if (need_alloc)
 		xfs_finish_inode_setup(*ipp);
+
+	return 0;
+
+out_cancel:
+	xfs_imeta_cancel_update(&upd, error);
+out_end:
+	/* Have to finish setting up the inode to ensure it's deleted. */
+	if (*ipp) {
+		xfs_finish_inode_setup(*ipp);
+		xfs_irele(*ipp);
+		*ipp = NULL;
 	}
 	return error;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/32] xfs: iget for metadata inodes
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:07   ` [PATCH 05/32] xfs: convert quota file creation to use imeta methods Darrick J. Wong
@ 2023-12-31 21:07   ` Darrick J. Wong
  2023-12-31 21:08   ` [PATCH 07/32] xfs: define the on-disk format for the metadir feature Darrick J. Wong
                     ` (25 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:07 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a xfs_iget_meta function for metadata inodes to ensure that we
always check that the inobt thinks a metadata inode is in use.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_imeta.h |    5 +++++
 fs/xfs/xfs_icache.c       |   40 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.c        |    8 +++++++
 fs/xfs/xfs_qm.c           |   50 ++++++++++++++++++++++++++++++---------------
 fs/xfs/xfs_qm_syscalls.c  |    9 +++++++-
 fs/xfs/xfs_rtalloc.c      |   46 +++++++++++++++++++++++------------------
 6 files changed, 120 insertions(+), 38 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h
index 7b3da865c0931..0a4361bda1c4f 100644
--- a/fs/xfs/libxfs/xfs_imeta.h
+++ b/fs/xfs/libxfs/xfs_imeta.h
@@ -47,4 +47,9 @@ unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp);
 unsigned int xfs_imeta_link_space_res(struct xfs_mount *mp);
 unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp);
 
+/* Must be implemented by the libxfs client */
+int xfs_imeta_iget(struct xfs_trans *tp, xfs_ino_t ino, unsigned char ftype,
+		struct xfs_inode **ipp);
+void xfs_imeta_irele(struct xfs_inode *ip);
+
 #endif /* __XFS_IMETA_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f6a9c4f40cfd0..aa8516e8ab64f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -25,6 +25,9 @@
 #include "xfs_ag.h"
 #include "xfs_log_priv.h"
 #include "xfs_health.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
+#include "xfs_imeta.h"
 
 #include <linux/iversion.h>
 
@@ -811,6 +814,43 @@ xfs_iget(
 	return error;
 }
 
+/*
+ * Get a metadata inode.  The ftype must match exactly.  Caller must supply
+ * a transaction (even if empty) to avoid livelocking if the inobt has a cycle.
+ */
+int
+xfs_imeta_iget(
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	unsigned char		ftype,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*ip;
+	int			error;
+
+	ASSERT(ftype != XFS_DIR3_FT_UNKNOWN);
+
+	error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &ip);
+	if (error == -EFSCORRUPTED)
+		goto whine;
+	if (error)
+		return error;
+
+	if (VFS_I(ip)->i_nlink == 0)
+		goto bad_rele;
+	if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
+		goto bad_rele;
+
+	*ipp = ip;
+	return 0;
+bad_rele:
+	xfs_irele(ip);
+whine:
+	xfs_err(mp, "metadata inode 0x%llx is corrupt", ino);
+	return -EFSCORRUPTED;
+}
+
 /*
  * Grab the inode for reclaim exclusively.
  *
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 789b9603989c5..f2145e2701f51 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -43,6 +43,7 @@
 #include "xfs_parent.h"
 #include "xfs_xattr.h"
 #include "xfs_inode_util.h"
+#include "xfs_imeta.h"
 
 struct kmem_cache *xfs_inode_cache;
 
@@ -2741,6 +2742,13 @@ xfs_irele(
 	iput(VFS_I(ip));
 }
 
+void
+xfs_imeta_irele(
+	struct xfs_inode	*ip)
+{
+	xfs_irele(ip);
+}
+
 /*
  * Ensure all commited transactions touching the inode are written to the log.
  */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2352eed966022..2d8e420f3ad34 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -29,6 +29,7 @@
 #include "xfs_health.h"
 #include "xfs_imeta.h"
 #include "xfs_imeta_utils.h"
+#include "xfs_da_format.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
@@ -233,15 +234,15 @@ xfs_qm_unmount_quotas(
 	 */
 	if (mp->m_quotainfo) {
 		if (mp->m_quotainfo->qi_uquotaip) {
-			xfs_irele(mp->m_quotainfo->qi_uquotaip);
+			xfs_imeta_irele(mp->m_quotainfo->qi_uquotaip);
 			mp->m_quotainfo->qi_uquotaip = NULL;
 		}
 		if (mp->m_quotainfo->qi_gquotaip) {
-			xfs_irele(mp->m_quotainfo->qi_gquotaip);
+			xfs_imeta_irele(mp->m_quotainfo->qi_gquotaip);
 			mp->m_quotainfo->qi_gquotaip = NULL;
 		}
 		if (mp->m_quotainfo->qi_pquotaip) {
-			xfs_irele(mp->m_quotainfo->qi_pquotaip);
+			xfs_imeta_irele(mp->m_quotainfo->qi_pquotaip);
 			mp->m_quotainfo->qi_pquotaip = NULL;
 		}
 	}
@@ -767,6 +768,7 @@ xfs_qm_qino_switch(
 	unsigned int		flags,
 	bool			*need_alloc)
 {
+	struct xfs_trans	*tp;
 	xfs_ino_t		ino = NULLFSINO;
 	int			error;
 
@@ -793,7 +795,11 @@ xfs_qm_qino_switch(
 	if (ino == NULLFSINO)
 		return 0;
 
-	error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+	error = xfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, ipp);
+	xfs_trans_cancel(tp);
 	if (error)
 		return error;
 
@@ -1590,6 +1596,7 @@ xfs_qm_init_quotainos(
 	struct xfs_inode	*uip = NULL;
 	struct xfs_inode	*gip = NULL;
 	struct xfs_inode	*pip = NULL;
+	struct xfs_trans	*tp = NULL;
 	int			error;
 	uint			flags = 0;
 
@@ -1599,30 +1606,37 @@ xfs_qm_init_quotainos(
 	 * Get the uquota and gquota inodes
 	 */
 	if (xfs_has_quota(mp)) {
+		error = xfs_trans_alloc_empty(mp, &tp);
+		if (error)
+			return error;
+
 		if (XFS_IS_UQUOTA_ON(mp) &&
 		    mp->m_sb.sb_uquotino != NULLFSINO) {
 			ASSERT(mp->m_sb.sb_uquotino > 0);
-			error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-					     0, 0, &uip);
+			error = xfs_imeta_iget(tp, mp->m_sb.sb_uquotino,
+					XFS_DIR3_FT_REG_FILE, &uip);
 			if (error)
-				return error;
+				goto error_rele;
 		}
 		if (XFS_IS_GQUOTA_ON(mp) &&
 		    mp->m_sb.sb_gquotino != NULLFSINO) {
 			ASSERT(mp->m_sb.sb_gquotino > 0);
-			error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-					     0, 0, &gip);
+			error = xfs_imeta_iget(tp, mp->m_sb.sb_gquotino,
+					XFS_DIR3_FT_REG_FILE, &gip);
 			if (error)
 				goto error_rele;
 		}
 		if (XFS_IS_PQUOTA_ON(mp) &&
 		    mp->m_sb.sb_pquotino != NULLFSINO) {
 			ASSERT(mp->m_sb.sb_pquotino > 0);
-			error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
-					     0, 0, &pip);
+			error = xfs_imeta_iget(tp, mp->m_sb.sb_pquotino,
+					XFS_DIR3_FT_REG_FILE, &pip);
 			if (error)
 				goto error_rele;
 		}
+
+		xfs_trans_cancel(tp);
+		tp = NULL;
 	} else {
 		flags |= XFS_QMOPT_SBVERSION;
 	}
@@ -1663,12 +1677,14 @@ xfs_qm_init_quotainos(
 	return 0;
 
 error_rele:
+	if (tp)
+		xfs_trans_cancel(tp);
 	if (uip)
-		xfs_irele(uip);
+		xfs_imeta_irele(uip);
 	if (gip)
-		xfs_irele(gip);
+		xfs_imeta_irele(gip);
 	if (pip)
-		xfs_irele(pip);
+		xfs_imeta_irele(pip);
 	return error;
 }
 
@@ -1677,15 +1693,15 @@ xfs_qm_destroy_quotainos(
 	struct xfs_quotainfo	*qi)
 {
 	if (qi->qi_uquotaip) {
-		xfs_irele(qi->qi_uquotaip);
+		xfs_imeta_irele(qi->qi_uquotaip);
 		qi->qi_uquotaip = NULL; /* paranoia */
 	}
 	if (qi->qi_gquotaip) {
-		xfs_irele(qi->qi_gquotaip);
+		xfs_imeta_irele(qi->qi_gquotaip);
 		qi->qi_gquotaip = NULL;
 	}
 	if (qi->qi_pquotaip) {
-		xfs_irele(qi->qi_pquotaip);
+		xfs_imeta_irele(qi->qi_pquotaip);
 		qi->qi_pquotaip = NULL;
 	}
 }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 392cb39cc10c8..269d333736f5f 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -18,6 +18,8 @@
 #include "xfs_quota.h"
 #include "xfs_qm.h"
 #include "xfs_icache.h"
+#include "xfs_imeta.h"
+#include "xfs_da_format.h"
 
 int
 xfs_qm_scall_quotaoff(
@@ -62,7 +64,12 @@ xfs_qm_scall_trunc_qfile(
 	if (ino == NULLFSINO)
 		return 0;
 
-	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	error = xfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, &ip);
+	xfs_trans_cancel(tp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 379462ea9a14f..7156a5b55cfb6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,6 +22,8 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_log_priv.h"
 #include "xfs_health.h"
+#include "xfs_da_format.h"
+#include "xfs_imeta.h"
 
 /*
  * Read and return the summary information for a given extent size,
@@ -1335,16 +1337,12 @@ xfs_rtalloc_reinit_frextents(
  */
 static inline int
 xfs_rtmount_iread_extents(
+	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	unsigned int		lock_class)
 {
-	struct xfs_trans	*tp;
 	int			error;
 
-	error = xfs_trans_alloc_empty(ip->i_mount, &tp);
-	if (error)
-		return error;
-
 	xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class);
 
 	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
@@ -1359,7 +1357,6 @@ xfs_rtmount_iread_extents(
 
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class);
-	xfs_trans_cancel(tp);
 	return error;
 }
 
@@ -1367,43 +1364,52 @@ xfs_rtmount_iread_extents(
  * Get the bitmap and summary inodes and the summary cache into the mount
  * structure at mount time.
  */
-int					/* error */
+int
 xfs_rtmount_inodes(
-	xfs_mount_t	*mp)		/* file system mount structure */
+	struct xfs_mount	*mp)
 {
-	int		error;		/* error return value */
-	xfs_sb_t	*sbp;
+	struct xfs_trans	*tp;
+	struct xfs_sb		*sbp = &mp->m_sb;
+	int			error;
 
-	sbp = &mp->m_sb;
-	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	error = xfs_imeta_iget(tp, mp->m_sb.sb_rbmino, XFS_DIR3_FT_REG_FILE,
+			&mp->m_rbmip);
 	if (xfs_metadata_is_sick(error))
 		xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
 	if (error)
-		return error;
+		goto out_trans;
 	ASSERT(mp->m_rbmip != NULL);
 
-	error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP);
+	error = xfs_rtmount_iread_extents(tp, mp->m_rbmip, XFS_ILOCK_RTBITMAP);
 	if (error)
 		goto out_rele_bitmap;
 
-	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+	error = xfs_imeta_iget(tp, mp->m_sb.sb_rsumino, XFS_DIR3_FT_REG_FILE,
+			&mp->m_rsumip);
 	if (xfs_metadata_is_sick(error))
 		xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY);
 	if (error)
 		goto out_rele_bitmap;
 	ASSERT(mp->m_rsumip != NULL);
 
-	error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM);
+	error = xfs_rtmount_iread_extents(tp, mp->m_rsumip, XFS_ILOCK_RTSUM);
 	if (error)
 		goto out_rele_summary;
 
 	xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
+	xfs_trans_cancel(tp);
 	return 0;
 
 out_rele_summary:
-	xfs_irele(mp->m_rsumip);
+	xfs_imeta_irele(mp->m_rsumip);
 out_rele_bitmap:
-	xfs_irele(mp->m_rbmip);
+	xfs_imeta_irele(mp->m_rbmip);
+out_trans:
+	xfs_trans_cancel(tp);
 	return error;
 }
 
@@ -1413,9 +1419,9 @@ xfs_rtunmount_inodes(
 {
 	kmem_free(mp->m_rsum_cache);
 	if (mp->m_rbmip)
-		xfs_irele(mp->m_rbmip);
+		xfs_imeta_irele(mp->m_rbmip);
 	if (mp->m_rsumip)
-		xfs_irele(mp->m_rsumip);
+		xfs_imeta_irele(mp->m_rsumip);
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/32] xfs: define the on-disk format for the metadir feature
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:07   ` [PATCH 06/32] xfs: iget for metadata inodes Darrick J. Wong
@ 2023-12-31 21:08   ` Darrick J. Wong
  2023-12-31 21:08   ` [PATCH 08/32] xfs: update imeta transaction reservations for metadir Darrick J. Wong
                     ` (24 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:08 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Define the on-disk layout and feature flags for the metadata inode
directory feature.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h     |   60 +++++++++++++++++++++++++++++++++++-----
 fs/xfs/libxfs/xfs_inode_util.c |    2 +
 fs/xfs/libxfs/xfs_sb.c         |    2 +
 fs/xfs/xfs_inode.h             |   13 +++++++++
 fs/xfs/xfs_mount.h             |    2 +
 fs/xfs/xfs_super.c             |    4 +++
 6 files changed, 75 insertions(+), 8 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index c60af43696326..7596b9286988e 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -174,6 +174,16 @@ typedef struct xfs_sb {
 	xfs_lsn_t	sb_lsn;		/* last write sequence */
 	uuid_t		sb_meta_uuid;	/* metadata file system unique id */
 
+	/* Fields beyond here do not match xfs_dsb.  Be very careful! */
+
+	/*
+	 * Metadata Directory Inode.  On disk this lives in the sb_rbmino slot,
+	 * but we continue to use the in-core superblock to cache the classic
+	 * inodes (rt bitmap; rt summary; user, group, and project quotas) so
+	 * we cache the metadir inode value here too.
+	 */
+	xfs_ino_t	sb_metadirino;
+
 	/* must be padded to 64 bit alignment */
 } xfs_sb_t;
 
@@ -190,7 +200,14 @@ struct xfs_dsb {
 	uuid_t		sb_uuid;	/* user-visible file system unique id */
 	__be64		sb_logstart;	/* starting block of log if internal */
 	__be64		sb_rootino;	/* root inode number */
-	__be64		sb_rbmino;	/* bitmap inode for realtime extents */
+	/*
+	 * bitmap inode for realtime extents.
+	 *
+	 * The metadata directory feature uses the sb_rbmino field to point to
+	 * the root of the metadata directory tree.  All other sb inode
+	 * pointers are no longer used.
+	 */
+	__be64		sb_rbmino;
 	__be64		sb_rsumino;	/* summary inode for rt bitmap */
 	__be32		sb_rextsize;	/* realtime extent size, blocks */
 	__be32		sb_agblocks;	/* size of an allocation group */
@@ -373,6 +390,7 @@ xfs_sb_has_ro_compat_feature(
 #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4)	/* needs xfs_repair */
 #define XFS_SB_FEAT_INCOMPAT_NREXT64	(1 << 5)	/* large extent counters */
 #define XFS_SB_FEAT_INCOMPAT_PARENT	(1 << 6)	/* parent pointers */
+#define XFS_SB_FEAT_INCOMPAT_METADIR	(1U << 31)	/* metadata dir tree */
 #define XFS_SB_FEAT_INCOMPAT_ALL \
 		(XFS_SB_FEAT_INCOMPAT_FTYPE|	\
 		 XFS_SB_FEAT_INCOMPAT_SPINODES|	\
@@ -1109,17 +1127,43 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 #define XFS_DIFLAG2_REFLINK_BIT	1	/* file's blocks may be shared */
 #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
 #define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
-#define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
+#define XFS_DIFLAG2_NREXT64_BIT	4	/* large extent counters */
+#define XFS_DIFLAG2_METADIR_BIT	63	/* filesystem metadata */
 
-#define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
-#define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
-#define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
-#define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
-#define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_DAX		(1ULL << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK	(1ULL << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE	(1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME	(1ULL << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64	(1ULL << XFS_DIFLAG2_NREXT64_BIT)
+
+/*
+ * The inode contains filesystem metadata and can be found through the metadata
+ * directory tree.  Metadata inodes must satisfy the following constraints:
+ *
+ * - V5 filesystem (and ftype) are enabled;
+ * - The only valid modes are regular files and directories;
+ * - The access bits must be zero;
+ * - DMAPI event and state masks are zero;
+ * - The user, group, and project IDs must be zero;
+ * - The immutable, sync, noatime, nodump, nodefrag flags must be set.
+ * - The dax flag must not be set.
+ * - Directories must have nosymlinks set.
+ *
+ * These requirements are chosen defensively to minimize the ability of
+ * userspace to read or modify the contents, should a metadata file ever
+ * escape to userspace.
+ *
+ * There are further constraints on the directory tree itself:
+ *
+ * - Metadata inodes must never be resolvable through the root directory;
+ * - They must never be accessed by userspace;
+ * - Metadata directory entries must have correct ftype.
+ */
+#define XFS_DIFLAG2_METADIR	(1ULL << XFS_DIFLAG2_METADIR_BIT)
 
 #define XFS_DIFLAG2_ANY \
 	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADIR)
 
 static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
 {
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 8f85c74603dba..2f190b875c7f8 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -225,6 +225,8 @@ xfs_inode_inherit_flags2(
 	}
 	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
 		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+	if (pip->i_diflags2 & XFS_DIFLAG2_METADIR)
+		ip->i_diflags2 |= XFS_DIFLAG2_METADIR;
 
 	/* Don't let invalid cowextsize hints propagate. */
 	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d4f72d4c85f83..2d2f55099db04 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -178,6 +178,8 @@ xfs_sb_version_to_features(
 		features |= XFS_FEAT_NREXT64;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
 		features |= XFS_FEAT_PARENT;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
+		features |= XFS_FEAT_METADIR;
 
 	return features;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index cc6a075505636..10fbb3d4d193e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -276,10 +276,23 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
 	return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
 }
 
+static inline bool xfs_is_metadir_inode(struct xfs_inode *ip)
+{
+	return ip->i_diflags2 & XFS_DIFLAG2_METADIR;
+}
+
 static inline bool xfs_is_metadata_inode(struct xfs_inode *ip)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
+	/* Any file in the metadata directory tree is a metadata inode. */
+	if (xfs_has_metadir(mp))
+		return xfs_is_metadir_inode(ip);
+
+	/*
+	 * Before metadata directories, the only metadata inodes were the
+	 * three quota files, the realtime bitmap, and the realtime summary.
+	 */
 	return ip == mp->m_rbmip || ip == mp->m_rsumip ||
 		xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f80f04335acc5..953386ad4b13f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -292,6 +292,7 @@ typedef struct xfs_mount {
 #define XFS_FEAT_BIGTIME	(1ULL << 24)	/* large timestamps */
 #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
 #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
+#define XFS_FEAT_METADIR	(1ULL << 27)	/* metadata directory tree */
 
 /* Mount features */
 #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
@@ -355,6 +356,7 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
 __XFS_HAS_FEAT(bigtime, BIGTIME)
 __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
 __XFS_HAS_FEAT(large_extent_counts, NREXT64)
+__XFS_HAS_FEAT(metadir, METADIR)
 
 /*
  * Mount features
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8b4e5b8579a9b..998fa006af5bb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1713,6 +1713,10 @@ xfs_fs_fill_super(
 		mp->m_features &= ~XFS_FEAT_DISCARD;
 	}
 
+	if (xfs_has_metadir(mp))
+		xfs_warn(mp,
+"EXPERIMENTAL metadata directory feature in use. Use at your own risk!");
+
 	if (xfs_has_reflink(mp)) {
 		if (mp->m_sb.sb_rblocks) {
 			xfs_alert(mp,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/32] xfs: update imeta transaction reservations for metadir
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:08   ` [PATCH 07/32] xfs: define the on-disk format for the metadir feature Darrick J. Wong
@ 2023-12-31 21:08   ` Darrick J. Wong
  2023-12-31 21:08   ` [PATCH 09/32] xfs: load metadata directory root at mount time Darrick J. Wong
                     ` (23 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:08 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Update the new metadata inode transaction reservations to handle
metadata directories if that feature is enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_trans_resv.c |  101 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 98 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index d7c9af3406949..a0545daf039c7 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -1109,6 +1109,81 @@ xfs_calc_sb_reservation(
 	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
 }
 
+/*
+ * Metadata inode creation needs enough space to create or mkdir a directory,
+ * plus logging the superblock.
+ */
+static unsigned int
+xfs_calc_imeta_create_resv(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	unsigned int		ret;
+
+	ret = xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+	ret += resp->tr_create.tr_logres;
+	return ret;
+}
+
+/* Metadata inode creation needs enough rounds to create or mkdir a directory */
+static int
+xfs_calc_imeta_create_count(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	return resp->tr_create.tr_logcount;
+}
+
+/*
+ * Metadata inode link needs enough space to add a file plus logging the
+ * superblock.
+ */
+static unsigned int
+xfs_calc_imeta_link_resv(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	unsigned int		ret;
+
+	ret = xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+	ret += resp->tr_link.tr_logres;
+	return ret;
+}
+
+/* Metadata inode linking needs enough rounds to remove a file. */
+static int
+xfs_calc_imeta_link_count(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	return resp->tr_link.tr_logcount;
+}
+
+/*
+ * Metadata inode unlink needs enough space to remove a file plus logging the
+ * superblock.
+ */
+static unsigned int
+xfs_calc_imeta_unlink_resv(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	unsigned int		ret;
+
+	ret = xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+	ret += resp->tr_remove.tr_logres;
+	return ret;
+}
+
+/* Metadata inode unlinking needs enough rounds to remove a file. */
+static int
+xfs_calc_imeta_unlink_count(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	return resp->tr_remove.tr_logcount;
+}
+
 /*
  * Namespace reservations.
  *
@@ -1251,7 +1326,27 @@ xfs_trans_resv_calc(
 	resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
 
 	/* metadata inode creation and unlink */
-	resp->tr_imeta_create = resp->tr_create;
-	resp->tr_imeta_link = resp->tr_link;
-	resp->tr_imeta_unlink = resp->tr_remove;
+	if (xfs_has_metadir(mp)) {
+		resp->tr_imeta_create.tr_logres =
+				xfs_calc_imeta_create_resv(mp, resp);
+		resp->tr_imeta_create.tr_logcount =
+				xfs_calc_imeta_create_count(mp, resp);
+		resp->tr_imeta_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+		resp->tr_imeta_link.tr_logres =
+				xfs_calc_imeta_link_resv(mp, resp);
+		resp->tr_imeta_link.tr_logcount =
+				xfs_calc_imeta_link_count(mp, resp);
+		resp->tr_imeta_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+		resp->tr_imeta_unlink.tr_logres =
+				xfs_calc_imeta_unlink_resv(mp, resp);
+		resp->tr_imeta_unlink.tr_logcount =
+				xfs_calc_imeta_unlink_count(mp, resp);
+		resp->tr_imeta_unlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+	} else {
+		resp->tr_imeta_create = resp->tr_create;
+		resp->tr_imeta_link = resp->tr_link;
+		resp->tr_imeta_unlink = resp->tr_remove;
+	}
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/32] xfs: load metadata directory root at mount time
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:08   ` [PATCH 08/32] xfs: update imeta transaction reservations for metadir Darrick J. Wong
@ 2023-12-31 21:08   ` Darrick J. Wong
  2023-12-31 21:08   ` [PATCH 10/32] xfs: convert metadata inode lookup keys to use paths Darrick J. Wong
                     ` (22 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:08 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Load the metadata directory root inode into memory at mount time and
release it at unmount time.  We also make sure that the obsolete inode
pointers in the superblock are not logged or read from the superblock.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_sb.c    |   31 +++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_types.c |    2 +-
 fs/xfs/xfs_mount.c        |   22 +++++++++++++++++++---
 fs/xfs/xfs_mount.h        |    1 +
 4 files changed, 52 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 2d2f55099db04..28f65753c0c5a 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -682,6 +682,25 @@ __xfs_sb_from_disk(
 	/* Convert on-disk flags to in-memory flags? */
 	if (convert_xquota)
 		xfs_sb_quota_from_disk(to);
+
+	if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+		/*
+		 * Set metadirino here and null out the in-core fields for
+		 * the other inodes because metadir initialization will load
+		 * them later.
+		 */
+		to->sb_metadirino = be64_to_cpu(from->sb_rbmino);
+		to->sb_rbmino = NULLFSINO;
+		to->sb_rsumino = NULLFSINO;
+
+		/*
+		 * We don't have to worry about quota inode conversion here
+		 * because metadir requires a v5 filesystem.
+		 */
+		to->sb_uquotino = NULLFSINO;
+		to->sb_gquotino = NULLFSINO;
+		to->sb_pquotino = NULLFSINO;
+	}
 }
 
 void
@@ -829,6 +848,18 @@ xfs_sb_to_disk(
 	to->sb_lsn = cpu_to_be64(from->sb_lsn);
 	if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
 		uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+
+	if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+		/*
+		 * Save metadirino here and null out the on-disk fields for
+		 * the other inodes, at least until we reuse the fields.
+		 */
+		to->sb_rbmino = cpu_to_be64(from->sb_metadirino);
+		to->sb_rsumino = cpu_to_be64(NULLFSINO);
+		to->sb_uquotino = cpu_to_be64(NULLFSINO);
+		to->sb_gquotino = cpu_to_be64(NULLFSINO);
+		to->sb_pquotino = cpu_to_be64(NULLFSINO);
+	}
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 72f7e050e600a..b1fa715e5f398 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -128,7 +128,7 @@ xfs_verify_dir_ino(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino)
 {
-	if (xfs_internal_inum(mp, ino))
+	if (!xfs_has_metadir(mp) && xfs_internal_inum(mp, ino))
 		return false;
 	return xfs_verify_ino(mp, ino);
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ab11fe9a4b4ab..8b61b8c51cd16 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -622,6 +622,18 @@ xfs_mount_setup_metadir(
 	if (error)
 		return error;
 
+	/* Load the metadata directory root inode into memory. */
+	if (xfs_has_metadir(mp)) {
+		error = xfs_imeta_iget(tp, mp->m_sb.sb_metadirino,
+				XFS_DIR3_FT_DIR, &mp->m_metadirip);
+		if (error) {
+			xfs_warn(mp,
+ "Failed to load metadir root directory, error %d",
+					error);
+			goto err_cancel;
+		}
+	}
+
 	error = xfs_imeta_mount(tp);
 	if (error) {
 		xfs_warn(mp, "Failed to load metadata inodes, error %d",
@@ -872,7 +884,7 @@ xfs_mountfs(
 
 	error = xfs_mount_setup_metadir(mp);
 	if (error)
-		goto out_log_dealloc;
+		goto out_free_metadir;
 
 	/*
 	 * Get and sanity-check the root inode.
@@ -884,7 +896,7 @@ xfs_mountfs(
 		xfs_warn(mp,
 			"Failed to read root inode 0x%llx, error %d",
 			sbp->sb_rootino, -error);
-		goto out_log_dealloc;
+		goto out_free_metadir;
 	}
 
 	ASSERT(rip != NULL);
@@ -1027,6 +1039,9 @@ xfs_mountfs(
 	xfs_irele(rip);
 	/* Clean out dquots that might be in memory after quotacheck. */
 	xfs_qm_unmount(mp);
+ out_free_metadir:
+	if (mp->m_metadirip)
+		xfs_imeta_irele(mp->m_metadirip);
 
 	/*
 	 * Inactivate all inodes that might still be in memory after a log
@@ -1048,7 +1063,6 @@ xfs_mountfs(
 	 * quota inodes.
 	 */
 	xfs_unmount_flush_inodes(mp);
- out_log_dealloc:
 	xfs_log_mount_cancel(mp);
  out_inodegc_shrinker:
 	shrinker_free(mp->m_inodegc_shrinker);
@@ -1101,6 +1115,8 @@ xfs_unmountfs(
 	xfs_qm_unmount_quotas(mp);
 	xfs_rtunmount_inodes(mp);
 	xfs_irele(mp->m_rootip);
+	if (mp->m_metadirip)
+		xfs_imeta_irele(mp->m_metadirip);
 
 	xfs_unmount_flush_inodes(mp);
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 953386ad4b13f..b45a4f4a8503e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -93,6 +93,7 @@ typedef struct xfs_mount {
 	struct xfs_inode	*m_rbmip;	/* pointer to bitmap inode */
 	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
 	struct xfs_inode	*m_rootip;	/* pointer to root directory */
+	struct xfs_inode	*m_metadirip;	/* ptr to metadata directory */
 	struct xfs_quotainfo	*m_quotainfo;	/* disk quota information */
 	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
 	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/32] xfs: convert metadata inode lookup keys to use paths
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 21:08   ` [PATCH 09/32] xfs: load metadata directory root at mount time Darrick J. Wong
@ 2023-12-31 21:08   ` Darrick J. Wong
  2023-12-31 21:09   ` [PATCH 11/32] xfs: enforce metadata inode flag Darrick J. Wong
                     ` (21 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:08 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert the magic metadata inode lookup keys to use actual strings
for paths.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_imeta.c |   48 ++++++++++++++++++++++++---------------------
 fs/xfs/libxfs/xfs_imeta.h |   27 +++++++++++++++++++++++--
 fs/xfs/xfs_trace.h        |   10 ++++++++-
 3 files changed, 59 insertions(+), 26 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index 497d28abaff10..bab82c3dde5de 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -49,26 +49,17 @@
  */
 
 /* Static metadata inode paths */
-
-const struct xfs_imeta_path XFS_IMETA_RTBITMAP = {
-	.bogus = 0,
-};
-
-const struct xfs_imeta_path XFS_IMETA_RTSUMMARY = {
-	.bogus = 1,
-};
-
-const struct xfs_imeta_path XFS_IMETA_USRQUOTA = {
-	.bogus = 2,
-};
-
-const struct xfs_imeta_path XFS_IMETA_GRPQUOTA = {
-	.bogus = 3,
-};
-
-const struct xfs_imeta_path XFS_IMETA_PRJQUOTA = {
-	.bogus = 4,
-};
+static const unsigned char *rtbitmap_path[]	= {"realtime", "bitmap"};
+static const unsigned char *rtsummary_path[]	= {"realtime", "summary"};
+static const unsigned char *usrquota_path[]	= {"quota", "user"};
+static const unsigned char *grpquota_path[]	= {"quota", "group"};
+static const unsigned char *prjquota_path[]	= {"quota", "project"};
+
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_RTBITMAP,	rtbitmap_path);
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_RTSUMMARY,	rtsummary_path);
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_USRQUOTA,	usrquota_path);
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_GRPQUOTA,	grpquota_path);
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_PRJQUOTA,	prjquota_path);
 
 /* Are these two paths equal? */
 STATIC bool
@@ -76,7 +67,20 @@ xfs_imeta_path_compare(
 	const struct xfs_imeta_path	*a,
 	const struct xfs_imeta_path	*b)
 {
-	return a == b;
+	unsigned int			i;
+
+	if (a == b)
+		return true;
+
+	if (a->im_depth != b->im_depth)
+		return false;
+
+	for (i = 0; i < a->im_depth; i++)
+		if (a->im_path[i] != b->im_path[i] &&
+		    strcmp(a->im_path[i], b->im_path[i]))
+			return false;
+
+	return true;
 }
 
 /* Is this path ok? */
@@ -84,7 +88,7 @@ static inline bool
 xfs_imeta_path_check(
 	const struct xfs_imeta_path	*path)
 {
-	return true;
+	return path->im_depth <= XFS_IMETA_MAX_DEPTH;
 }
 
 /* Functions for storing and retrieving superblock inode values. */
diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h
index 0a4361bda1c4f..60e0f6a6c134a 100644
--- a/fs/xfs/libxfs/xfs_imeta.h
+++ b/fs/xfs/libxfs/xfs_imeta.h
@@ -6,10 +6,23 @@
 #ifndef __XFS_IMETA_H__
 #define __XFS_IMETA_H__
 
+/* How deep can we nest metadata dirs? */
+#define XFS_IMETA_MAX_DEPTH	64
+
+/* Form an imeta path from a simple array of strings. */
+#define XFS_IMETA_DEFINE_PATH(name, path) \
+const struct xfs_imeta_path name = { \
+	.im_path = (path), \
+	.im_depth = ARRAY_SIZE(path), \
+}
+
 /* Key for looking up metadata inodes. */
 struct xfs_imeta_path {
-	/* Temporary: integer to keep the static imeta definitions unique */
-	int		bogus;
+	/* Array of string pointers. */
+	const unsigned char	**im_path;
+
+	/* Number of strings in path. */
+	uint8_t			im_depth;
 };
 
 /* Cleanup widget for metadata inode creation and deletion. */
@@ -25,6 +38,16 @@ struct xfs_imeta_update {
 	unsigned int		ip_locked:1;
 };
 
+/* Grab the last path component, mostly for tracing. */
+static inline const unsigned char *
+xfs_imeta_lastpath(
+	const struct xfs_imeta_update	*upd)
+{
+	if (upd->path && upd->path->im_path && upd->path->im_depth > 0)
+		return upd->path->im_path[upd->path->im_depth - 1];
+	return "?";
+}
+
 /* Lookup keys for static metadata inodes. */
 extern const struct xfs_imeta_path XFS_IMETA_RTBITMAP;
 extern const struct xfs_imeta_path XFS_IMETA_RTSUMMARY;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bedfed2ef3e60..12f68507d7a74 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5091,13 +5091,16 @@ DECLARE_EVENT_CLASS(xfs_imeta_update_class,
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
+		__string(fname, xfs_imeta_lastpath(upd))
 	),
 	TP_fast_assign(
 		__entry->dev = upd->mp->m_super->s_dev;
 		__entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
+		__assign_str(fname, xfs_imeta_lastpath(upd));
 	),
-	TP_printk("dev %d:%d ino 0x%llx",
+	TP_printk("dev %d:%d fname '%s' ino 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __get_str(fname),
 		  __entry->ino)
 )
 
@@ -5121,14 +5124,17 @@ DECLARE_EVENT_CLASS(xfs_imeta_update_error_class,
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(int, error)
+		__string(fname, xfs_imeta_lastpath(upd))
 	),
 	TP_fast_assign(
 		__entry->dev = upd->mp->m_super->s_dev;
 		__entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
 		__entry->error = error;
+		__assign_str(fname, xfs_imeta_lastpath(upd));
 	),
-	TP_printk("dev %d:%d ino 0x%llx error %d",
+	TP_printk("dev %d:%d fname '%s' ino 0x%llx error %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __get_str(fname),
 		  __entry->ino,
 		  __entry->error)
 )


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/32] xfs: enforce metadata inode flag
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 21:08   ` [PATCH 10/32] xfs: convert metadata inode lookup keys to use paths Darrick J. Wong
@ 2023-12-31 21:09   ` Darrick J. Wong
  2023-12-31 21:09   ` [PATCH 12/32] xfs: allow deletion of metadata directory files Darrick J. Wong
                     ` (20 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:09 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add checks for the metadata inode flag so that we don't ever leak
metadata inodes out to userspace, and we don't ever try to read a
regular inode as metadata.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c |   73 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_buf.h |    3 ++
 fs/xfs/scrub/common.c         |   10 ++++--
 fs/xfs/scrub/inode.c          |   25 +++++++++++++-
 fs/xfs/scrub/inode_repair.c   |   10 ++++++
 fs/xfs/xfs_icache.c           |    2 +
 fs/xfs/xfs_inode.c            |   13 +++++++
 7 files changed, 132 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d79002343d0b6..18d9e71a3bb30 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -460,6 +460,73 @@ xfs_dinode_verify_nrext64(
 	return NULL;
 }
 
+/*
+ * Validate all the picky requirements we have for a file that claims to be
+ * filesystem metadata.
+ */
+xfs_failaddr_t
+xfs_dinode_verify_metadir(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip,
+	uint16_t		mode,
+	uint16_t		flags,
+	uint64_t		flags2)
+{
+	if (!xfs_has_metadir(mp))
+		return __this_address;
+
+	/* V5 filesystem only */
+	if (dip->di_version < 3)
+		return __this_address;
+
+	/* V3 inode fields that are always zero */
+	if (dip->di_onlink)
+		return __this_address;
+	if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad)
+		return __this_address;
+	if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter)
+		return __this_address;
+
+	/* Metadata files can only be directories or regular files */
+	if (!S_ISDIR(mode) && !S_ISREG(mode))
+		return __this_address;
+
+	/* They must have zero access permissions */
+	if (mode & 0777)
+		return __this_address;
+
+	/* DMAPI event and state masks are zero */
+	if (dip->di_dmevmask || dip->di_dmstate)
+		return __this_address;
+
+	/* User, group, and project IDs must be zero */
+	if (dip->di_uid || dip->di_gid ||
+	    dip->di_projid_lo || dip->di_projid_hi)
+		return __this_address;
+
+	/* Immutable, sync, noatime, nodump, and nodefrag flags must be set */
+	if (!(flags & XFS_DIFLAG_IMMUTABLE))
+		return __this_address;
+	if (!(flags & XFS_DIFLAG_SYNC))
+		return __this_address;
+	if (!(flags & XFS_DIFLAG_NOATIME))
+		return __this_address;
+	if (!(flags & XFS_DIFLAG_NODUMP))
+		return __this_address;
+	if (!(flags & XFS_DIFLAG_NODEFRAG))
+		return __this_address;
+
+	/* Directories must have nosymlinks flags set */
+	if (S_ISDIR(mode) && !(flags & XFS_DIFLAG_NOSYMLINKS))
+		return __this_address;
+
+	/* dax flags2 must not be set */
+	if (flags2 & XFS_DIFLAG2_DAX)
+		return __this_address;
+
+	return NULL;
+}
+
 xfs_failaddr_t
 xfs_dinode_verify(
 	struct xfs_mount	*mp,
@@ -624,6 +691,12 @@ xfs_dinode_verify(
 	    !xfs_has_bigtime(mp))
 		return __this_address;
 
+	if (flags2 & XFS_DIFLAG2_METADIR) {
+		fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2);
+		if (fa)
+			return fa;
+	}
+
 	return NULL;
 }
 
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 585ed5a110af4..8d43d2641c732 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -28,6 +28,9 @@ int	xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
 
 xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
 			   struct xfs_dinode *dip);
+xfs_failaddr_t xfs_dinode_verify_metadir(struct xfs_mount *mp,
+		struct xfs_dinode *dip, uint16_t mode, uint16_t flags,
+		uint64_t flags2);
 xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
 		uint32_t extsize, uint16_t mode, uint16_t flags);
 xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 74be3b7341b51..fbc28bd98e957 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -942,9 +942,15 @@ xchk_iget_for_scrubbing(
 	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
 		return xchk_install_live_inode(sc, ip_in);
 
-	/* Reject internal metadata files and obviously bad inode numbers. */
-	if (xfs_internal_inum(mp, sc->sm->sm_ino))
+	/*
+	 * On pre-metadir filesystems, reject internal metadata files.  For
+	 * metadir filesystems, scrubbing any file in the metadata directory
+	 * tree by handle is allowed, because that is the only way to validate
+	 * the metadata directory structure.
+	 */
+	if (!xfs_has_metadir(mp) && xfs_internal_inum(mp, sc->sm->sm_ino))
 		return -ENOENT;
+	/* Reject obviously bad inode numbers. */
 	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
 		return -ENOENT;
 
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index d32716fb2fecf..7357bf5156ba3 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -60,6 +60,22 @@ xchk_install_handle_iscrub(
 	if (error)
 		return error;
 
+	/*
+	 * Don't allow scrubbing by handle of any inodes in the metadata
+	 * directory tree.  We don't know if any of the scans launched by
+	 * this scrubber will end up indirectly trying to lock this file.
+	 *
+	 * Scrubbers of inode-rooted metadata files (i.e. the non-dir files)
+	 * will attach all the resources needed to scrub the inode and call
+	 * xchk_inode directly.  Hence we do not let userspace call us
+	 * directly.
+	 */
+	if (xfs_is_metadir_inode(ip) && !S_ISDIR(VFS_I(ip)->i_mode)) {
+		xchk_irele(sc, ip);
+		sc->ip = NULL;
+		return -ENOENT;
+	}
+
 	return xchk_prepare_iscrub(sc);
 }
 
@@ -94,9 +110,14 @@ xchk_setup_inode(
 		return xchk_prepare_iscrub(sc);
 	}
 
-	/* Reject internal metadata files and obviously bad inode numbers. */
-	if (xfs_internal_inum(mp, sc->sm->sm_ino))
+	/*
+	 * On pre-metadir filesystems, reject internal metadata files.  For
+	 * metadir filesystems, scrubbing directory inodes in the metadata
+	 * directory tree by handle is allowed.
+	 */
+	if (!xfs_has_metadir(mp) && xfs_internal_inum(mp, sc->sm->sm_ino))
 		return -ENOENT;
+	/* Reject obviously bad inode numbers. */
 	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
 		return -ENOENT;
 
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 187f782d51f22..e2dde5834051c 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -517,6 +517,16 @@ xrep_dinode_flags(
 		dip->di_nrext64_pad = 0;
 	else if (dip->di_version >= 3)
 		dip->di_v3_pad = 0;
+
+	if (flags2 & XFS_DIFLAG2_METADIR) {
+		xfs_failaddr_t	fa;
+
+		fa = xfs_dinode_verify_metadir(sc->mp, dip, mode, flags,
+				flags2);
+		if (fa)
+			flags2 &= ~XFS_DIFLAG2_METADIR;
+	}
+
 	dip->di_flags = cpu_to_be16(flags);
 	dip->di_flags2 = cpu_to_be64(flags2);
 }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index aa8516e8ab64f..64c52e8621fdb 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -841,6 +841,8 @@ xfs_imeta_iget(
 		goto bad_rele;
 	if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
 		goto bad_rele;
+	if (xfs_has_metadir(mp) && !xfs_is_metadir_inode(ip))
+		goto bad_rele;
 
 	*ipp = ip;
 	return 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f2145e2701f51..4699a9382f757 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -577,8 +577,19 @@ xfs_lookup(
 	if (error)
 		goto out_free_name;
 
+	/*
+	 * Fail if a directory entry in the regular directory tree points to
+	 * a metadata file.
+	 */
+	if (XFS_IS_CORRUPT(dp->i_mount, xfs_is_metadir_inode(*ipp))) {
+		error = -EFSCORRUPTED;
+		goto out_irele;
+	}
+
 	return 0;
 
+out_irele:
+	xfs_irele(*ipp);
 out_free_name:
 	if (ci_name)
 		kmem_free(ci_name->name);
@@ -2746,6 +2757,8 @@ void
 xfs_imeta_irele(
 	struct xfs_inode	*ip)
 {
+	ASSERT(!xfs_has_metadir(ip->i_mount) || xfs_is_metadir_inode(ip));
+
 	xfs_irele(ip);
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/32] xfs: allow deletion of metadata directory files
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 21:09   ` [PATCH 11/32] xfs: enforce metadata inode flag Darrick J. Wong
@ 2023-12-31 21:09   ` Darrick J. Wong
  2023-12-31 21:09   ` [PATCH 13/32] xfs: read and write metadata inode directory Darrick J. Wong
                     ` (19 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:09 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make it possible to free metadata files once we've unlinked them from
the directory structure.  We don't do this in the kernel, at least not
yet, but don't leave a logic bomb for later.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_imeta.c |   49 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_inode.c        |    2 +-
 fs/xfs/xfs_inode.h        |    3 +++
 3 files changed, 52 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index bab82c3dde5de..38f79827b9354 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -23,6 +23,7 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_trans_space.h"
+#include "xfs_ag.h"
 
 /*
  * Metadata File Management
@@ -360,6 +361,38 @@ xfs_imeta_create(
 	return error;
 }
 
+/* Free a file from the metadata directory tree. */
+STATIC int
+xfs_imeta_ifree(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	struct xfs_icluster	xic = { 0 };
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(ip->i_df.if_nextents == 0);
+	ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
+	ASSERT(ip->i_nblocks == 0);
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	error = xfs_dir_ifree(tp, pag, ip, &xic);
+	if (error)
+		goto out;
+
+	/* Metadata files do not support ownership changes or DMAPI. */
+
+	if (xic.deleted)
+		error = xfs_ifree_cluster(tp, pag, ip, &xic);
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
 /*
  * Unlink a metadata inode @upd->ip from the metadata directory given by @path.
  * The path must already exist.
@@ -368,10 +401,24 @@ int
 xfs_imeta_unlink(
 	struct xfs_imeta_update		*upd)
 {
+	int				error;
+
 	ASSERT(xfs_imeta_path_check(upd->path));
 	ASSERT(xfs_imeta_verify(upd->mp, upd->ip->i_ino));
 
-	return xfs_imeta_sb_unlink(upd);
+	error = xfs_imeta_sb_unlink(upd);
+	if (error)
+		return error;
+
+	/*
+	 * Metadata files require explicit resource cleanup.  In other words,
+	 * the inactivation system will not touch these files, so we must free
+	 * the ondisk inode by ourselves if warranted.
+	 */
+	if (VFS_I(upd->ip)->i_nlink > 0)
+		return 0;
+
+	return xfs_imeta_ifree(upd->tp, upd->ip);
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4699a9382f757..7f5ad390b6c7c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1807,7 +1807,7 @@ xfs_ifree_mark_inode_stale(
  * inodes that are in memory - they all must be marked stale and attached to
  * the cluster buffer.
  */
-static int
+int
 xfs_ifree_cluster(
 	struct xfs_trans	*tp,
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 10fbb3d4d193e..e967fa10721f9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -550,6 +550,9 @@ uint		xfs_ilock_data_map_shared(struct xfs_inode *);
 uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
 
 int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
+int		xfs_ifree_cluster(struct xfs_trans *tp, struct xfs_perag *pag,
+				struct xfs_inode *free_ip,
+				struct xfs_icluster *xic);
 int		xfs_itruncate_extents_flags(struct xfs_trans **,
 				struct xfs_inode *, int, xfs_fsize_t, int);
 void		xfs_iext_realloc(xfs_inode_t *, int, int);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/32] xfs: read and write metadata inode directory
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 21:09   ` [PATCH 12/32] xfs: allow deletion of metadata directory files Darrick J. Wong
@ 2023-12-31 21:09   ` Darrick J. Wong
  2023-12-31 21:09   ` [PATCH 14/32] xfs: ensure metadata directory paths exist before creating files Darrick J. Wong
                     ` (18 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:09 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Plumb in the bits we need to look up metadata inode numbers from the
metadata inode directory and save them back.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_imeta.c      |  563 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_imeta.h      |   17 +
 fs/xfs/libxfs/xfs_inode_util.c |    3 
 fs/xfs/libxfs/xfs_trans_resv.c |    8 -
 fs/xfs/xfs_imeta_utils.c       |   77 +++++
 fs/xfs/xfs_trace.h             |   50 +++-
 6 files changed, 704 insertions(+), 14 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index 38f79827b9354..5c5a801386a29 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -24,6 +24,8 @@
 #include "xfs_da_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
 
 /*
  * Metadata File Management
@@ -44,9 +46,16 @@
  * this structure must be passed to xfs_imeta_end_update to free resources that
  * cannot be freed during the transaction.
  *
- * Right now we only support callers passing in the predefined metadata inode
- * paths; the goal is that callers will some day locate metadata inodes based
- * on path lookups into a metadata directory structure.
+ * When the metadata directory tree (metadir) feature is enabled, we can create
+ * a complex directory tree in which to store metadata inodes.  Inodes within
+ * the metadata directory tree should have the "metadata" inode flag set to
+ * prevent them from being exposed to the outside world.
+ *
+ * Callers are not expected to take the IOLOCK of metadata directories.  They
+ * are expected to take the ILOCK of any inode in the metadata directory tree
+ * (just like the regular to synchronize access to that inode.  It is not
+ * necessary to take the MMAPLOCK since metadata inodes should never be exposed
+ * to user space.
  */
 
 /* Static metadata inode paths */
@@ -62,6 +71,11 @@ XFS_IMETA_DEFINE_PATH(XFS_IMETA_USRQUOTA,	usrquota_path);
 XFS_IMETA_DEFINE_PATH(XFS_IMETA_GRPQUOTA,	grpquota_path);
 XFS_IMETA_DEFINE_PATH(XFS_IMETA_PRJQUOTA,	prjquota_path);
 
+const struct xfs_imeta_path XFS_IMETA_METADIR = {
+	.im_depth = 0,
+	.im_ftype = XFS_DIR3_FT_DIR,
+};
+
 /* Are these two paths equal? */
 STATIC bool
 xfs_imeta_path_compare(
@@ -119,6 +133,10 @@ static const struct xfs_imeta_sbmap {
 		.path	= &XFS_IMETA_PRJQUOTA,
 		.offset	= offsetof(struct xfs_sb, sb_pquotino),
 	},
+	{
+		.path	= &XFS_IMETA_METADIR,
+		.offset	= offsetof(struct xfs_sb, sb_metadirino),
+	},
 	{ NULL, 0 },
 };
 
@@ -288,6 +306,521 @@ xfs_imeta_sb_link(
 	return 0;
 }
 
+/* Functions for storing and retrieving metadata directory inode values. */
+
+static inline void
+xfs_imeta_set_xname(
+	struct xfs_name			*xname,
+	const struct xfs_imeta_path	*path,
+	unsigned int			path_idx,
+	unsigned char			ftype)
+{
+	xname->name = (const unsigned char *)path->im_path[path_idx];
+	xname->len = strlen(path->im_path[path_idx]);
+	xname->type = ftype;
+}
+
+/*
+ * Look up the inode number and filetype for an exact name in a directory.
+ * Caller must hold ILOCK_EXCL.
+ */
+static inline int
+xfs_imeta_dir_lookup(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_name		*xname,
+	xfs_ino_t		*ino)
+{
+	struct xfs_da_args	args = {
+		.trans		= tp,
+		.dp		= dp,
+		.geo		= dp->i_mount->m_dir_geo,
+		.name		= xname->name,
+		.namelen	= xname->len,
+		.hashval	= xfs_dir2_hashname(dp->i_mount, xname),
+		.whichfork	= XFS_DATA_FORK,
+		.op_flags	= XFS_DA_OP_OKNOENT,
+		.owner		= dp->i_ino,
+	};
+	bool			isblock, isleaf;
+	int			error;
+
+	if (xfs_is_shutdown(dp->i_mount))
+		return -EIO;
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_dir2_sf_lookup(&args);
+		goto out_unlock;
+	}
+
+	/* dir2 functions require that the data fork is loaded */
+	error = xfs_iread_extents(tp, dp, XFS_DATA_FORK);
+	if (error)
+		goto out_unlock;
+
+	error = xfs_dir2_isblock(&args, &isblock);
+	if (error)
+		goto out_unlock;
+
+	if (isblock) {
+		error = xfs_dir2_block_lookup(&args);
+		goto out_unlock;
+	}
+
+	error = xfs_dir2_isleaf(&args, &isleaf);
+	if (error)
+		goto out_unlock;
+
+	if (isleaf) {
+		error = xfs_dir2_leaf_lookup(&args);
+		goto out_unlock;
+	}
+
+	error = xfs_dir2_node_lookup(&args);
+
+out_unlock:
+	if (error == -EEXIST)
+		error = 0;
+	if (error)
+		return error;
+
+	*ino = args.inumber;
+	xname->type = args.filetype;
+	return 0;
+}
+
+/*
+ * Given a parent directory @dp and a metadata inode path component @xname,
+ * Look up the inode number in the directory, returning it in @ino.
+ * @xname.type must match the directory entry's ftype.
+ *
+ * Caller must hold ILOCK_EXCL.
+ */
+static inline int
+xfs_imeta_dir_lookup_component(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_name		*xname,
+	xfs_ino_t		*ino)
+{
+	int			type_wanted = xname->type;
+	int			error;
+
+	if (!S_ISDIR(VFS_I(dp)->i_mode))
+		return -EFSCORRUPTED;
+
+	error = xfs_imeta_dir_lookup(tp, dp, xname, ino);
+	if (error)
+		return error;
+	if (!xfs_verify_ino(dp->i_mount, *ino))
+		return -EFSCORRUPTED;
+	if (type_wanted != XFS_DIR3_FT_UNKNOWN && xname->type != type_wanted)
+		return -EFSCORRUPTED;
+
+	trace_xfs_imeta_dir_lookup(dp, xname, *ino);
+	return 0;
+}
+
+/*
+ * Traverse a metadata directory tree path, returning the inode corresponding
+ * to the parent of the last path component.  If any of the path components do
+ * not exist, return -ENOENT.  Caller must supply a transaction to avoid
+ * livelocks on btree cycles.
+ *
+ * @dp is returned without any locks held.
+ */
+int
+xfs_imeta_dir_parent(
+	struct xfs_trans		*tp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_inode		**dpp)
+{
+	struct xfs_name			xname;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_inode		*dp = NULL;
+	xfs_ino_t			ino;
+	unsigned int			i;
+	int				error;
+
+	/* Caller wanted the root, we're done! */
+	if (path->im_depth == 0)
+		goto out;
+
+	/* No metadata directory means no parent. */
+	if (mp->m_metadirip == NULL)
+		return -ENOENT;
+
+	/* Grab a new reference to the metadir root dir. */
+	error = xfs_imeta_iget(tp, mp->m_metadirip->i_ino, XFS_DIR3_FT_DIR,
+			&dp);
+	if (error)
+		return error;
+
+	for (i = 0; i < path->im_depth - 1; i++) {
+		struct xfs_inode	*ip = NULL;
+
+		xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+		/* Look up the name in the current directory. */
+		xfs_imeta_set_xname(&xname, path, i, XFS_DIR3_FT_DIR);
+		error = xfs_imeta_dir_lookup_component(tp, dp, &xname, &ino);
+		if (error)
+			goto out_rele;
+
+		/*
+		 * Grab the child inode while we still have the parent
+		 * directory locked.
+		 */
+		error = xfs_imeta_iget(tp, ino, XFS_DIR3_FT_DIR, &ip);
+		if (error)
+			goto out_rele;
+
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		xfs_imeta_irele(dp);
+		dp = ip;
+	}
+
+out:
+	*dpp = dp;
+	return 0;
+
+out_rele:
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	xfs_imeta_irele(dp);
+	return error;
+}
+
+/*
+ * Look up a metadata inode from the metadata directory.  If the last path
+ * component doesn't exist, return NULLFSINO.  If any other part of the path
+ * does not exist, return -ENOENT so we can distinguish the two.
+ */
+STATIC int
+xfs_imeta_dir_lookup_int(
+	struct xfs_trans		*tp,
+	const struct xfs_imeta_path	*path,
+	xfs_ino_t			*inop)
+{
+	struct xfs_name			xname;
+	struct xfs_inode		*dp = NULL;
+	xfs_ino_t			ino;
+	int				error;
+
+	/* metadir ino is recorded in superblock */
+	if (xfs_imeta_path_compare(path, &XFS_IMETA_METADIR))
+		return xfs_imeta_sb_lookup(tp->t_mountp, path, inop);
+
+	ASSERT(path->im_depth > 0);
+
+	/* Find the parent of the last path component. */
+	error = xfs_imeta_dir_parent(tp, path, &dp);
+	if (error)
+		return error;
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/* Look up the name in the current directory. */
+	xfs_imeta_set_xname(&xname, path, path->im_depth - 1, path->im_ftype);
+	error = xfs_imeta_dir_lookup_component(tp, dp, &xname, &ino);
+	switch (error) {
+	case 0:
+		*inop = ino;
+		break;
+	case -ENOENT:
+		*inop = NULLFSINO;
+		error = 0;
+		break;
+	}
+
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	xfs_imeta_irele(dp);
+	return error;
+}
+
+/*
+ * Load all the metadata inode pointers that are cached in the in-core
+ * superblock but live somewhere in the metadata directory tree.
+ */
+STATIC int
+xfs_imeta_dir_mount(
+	struct xfs_trans		*tp)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	const struct xfs_imeta_sbmap	*p;
+	xfs_ino_t			*sb_inop;
+	int				err2;
+	int				error = 0;
+
+	for (p = xfs_imeta_sbmaps; p->path && p->path->im_depth > 0; p++) {
+		if (p->path == &XFS_IMETA_METADIR)
+			continue;
+		sb_inop = xfs_imeta_sbmap_to_inop(mp, p);
+		err2 = xfs_imeta_dir_lookup_int(tp, p->path, sb_inop);
+		if (err2 == -ENOENT) {
+			*sb_inop = NULLFSINO;
+			continue;
+		}
+		if (!error && err2)
+			error = err2;
+	}
+
+	return error;
+}
+
+/* Set up an inode to be recognized as a metadata directory inode. */
+void
+xfs_imeta_set_iflag(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	VFS_I(ip)->i_mode &= ~0777;
+	VFS_I(ip)->i_uid = GLOBAL_ROOT_UID;
+	VFS_I(ip)->i_gid = GLOBAL_ROOT_GID;
+	ip->i_projid = 0;
+	ip->i_diflags |= (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_SYNC |
+			  XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP |
+			  XFS_DIFLAG_NODEFRAG);
+	if (S_ISDIR(VFS_I(ip)->i_mode))
+		ip->i_diflags |= XFS_DIFLAG_NOSYMLINKS;
+	ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+	ip->i_diflags2 |= XFS_DIFLAG2_METADIR;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+
+/* Clear the metadata directory inode flag. */
+void
+xfs_imeta_clear_iflag(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+
+	ip->i_diflags2 &= ~XFS_DIFLAG2_METADIR;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+/*
+ * Create a new metadata inode accessible via the given metadata directory path.
+ * Callers must ensure that the directory entry does not already exist; a new
+ * one will be created.
+ */
+STATIC int
+xfs_imeta_dir_create(
+	struct xfs_imeta_update		*upd,
+	umode_t				mode)
+{
+	struct xfs_icreate_args		args = {
+		.pip			= upd->dp,
+		.nlink			= S_ISDIR(mode) ? 2 : 1,
+	};
+	struct xfs_name			xname;
+	struct xfs_dir_update		du = {
+		.dp			= upd->dp,
+		.name			= &xname,
+		.ppargs			= upd->ppargs,
+	};
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+	xfs_ino_t			ino;
+	unsigned int			resblks;
+	int				error;
+
+	ASSERT(xfs_isilocked(upd->dp, XFS_ILOCK_EXCL));
+
+	/* metadir ino is recorded in superblock; only mkfs gets to do this */
+	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
+		error = xfs_imeta_sb_create(upd, mode);
+		if (error)
+			return error;
+
+		/* Set the metadata iflag, initialize directory. */
+		xfs_imeta_set_iflag(upd->tp, upd->ip);
+		return xfs_dir_init(upd->tp, upd->ip, upd->ip);
+	}
+
+	ASSERT(upd->path->im_depth > 0);
+
+	xfs_icreate_args_rootfile(&args, mp, mode, xfs_has_parent(mp));
+
+	/* Check that the name does not already exist in the directory. */
+	xfs_imeta_set_xname(&xname, upd->path, upd->path->im_depth - 1,
+			XFS_DIR3_FT_UNKNOWN);
+	error = xfs_imeta_dir_lookup_component(upd->tp, upd->dp, &xname, &ino);
+	switch (error) {
+	case -ENOENT:
+		break;
+	case 0:
+		error = -EEXIST;
+		fallthrough;
+	default:
+		return error;
+	}
+
+	/*
+	 * A newly created regular or special file just has one directory
+	 * entry pointing to them, but a directory also the "." entry
+	 * pointing to itself.
+	 */
+	error = xfs_dialloc(&upd->tp, upd->dp->i_ino, mode, &ino);
+	if (error)
+		return error;
+	error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
+	if (error)
+		return error;
+	du.ip = upd->ip;
+	xfs_imeta_set_iflag(upd->tp, upd->ip);
+	upd->ip_locked = true;
+
+	/*
+	 * Join the directory inode to the transaction.  We do not do it
+	 * earlier because xfs_dialloc rolls the transaction.
+	 */
+	xfs_trans_ijoin(upd->tp, upd->dp, 0);
+
+	/* Create the entry. */
+	if (S_ISDIR(args.mode))
+		resblks = xfs_mkdir_space_res(mp, xname.len);
+	else
+		resblks = xfs_create_space_res(mp, xname.len);
+	xname.type = xfs_mode_to_ftype(args.mode);
+
+	trace_xfs_imeta_dir_try_create(upd);
+
+	error = xfs_dir_create_child(upd->tp, resblks, &du);
+	if (error)
+		return error;
+
+	/* Metadir files are not accounted to quota. */
+
+	trace_xfs_imeta_dir_create(upd);
+
+	/* Update the in-core superblock value if there is one. */
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (sb_inop)
+		*sb_inop = ino;
+	return 0;
+}
+
+/*
+ * Remove the given entry from the metadata directory and drop the link count
+ * of the metadata inode.
+ */
+STATIC int
+xfs_imeta_dir_unlink(
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_name			xname;
+	struct xfs_dir_update		du = {
+		.dp			= upd->dp,
+		.name			= &xname,
+		.ip			= upd->ip,
+		.ppargs			= upd->ppargs,
+	};
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+	xfs_ino_t			ino;
+	unsigned int			resblks;
+	int				error;
+
+	ASSERT(xfs_isilocked(upd->dp, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(upd->ip, XFS_ILOCK_EXCL));
+
+	/* Metadata directory root cannot be unlinked. */
+	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	ASSERT(upd->path->im_depth > 0);
+
+	/* Look up the name in the current directory. */
+	xfs_imeta_set_xname(&xname, upd->path, upd->path->im_depth - 1,
+			xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode));
+	error = xfs_imeta_dir_lookup_component(upd->tp, upd->dp, &xname, &ino);
+	switch (error) {
+	case 0:
+		if (ino != upd->ip->i_ino)
+			error = -ENOENT;
+		break;
+	case -ENOENT:
+		error = -EFSCORRUPTED;
+		break;
+	}
+	if (error)
+		return error;
+
+	resblks = xfs_remove_space_res(mp, xname.len);
+	error = xfs_dir_remove_child(upd->tp, resblks, &du);
+	if (error)
+		return error;
+
+	trace_xfs_imeta_dir_unlink(upd);
+
+	/* Update the in-core superblock value if there is one. */
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (sb_inop)
+		*sb_inop = NULLFSINO;
+	return 0;
+}
+
+/* Set the given path in the metadata directory to point to an inode. */
+STATIC int
+xfs_imeta_dir_link(
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_name			xname;
+	struct xfs_dir_update		du = {
+		.dp			= upd->dp,
+		.name			= &xname,
+		.ip			= upd->ip,
+		.ppargs			= upd->ppargs,
+	};
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+	xfs_ino_t			ino;
+	unsigned int			resblks;
+	int				error;
+
+	ASSERT(xfs_isilocked(upd->dp, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(upd->ip, XFS_ILOCK_EXCL));
+
+	/* Metadata directory root cannot be linked. */
+	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	ASSERT(upd->path->im_depth > 0);
+
+	/* Look up the name in the current directory. */
+	xfs_imeta_set_xname(&xname, upd->path, upd->path->im_depth - 1,
+			xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode));
+	error = xfs_imeta_dir_lookup_component(upd->tp, upd->dp, &xname, &ino);
+	switch (error) {
+	case -ENOENT:
+		break;
+	case 0:
+		error = -EEXIST;
+		fallthrough;
+	default:
+		return error;
+	}
+
+	resblks = xfs_link_space_res(mp, xname.len);
+	error = xfs_dir_add_child(upd->tp, resblks, &du);
+	if (error)
+		return error;
+
+	trace_xfs_imeta_dir_link(upd);
+
+	/* Update the in-core superblock value if there is one. */
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (sb_inop)
+		*sb_inop = upd->ip->i_ino;
+	return 0;
+}
+
 /* General functions for managing metadata inode pointers */
 
 /*
@@ -318,7 +851,13 @@ xfs_imeta_lookup(
 
 	ASSERT(xfs_imeta_path_check(path));
 
-	error = xfs_imeta_sb_lookup(mp, path, &ino);
+	if (xfs_has_metadir(mp)) {
+		error = xfs_imeta_dir_lookup_int(tp, path, &ino);
+		if (error == -ENOENT)
+			return -EFSCORRUPTED;
+	} else {
+		error = xfs_imeta_sb_lookup(mp, path, &ino);
+	}
 	if (error)
 		return error;
 
@@ -350,13 +889,17 @@ xfs_imeta_create(
 	umode_t				mode,
 	struct xfs_inode		**ipp)
 {
+	struct xfs_mount		*mp = upd->mp;
 	int				error;
 
 	ASSERT(xfs_imeta_path_check(upd->path));
 
 	*ipp = NULL;
 
-	error = xfs_imeta_sb_create(upd, mode);
+	if (xfs_has_metadir(mp))
+		error = xfs_imeta_dir_create(upd, mode);
+	else
+		error = xfs_imeta_sb_create(upd, mode);
 	*ipp = upd->ip;
 	return error;
 }
@@ -406,7 +949,10 @@ xfs_imeta_unlink(
 	ASSERT(xfs_imeta_path_check(upd->path));
 	ASSERT(xfs_imeta_verify(upd->mp, upd->ip->i_ino));
 
-	error = xfs_imeta_sb_unlink(upd);
+	if (xfs_has_metadir(upd->mp))
+		error = xfs_imeta_dir_unlink(upd);
+	else
+		error = xfs_imeta_sb_unlink(upd);
 	if (error)
 		return error;
 
@@ -432,6 +978,8 @@ xfs_imeta_link(
 {
 	ASSERT(xfs_imeta_path_check(upd->path));
 
+	if (xfs_has_metadir(upd->mp))
+		return xfs_imeta_dir_link(upd);
 	return xfs_imeta_sb_link(upd);
 }
 
@@ -462,5 +1010,8 @@ int
 xfs_imeta_mount(
 	struct xfs_trans	*tp)
 {
+	if (xfs_has_metadir(tp->t_mountp))
+		return xfs_imeta_dir_mount(tp);
+
 	return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h
index 60e0f6a6c134a..b8e360bbdfbe1 100644
--- a/fs/xfs/libxfs/xfs_imeta.h
+++ b/fs/xfs/libxfs/xfs_imeta.h
@@ -13,6 +13,7 @@
 #define XFS_IMETA_DEFINE_PATH(name, path) \
 const struct xfs_imeta_path name = { \
 	.im_path = (path), \
+	.im_ftype = XFS_DIR3_FT_REG_FILE, \
 	.im_depth = ARRAY_SIZE(path), \
 }
 
@@ -23,6 +24,9 @@ struct xfs_imeta_path {
 
 	/* Number of strings in path. */
 	uint8_t			im_depth;
+
+	/* Expected file type. */
+	uint8_t			im_ftype;
 };
 
 /* Cleanup widget for metadata inode creation and deletion. */
@@ -32,9 +36,16 @@ struct xfs_imeta_update {
 
 	const struct xfs_imeta_path *path;
 
+	/* Parent pointer update context */
+	struct xfs_parent_args	*ppargs;
+
+	/* Parent directory */
+	struct xfs_inode	*dp;
+
 	/* Metadata inode */
 	struct xfs_inode	*ip;
 
+	unsigned int		dp_locked:1;
 	unsigned int		ip_locked:1;
 };
 
@@ -54,9 +65,15 @@ extern const struct xfs_imeta_path XFS_IMETA_RTSUMMARY;
 extern const struct xfs_imeta_path XFS_IMETA_USRQUOTA;
 extern const struct xfs_imeta_path XFS_IMETA_GRPQUOTA;
 extern const struct xfs_imeta_path XFS_IMETA_PRJQUOTA;
+extern const struct xfs_imeta_path XFS_IMETA_METADIR;
 
 int xfs_imeta_lookup(struct xfs_trans *tp, const struct xfs_imeta_path *path,
 		xfs_ino_t *ino);
+int xfs_imeta_dir_parent(struct xfs_trans *tp,
+		const struct xfs_imeta_path *path, struct xfs_inode **dpp);
+
+void xfs_imeta_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_imeta_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
 
 int xfs_imeta_create(struct xfs_imeta_update *upd, umode_t mode,
 		struct xfs_inode **ipp);
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 2f190b875c7f8..7833530102d1c 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -23,6 +23,7 @@
 #include "xfs_ag.h"
 #include "xfs_iunlink_item.h"
 #include "xfs_inode_item.h"
+#include "xfs_imeta.h"
 
 uint16_t
 xfs_flags2diflags(
@@ -649,6 +650,8 @@ xfs_droplink(
 	if (inode->i_nlink)
 		return 0;
 
+	if (xfs_is_metadir_inode(ip))
+		xfs_imeta_clear_iflag(tp, ip);
 	return xfs_iunlink(tp, ip);
 }
 
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index a0545daf039c7..0840d5000bf97 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -1121,7 +1121,10 @@ xfs_calc_imeta_create_resv(
 	unsigned int		ret;
 
 	ret = xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-	ret += resp->tr_create.tr_logres;
+	if (xfs_has_metadir(mp))
+		ret += max(resp->tr_create.tr_logres, resp->tr_mkdir.tr_logres);
+	else
+		ret += resp->tr_create.tr_logres;
 	return ret;
 }
 
@@ -1131,6 +1134,9 @@ xfs_calc_imeta_create_count(
 	struct xfs_mount	*mp,
 	struct xfs_trans_resv	*resp)
 {
+	if (xfs_has_metadir(mp))
+		return max(resp->tr_create.tr_logcount,
+			   resp->tr_mkdir.tr_logcount);
 	return resp->tr_create.tr_logcount;
 }
 
diff --git a/fs/xfs/xfs_imeta_utils.c b/fs/xfs/xfs_imeta_utils.c
index 65a3aea49a5fc..a8ff46a3d502e 100644
--- a/fs/xfs/xfs_imeta_utils.c
+++ b/fs/xfs/xfs_imeta_utils.c
@@ -22,6 +22,7 @@
 #include "xfs_imeta.h"
 #include "xfs_imeta_utils.h"
 #include "xfs_trace.h"
+#include "xfs_parent.h"
 
 /* Initialize a metadata update structure. */
 static inline int
@@ -30,10 +31,33 @@ xfs_imeta_init(
 	const struct xfs_imeta_path	*path,
 	struct xfs_imeta_update		*upd)
 {
+	struct xfs_trans		*tp;
+	int				error;
+
 	memset(upd, 0, sizeof(struct xfs_imeta_update));
 	upd->mp = mp;
 	upd->path = path;
-	return 0;
+
+	if (!xfs_has_metadir(mp))
+		return 0;
+
+	/*
+	 * Find the parent of the last path component.  If the parent path does
+	 * not exist, we consider this corruption because paths are supposed
+	 * to exist.  For example, if the path is /quota/user, we require that
+	 * /quota already exists.
+	 */
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+	error = xfs_imeta_dir_parent(tp, upd->path, &upd->dp);
+	xfs_trans_cancel(tp);
+	if (error == -ENOENT)
+		return -EFSCORRUPTED;
+	if (error)
+		return error;
+
+	return xfs_parent_start(mp, &upd->ppargs);
 }
 
 /*
@@ -48,11 +72,25 @@ xfs_imeta_teardown(
 {
 	trace_xfs_imeta_teardown(upd, error);
 
+	if (upd->ppargs) {
+		xfs_parent_finish(upd->mp, upd->ppargs);
+		upd->ppargs = NULL;
+	}
+
 	if (upd->ip) {
 		if (upd->ip_locked)
 			xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
 		upd->ip_locked = false;
 	}
+
+	if (upd->dp) {
+		if (upd->dp_locked)
+			xfs_iunlock(upd->dp, XFS_ILOCK_EXCL);
+		upd->dp_locked = false;
+
+		xfs_imeta_irele(upd->dp);
+		upd->dp = NULL;
+	}
 }
 
 /*
@@ -83,6 +121,15 @@ xfs_imeta_start_create(
 	if (error)
 		goto out_teardown;
 
+	/*
+	 * Lock the parent directory if there is one.  We can't ijoin it to
+	 * the transaction until after the child file has been created.
+	 */
+	if (upd->dp) {
+		xfs_ilock(upd->dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+		upd->dp_locked = true;
+	}
+
 	trace_xfs_imeta_start_create(upd);
 	return 0;
 out_teardown:
@@ -111,10 +158,30 @@ xfs_imeta_start_dir_update(
 
 	upd->ip = ip;
 
-	error = xfs_trans_alloc_inode(upd->ip, tr_resv, resblks, 0, false,
-			&upd->tp);
-	if (error)
-		goto out_teardown;
+	if (upd->dp) {
+		int			nospace_error = 0;
+
+		error = xfs_trans_alloc_dir(upd->dp, tr_resv, upd->ip,
+				&resblks, &upd->tp, &nospace_error);
+		if (error)
+			goto out_teardown;
+		if (!resblks) {
+			/* We don't allow reservationless updates. */
+			xfs_trans_cancel(upd->tp);
+			upd->tp = NULL;
+			xfs_iunlock(upd->dp, XFS_ILOCK_EXCL);
+			xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
+			error = nospace_error;
+			goto out_teardown;
+		}
+
+		upd->dp_locked = true;
+	} else {
+		error = xfs_trans_alloc_inode(upd->ip, tr_resv, resblks, 0,
+				false, &upd->tp);
+		if (error)
+			goto out_teardown;
+	}
 
 	upd->ip_locked = true;
 	return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 12f68507d7a74..17f364e1b9613 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5090,16 +5090,19 @@ DECLARE_EVENT_CLASS(xfs_imeta_update_class,
 	TP_ARGS(upd),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(xfs_ino_t, dp_ino)
 		__field(xfs_ino_t, ino)
 		__string(fname, xfs_imeta_lastpath(upd))
 	),
 	TP_fast_assign(
 		__entry->dev = upd->mp->m_super->s_dev;
+		__entry->dp_ino = upd->dp ? upd->dp->i_ino : NULLFSINO;
 		__entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
 		__assign_str(fname, xfs_imeta_lastpath(upd));
 	),
-	TP_printk("dev %d:%d fname '%s' ino 0x%llx",
+	TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dp_ino,
 		  __get_str(fname),
 		  __entry->ino)
 )
@@ -5116,24 +5119,31 @@ DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_update_cancel);
 DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_sb_create);
 DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_sb_unlink);
 DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_sb_link);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_dir_try_create);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_dir_create);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_dir_unlink);
+DEFINE_IMETA_UPDATE_EVENT(xfs_imeta_dir_link);
 
 DECLARE_EVENT_CLASS(xfs_imeta_update_error_class,
 	TP_PROTO(const struct xfs_imeta_update *upd, int error),
 	TP_ARGS(upd, error),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(xfs_ino_t, dp_ino)
 		__field(xfs_ino_t, ino)
 		__field(int, error)
 		__string(fname, xfs_imeta_lastpath(upd))
 	),
 	TP_fast_assign(
 		__entry->dev = upd->mp->m_super->s_dev;
+		__entry->dp_ino = upd->dp ? upd->dp->i_ino : NULLFSINO;
 		__entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
 		__entry->error = error;
 		__assign_str(fname, xfs_imeta_lastpath(upd));
 	),
-	TP_printk("dev %d:%d fname '%s' ino 0x%llx error %d",
+	TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx error %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dp_ino,
 		  __get_str(fname),
 		  __entry->ino,
 		  __entry->error)
@@ -5145,6 +5155,42 @@ DEFINE_EVENT(xfs_imeta_update_error_class, name, \
 	TP_ARGS(upd, error))
 DEFINE_IMETA_UPDATE_ERROR_EVENT(xfs_imeta_teardown);
 
+DECLARE_EVENT_CLASS(xfs_imeta_dir_class,
+	TP_PROTO(struct xfs_inode *dp, struct xfs_name *name,
+		 xfs_ino_t ino),
+	TP_ARGS(dp, name, ino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, dp_ino)
+		__field(xfs_ino_t, ino)
+		__field(int, ftype)
+		__field(int, namelen)
+		__dynamic_array(char, name, name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(dp)->i_sb->s_dev;
+		__entry->dp_ino = dp->i_ino;
+		__entry->ino = ino,
+		__entry->ftype = name->type;
+		__entry->namelen = name->len;
+		memcpy(__get_str(name), name->name, name->len);
+	),
+	TP_printk("dev %d:%d dir 0x%llx type %s name '%.*s' ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dp_ino,
+		  __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+		  __entry->namelen,
+		  __get_str(name),
+		  __entry->ino)
+)
+
+#define DEFINE_IMETA_DIR_EVENT(name) \
+DEFINE_EVENT(xfs_imeta_dir_class, name, \
+	TP_PROTO(struct xfs_inode *dp, struct xfs_name *name, \
+		 xfs_ino_t ino), \
+	TP_ARGS(dp, name, ino))
+DEFINE_IMETA_DIR_EVENT(xfs_imeta_dir_lookup);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/32] xfs: ensure metadata directory paths exist before creating files
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 21:09   ` [PATCH 13/32] xfs: read and write metadata inode directory Darrick J. Wong
@ 2023-12-31 21:09   ` Darrick J. Wong
  2023-12-31 21:10   ` [PATCH 15/32] xfs: disable the agi rotor for metadata inodes Darrick J. Wong
                     ` (17 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:09 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Since xfs_imeta_create can create new metadata files arbitrarily deep in
the metadata directory tree, we must supply a function that can ensure
that all directories in a path exist, and call it before the quota
functions create the quota inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_imeta_utils.c |   71 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_imeta_utils.h |    3 ++
 fs/xfs/xfs_qm.c          |   17 +++++++++++
 3 files changed, 91 insertions(+)


diff --git a/fs/xfs/xfs_imeta_utils.c b/fs/xfs/xfs_imeta_utils.c
index a8ff46a3d502e..9fbaa4323e3b2 100644
--- a/fs/xfs/xfs_imeta_utils.c
+++ b/fs/xfs/xfs_imeta_utils.c
@@ -265,3 +265,74 @@ xfs_imeta_cancel_update(
 
 	xfs_imeta_teardown(upd, error);
 }
+
+/* Create a metadata for the last component of the path. */
+STATIC int
+xfs_imeta_mkdir(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path)
+{
+	struct xfs_imeta_update		upd;
+	struct xfs_inode		*ip = NULL;
+	int				error;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/* Allocate a transaction to create the last directory. */
+	error = xfs_imeta_start_create(mp, path, &upd);
+	if (error)
+		return error;
+
+	/* Create the subdirectory and take our reference. */
+	error = xfs_imeta_create(&upd, S_IFDIR, &ip);
+	if (error)
+		goto out_cancel;
+
+	error = xfs_imeta_commit_update(&upd);
+
+	/*
+	 * We don't pass the directory we just created to the caller, so finish
+	 * setting up the inode, then release the dir and the dquots.
+	 */
+	goto out_irele;
+
+out_cancel:
+	xfs_imeta_cancel_update(&upd, error);
+out_irele:
+	/* Have to finish setting up the inode to ensure it's deleted. */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		xfs_irele(ip);
+	}
+	return error;
+}
+
+/*
+ * Make sure that every metadata directory path component exists and is a
+ * directory.
+ */
+int
+xfs_imeta_ensure_dirpath(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path)
+{
+	struct xfs_imeta_path		temp_path = {
+		.im_path		= path->im_path,
+		.im_depth		= 1,
+		.im_ftype		= XFS_DIR3_FT_DIR,
+	};
+	unsigned int			i;
+	int				error = 0;
+
+	if (!xfs_has_metadir(mp))
+		return 0;
+
+	for (i = 0; i < path->im_depth - 1; i++, temp_path.im_depth++) {
+		error = xfs_imeta_mkdir(mp, &temp_path);
+		if (error && error != -EEXIST)
+			return error;
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/xfs_imeta_utils.h b/fs/xfs/xfs_imeta_utils.h
index 0235f7048ff1d..f1bd6699997bb 100644
--- a/fs/xfs/xfs_imeta_utils.h
+++ b/fs/xfs/xfs_imeta_utils.h
@@ -18,6 +18,9 @@ int xfs_imeta_start_unlink(struct xfs_mount *mp,
 		const struct xfs_imeta_path *path,
 		struct xfs_inode *ip, struct xfs_imeta_update *upd);
 
+int xfs_imeta_ensure_dirpath(struct xfs_mount *mp,
+		const struct xfs_imeta_path *path);
+
 int xfs_imeta_commit_update(struct xfs_imeta_update *upd);
 void xfs_imeta_cancel_update(struct xfs_imeta_update *upd, int error);
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2d8e420f3ad34..1d28f0982840c 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -836,6 +836,23 @@ xfs_qm_qino_alloc(
 		return error;
 
 	if (need_alloc) {
+		/*
+		 * Ensure the quota directory exists, being careful to disable
+		 * quotas while we do this.  We'll have to quotacheck anyway,
+		 * so the temporary undercount of the directory tree shouldn't
+		 * affect the quota count.
+		 */
+		if (xfs_has_metadir(mp)) {
+			unsigned int	old_qflags;
+
+			old_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
+			mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+			error = xfs_imeta_ensure_dirpath(mp, path);
+			mp->m_qflags |= old_qflags;
+			if (error)
+				return error;
+		}
+
 		error = xfs_imeta_start_create(mp, path, &upd);
 	} else {
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, 0, 0, 0,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/32] xfs: disable the agi rotor for metadata inodes
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-31 21:09   ` [PATCH 14/32] xfs: ensure metadata directory paths exist before creating files Darrick J. Wong
@ 2023-12-31 21:10   ` Darrick J. Wong
  2023-12-31 21:10   ` [PATCH 16/32] xfs: hide metadata inodes from everyone because they are special Darrick J. Wong
                     ` (16 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:10 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Ideally, we'd put all the metadata inodes in one place if we could, so
that the metadata all stay reasonably close together instead of
spreading out over the disk.  Furthermore, if the log is internal we'd
probably prefer to keep the metadata near the log.  Therefore, disable
AGI rotoring for metadata inode allocations.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ialloc.c |   56 ++++++++++++++++++++++++++++++--------------
 fs/xfs/libxfs/xfs_ialloc.h |    2 +-
 fs/xfs/libxfs/xfs_imeta.c  |    4 ++-
 fs/xfs/scrub/tempfile.c    |    2 +-
 fs/xfs/xfs_inode.c         |    4 ++-
 fs/xfs/xfs_symlink.c       |    2 +-
 6 files changed, 45 insertions(+), 25 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d58d700ed8a8b..96d22f9698a7a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1799,6 +1799,37 @@ xfs_dialloc_try_ag(
 	return error;
 }
 
+/*
+ * Pick an AG for the new inode.
+ *
+ * Directories, symlinks, and regular files frequently allocate at least one
+ * block, so factor that potential expansion when we examine whether an AG has
+ * enough space for file creation.  Try to keep metadata files all in the same
+ * AG.
+ */
+static inline xfs_agnumber_t
+xfs_dialloc_pick_ag(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp,
+	umode_t			mode)
+{
+	xfs_agnumber_t		start_agno;
+
+	if (!dp)
+		return 0;
+	if (xfs_is_metadir_inode(dp))
+		return 0;
+
+	if (S_ISDIR(mode))
+		return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi;
+
+	start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino);
+	if (start_agno >= mp->m_maxagi)
+		start_agno = 0;
+
+	return start_agno;
+}
+
 /*
  * Allocate an on-disk inode.
  *
@@ -1810,34 +1841,23 @@ xfs_dialloc_try_ag(
 int
 xfs_dialloc(
 	struct xfs_trans	**tpp,
-	xfs_ino_t		parent,
+	struct xfs_inode	*dp,
 	umode_t			mode,
 	xfs_ino_t		*new_ino)
 {
 	struct xfs_mount	*mp = (*tpp)->t_mountp;
-	xfs_agnumber_t		agno;
-	int			error = 0;
-	xfs_agnumber_t		start_agno;
 	struct xfs_perag	*pag;
 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	xfs_ino_t		ino = NULLFSINO;
+	xfs_ino_t		parent = dp ? dp->i_ino : 0;
+	xfs_agnumber_t		agno;
+	xfs_agnumber_t		start_agno;
 	bool			ok_alloc = true;
 	bool			low_space = false;
 	int			flags;
-	xfs_ino_t		ino = NULLFSINO;
+	int			error = 0;
 
-	/*
-	 * Directories, symlinks, and regular files frequently allocate at least
-	 * one block, so factor that potential expansion when we examine whether
-	 * an AG has enough space for file creation.
-	 */
-	if (S_ISDIR(mode))
-		start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) %
-				mp->m_maxagi;
-	else {
-		start_agno = XFS_INO_TO_AGNO(mp, parent);
-		if (start_agno >= mp->m_maxagi)
-			start_agno = 0;
-	}
+	start_agno = xfs_dialloc_pick_ag(mp, dp, mode);
 
 	/*
 	 * If we have already hit the ceiling of inode blocks then clear
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index f1412183bb44b..9bfe2d8d84b1d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -37,7 +37,7 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
  * Allocate an inode on disk.  Mode is used to tell whether the new inode will
  * need space, and whether it is a directory.
  */
-int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode,
+int xfs_dialloc(struct xfs_trans **tpp, struct xfs_inode *dp, umode_t mode,
 		xfs_ino_t *new_ino);
 
 int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag,
diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index 5c5a801386a29..e3651c7bba2b3 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -230,7 +230,7 @@ xfs_imeta_sb_create(
 		return -EEXIST;
 
 	/* Create a new inode and set the sb pointer. */
-	error = xfs_dialloc(&upd->tp, 0, mode, &ino);
+	error = xfs_dialloc(&upd->tp, NULL, mode, &ino);
 	if (error)
 		return error;
 	error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
@@ -662,7 +662,7 @@ xfs_imeta_dir_create(
 	 * entry pointing to them, but a directory also the "." entry
 	 * pointing to itself.
 	 */
-	error = xfs_dialloc(&upd->tp, upd->dp->i_ino, mode, &ino);
+	error = xfs_dialloc(&upd->tp, upd->dp, mode, &ino);
 	if (error)
 		return error;
 	error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 6d207559df989..db3a9a47e1f82 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -87,7 +87,7 @@ xrep_tempfile_create(
 		goto out_release_dquots;
 
 	/* Allocate inode, set up directory. */
-	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+	error = xfs_dialloc(&tp, dp, mode, &ino);
 	if (error)
 		goto out_trans_cancel;
 	error = xfs_icreate(tp, ino, &args, &sc->tempip);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f5ad390b6c7c..66aa8bbcb1045 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -768,7 +768,7 @@ xfs_create(
 	 * entry pointing to them, but a directory also the "." entry
 	 * pointing to itself.
 	 */
-	error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
+	error = xfs_dialloc(&tp, dp, args->mode, &ino);
 	if (!error)
 		error = xfs_icreate(tp, ino, args, &du.ip);
 	if (error)
@@ -879,7 +879,7 @@ xfs_create_tmpfile(
 	if (error)
 		goto out_release_dquots;
 
-	error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
+	error = xfs_dialloc(&tp, dp, args->mode, &ino);
 	if (!error)
 		error = xfs_icreate(tp, ino, args, &ip);
 	if (error)
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index e7936858dfcf5..973be5e8b14be 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -169,7 +169,7 @@ xfs_symlink(
 	/*
 	 * Allocate an inode for the symlink.
 	 */
-	error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino);
+	error = xfs_dialloc(&tp, dp, S_IFLNK, &ino);
 	if (!error)
 		error = xfs_icreate(tp, ino, &args, &du.ip);
 	if (error)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/32] xfs: hide metadata inodes from everyone because they are special
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-31 21:10   ` [PATCH 15/32] xfs: disable the agi rotor for metadata inodes Darrick J. Wong
@ 2023-12-31 21:10   ` Darrick J. Wong
  2023-12-31 21:10   ` [PATCH 17/32] xfs: advertise metadata directory feature Darrick J. Wong
                     ` (15 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:10 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Metadata inodes are private files and therefore cannot be exposed to
userspace.  This means no bulkstat, no open-by-handle, no linking them
into the directory tree, and no feeding them to LSMs.  As such, we mark
them S_PRIVATE, which stops all that.

While we're at it, put them in a separate lockdep class so that it won't
get confused by "recursive" i_rwsem locking such as what happens when we
write to a rt file and need to allocate from the rt bitmap file.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/tempfile.c |    8 ++++++++
 fs/xfs/xfs_iops.c       |   34 ++++++++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index db3a9a47e1f82..583bbd5a02bc9 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -867,6 +867,14 @@ xrep_is_tempfile(
 	const struct xfs_inode	*ip)
 {
 	const struct inode	*inode = &ip->i_vnode;
+	struct xfs_mount	*mp = ip->i_mount;
+
+	/*
+	 * Files in the metadata directory tree also have S_PRIVATE set and
+	 * IOP_XATTR unset, so we must distinguish them separately.
+	 */
+	if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADIR))
+		return false;
 
 	if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
 		return true;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 90296b3c838aa..461e77dd54e38 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -45,6 +45,15 @@
 static struct lock_class_key xfs_nondir_ilock_class;
 static struct lock_class_key xfs_dir_ilock_class;
 
+/*
+ * Metadata directories and files are not exposed to userspace, which means
+ * that they never access any of the VFS IO locks and never experience page
+ * faults.  Give them separate locking classes so that lockdep will not
+ * complain about conflicts that cannot happen.
+ */
+static struct lock_class_key xfs_metadata_file_ilock_class;
+static struct lock_class_key xfs_metadata_dir_ilock_class;
+
 static int
 xfs_initxattrs(
 	struct inode		*inode,
@@ -1291,6 +1300,7 @@ xfs_setup_inode(
 {
 	struct inode		*inode = &ip->i_vnode;
 	gfp_t			gfp_mask;
+	bool			is_meta = xfs_is_metadata_inode(ip);
 
 	inode->i_ino = ip->i_ino;
 	inode->i_state |= I_NEW;
@@ -1302,6 +1312,16 @@ xfs_setup_inode(
 	i_size_write(inode, ip->i_disk_size);
 	xfs_diflags_to_iflags(ip, true);
 
+	/*
+	 * Mark our metadata files as private so that LSMs and the ACL code
+	 * don't try to add their own metadata or reason about these files,
+	 * and users cannot ever obtain file handles to them.
+	 */
+	if (is_meta) {
+		inode->i_flags |= S_PRIVATE;
+		inode->i_opflags &= ~IOP_XATTR;
+	}
+
 	if (S_ISDIR(inode->i_mode)) {
 		/*
 		 * We set the i_rwsem class here to avoid potential races with
@@ -1311,9 +1331,19 @@ xfs_setup_inode(
 		 */
 		lockdep_set_class(&inode->i_rwsem,
 				  &inode->i_sb->s_type->i_mutex_dir_key);
-		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+		if (is_meta)
+			lockdep_set_class(&ip->i_lock.mr_lock,
+					  &xfs_metadata_dir_ilock_class);
+		else
+			lockdep_set_class(&ip->i_lock.mr_lock,
+					  &xfs_dir_ilock_class);
 	} else {
-		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+		if (is_meta)
+			lockdep_set_class(&ip->i_lock.mr_lock,
+					  &xfs_metadata_file_ilock_class);
+		else
+			lockdep_set_class(&ip->i_lock.mr_lock,
+					  &xfs_nondir_ilock_class);
 	}
 
 	/*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/32] xfs: advertise metadata directory feature
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-31 21:10   ` [PATCH 16/32] xfs: hide metadata inodes from everyone because they are special Darrick J. Wong
@ 2023-12-31 21:10   ` Darrick J. Wong
  2023-12-31 21:10   ` [PATCH 18/32] xfs: allow bulkstat to return metadata directories Darrick J. Wong
                     ` (14 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:10 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Advertise the existence of the metadata directory feature; this will be
used by scrub to decide if it needs to scan the metadir too.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |    1 +
 fs/xfs/libxfs/xfs_sb.c |    2 ++
 2 files changed, 3 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 77fbca573e164..2e31fc2240e4b 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_BIGTIME	(1 << 21) /* 64-bit nsec timestamps */
 #define XFS_FSOP_GEOM_FLAGS_INOBTCNT	(1 << 22) /* inobt btree counter */
 #define XFS_FSOP_GEOM_FLAGS_NREXT64	(1 << 23) /* large extent counters */
+#define XFS_FSOP_GEOM_FLAGS_METADIR	(1U << 29) /* metadata directories */
 #define XFS_FSOP_GEOM_FLAGS_PARENT	(1U << 30) /* parent pointers */
 
 /* atomic file extent swap available to userspace */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 28f65753c0c5a..7a20b9b4bccb3 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1298,6 +1298,8 @@ xfs_fs_geometry(
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
 	if (xfs_atomic_swap_supported(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATOMIC_SWAP;
+	if (xfs_has_metadir(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
 	geo->rtsectsize = sbp->sb_blocksize;
 	geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/32] xfs: allow bulkstat to return metadata directories
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-31 21:10   ` [PATCH 17/32] xfs: advertise metadata directory feature Darrick J. Wong
@ 2023-12-31 21:10   ` Darrick J. Wong
  2023-12-31 21:11   ` [PATCH 19/32] xfs: enable creation of dynamically allocated metadir path structures Darrick J. Wong
                     ` (13 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:10 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow the V5 bulkstat ioctl to return information about metadata
directory files so that xfs_scrub can find and scrub them, since they
are otherwise ordinary directories.

(Metadata files of course require per-file scrub code and hence do not
need exposure.)

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |   10 +++++++++-
 fs/xfs/xfs_ioctl.c     |    7 +++++++
 fs/xfs/xfs_itable.c    |   34 ++++++++++++++++++++++++++++++----
 fs/xfs/xfs_itable.h    |    3 +++
 4 files changed, 49 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2e31fc2240e4b..ab961a0118157 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -491,9 +491,17 @@ struct xfs_bulk_ireq {
  */
 #define XFS_BULK_IREQ_NREXT64	(1U << 2)
 
+/*
+ * Allow bulkstat to return information about metadata directories.  This
+ * enables xfs_scrub to find them for scanning, as they are otherwise ordinary
+ * directories.
+ */
+#define XFS_BULK_IREQ_METADIR	(1U << 31)
+
 #define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO |	 \
 				 XFS_BULK_IREQ_SPECIAL | \
-				 XFS_BULK_IREQ_NREXT64)
+				 XFS_BULK_IREQ_NREXT64 | \
+				 XFS_BULK_IREQ_METADIR)
 
 /* Operate on the root directory inode. */
 #define XFS_BULK_IREQ_SPECIAL_ROOT	(1)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 184916bc6528a..a887c1d5d69fb 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -824,6 +824,10 @@ xfs_bulk_ireq_setup(
 	if (hdr->flags & XFS_BULK_IREQ_NREXT64)
 		breq->flags |= XFS_IBULK_NREXT64;
 
+	/* Caller wants to see metadata directories in bulkstat output. */
+	if (hdr->flags & XFS_BULK_IREQ_METADIR)
+		breq->flags |= XFS_IBULK_METADIR;
+
 	return 0;
 }
 
@@ -914,6 +918,9 @@ xfs_ioc_inumbers(
 	if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
 		return -EFAULT;
 
+	if (hdr.flags & XFS_BULK_IREQ_METADIR)
+		return -EINVAL;
+
 	error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
 	if (error == -ECANCELED)
 		goto out_teardown;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 4610660f267e6..3339cf5be3a86 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -36,6 +36,15 @@ struct xfs_bstat_chunk {
 	struct xfs_bulkstat	*buf;
 };
 
+static inline bool
+want_metadir_file(
+	struct xfs_inode	*ip,
+	struct xfs_ibulk	*breq)
+{
+	return  xfs_is_metadir_inode(ip) &&
+		(breq->flags & XFS_IBULK_METADIR);
+}
+
 /*
  * Fill out the bulkstat info for a single inode and report it somewhere.
  *
@@ -69,9 +78,6 @@ xfs_bulkstat_one_int(
 	vfsuid_t		vfsuid;
 	vfsgid_t		vfsgid;
 
-	if (xfs_internal_inum(mp, ino))
-		goto out_advance;
-
 	error = xfs_iget(mp, tp, ino,
 			 (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
 			 XFS_ILOCK_SHARED, &ip);
@@ -97,8 +103,28 @@ xfs_bulkstat_one_int(
 	vfsuid = i_uid_into_vfsuid(idmap, inode);
 	vfsgid = i_gid_into_vfsgid(idmap, inode);
 
+	/*
+	 * If caller wants files from the metadata directories, push out the
+	 * bare minimum information for enabling scrub.
+	 */
+	if (want_metadir_file(ip, bc->breq)) {
+		memset(buf, 0, sizeof(*buf));
+		buf->bs_ino = ino;
+		buf->bs_gen = inode->i_generation;
+		buf->bs_mode = inode->i_mode & S_IFMT;
+		xfs_bulkstat_health(ip, buf);
+		buf->bs_version = XFS_BULKSTAT_VERSION_V5;
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		xfs_irele(ip);
+
+		error = bc->formatter(bc->breq, buf);
+		if (!error || error == -ECANCELED)
+			goto out_advance;
+		goto out;
+	}
+
 	/* If this is a private inode, don't leak its details to userspace. */
-	if (IS_PRIVATE(inode)) {
+	if (IS_PRIVATE(inode) || xfs_internal_inum(mp, ino)) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		xfs_irele(ip);
 		error = -EINVAL;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1659f13f17a89..f10e8f8f23351 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -22,6 +22,9 @@ struct xfs_ibulk {
 /* Fill out the bs_extents64 field if set. */
 #define XFS_IBULK_NREXT64	(1U << 1)
 
+/* Signal that we can return metadata directories. */
+#define XFS_IBULK_METADIR	(1U << 2)
+
 /*
  * Advance the user buffer pointer by one record of the given size.  If the
  * buffer is now full, return the appropriate error code.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/32] xfs: enable creation of dynamically allocated metadir path structures
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-31 21:10   ` [PATCH 18/32] xfs: allow bulkstat to return metadata directories Darrick J. Wong
@ 2023-12-31 21:11   ` Darrick J. Wong
  2023-12-31 21:11   ` [PATCH 20/32] xfs: don't count metadata directory files to quota Darrick J. Wong
                     ` (12 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:11 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a few helper functions so that it's possible to allocate
xfs_imeta_path objects dynamically, along with dynamically allocated
path components.  Eventually we're going to want to support paths of the
form "/realtime/$rtgroup.rmap", and this is necessary for that.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_imeta.c |   46 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_imeta.h |   15 +++++++++++++++
 2 files changed, 61 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index e3651c7bba2b3..10249a1c36c73 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -1015,3 +1015,49 @@ xfs_imeta_mount(
 
 	return 0;
 }
+
+/* Create a path to a file within the metadata directory tree. */
+int
+xfs_imeta_create_file_path(
+	struct xfs_mount	*mp,
+	unsigned int		nr_components,
+	struct xfs_imeta_path	**pathp)
+{
+	struct xfs_imeta_path	*p;
+	unsigned char		**components;
+
+	p = kzalloc(sizeof(struct xfs_imeta_path), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	components = kvcalloc(nr_components, sizeof(unsigned char *),
+			GFP_KERNEL);
+	if (!components) {
+		kfree(p);
+		return -ENOMEM;
+	}
+
+	p->im_depth = nr_components;
+	p->im_path = (const unsigned char **)components;
+	p->im_ftype = XFS_DIR3_FT_REG_FILE;
+	*pathp = p;
+	return 0;
+}
+
+/* Free a metadata directory tree path. */
+void
+xfs_imeta_free_path(
+	const struct xfs_imeta_path	*path)
+{
+	unsigned int			i;
+
+	if (path->im_flags & XFS_IMETA_PATH_STATIC)
+		return;
+
+	for (i = 0; i < path->im_depth; i++) {
+		if ((path->im_dynamicmask & (1ULL << i)) && path->im_path[i])
+			kfree(path->im_path[i]);
+	}
+	kfree(path->im_path);
+	kfree(path);
+}
diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h
index b8e360bbdfbe1..3b5953efc013c 100644
--- a/fs/xfs/libxfs/xfs_imeta.h
+++ b/fs/xfs/libxfs/xfs_imeta.h
@@ -15,6 +15,8 @@ const struct xfs_imeta_path name = { \
 	.im_path = (path), \
 	.im_ftype = XFS_DIR3_FT_REG_FILE, \
 	.im_depth = ARRAY_SIZE(path), \
+	.im_flags = XFS_IMETA_PATH_STATIC, \
+	.im_dynamicmask = 0, \
 }
 
 /* Key for looking up metadata inodes. */
@@ -22,6 +24,12 @@ struct xfs_imeta_path {
 	/* Array of string pointers. */
 	const unsigned char	**im_path;
 
+	/* Each bit corresponds to an element of im_path needing to be freed */
+	unsigned long long	im_dynamicmask;
+
+	/* XFS_IMETA_* path flags */
+	uint16_t		im_flags;
+
 	/* Number of strings in path. */
 	uint8_t			im_depth;
 
@@ -29,6 +37,9 @@ struct xfs_imeta_path {
 	uint8_t			im_ftype;
 };
 
+/* Path is statically allocated. */
+#define XFS_IMETA_PATH_STATIC	(1U << 0)
+
 /* Cleanup widget for metadata inode creation and deletion. */
 struct xfs_imeta_update {
 	struct xfs_mount	*mp;
@@ -72,6 +83,10 @@ int xfs_imeta_lookup(struct xfs_trans *tp, const struct xfs_imeta_path *path,
 int xfs_imeta_dir_parent(struct xfs_trans *tp,
 		const struct xfs_imeta_path *path, struct xfs_inode **dpp);
 
+int xfs_imeta_create_file_path(struct xfs_mount *mp,
+		unsigned int nr_components, struct xfs_imeta_path **pathp);
+void xfs_imeta_free_path(const struct xfs_imeta_path *path);
+
 void xfs_imeta_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
 void xfs_imeta_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/32] xfs: don't count metadata directory files to quota
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-31 21:11   ` [PATCH 19/32] xfs: enable creation of dynamically allocated metadir path structures Darrick J. Wong
@ 2023-12-31 21:11   ` Darrick J. Wong
  2023-12-31 21:11   ` [PATCH 21/32] xfs: adjust xfs_bmap_add_attrfork for metadir Darrick J. Wong
                     ` (11 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:11 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Files in the metadata directory tree are internal to the filesystem.
Don't count the inodes or the blocks they use in the root dquot because
users do not need to know about their resource usage.  This will also
quiet down complaints about dquot usage not matching du output.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_dquot.c       |    1 +
 fs/xfs/xfs_qm.c          |   11 +++++++++++
 fs/xfs/xfs_quota.h       |    5 +++++
 fs/xfs/xfs_trans_dquot.c |    6 ++++++
 4 files changed, 23 insertions(+)


diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bc1893a4b6738..55b2bc1d7d5db 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -979,6 +979,7 @@ xfs_qm_dqget_inode(
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(xfs_inode_dquot(ip, type) == NULL);
+	ASSERT(!xfs_is_metadir_inode(ip));
 
 	id = xfs_qm_id_for_quotatype(ip, type);
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 1d28f0982840c..a6b5193190c4c 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -305,6 +305,8 @@ xfs_qm_need_dqattach(
 		return false;
 	if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
 		return false;
+	if (xfs_is_metadir_inode(ip))
+		return false;
 	return true;
 }
 
@@ -327,6 +329,7 @@ xfs_qm_dqattach_locked(
 		return 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!xfs_is_metadir_inode(ip));
 
 	if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
 		error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
@@ -1253,6 +1256,10 @@ xfs_qm_dqusage_adjust(
 		}
 	}
 
+	/* Metadata directory files are not accounted to user-visible quotas. */
+	if (xfs_is_metadir_inode(ip))
+		goto error0;
+
 	ASSERT(ip->i_delayed_blks == 0);
 
 	if (XFS_IS_REALTIME_INODE(ip)) {
@@ -1775,6 +1782,8 @@ xfs_qm_vop_dqalloc(
 	if (!XFS_IS_QUOTA_ON(mp))
 		return 0;
 
+	ASSERT(!xfs_is_metadir_inode(ip));
+
 	lockflags = XFS_ILOCK_EXCL;
 	xfs_ilock(ip, lockflags);
 
@@ -1904,6 +1913,7 @@ xfs_qm_vop_chown(
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
+	ASSERT(!xfs_is_metadir_inode(ip));
 
 	/* old dquot */
 	prevdq = *IO_olddq;
@@ -1991,6 +2001,7 @@ xfs_qm_vop_create_dqattach(
 		return;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!xfs_is_metadir_inode(ip));
 
 	if (udqp && XFS_IS_UQUOTA_ON(mp)) {
 		ASSERT(ip->i_udquot == NULL);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fe63489d91b2f..55320c9ff1367 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -29,6 +29,11 @@ struct xfs_buf;
 	 (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
 	 (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
 
+#define XFS_IS_DQDETACHED(mp, ip) \
+	((ip)->i_udquot == NULL || \
+	 (ip)->i_gdquot == NULL || \
+	 (ip)->i_pdquot == NULL)
+
 #define XFS_QM_NEED_QUOTACHECK(mp) \
 	((XFS_IS_UQUOTA_ON(mp) && \
 		(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 833a65be05705..6983e35b7c2b7 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -156,6 +156,8 @@ xfs_trans_mod_ino_dquot(
 	unsigned int			field,
 	int64_t				delta)
 {
+	ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(mp, ip));
+
 	xfs_trans_mod_dquot(tp, dqp, field, delta);
 
 	if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
@@ -236,6 +238,8 @@ xfs_trans_mod_dquot_byino(
 	    xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
 		return;
 
+	ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(mp, ip));
+
 	if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
 		xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);
 	if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
@@ -951,6 +955,8 @@ xfs_trans_reserve_quota_nblks(
 
 	if (!XFS_IS_QUOTA_ON(mp))
 		return 0;
+	if (xfs_is_metadir_inode(ip))
+		return 0;
 
 	ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/32] xfs: adjust xfs_bmap_add_attrfork for metadir
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-31 21:11   ` [PATCH 20/32] xfs: don't count metadata directory files to quota Darrick J. Wong
@ 2023-12-31 21:11   ` Darrick J. Wong
  2023-12-31 21:11   ` [PATCH 22/32] xfs: record health problems with the metadata directory Darrick J. Wong
                     ` (10 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:11 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Online repair might use the xfs_bmap_add_attrfork to repair a file in
the metadata directory tree if (say) the metadata file lacks the correct
parent pointers.  In that case, it is not correct to check that the file
is dqattached -- metadata files must be not have /any/ dquot attached at
all.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_attr.c |    5 ++++-
 fs/xfs/libxfs/xfs_bmap.c |    5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 9cefaeca8f854..9cf3e31b98f89 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -946,7 +946,10 @@ xfs_attr_add_fork(
 	unsigned int		blks;		/* space reservation */
 	int			error;		/* error return value */
 
-	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	if (xfs_is_metadir_inode(ip))
+		ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+	else
+		ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 
 	blks = XFS_ADDAFORK_SPACE_RES(mp);
 
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index dd0229963ad97..6d4e4861b4f5c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1024,7 +1024,10 @@ xfs_bmap_add_attrfork(
 	int			error;		/* error return value */
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	if (xfs_is_metadir_inode(ip))
+		ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+	else
+		ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 	ASSERT(!xfs_inode_has_attr_fork(ip));
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/32] xfs: record health problems with the metadata directory
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-31 21:11   ` [PATCH 21/32] xfs: adjust xfs_bmap_add_attrfork for metadir Darrick J. Wong
@ 2023-12-31 21:11   ` Darrick J. Wong
  2023-12-31 21:12   ` [PATCH 23/32] xfs: do not count metadata directory files when doing online quotacheck Darrick J. Wong
                     ` (9 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:11 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make a report to the health monitoring subsystem any time we encounter
something in the metadata directory tree that looks like corruption.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h     |    1 +
 fs/xfs/libxfs/xfs_health.h |    4 +++-
 fs/xfs/libxfs/xfs_imeta.c  |   24 +++++++++++++++++++-----
 fs/xfs/xfs_health.c        |    1 +
 fs/xfs/xfs_icache.c        |    1 +
 fs/xfs/xfs_imeta_utils.c   |    5 ++++-
 fs/xfs/xfs_inode.c         |    1 +
 7 files changed, 30 insertions(+), 7 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index ab961a0118157..952e4fc93c4cf 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -197,6 +197,7 @@ struct xfs_fsop_geom {
 #define XFS_FSOP_GEOM_SICK_RT_SUMMARY	(1 << 5)  /* realtime summary */
 #define XFS_FSOP_GEOM_SICK_QUOTACHECK	(1 << 6)  /* quota counts */
 #define XFS_FSOP_GEOM_SICK_NLINKS	(1 << 7)  /* inode link counts */
+#define XFS_FSOP_GEOM_SICK_METADIR	(1 << 8)  /* metadata directory */
 
 /* Output for XFS_FS_COUNTS */
 typedef struct xfs_fsop_counts {
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index bca1990f71da8..d9b9968607f12 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -60,6 +60,7 @@ struct xfs_da_args;
 #define XFS_SICK_FS_PQUOTA	(1 << 3)  /* project quota */
 #define XFS_SICK_FS_QUOTACHECK	(1 << 4)  /* quota counts */
 #define XFS_SICK_FS_NLINKS	(1 << 5)  /* inode link counts */
+#define XFS_SICK_FS_METADIR	(1 << 6)  /* metadata directory tree */
 
 /* Observable health issues for realtime volume metadata. */
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
@@ -103,7 +104,8 @@ struct xfs_da_args;
 				 XFS_SICK_FS_GQUOTA | \
 				 XFS_SICK_FS_PQUOTA | \
 				 XFS_SICK_FS_QUOTACHECK | \
-				 XFS_SICK_FS_NLINKS)
+				 XFS_SICK_FS_NLINKS | \
+				 XFS_SICK_FS_METADIR)
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY)
diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index 10249a1c36c73..57be23f89d864 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -26,6 +26,7 @@
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
+#include "xfs_health.h"
 
 /*
  * Metadata File Management
@@ -406,16 +407,22 @@ xfs_imeta_dir_lookup_component(
 	int			type_wanted = xname->type;
 	int			error;
 
-	if (!S_ISDIR(VFS_I(dp)->i_mode))
+	if (!S_ISDIR(VFS_I(dp)->i_mode)) {
+		xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
+	}
 
 	error = xfs_imeta_dir_lookup(tp, dp, xname, ino);
 	if (error)
 		return error;
-	if (!xfs_verify_ino(dp->i_mount, *ino))
+	if (!xfs_verify_ino(dp->i_mount, *ino)) {
+		xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
-	if (type_wanted != XFS_DIR3_FT_UNKNOWN && xname->type != type_wanted)
+	}
+	if (type_wanted != XFS_DIR3_FT_UNKNOWN && xname->type != type_wanted) {
+		xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
+	}
 
 	trace_xfs_imeta_dir_lookup(dp, xname, *ino);
 	return 0;
@@ -729,6 +736,7 @@ xfs_imeta_dir_unlink(
 	/* Metadata directory root cannot be unlinked. */
 	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
 		ASSERT(0);
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
 	}
 
@@ -744,6 +752,7 @@ xfs_imeta_dir_unlink(
 			error = -ENOENT;
 		break;
 	case -ENOENT:
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		error = -EFSCORRUPTED;
 		break;
 	}
@@ -788,6 +797,7 @@ xfs_imeta_dir_link(
 	/* Metadata directory root cannot be linked. */
 	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
 		ASSERT(0);
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
 	}
 
@@ -853,16 +863,20 @@ xfs_imeta_lookup(
 
 	if (xfs_has_metadir(mp)) {
 		error = xfs_imeta_dir_lookup_int(tp, path, &ino);
-		if (error == -ENOENT)
+		if (error == -ENOENT) {
+			xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 			return -EFSCORRUPTED;
+		}
 	} else {
 		error = xfs_imeta_sb_lookup(mp, path, &ino);
 	}
 	if (error)
 		return error;
 
-	if (!xfs_imeta_verify(mp, ino))
+	if (!xfs_imeta_verify(mp, ino)) {
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
+	}
 
 	*inop = ino;
 	return 0;
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 15ebfe331f277..09094b271e54e 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -360,6 +360,7 @@ static const struct ioctl_sick_map fs_map[] = {
 	{ XFS_SICK_FS_PQUOTA,	XFS_FSOP_GEOM_SICK_PQUOTA },
 	{ XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK },
 	{ XFS_SICK_FS_NLINKS,	XFS_FSOP_GEOM_SICK_NLINKS },
+	{ XFS_SICK_FS_METADIR,	XFS_FSOP_GEOM_SICK_METADIR },
 	{ 0, 0 },
 };
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 64c52e8621fdb..27cef2922d206 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -850,6 +850,7 @@ xfs_imeta_iget(
 	xfs_irele(ip);
 whine:
 	xfs_err(mp, "metadata inode 0x%llx is corrupt", ino);
+	xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 	return -EFSCORRUPTED;
 }
 
diff --git a/fs/xfs/xfs_imeta_utils.c b/fs/xfs/xfs_imeta_utils.c
index 9fbaa4323e3b2..e871d5b52e7be 100644
--- a/fs/xfs/xfs_imeta_utils.c
+++ b/fs/xfs/xfs_imeta_utils.c
@@ -23,6 +23,7 @@
 #include "xfs_imeta_utils.h"
 #include "xfs_trace.h"
 #include "xfs_parent.h"
+#include "xfs_health.h"
 
 /* Initialize a metadata update structure. */
 static inline int
@@ -52,8 +53,10 @@ xfs_imeta_init(
 		return error;
 	error = xfs_imeta_dir_parent(tp, upd->path, &upd->dp);
 	xfs_trans_cancel(tp);
-	if (error == -ENOENT)
+	if (error == -ENOENT) {
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
+	}
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 66aa8bbcb1045..322fa538aec4b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -582,6 +582,7 @@ xfs_lookup(
 	 * a metadata file.
 	 */
 	if (XFS_IS_CORRUPT(dp->i_mount, xfs_is_metadir_inode(*ipp))) {
+		xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR);
 		error = -EFSCORRUPTED;
 		goto out_irele;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/32] xfs: do not count metadata directory files when doing online quotacheck
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-31 21:11   ` [PATCH 22/32] xfs: record health problems with the metadata directory Darrick J. Wong
@ 2023-12-31 21:12   ` Darrick J. Wong
  2023-12-31 21:12   ` [PATCH 24/32] xfs: scrub shouldn't tag metadata files with attr forks if metadir and parent pointers are on Darrick J. Wong
                     ` (8 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:12 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Previously, we stated that files in the metadata directory tree are not
counted in the dquot information.  Fix the online quotacheck code to
reflect this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/quotacheck.c |    7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
index a2f728d76fab2..812a0f91458e9 100644
--- a/fs/xfs/scrub/quotacheck.c
+++ b/fs/xfs/scrub/quotacheck.c
@@ -398,10 +398,13 @@ xqcheck_collect_inode(
 	bool			isreg = S_ISREG(VFS_I(ip)->i_mode);
 	int			error = 0;
 
-	if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
+	if (xfs_is_metadir_inode(ip) ||
+	    xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
 		/*
 		 * Quota files are never counted towards quota, so we do not
-		 * need to take the lock.
+		 * need to take the lock.  Files do not switch between the
+		 * metadata and regular directory trees without a reallocation,
+		 * so we do not need to ILOCK them either.
 		 */
 		xchk_iscan_mark_visited(&xqc->iscan, ip);
 		return 0;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/32] xfs: scrub shouldn't tag metadata files with attr forks if metadir and parent pointers are on
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-31 21:12   ` [PATCH 23/32] xfs: do not count metadata directory files when doing online quotacheck Darrick J. Wong
@ 2023-12-31 21:12   ` Darrick J. Wong
  2023-12-31 21:12   ` [PATCH 25/32] xfs: scrub metadata directories Darrick J. Wong
                     ` (7 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:12 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If metadata directory trees and parent pointers are both enabled, it's
ok for metadata files to have extended attribute forks.  Don't flag
these in online checking, and don't remove it during online repair.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/common.c |    8 ++++++--
 fs/xfs/scrub/repair.c |    8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index fbc28bd98e957..5a6681c6647c3 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -1248,8 +1248,12 @@ xchk_metadata_inode_forks(
 		return 0;
 	}
 
-	/* They also should never have extended attributes. */
-	if (xfs_inode_hasattr(sc->ip)) {
+	/*
+	 * Metadata files can only have extended attributes if parent pointers
+	 * and the metadata directory tree are enabled.
+	 */
+	if (xfs_inode_hasattr(sc->ip) &&
+	    !(xfs_has_parent(sc->mp) && xfs_has_metadir(sc->mp))) {
 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
 		return 0;
 	}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index b15eee680510c..12cbc14b24810 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1100,8 +1100,12 @@ xrep_metadata_inode_forks(
 			return error;
 	}
 
-	/* Clear the attr forks since metadata shouldn't have that. */
-	if (xfs_inode_hasattr(sc->ip)) {
+	/*
+	 * Clear the attr forks since metadata shouldn't have one unless
+	 * parent pointers and the metadata directory tree are enabled.
+	 */
+	if (xfs_inode_hasattr(sc->ip) &&
+	    !(xfs_has_parent(sc->mp) && xfs_has_metadir(sc->mp))) {
 		if (!dirty) {
 			dirty = true;
 			xfs_trans_ijoin(sc->tp, sc->ip, 0);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 25/32] xfs: scrub metadata directories
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (23 preceding siblings ...)
  2023-12-31 21:12   ` [PATCH 24/32] xfs: scrub shouldn't tag metadata files with attr forks if metadir and parent pointers are on Darrick J. Wong
@ 2023-12-31 21:12   ` Darrick J. Wong
  2023-12-31 21:13   ` [PATCH 26/32] xfs: teach nlink scrubber to deal with metadata directory roots Darrick J. Wong
                     ` (6 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:12 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach online scrub about the metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/dir.c        |    8 ++++++++
 fs/xfs/scrub/dir_repair.c |    6 ++++++
 fs/xfs/scrub/dirtree.c    |   22 ++++++++++++++++++----
 fs/xfs/scrub/dirtree.h    |    2 ++
 fs/xfs/scrub/findparent.c |   37 +++++++++++++++++++++++++++++++------
 fs/xfs/scrub/parent.c     |   30 +++++++++++++++++++++++-------
 fs/xfs/scrub/trace.h      |    1 +
 7 files changed, 89 insertions(+), 17 deletions(-)


diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cfaddde6a34d6..b49ca2d5a160b 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -102,6 +102,14 @@ xchk_dir_check_ftype(
 
 	if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+	/*
+	 * Metadata and regular inodes cannot cross trees.  This property
+	 * cannot change without a full inode free and realloc cycle, so it's
+	 * safe to check this without holding locks.
+	 */
+	if (xfs_is_metadir_inode(ip) ^ xfs_is_metadir_inode(sc->ip))
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
 }
 
 /*
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
index c66838fc144d5..304eb042df7c9 100644
--- a/fs/xfs/scrub/dir_repair.c
+++ b/fs/xfs/scrub/dir_repair.c
@@ -440,6 +440,12 @@ xrep_dir_salvage_entry(
 	if (error)
 		return 0;
 
+	/* Don't mix metadata and regular directory trees. */
+	if (xfs_is_metadir_inode(ip) ^ xfs_is_metadir_inode(rd->sc->ip)) {
+		xchk_irele(sc, ip);
+		return 0;
+	}
+
 	xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
 	xchk_irele(sc, ip);
 
diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c
index 53fd89e48cfc6..085561b7a1dcc 100644
--- a/fs/xfs/scrub/dirtree.c
+++ b/fs/xfs/scrub/dirtree.c
@@ -350,7 +350,8 @@ xchk_dirpath_set_outcome(
 STATIC int
 xchk_dirpath_step_up(
 	struct xchk_dirtree	*dl,
-	struct xchk_dirpath	*path)
+	struct xchk_dirpath	*path,
+	bool			is_metadir)
 {
 	struct xfs_scrub	*sc = dl->sc;
 	struct xfs_inode	*dp;
@@ -423,6 +424,14 @@ xchk_dirpath_step_up(
 		goto out_scanlock;
 	}
 
+	/* Parent must be in the same directory tree. */
+	if (is_metadir != xfs_is_metadir_inode(dp)) {
+		trace_xchk_dirpath_crosses_tree(dl->sc, dp, path->path_nr,
+				path->nr_steps, &dl->pptr);
+		error = -EFSCORRUPTED;
+		goto out_scanlock;
+	}
+
 	/*
 	 * If the extended attributes look as though they has been zapped by
 	 * the inode record repair code, we cannot scan for parent pointers.
@@ -492,6 +501,7 @@ xchk_dirpath_walk_upwards(
 	struct xchk_dirpath	*path)
 {
 	struct xfs_scrub	*sc = dl->sc;
+	bool			is_metadir;
 	int			error;
 
 	ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
@@ -521,6 +531,7 @@ xchk_dirpath_walk_upwards(
 	 * ILOCK state is no longer tracked in the scrub context.  Hence we
 	 * must drop @sc->ip's ILOCK during the walk.
 	 */
+	is_metadir = xfs_is_metadir_inode(sc->ip);
 	mutex_unlock(&dl->lock);
 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
 
@@ -530,7 +541,7 @@ xchk_dirpath_walk_upwards(
 	 * If we see any kind of error here (including corruptions), the parent
 	 * pointer of @sc->ip is corrupt.  Stop the whole scan.
 	 */
-	error = xchk_dirpath_step_up(dl, path);
+	error = xchk_dirpath_step_up(dl, path, is_metadir);
 	if (error) {
 		xchk_ilock(sc, XFS_ILOCK_EXCL);
 		mutex_lock(&dl->lock);
@@ -543,7 +554,7 @@ xchk_dirpath_walk_upwards(
 	 * *somewhere* in the path, but we don't need to stop scanning.
 	 */
 	while (!error && path->outcome == XCHK_DIRPATH_SCANNING)
-		error = xchk_dirpath_step_up(dl, path);
+		error = xchk_dirpath_step_up(dl, path, is_metadir);
 
 	/* Retake the locks we had, mark paths, etc. */
 	xchk_ilock(sc, XFS_ILOCK_EXCL);
@@ -870,7 +881,10 @@ xchk_dirtree(
 	 * scan, because the hook doesn't detach until after sc->ip gets
 	 * released during teardown.
 	 */
-	dl->root_ino = sc->mp->m_rootip->i_ino;
+	if (xfs_is_metadir_inode(sc->ip))
+		dl->root_ino = sc->mp->m_metadirip->i_ino;
+	else
+		dl->root_ino = sc->mp->m_rootip->i_ino;
 	dl->scan_ino = sc->ip->i_ino;
 
 	trace_xchk_dirtree_start(sc->ip, sc->sm, 0);
diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h
index a5dca42906e0e..86c438c20fe7d 100644
--- a/fs/xfs/scrub/dirtree.h
+++ b/fs/xfs/scrub/dirtree.h
@@ -156,6 +156,8 @@ xchk_dirtree_parentless(const struct xchk_dirtree *dl)
 
 	if (sc->ip == sc->mp->m_rootip)
 		return true;
+	if (sc->ip == sc->mp->m_metadirip)
+		return true;
 	if (VFS_I(sc->ip)->i_nlink == 0)
 		return true;
 	return false;
diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c
index ceb76b26c6cd1..55c83cb3bfd4e 100644
--- a/fs/xfs/scrub/findparent.c
+++ b/fs/xfs/scrub/findparent.c
@@ -172,6 +172,10 @@ xrep_findparent_walk_directory(
 	 */
 	lock_mode = xfs_ilock_data_map_shared(dp);
 
+	/* Don't mix metadata and regular directory trees. */
+	if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip))
+		goto out_unlock;
+
 	/*
 	 * If this directory is known to be sick, we cannot scan it reliably
 	 * and must abort.
@@ -360,15 +364,30 @@ xrep_findparent_confirm(
 	};
 	int			error;
 
-	/*
-	 * The root directory always points to itself.  Unlinked dirs can point
-	 * anywhere, so we point them at the root dir too.
-	 */
-	if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) {
+	/* The root directory always points to itself. */
+	if (sc->ip == sc->mp->m_rootip) {
 		*parent_ino = sc->mp->m_sb.sb_rootino;
 		return 0;
 	}
 
+	/* The metadata root directory always points to itself. */
+	if (sc->ip == sc->mp->m_metadirip) {
+		*parent_ino = sc->mp->m_sb.sb_metadirino;
+		return 0;
+	}
+
+	/*
+	 * Unlinked dirs can point anywhere, so we point them at the root dir
+	 * of whichever tree is appropriate.
+	 */
+	if (VFS_I(sc->ip)->i_nlink == 0) {
+		if (xfs_is_metadir_inode(sc->ip))
+			*parent_ino = sc->mp->m_sb.sb_metadirino;
+		else
+			*parent_ino = sc->mp->m_sb.sb_rootino;
+		return 0;
+	}
+
 	/* Reject garbage parent inode numbers and self-referential parents. */
 	if (*parent_ino == NULLFSINO)
 	       return 0;
@@ -410,8 +429,14 @@ xrep_findparent_self_reference(
 	if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino)
 		return sc->mp->m_sb.sb_rootino;
 
-	if (VFS_I(sc->ip)->i_nlink == 0)
+	if (sc->ip->i_ino == sc->mp->m_sb.sb_metadirino)
+		return sc->mp->m_sb.sb_metadirino;
+
+	if (VFS_I(sc->ip)->i_nlink == 0) {
+		if (xfs_is_metadir_inode(sc->ip))
+			return sc->mp->m_sb.sb_metadirino;
 		return sc->mp->m_sb.sb_rootino;
+	}
 
 	return NULLFSINO;
 }
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 6af212540c631..8c7ce2b9dc324 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -132,6 +132,14 @@ xchk_parent_validate(
 		return 0;
 	}
 
+	/* Is this the metadata root dir?  Then '..' must point to itself. */
+	if (sc->ip == mp->m_metadirip) {
+		if (sc->ip->i_ino != mp->m_sb.sb_metadirino ||
+		    sc->ip->i_ino != parent_ino)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		return 0;
+	}
+
 	/* '..' must not point to ourselves. */
 	if (sc->ip->i_ino == parent_ino) {
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
@@ -185,6 +193,12 @@ xchk_parent_validate(
 		goto out_unlock;
 	}
 
+	/* Metadata and regular inodes cannot cross trees. */
+	if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		goto out_unlock;
+	}
+
 	/* Look for a directory entry in the parent pointing to the child. */
 	error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
 	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
@@ -287,7 +301,7 @@ xchk_parent_pptr_and_dotdot(
 	}
 
 	/* Is this the root dir?  Then '..' must point to itself. */
-	if (sc->ip == sc->mp->m_rootip) {
+	if (sc->ip == sc->mp->m_rootip || sc->ip == sc->mp->m_metadirip) {
 		if (sc->ip->i_ino != pp->parent_ino)
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
 		return 0;
@@ -721,7 +735,8 @@ xchk_parent_count_pptrs(
 	}
 
 	if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
-		if (sc->ip == sc->mp->m_rootip)
+		if (sc->ip == sc->mp->m_rootip ||
+		    sc->ip == sc->mp->m_metadirip)
 			pp->pptrs_found++;
 
 		if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0)
@@ -911,15 +926,16 @@ xchk_pptr_looks_zapped(
 	 * of a parent pointer scan is always the empty set.  It's safe to scan
 	 * them even if the attr fork was zapped.
 	 */
-	if (ip == mp->m_rootip)
+	if (ip == mp->m_rootip || ip == mp->m_metadirip)
 		return false;
 
 	/*
-	 * Metadata inodes are all rooted in the superblock and do not have
-	 * any parents.  Hence the attr fork will not be initialized, but
-	 * there are no parent pointers that might have been zapped.
+	 * Prior to metadata directories, all metadata inodes are rooted in the
+	 * superblock and do not have any parents.  Hence the attr fork will
+	 * not be initialized, but there are no parent pointers that might have
+	 * been zapped.
 	 */
-	if (xfs_is_metadata_inode(ip))
+	if (!xfs_has_metadir(mp) && xfs_is_metadata_inode(ip))
 		return false;
 
 	/*
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index cc2af405cd3a7..83ad76a18b2c0 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1763,6 +1763,7 @@ DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen);
 DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent);
 DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent);
 DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_crosses_tree);
 
 TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING);
 TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 26/32] xfs: teach nlink scrubber to deal with metadata directory roots
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (24 preceding siblings ...)
  2023-12-31 21:12   ` [PATCH 25/32] xfs: scrub metadata directories Darrick J. Wong
@ 2023-12-31 21:13   ` Darrick J. Wong
  2023-12-31 21:13   ` [PATCH 27/32] xfs: don't check secondary super inode pointers when metadir enabled Darrick J. Wong
                     ` (5 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:13 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enhance the inode link count online fsck code alter their behavior when
they detect metadata directory tree roots, just like they do for the
regular root directory.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/nlinks.c        |   12 +++++++-----
 fs/xfs/scrub/nlinks_repair.c |    2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 4e62e287e1590..46785e0712377 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -275,7 +275,7 @@ xchk_nlinks_collect_dirent(
 	 * determine the backref count.
 	 */
 	if (dotdot) {
-		if (dp == sc->mp->m_rootip)
+		if (dp == sc->mp->m_rootip || dp == sc->mp->m_metadirip)
 			error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
 		else if (!xfs_has_parent(sc->mp))
 			error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
@@ -516,9 +516,11 @@ xchk_nlinks_collect(
 	int			error;
 
 	/* Count the rt and quota files that are rooted in the superblock. */
-	error = xchk_nlinks_collect_metafiles(xnc);
-	if (error)
-		return error;
+	if (!xfs_has_metadir(sc->mp)) {
+		error = xchk_nlinks_collect_metafiles(xnc);
+		if (error)
+			return error;
+	}
 
 	/*
 	 * Set up for a potentially lengthy filesystem scan by reducing our
@@ -716,7 +718,7 @@ xchk_nlinks_compare_inode(
 		}
 	}
 
-	if (ip == sc->mp->m_rootip) {
+	if (ip == sc->mp->m_rootip || ip == sc->mp->m_metadirip) {
 		/*
 		 * For the root of a directory tree, both the '.' and '..'
 		 * entries should point to the root directory.  The dotdot
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
index fb299b23d5f1d..b2f1cec7d5778 100644
--- a/fs/xfs/scrub/nlinks_repair.c
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -64,7 +64,7 @@ xrep_nlinks_is_orphaned(
 
 	if (obs->parents != 0)
 		return false;
-	if (ip == mp->m_rootip || ip == sc->orphanage)
+	if (ip == mp->m_rootip || ip == sc->orphanage || ip == mp->m_metadirip)
 		return false;
 	return actual_nlink != 0;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 27/32] xfs: don't check secondary super inode pointers when metadir enabled
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (25 preceding siblings ...)
  2023-12-31 21:13   ` [PATCH 26/32] xfs: teach nlink scrubber to deal with metadata directory roots Darrick J. Wong
@ 2023-12-31 21:13   ` Darrick J. Wong
  2023-12-31 21:13   ` [PATCH 28/32] xfs: move repair temporary files to the metadata directory tree Darrick J. Wong
                     ` (4 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:13 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When metadata directories are enabled, the rt and quota inodes are no
longer pointed to by the superblock, so it doesn't make sense to check
these.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/agheader.c |   29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)


diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 7fd89889d1d81..b98d014f65521 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -144,11 +144,16 @@ xchk_superblock(
 	if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino))
 		xchk_block_set_preen(sc, bp);
 
-	if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
-		xchk_block_set_preen(sc, bp);
+	if (xfs_has_metadir(sc->mp)) {
+		if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_metadirino))
+			xchk_block_set_preen(sc, bp);
+	} else {
+		if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
+			xchk_block_set_preen(sc, bp);
 
-	if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
-		xchk_block_set_preen(sc, bp);
+		if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
+			xchk_block_set_preen(sc, bp);
+	}
 
 	if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize))
 		xchk_block_set_corrupt(sc, bp);
@@ -225,11 +230,13 @@ xchk_superblock(
 	 * sb_icount, sb_ifree, sb_fdblocks, sb_frexents
 	 */
 
-	if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
-		xchk_block_set_preen(sc, bp);
+	if (!xfs_has_metadir(sc->mp)) {
+		if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
+			xchk_block_set_preen(sc, bp);
 
-	if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
-		xchk_block_set_preen(sc, bp);
+		if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
+			xchk_block_set_preen(sc, bp);
+	}
 
 	/*
 	 * Skip the quota flags since repair will force quotacheck.
@@ -338,8 +345,10 @@ xchk_superblock(
 		if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
 			xchk_block_set_corrupt(sc, bp);
 
-		if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
-			xchk_block_set_preen(sc, bp);
+		if (!xfs_has_metadir(sc->mp)) {
+			if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
+				xchk_block_set_preen(sc, bp);
+		}
 
 		/* Don't care about sb_lsn */
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 28/32] xfs: move repair temporary files to the metadata directory tree
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (26 preceding siblings ...)
  2023-12-31 21:13   ` [PATCH 27/32] xfs: don't check secondary super inode pointers when metadir enabled Darrick J. Wong
@ 2023-12-31 21:13   ` Darrick J. Wong
  2023-12-31 21:13   ` [PATCH 29/32] xfs: check metadata directory file path connectivity Darrick J. Wong
                     ` (3 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:13 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Due to resource acquisition rules, we have to create the ondisk
temporary files used to stage a filesystem repair before we can acquire
a reference to the inode that we actually want to repair.  Therefore,
we do not know at tempfile creation time whether the tempfile will
belong to the regular directory tree or the metadata directory tree.

This distinction becomes important when the swapext code tries to figure
out the quota accounting of the two files whose mappings are being
swapped.  The swapext code assumes that accounting updates are required
for a file if dqattach attaches dquots.  Metadir files are never
accounted in quota, which means that swapext must not update the quota
accounting when swapping in a repaired directory/xattr/rtbitmap structure.

Prior to the swapext call, therefore, both files must be marked as
METADIR for dqattach so that dqattach will ignore them.  Add support for
a repair tempfile to be switched to the metadir tree and switched back
before being released so that ifree will just free the file.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/common.c   |    5 ++
 fs/xfs/scrub/tempfile.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/tempfile.h |    3 +
 3 files changed, 105 insertions(+)


diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 5a6681c6647c3..c6501101c925b 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -38,6 +38,7 @@
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/health.h"
+#include "scrub/tempfile.h"
 
 /* Common code for the metadata scrubbers. */
 
@@ -1093,6 +1094,10 @@ xchk_setup_inode_contents(
 	if (error)
 		return error;
 
+	error = xrep_tempfile_adjust_directory_tree(sc);
+	if (error)
+		return error;
+
 	/* Lock the inode so the VFS cannot touch this file. */
 	xchk_ilock(sc, XFS_IOLOCK_EXCL);
 
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 583bbd5a02bc9..1dd7c4c5cff0f 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -22,6 +22,7 @@
 #include "xfs_swapext.h"
 #include "xfs_defer.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_imeta.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/repair.h"
@@ -173,6 +174,101 @@ xrep_tempfile_create(
 	return error;
 }
 
+/*
+ * Temporary files have to be created before we even know which inode we're
+ * going to scrub, so we assume that they will be part of the regular directory
+ * tree.  If it turns out that we're actually scrubbing a file from the
+ * metadata directory tree, we have to subtract the temp file from the root
+ * dquots and detach the dquots.
+ */
+int
+xrep_tempfile_adjust_directory_tree(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	if (!sc->tempip)
+		return 0;
+
+	ASSERT(sc->tp == NULL);
+	ASSERT(!xfs_is_metadir_inode(sc->tempip));
+
+	if (!sc->ip || !xfs_is_metadir_inode(sc->ip))
+		return 0;
+
+	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
+	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		goto out_iolock;
+
+	xrep_tempfile_ilock(sc);
+	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+	/* Metadir files are not accounted in quota, so drop icount */
+	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L);
+	xfs_imeta_set_iflag(sc->tp, sc->tempip);
+
+	error = xrep_trans_commit(sc);
+	if (error)
+		goto out_ilock;
+
+	xfs_qm_dqdetach(sc->tempip);
+out_ilock:
+	xrep_tempfile_iunlock(sc);
+out_iolock:
+	xrep_tempfile_iounlock(sc);
+	return error;
+}
+
+/*
+ * Remove this temporary file from the metadata directory tree so that it can
+ * be inactivated the normal way.
+ */
+STATIC int
+xrep_tempfile_remove_metadir(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip))
+		return 0;
+
+	ASSERT(sc->tp == NULL);
+
+	xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
+	sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		goto out_iolock;
+
+	xrep_tempfile_ilock(sc);
+	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+	xfs_imeta_clear_iflag(sc->tp, sc->tempip);
+
+	/* Non-metadir files are accounted in quota, so bump bcount/icount */
+	error = xfs_qm_dqattach_locked(sc->tempip, false);
+	if (error)
+		goto out_cancel;
+
+	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L);
+	xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT,
+			sc->tempip->i_nblocks);
+	error = xrep_trans_commit(sc);
+	goto out_ilock;
+
+out_cancel:
+	xchk_trans_cancel(sc);
+out_ilock:
+	xrep_tempfile_iunlock(sc);
+out_iolock:
+	xrep_tempfile_iounlock(sc);
+	return error;
+}
+
 /* Take IOLOCK_EXCL on the temporary file, maybe. */
 bool
 xrep_tempfile_iolock_nowait(
@@ -281,6 +377,7 @@ xrep_tempfile_rele(
 		sc->temp_ilock_flags = 0;
 	}
 
+	xrep_tempfile_remove_metadir(sc);
 	xchk_irele(sc, sc->tempip);
 	sc->tempip = NULL;
 }
diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h
index e51399f595fe9..71c1b54599c30 100644
--- a/fs/xfs/scrub/tempfile.h
+++ b/fs/xfs/scrub/tempfile.h
@@ -10,6 +10,8 @@
 int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode);
 void xrep_tempfile_rele(struct xfs_scrub *sc);
 
+int xrep_tempfile_adjust_directory_tree(struct xfs_scrub *sc);
+
 bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc);
 int xrep_tempfile_iolock_polled(struct xfs_scrub *sc);
 void xrep_tempfile_iounlock(struct xfs_scrub *sc);
@@ -42,6 +44,7 @@ static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc)
 	xchk_ilock(sc, XFS_IOLOCK_EXCL);
 }
 # define xrep_is_tempfile(ip)		(false)
+# define xrep_tempfile_adjust_directory_tree(sc)	(0)
 # define xrep_tempfile_rele(sc)
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 29/32] xfs: check metadata directory file path connectivity
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (27 preceding siblings ...)
  2023-12-31 21:13   ` [PATCH 28/32] xfs: move repair temporary files to the metadata directory tree Darrick J. Wong
@ 2023-12-31 21:13   ` Darrick J. Wong
  2023-12-31 21:14   ` [PATCH 30/32] xfs: repair " Darrick J. Wong
                     ` (2 subsequent siblings)
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:13 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new scrubber type that checks that well known metadata
directory paths are connected to the metadata inode that the incore
structures think is in use.  IOWs, check that "/quota/user" in the
metadata directory tree actually points to
mp->m_quotainfo->qi_uquotaip->i_ino.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile            |    1 
 fs/xfs/libxfs/xfs_fs.h     |   17 +++
 fs/xfs/libxfs/xfs_health.h |    4 +
 fs/xfs/scrub/common.h      |    1 
 fs/xfs/scrub/health.c      |    1 
 fs/xfs/scrub/metapath.c    |  250 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c       |    9 ++
 fs/xfs/scrub/scrub.h       |    2 
 fs/xfs/scrub/stats.c       |    1 
 fs/xfs/scrub/trace.c       |    1 
 fs/xfs/scrub/trace.h       |   45 ++++++++
 fs/xfs/xfs_health.c        |    1 
 12 files changed, 330 insertions(+), 3 deletions(-)
 create mode 100644 fs/xfs/scrub/metapath.c


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 29cf5a5f8104a..5362a0fb56d77 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -173,6 +173,7 @@ xfs-y				+= $(addprefix scrub/, \
 				   inode.o \
 				   iscan.o \
 				   listxattr.o \
+				   metapath.o \
 				   nlinks.o \
 				   parent.o \
 				   readdir.o \
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 952e4fc93c4cf..a0efcbde5ae60 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -198,6 +198,7 @@ struct xfs_fsop_geom {
 #define XFS_FSOP_GEOM_SICK_QUOTACHECK	(1 << 6)  /* quota counts */
 #define XFS_FSOP_GEOM_SICK_NLINKS	(1 << 7)  /* inode link counts */
 #define XFS_FSOP_GEOM_SICK_METADIR	(1 << 8)  /* metadata directory */
+#define XFS_FSOP_GEOM_SICK_METAPATH	(1 << 9)  /* metadir tree path */
 
 /* Output for XFS_FS_COUNTS */
 typedef struct xfs_fsop_counts {
@@ -731,9 +732,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_NLINKS	26	/* inode link counts */
 #define XFS_SCRUB_TYPE_HEALTHY	27	/* everything checked out ok */
 #define XFS_SCRUB_TYPE_DIRTREE	28	/* directory tree structure */
+#define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	29
+#define XFS_SCRUB_TYPE_NR	30
 
 /*
  * This special type code only applies to the vectored scrub implementation.
@@ -788,6 +790,19 @@ struct xfs_scrub_metadata {
 				 XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
 #define XFS_SCRUB_FLAGS_ALL	(XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
 
+/*
+ * i: sm_ino values for XFS_SCRUB_TYPE_METAPATH to select a metadata file for
+ * path checking.
+ */
+#define XFS_SCRUB_METAPATH_RTBITMAP	0
+#define XFS_SCRUB_METAPATH_RTSUMMARY	1
+#define XFS_SCRUB_METAPATH_USRQUOTA	2
+#define XFS_SCRUB_METAPATH_GRPQUOTA	3
+#define XFS_SCRUB_METAPATH_PRJQUOTA	4
+
+/* Number of metapath sm_ino values */
+#define XFS_SCRUB_METAPATH_NR		5
+
 /*
  * ioctl limits
  */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index d9b9968607f12..1816c67351ac8 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -61,6 +61,7 @@ struct xfs_da_args;
 #define XFS_SICK_FS_QUOTACHECK	(1 << 4)  /* quota counts */
 #define XFS_SICK_FS_NLINKS	(1 << 5)  /* inode link counts */
 #define XFS_SICK_FS_METADIR	(1 << 6)  /* metadata directory tree */
+#define XFS_SICK_FS_METAPATH	(1 << 7)  /* metadata directory tree path */
 
 /* Observable health issues for realtime volume metadata. */
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
@@ -105,7 +106,8 @@ struct xfs_da_args;
 				 XFS_SICK_FS_PQUOTA | \
 				 XFS_SICK_FS_QUOTACHECK | \
 				 XFS_SICK_FS_NLINKS | \
-				 XFS_SICK_FS_METADIR)
+				 XFS_SICK_FS_METADIR | \
+				 XFS_SICK_FS_METAPATH)
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY)
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index a90c82c18e3c9..11a893e4b228e 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -68,6 +68,7 @@ int xchk_setup_xattr(struct xfs_scrub *sc);
 int xchk_setup_symlink(struct xfs_scrub *sc);
 int xchk_setup_parent(struct xfs_scrub *sc);
 int xchk_setup_dirtree(struct xfs_scrub *sc);
+int xchk_setup_metapath(struct xfs_scrub *sc);
 #ifdef CONFIG_XFS_RT
 int xchk_setup_rtbitmap(struct xfs_scrub *sc);
 int xchk_setup_rtsummary(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 12f3e9fca727f..4aae9a594cce5 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -111,6 +111,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_QUOTACHECK]	= { XHG_FS,  XFS_SICK_FS_QUOTACHECK },
 	[XFS_SCRUB_TYPE_NLINKS]		= { XHG_FS,  XFS_SICK_FS_NLINKS },
 	[XFS_SCRUB_TYPE_DIRTREE]	= { XHG_INO, XFS_SICK_INO_DIRTREE },
+	[XFS_SCRUB_TYPE_METAPATH]	= { XHG_FS,  XFS_SICK_FS_METAPATH },
 };
 
 /* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
new file mode 100644
index 0000000000000..fac957de282c3
--- /dev/null
+++ b/fs/xfs/scrub/metapath.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_imeta.h"
+#include "xfs_imeta_utils.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dir2.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/readdir.h"
+
+/*
+ * Metadata Directory Tree Paths
+ * =============================
+ *
+ * A filesystem with metadir enabled expects to find metadata structures
+ * attached to files that are accessible by walking a path down the metadata
+ * directory tree.  Given the metadir path and the incore inode storing the
+ * metadata, this scrubber ensures that the ondisk metadir path points to the
+ * ondisk inode represented by the incore inode.
+ */
+
+struct xchk_metapath {
+	struct xfs_scrub		*sc;
+
+	/* Name for lookup */
+	struct xfs_name			xname;
+
+	/* Path for this metadata file */
+	const struct xfs_imeta_path	*path;
+
+	/* Directory parent of the metadata file. */
+	struct xfs_inode		*dp;
+
+	/* Locks held on dp */
+	unsigned int			dp_ilock_flags;
+};
+
+/* Release resources tracked in the buffer. */
+STATIC void
+xchk_metapath_cleanup(
+	void			*buf)
+{
+	struct xchk_metapath	*mpath = buf;
+	struct xfs_scrub	*sc = mpath->sc;
+
+	if (mpath->dp) {
+		if (mpath->dp_ilock_flags)
+			xfs_iunlock(mpath->dp, mpath->dp_ilock_flags);
+		xchk_irele(sc, mpath->dp);
+	}
+	if (mpath->path)
+		xfs_imeta_free_path(mpath->path);
+}
+
+int
+xchk_setup_metapath(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_metapath	*mpath;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	if (!xfs_has_metadir(mp))
+		return -ENOENT;
+	if (sc->sm->sm_gen || sc->sm->sm_agno)
+		return -EINVAL;
+
+	mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS);
+	if (!mpath)
+		return -ENOMEM;
+	mpath->sc = sc;
+	sc->buf = mpath;
+	sc->buf_cleanup = xchk_metapath_cleanup;
+
+	/* Select the metadir path and the metadata file. */
+	switch (sc->sm->sm_ino) {
+	case XFS_SCRUB_METAPATH_RTBITMAP:
+		mpath->path = &XFS_IMETA_RTBITMAP;
+		ip = mp->m_rbmip;
+		break;
+	case XFS_SCRUB_METAPATH_RTSUMMARY:
+		mpath->path = &XFS_IMETA_RTSUMMARY;
+		ip = mp->m_rsumip;
+		break;
+	case XFS_SCRUB_METAPATH_USRQUOTA:
+		mpath->path = &XFS_IMETA_USRQUOTA;
+		if (XFS_IS_UQUOTA_ON(mp))
+			ip = xfs_quota_inode(mp, XFS_DQTYPE_USER);
+		break;
+	case XFS_SCRUB_METAPATH_GRPQUOTA:
+		mpath->path = &XFS_IMETA_GRPQUOTA;
+		if (XFS_IS_GQUOTA_ON(mp))
+			ip = xfs_quota_inode(mp, XFS_DQTYPE_GROUP);
+		break;
+	case XFS_SCRUB_METAPATH_PRJQUOTA:
+		mpath->path = &XFS_IMETA_PRJQUOTA;
+		if (XFS_IS_PQUOTA_ON(mp))
+			ip = xfs_quota_inode(mp, XFS_DQTYPE_PROJ);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (mpath->path->im_depth < 1) {
+		/* Not supposed to have any zero-length paths */
+		ASSERT(mpath->path->im_depth >= 1);
+		return -EFSCORRUPTED;
+	}
+
+	if (!ip)
+		return -ENOENT;
+
+	error = xchk_install_live_inode(sc, ip);
+	if (error)
+		return error;
+
+	mpath->xname.name = mpath->path->im_path[mpath->path->im_depth - 1];
+	mpath->xname.len = strlen(mpath->xname.name);
+	mpath->xname.type = xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode);
+	return 0;
+}
+
+/*
+ * Try to attach the parent metadata directory to the scrub context.  Returns
+ * true if a parent is attached.
+ */
+STATIC bool
+xchk_metapath_try_attach_parent(
+	struct xchk_metapath	*mpath)
+{
+	struct xfs_scrub	*sc = mpath->sc;
+
+	if (mpath->dp)
+		return true;
+
+	/*
+	 * Grab the parent we just ensured.  If the parent itself is corrupt
+	 * enough that xfs_iget fails, someone else will have to fix it for us.
+	 * That someone are the inode and scrubbers, invoked on the parent dir
+	 * via handle.
+	 */
+	xfs_imeta_dir_parent(sc->tp, mpath->path, &mpath->dp);
+
+	trace_xchk_metapath_try_attach_parent(sc, mpath->path, mpath->dp,
+			NULLFSINO);
+	return mpath->dp != NULL;
+}
+
+/*
+ * Take the ILOCK on the metadata directory parent and child.  We do not know
+ * that the metadata directory is not corrupt, so we lock the parent and try
+ * to lock the child.  Returns 0 if successful, or -EINTR to abort the scrub.
+ */
+STATIC int
+xchk_metapath_ilock_both(
+	struct xchk_metapath	*mpath)
+{
+	struct xfs_scrub	*sc = mpath->sc;
+	int			error = 0;
+
+	while (true) {
+		xfs_ilock(mpath->dp, XFS_ILOCK_EXCL);
+		if (xchk_ilock_nowait(sc, XFS_ILOCK_EXCL)) {
+			mpath->dp_ilock_flags |= XFS_ILOCK_EXCL;
+			return 0;
+		}
+		xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+
+		if (xchk_should_terminate(sc, &error))
+			return error;
+
+		delay(1);
+	}
+
+	ASSERT(0);
+	return -EINTR;
+}
+
+/* Unlock parent and child inodes. */
+static inline void
+xchk_metapath_iunlock(
+	struct xchk_metapath	*mpath)
+{
+	struct xfs_scrub	*sc = mpath->sc;
+
+	xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+	mpath->dp_ilock_flags &= ~XFS_ILOCK_EXCL;
+	xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+}
+
+int
+xchk_metapath(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_metapath	*mpath = sc->buf;
+	xfs_ino_t		ino = NULLFSINO;
+	int			error;
+
+	error = xchk_trans_alloc_empty(sc);
+	if (error)
+		return error;
+
+	/* Grab the parent; if we can't, it's corrupt. */
+	if (!xchk_metapath_try_attach_parent(mpath)) {
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		goto out_cancel;
+	}
+
+	error = xchk_metapath_ilock_both(mpath);
+	if (error)
+		goto out_cancel;
+
+	/* Make sure the parent dir has a dirent pointing to this file. */
+	error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+	trace_xchk_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+	if (error == -ENOENT) {
+		/* No directory entry at all */
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		error = 0;
+		goto out_ilock;
+	}
+	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+		goto out_ilock;
+	if (ino != sc->ip->i_ino) {
+		/* Pointing to wrong inode */
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+	}
+
+out_ilock:
+	xchk_metapath_iunlock(mpath);
+out_cancel:
+	xchk_trans_cancel(sc);
+	return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 7b70dfb30287b..07d3ed91259ce 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -447,6 +447,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.has	= xfs_has_parent,
 		.repair	= xrep_dirtree,
 	},
+	[XFS_SCRUB_TYPE_METAPATH] = {	/* metadata directory tree path */
+		.type	= ST_GENERIC,
+		.setup	= xchk_setup_metapath,
+		.scrub	= xchk_metapath,
+		.has	= xfs_has_metadir,
+		.repair	= xrep_notsupported,
+	},
 };
 
 static int
@@ -494,6 +501,8 @@ xchk_validate_inputs(
 		if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
 			goto out;
 		break;
+	case ST_GENERIC:
+		break;
 	default:
 		goto out;
 	}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index ecd7aff7a48bf..b48199d8940b8 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -66,6 +66,7 @@ enum xchk_type {
 	ST_PERAG,	/* per-AG metadata */
 	ST_FS,		/* per-FS metadata */
 	ST_INODE,	/* per-inode metadata */
+	ST_GENERIC,	/* determined by the scrubber */
 };
 
 struct xchk_meta_ops {
@@ -250,6 +251,7 @@ int xchk_xattr(struct xfs_scrub *sc);
 int xchk_symlink(struct xfs_scrub *sc);
 int xchk_parent(struct xfs_scrub *sc);
 int xchk_dirtree(struct xfs_scrub *sc);
+int xchk_metapath(struct xfs_scrub *sc);
 #ifdef CONFIG_XFS_RT
 int xchk_rtbitmap(struct xfs_scrub *sc);
 int xchk_rtsummary(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index fd92df6389ac8..2e576c601b7dc 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -80,6 +80,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_QUOTACHECK]	= "quotacheck",
 	[XFS_SCRUB_TYPE_NLINKS]		= "nlinks",
 	[XFS_SCRUB_TYPE_DIRTREE]	= "dirtree",
+	[XFS_SCRUB_TYPE_METAPATH]	= "metapath",
 };
 
 /* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 994a910eead80..06f7b89920e94 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -21,6 +21,7 @@
 #include "xfs_dir2.h"
 #include "xfs_rmap.h"
 #include "xfs_parent.h"
+#include "xfs_imeta.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 83ad76a18b2c0..f7a0f4de23a55 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -32,6 +32,7 @@ struct xfs_parent_name_irec;
 enum xchk_dirpath_outcome;
 struct xchk_dirtree;
 struct xchk_dirtree_outcomes;
+struct xfs_imeta_path;
 
 /*
  * ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -81,6 +82,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
 
 #define XFS_SCRUB_TYPE_STRINGS \
 	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
@@ -112,7 +114,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
 	{ XFS_SCRUB_TYPE_NLINKS,	"nlinks" }, \
 	{ XFS_SCRUB_TYPE_HEALTHY,	"healthy" }, \
 	{ XFS_SCRUB_TYPE_DIRTREE,	"dirtree" }, \
-	{ XFS_SCRUB_TYPE_BARRIER,	"barrier" }
+	{ XFS_SCRUB_TYPE_BARRIER,	"barrier" }, \
+	{ XFS_SCRUB_TYPE_METAPATH,	"metapath" }
 
 #define XFS_SCRUB_FLAG_STRINGS \
 	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \
@@ -1926,6 +1929,46 @@ TRACE_EVENT(xchk_dirtree_live_update,
 		  __get_str(name))
 );
 
+DECLARE_EVENT_CLASS(xchk_metapath_class,
+	TP_PROTO(struct xfs_scrub *sc, const struct xfs_imeta_path *path,
+		 struct xfs_inode *dp, xfs_ino_t ino),
+	TP_ARGS(sc, path, dp, ino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, scrub_ino)
+		__field(xfs_ino_t, parent_ino)
+		__field(xfs_ino_t, ino)
+		__field(unsigned int, namelen)
+		__dynamic_array(char, name, strlen(path->im_path[path->im_depth - 1]))
+	),
+	TP_fast_assign(
+		const unsigned char *p;
+
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->scrub_ino = sc->ip ? sc->ip->i_ino : NULLFSINO;
+		__entry->parent_ino = dp ? dp->i_ino : NULLFSINO;
+		__entry->ino = ino;
+
+		p = path->im_path[path->im_depth - 1];
+		__entry->namelen = strlen(p);
+		memcpy(__get_str(name), p, __entry->namelen);
+	),
+	TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx name '%.*s' ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->scrub_ino,
+		  __entry->parent_ino,
+		  __entry->namelen,
+		  __get_str(name),
+		  __entry->ino)
+);
+#define DEFINE_XCHK_METAPATH_EVENT(name) \
+DEFINE_EVENT(xchk_metapath_class, name, \
+	TP_PROTO(struct xfs_scrub *sc, const struct xfs_imeta_path *path, \
+		 struct xfs_inode *dp, xfs_ino_t ino), \
+	TP_ARGS(sc, path, dp, ino))
+DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_try_attach_parent);
+DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup);
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 09094b271e54e..b7aa33a4c9e06 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -361,6 +361,7 @@ static const struct ioctl_sick_map fs_map[] = {
 	{ XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK },
 	{ XFS_SICK_FS_NLINKS,	XFS_FSOP_GEOM_SICK_NLINKS },
 	{ XFS_SICK_FS_METADIR,	XFS_FSOP_GEOM_SICK_METADIR },
+	{ XFS_SICK_FS_METAPATH,	XFS_FSOP_GEOM_SICK_METAPATH },
 	{ 0, 0 },
 };
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 30/32] xfs: repair metadata directory file path connectivity
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (28 preceding siblings ...)
  2023-12-31 21:13   ` [PATCH 29/32] xfs: check metadata directory file path connectivity Darrick J. Wong
@ 2023-12-31 21:14   ` Darrick J. Wong
  2023-12-31 21:14   ` [PATCH 31/32] xfs: fix up repair functions for metadata directory roots Darrick J. Wong
  2023-12-31 21:14   ` [PATCH 32/32] xfs: enable metadata directory feature Darrick J. Wong
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:14 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Fix disconnected or incorrect metadata directory paths.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/metapath.c |  367 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h   |    3 
 fs/xfs/scrub/scrub.c    |    2 
 fs/xfs/scrub/trace.h    |    5 +
 4 files changed, 376 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index fac957de282c3..5a669a1a8ad17 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -17,10 +17,15 @@
 #include "xfs_quota.h"
 #include "xfs_qm.h"
 #include "xfs_dir2.h"
+#include "xfs_parent.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_attr.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/readdir.h"
+#include "scrub/repair.h"
 
 /*
  * Metadata Directory Tree Paths
@@ -39,6 +44,9 @@ struct xchk_metapath {
 	/* Name for lookup */
 	struct xfs_name			xname;
 
+	/* Directory update for repairs */
+	struct xfs_dir_update		du;
+
 	/* Path for this metadata file */
 	const struct xfs_imeta_path	*path;
 
@@ -47,6 +55,18 @@ struct xchk_metapath {
 
 	/* Locks held on dp */
 	unsigned int			dp_ilock_flags;
+
+	/* Transaction block reservations */
+	unsigned int			link_resblks;
+	unsigned int			unlink_resblks;
+
+	/* Parent pointer updates */
+	struct xfs_parent_args		link_ppargs;
+	struct xfs_parent_args		unlink_ppargs;
+
+	/* Scratchpads for removing links */
+	struct xfs_parent_name_irec	pptr;
+	struct xfs_parent_scratch	scratch;
 };
 
 /* Release resources tracked in the buffer. */
@@ -248,3 +268,350 @@ xchk_metapath(
 	xchk_trans_cancel(sc);
 	return error;
 }
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+/* Create the dirent represented by the final component of the path. */
+STATIC int
+xrep_metapath_link(
+	struct xchk_metapath	*mpath)
+{
+	struct xfs_scrub	*sc = mpath->sc;
+
+	mpath->du.dp = mpath->dp;
+	mpath->du.name = &mpath->xname;
+	mpath->du.ip = sc->ip;
+
+	if (xfs_has_parent(sc->mp))
+		mpath->du.ppargs = &mpath->link_ppargs;
+	else
+		mpath->du.ppargs = NULL;
+
+	trace_xrep_metapath_link(sc, mpath->path, mpath->dp, sc->ip->i_ino);
+
+	return xfs_dir_add_child(sc->tp, mpath->link_resblks, &mpath->du);
+}
+
+/* Remove the dirent at the final component of the path. */
+STATIC int
+xrep_metapath_unlink(
+	struct xchk_metapath	*mpath,
+	xfs_ino_t		ino,
+	struct xfs_inode	*ip)
+{
+	struct xfs_scrub	*sc = mpath->sc;
+	struct xfs_mount	*mp = sc->mp;
+	int			error;
+
+	trace_xrep_metapath_unlink(sc, mpath->path, mpath->dp, ino);
+
+	if (!ip) {
+		/* The child inode isn't allocated.  Junk the dirent. */
+		xfs_trans_log_inode(sc->tp, mpath->dp, XFS_ILOG_CORE);
+		return xfs_dir_removename(sc->tp, mpath->dp, &mpath->xname,
+				ino, mpath->unlink_resblks);
+	}
+
+	mpath->du.dp = mpath->dp;
+	mpath->du.name = &mpath->xname;
+	mpath->du.ip = ip;
+	mpath->du.ppargs = NULL;
+
+	/* Figure out if we're removing a parent pointer too. */
+	if (xfs_has_parent(mp)) {
+		mpath->pptr.p_ino = ip->i_ino;
+		mpath->pptr.p_gen = VFS_I(ip)->i_generation;
+		mpath->pptr.p_namelen = mpath->xname.len;
+		memcpy(mpath->pptr.p_name, mpath->xname.name, mpath->xname.len);
+		xfs_parent_irec_hashname(mp, &mpath->pptr);
+
+		error = xfs_parent_lookup(sc->tp, ip, &mpath->pptr,
+				&mpath->scratch);
+		switch (error) {
+		case -ENOATTR:
+			break;
+		case 0:
+			mpath->du.ppargs = &mpath->unlink_ppargs;
+			break;
+		default:
+			return error;
+		}
+	}
+
+	return xfs_dir_remove_child(sc->tp, mpath->unlink_resblks, &mpath->du);
+}
+
+/*
+ * Try to create a dirent in @mpath->dp with the name @mpath->xname that points
+ * to @sc->ip.  Returns:
+ *
+ * -EEXIST and an @alleged_child if the dirent that points to the wrong inode;
+ * 0 if there is now a dirent pointing to @sc->ip; or
+ * A negative errno on error.
+ */
+STATIC int
+xrep_metapath_try_link(
+	struct xchk_metapath	*mpath,
+	xfs_ino_t		*alleged_child)
+{
+	struct xfs_scrub	*sc = mpath->sc;
+	xfs_ino_t		ino;
+	int			error;
+
+	/* Allocate transaction, lock inodes, join to transaction. */
+	error = xchk_trans_alloc(sc, mpath->link_resblks);
+	if (error)
+		return error;
+
+	error = xchk_metapath_ilock_both(mpath);
+	if (error) {
+		xchk_trans_cancel(sc);
+		return error;
+	}
+	xfs_trans_ijoin(sc->tp, mpath->dp, 0);
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+	trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+	if (error == -ENOENT) {
+		/*
+		 * There is no dirent in the directory.  Create an entry
+		 * pointing to @sc->ip.
+		 */
+		error = xrep_metapath_link(mpath);
+		if (error)
+			goto out_cancel;
+
+		error = xrep_trans_commit(sc);
+		xchk_metapath_iunlock(mpath);
+		return error;
+	}
+	if (error)
+		goto out_cancel;
+
+	if (ino == sc->ip->i_ino) {
+		/* The dirent already points to @sc->ip; we're done. */
+		error = 0;
+		goto out_cancel;
+	}
+
+	/*
+	 * The dirent points elsewhere; pass that back so that the caller
+	 * can try to remove the dirent.
+	 */
+	*alleged_child = ino;
+	error = -EEXIST;
+
+out_cancel:
+	xchk_trans_cancel(sc);
+	xchk_metapath_iunlock(mpath);
+	return error;
+}
+
+/*
+ * Take the ILOCK on the metadata directory parent and a bad child, if one is
+ * supplied.  We do not know that the metadata directory is not corrupt, so we
+ * lock the parent and try to lock the child.  Returns 0 if successful, or
+ * -EINTR to abort the repair.  The lock state of @dp is not recorded in @mpath.
+ */
+STATIC int
+xchk_metapath_ilock_parent_and_child(
+	struct xchk_metapath	*mpath,
+	struct xfs_inode	*ip)
+{
+	struct xfs_scrub	*sc = mpath->sc;
+	int			error = 0;
+
+	while (true) {
+		xfs_ilock(mpath->dp, XFS_ILOCK_EXCL);
+		if (!ip || xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+			return 0;
+		xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+
+		if (xchk_should_terminate(sc, &error))
+			return error;
+
+		delay(1);
+	}
+
+	ASSERT(0);
+	return -EINTR;
+}
+
+/*
+ * Try to remove a dirent in @mpath->dp with the name @mpath->xname that points
+ * to @alleged_child.  Returns:
+ *
+ * 0 if there is no longer a dirent;
+ * -EEXIST if the dirent points to @sc->ip;
+ * -EAGAIN and an updated @alleged_child if the dirent points elsewhere; or
+ * A negative errno for any other error.
+ */
+STATIC int
+xrep_metapath_try_unlink(
+	struct xchk_metapath	*mpath,
+	xfs_ino_t		*alleged_child)
+{
+	struct xfs_scrub	*sc = mpath->sc;
+	struct xfs_inode	*ip = NULL;
+	xfs_ino_t		ino;
+	int			error;
+
+	ASSERT(*alleged_child != sc->ip->i_ino);
+
+	trace_xrep_metapath_try_unlink(sc, mpath->path, mpath->dp,
+			*alleged_child);
+
+	/*
+	 * Allocate transaction, grab the alleged child inode, lock inodes,
+	 * join to transaction.
+	 */
+	error = xchk_trans_alloc(sc, mpath->unlink_resblks);
+	if (error)
+		return error;
+
+	error = xchk_iget(sc, *alleged_child, &ip);
+	if (error == -EINVAL || error == -ENOENT) {
+		/* inode number is bogus, junk the dirent */
+		error = 0;
+	}
+	if (error) {
+		xchk_trans_cancel(sc);
+		return error;
+	}
+
+	error = xchk_metapath_ilock_parent_and_child(mpath, ip);
+	if (error) {
+		xchk_trans_cancel(sc);
+		return error;
+	}
+	xfs_trans_ijoin(sc->tp, mpath->dp, 0);
+	if (ip)
+		xfs_trans_ijoin(sc->tp, ip, 0);
+
+	error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+	trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+	if (error == -ENOENT) {
+		/*
+		 * There is no dirent in the directory anymore.  We're ready to
+		 * try the link operation again.
+		 */
+		error = 0;
+		goto out_cancel;
+	}
+	if (error)
+		goto out_cancel;
+
+	if (ino == sc->ip->i_ino) {
+		/* The dirent already points to @sc->ip; we're done. */
+		error = -EEXIST;
+		goto out_cancel;
+	}
+
+	/*
+	 * The dirent does not point to the alleged child.  Update the caller
+	 * and signal that we want to be called again.
+	 */
+	if (ino != *alleged_child) {
+		*alleged_child = ino;
+		error = -EAGAIN;
+		goto out_cancel;
+	}
+
+	/* Remove the link to the child. */
+	error = xrep_metapath_unlink(mpath, ino, ip);
+	if (error)
+		goto out_cancel;
+
+	error = xrep_trans_commit(sc);
+	goto out_unlock;
+
+out_cancel:
+	xchk_trans_cancel(sc);
+out_unlock:
+	xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+	if (ip) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xchk_irele(sc, ip);
+	}
+	return error;
+}
+
+/*
+ * Make sure the metadata directory path points to the child being examined.
+ *
+ * Repair needs to be able to create a directory structure, create its own
+ * transactions, and take ILOCKs.  This function /must/ be called after all
+ * other repairs have completed.
+ */
+int
+xrep_metapath(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_metapath	*mpath = sc->buf;
+	struct xfs_mount	*mp = sc->mp;
+	int			error = 0;
+
+	/*
+	 * Make sure the directory path exists all the way down to where the
+	 * parent pointer should be.
+	 */
+	error = xfs_imeta_ensure_dirpath(sc->mp, mpath->path);
+	if (error)
+		return error;
+
+	/* Make sure the parent is attached now. */
+	error = xchk_trans_alloc_empty(sc);
+	if (error)
+		return error;
+	if (!xchk_metapath_try_attach_parent(mpath)) {
+		xchk_trans_cancel(sc);
+		return -EFSCORRUPTED;
+	}
+	xchk_trans_cancel(sc);
+
+	/*
+	 * Make sure the child file actually has an attr fork to receive a new
+	 * parent pointer if the fs has parent pointers.
+	 */
+	if (xfs_has_parent(mp)) {
+		error = xfs_attr_add_fork(sc->ip,
+				sizeof(struct xfs_attr_sf_hdr), 1);
+		if (error)
+			return error;
+	}
+
+	/* Compute block reservation required to unlink and link a file. */
+	mpath->unlink_resblks = xfs_remove_space_res(mp, MAXNAMELEN);
+	mpath->link_resblks = xfs_link_space_res(mp, MAXNAMELEN);
+
+	/* Set up parent pointer tracking. */
+	xfs_parent_args_init(mp, &mpath->link_ppargs);
+	xfs_parent_args_init(mp, &mpath->unlink_ppargs);
+
+	do {
+		xfs_ino_t	alleged_child;
+
+		/* Re-establish the link, or tell us which inode to remove. */
+		error = xrep_metapath_try_link(mpath, &alleged_child);
+		if (!error)
+			return 0;
+		if (error != -EEXIST)
+			return error;
+
+		/*
+		 * Remove an incorrect link to an alleged child, or tell us
+		 * which inode to remove.
+		 */
+		do {
+			error = xrep_metapath_try_unlink(mpath, &alleged_child);
+		} while (error == -EAGAIN);
+		if (error == -EEXIST) {
+			/* Link established; we're done. */
+			error = 0;
+			break;
+		}
+	} while (!error);
+
+	return error;
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 22bce1def9393..a0780ccdd9ab6 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -134,6 +134,7 @@ int xrep_directory(struct xfs_scrub *sc);
 int xrep_parent(struct xfs_scrub *sc);
 int xrep_symlink(struct xfs_scrub *sc);
 int xrep_dirtree(struct xfs_scrub *sc);
+int xrep_metapath(struct xfs_scrub *sc);
 
 #ifdef CONFIG_XFS_RT
 int xrep_rtbitmap(struct xfs_scrub *sc);
@@ -208,6 +209,7 @@ xrep_setup_nothing(
 #define xrep_setup_parent		xrep_setup_nothing
 #define xrep_setup_nlinks		xrep_setup_nothing
 #define xrep_setup_dirtree		xrep_setup_nothing
+#define xrep_setup_metapath		xrep_setup_nothing
 
 #define xrep_setup_inode(sc, imap)	((void)0)
 
@@ -243,6 +245,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
 #define xrep_parent			xrep_notsupported
 #define xrep_symlink			xrep_notsupported
 #define xrep_dirtree			xrep_notsupported
+#define xrep_metapath			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 07d3ed91259ce..1c3b140b02419 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -452,7 +452,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_metapath,
 		.scrub	= xchk_metapath,
 		.has	= xfs_has_metadir,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_metapath,
 	},
 };
 
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index f7a0f4de23a55..038a4f8dda5a0 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -3740,6 +3740,11 @@ DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path);
 DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption);
 DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate);
 
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_lookup);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link);
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 31/32] xfs: fix up repair functions for metadata directory roots
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (29 preceding siblings ...)
  2023-12-31 21:14   ` [PATCH 30/32] xfs: repair " Darrick J. Wong
@ 2023-12-31 21:14   ` Darrick J. Wong
  2023-12-31 21:14   ` [PATCH 32/32] xfs: enable metadata directory feature Darrick J. Wong
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:14 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Amend the directory and parent pointer repair code to handle the root of
the metadata directory tree correctly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/dir_repair.c    |    2 +-
 fs/xfs/scrub/inode_repair.c  |    8 ++++----
 fs/xfs/scrub/orphanage.c     |    2 ++
 fs/xfs/scrub/parent_repair.c |    2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
index 304eb042df7c9..a6e2219941be7 100644
--- a/fs/xfs/scrub/dir_repair.c
+++ b/fs/xfs/scrub/dir_repair.c
@@ -1324,7 +1324,7 @@ xrep_dir_scan_dirtree(
 	int			error;
 
 	/* Roots of directory trees are their own parents. */
-	if (sc->ip == sc->mp->m_rootip)
+	if (sc->ip == sc->mp->m_rootip || sc->ip == sc->mp->m_metadirip)
 		xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
 
 	/*
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index e2dde5834051c..2c4209054f41b 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1718,14 +1718,14 @@ xrep_inode_pptr(
 		return 0;
 
 	/* The root directory doesn't have a parent pointer. */
-	if (ip == mp->m_rootip)
+	if (ip == mp->m_rootip || ip == mp->m_metadirip)
 		return 0;
 
 	/*
-	 * Metadata inodes are rooted in the superblock and do not have any
-	 * parents.
+	 * Prior to metadata directories, all metadata inodes are rooted in the
+	 * superblock and do not have any parents.
 	 */
-	if (xfs_is_metadata_inode(ip))
+	if (!xfs_has_metadir(mp) && xfs_is_metadata_inode(ip))
 		return 0;
 
 	/* Inode already has an attr fork; no further work possible here. */
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index f1711d6031613..1c8f854aaf552 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -297,6 +297,8 @@ xrep_orphanage_can_adopt(
 		return false;
 	if (xfs_internal_inum(sc->mp, sc->ip->i_ino))
 		return false;
+	if (xfs_is_metadata_inode(sc->ip))
+		return false;
 	return true;
 }
 
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
index 04ea0b05ed088..65158dff840bb 100644
--- a/fs/xfs/scrub/parent_repair.c
+++ b/fs/xfs/scrub/parent_repair.c
@@ -1345,7 +1345,7 @@ xrep_parent_rebuild_pptrs(
 	 * so that we can decide if we're moving this file to the orphanage.
 	 * For this purpose, root directories are their own parents.
 	 */
-	if (sc->ip == sc->mp->m_rootip) {
+	if (sc->ip == sc->mp->m_rootip || sc->ip == sc->mp->m_metadirip) {
 		xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino);
 	} else {
 		error = xrep_parent_lookup_pptrs(sc, &parent_ino);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 32/32] xfs: enable metadata directory feature
  2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
                     ` (30 preceding siblings ...)
  2023-12-31 21:14   ` [PATCH 31/32] xfs: fix up repair functions for metadata directory roots Darrick J. Wong
@ 2023-12-31 21:14   ` Darrick J. Wong
  31 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:14 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable the metadata directory feature.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 7596b9286988e..0636ca97622dd 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -398,7 +398,8 @@ xfs_sb_has_ro_compat_feature(
 		 XFS_SB_FEAT_INCOMPAT_BIGTIME| \
 		 XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
 		 XFS_SB_FEAT_INCOMPAT_NREXT64| \
-		 XFS_SB_FEAT_INCOMPAT_PARENT)
+		 XFS_SB_FEAT_INCOMPAT_PARENT | \
+		 XFS_SB_FEAT_INCOMPAT_METADIR)
 
 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL
 static inline bool


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/4] xfs: refactor realtime scrubbing context management
  2023-12-31 19:35 ` [PATCHSET v2.0 03/15] xfs: refactor realtime meta inode locking Darrick J. Wong
@ 2023-12-31 21:14   ` Darrick J. Wong
  2023-12-31 21:15   ` [PATCH 2/4] xfs: use separate lock classes for realtime metadata inode ILOCKs Darrick J. Wong
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:14 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a pair of helpers to deal with setting up the necessary incore
context to check metadata records against the realtime metadata.  Right
now this is limited to locking the realtime bitmap and summary inodes,
but as we add rmap and reflink to the realtime device this will grow to
include btree cursors.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/bmap.c      |    2 +
 fs/xfs/scrub/common.c    |   69 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/common.h    |   18 ++++++++++++
 fs/xfs/scrub/rtbitmap.c  |    7 +----
 fs/xfs/scrub/rtsummary.c |   12 ++------
 fs/xfs/scrub/scrub.c     |    1 +
 fs/xfs/scrub/scrub.h     |    9 ++++++
 7 files changed, 104 insertions(+), 14 deletions(-)


diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 24a15bf784f11..b90f0d8540ba5 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -314,8 +314,10 @@ xchk_bmap_rt_iextent_xref(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
+	xchk_rt_init(info->sc, &info->sc->sr, XCHK_RTLOCK_BITMAP_SHARED);
 	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
 			irec->br_blockcount);
+	xchk_rt_unlock(info->sc, &info->sc->sr);
 }
 
 /* Cross-reference a single datadev extent record. */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index c6501101c925b..4ad58192f2e3d 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -679,6 +679,75 @@ xchk_ag_init(
 	return 0;
 }
 
+/*
+ * For scrubbing a realtime file, grab all the in-core resources we'll need to
+ * check the realtime metadata, which means taking the ILOCK of the realtime
+ * metadata inodes.  Callers must not join these inodes to the transaction
+ * with non-zero lockflags or concurrency problems will result.  The
+ * @rtlock_flags argument takes XCHK_RTLOCK_* flags because scrub has somewhat
+ * unusual locking requirements.
+ */
+void
+xchk_rt_init(
+	struct xfs_scrub	*sc,
+	struct xchk_rt		*sr,
+	unsigned int		rtlock_flags)
+{
+	ASSERT(!(rtlock_flags & ~XCHK_RTLOCK_ALL));
+	ASSERT(hweight32(rtlock_flags & (XCHK_RTLOCK_BITMAP |
+					 XCHK_RTLOCK_BITMAP_SHARED)) < 2);
+	ASSERT(hweight32(rtlock_flags & (XCHK_RTLOCK_SUMMARY |
+					 XCHK_RTLOCK_SUMMARY_SHARED)) < 2);
+
+	if (rtlock_flags & XCHK_RTLOCK_BITMAP)
+		xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+	else if (rtlock_flags & XCHK_RTLOCK_BITMAP_SHARED)
+		xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+
+	if (rtlock_flags & XCHK_RTLOCK_SUMMARY)
+		xfs_ilock(sc->mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	else if (rtlock_flags & XCHK_RTLOCK_SUMMARY_SHARED)
+		xfs_ilock(sc->mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+
+	sr->rtlock_flags = rtlock_flags;
+}
+
+/*
+ * Unlock the realtime metadata inodes.  This must be done /after/ committing
+ * (or cancelling) the scrub transaction.
+ */
+void
+xchk_rt_unlock(
+	struct xfs_scrub	*sc,
+	struct xchk_rt		*sr)
+{
+	if (!sr->rtlock_flags)
+		return;
+
+	if (sr->rtlock_flags & XCHK_RTLOCK_SUMMARY)
+		xfs_iunlock(sc->mp->m_rsumip, XFS_ILOCK_EXCL);
+	else if (sr->rtlock_flags & XCHK_RTLOCK_SUMMARY_SHARED)
+		xfs_iunlock(sc->mp->m_rsumip, XFS_ILOCK_SHARED);
+
+	if (sr->rtlock_flags & XCHK_RTLOCK_BITMAP)
+		xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_EXCL);
+	else if (sr->rtlock_flags & XCHK_RTLOCK_BITMAP_SHARED)
+		xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED);
+
+	sr->rtlock_flags = 0;
+}
+
+/* Drop only the shared rt bitmap lock. */
+void
+xchk_rt_unlock_rtbitmap(
+	struct xfs_scrub	*sc)
+{
+	ASSERT(sc->sr.rtlock_flags & XCHK_RTLOCK_BITMAP_SHARED);
+
+	xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	sc->sr.rtlock_flags &= ~XCHK_RTLOCK_BITMAP_SHARED;
+}
+
 /* Per-scrubber setup functions */
 
 void
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 11a893e4b228e..007d293a06d52 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -129,6 +129,24 @@ xchk_ag_init_existing(
 	return error == -ENOENT ? -EFSCORRUPTED : error;
 }
 
+/* Lock the rt bitmap in exclusive mode */
+#define XCHK_RTLOCK_BITMAP		(1U << 31)
+/* Lock the rt bitmap in shared mode */
+#define XCHK_RTLOCK_BITMAP_SHARED	(1U << 30)
+/* Lock the rt summary in exclusive mode */
+#define XCHK_RTLOCK_SUMMARY		(1U << 29)
+/* Lock the rt summary in shared mode */
+#define XCHK_RTLOCK_SUMMARY_SHARED	(1U << 28)
+
+#define XCHK_RTLOCK_ALL		(XCHK_RTLOCK_BITMAP | \
+				 XCHK_RTLOCK_BITMAP_SHARED | \
+				 XCHK_RTLOCK_SUMMARY | \
+				 XCHK_RTLOCK_SUMMARY_SHARED)
+
+void xchk_rt_init(struct xfs_scrub *sc, struct xchk_rt *sr,
+		unsigned int xchk_rtlock_flags);
+void xchk_rt_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
+void xchk_rt_unlock_rtbitmap(struct xfs_scrub *sc);
 int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		struct xchk_ag *sa);
 void xchk_ag_btcur_free(struct xchk_ag *sa);
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 441ca99776527..c018376256a86 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -52,7 +52,7 @@ xchk_setup_rtbitmap(
 	if (error)
 		return error;
 
-	xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+	xchk_rt_init(sc, &sc->sr, XCHK_RTLOCK_BITMAP);
 
 	/*
 	 * Now that we've locked the rtbitmap, we can't race with growfsrt
@@ -216,13 +216,10 @@ xchk_xref_is_used_rt_space(
 
 	startext = xfs_rtb_to_rtx(sc->mp, rtbno);
 	endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
-	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 	error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext,
 			endext - startext + 1, &is_free);
 	if (!xchk_should_check_xref(sc, &error, NULL))
-		goto out_unlock;
+		return;
 	if (is_free)
 		xchk_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
-out_unlock:
-	xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 }
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 5d1622203c8a9..829036b50b0c1 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -79,14 +79,8 @@ xchk_setup_rtsummary(
 	if (error)
 		return error;
 
-	/*
-	 * Locking order requires us to take the rtbitmap first.  We must be
-	 * careful to unlock it ourselves when we are done with the rtbitmap
-	 * file since the scrub infrastructure won't do that for us.  Only
-	 * then we can lock the rtsummary inode.
-	 */
-	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
-	xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	xchk_rt_init(sc, &sc->sr,
+			XCHK_RTLOCK_SUMMARY | XCHK_RTLOCK_BITMAP_SHARED);
 
 	/*
 	 * Now that we've locked the rtbitmap and rtsummary, we can't race with
@@ -365,6 +359,6 @@ xchk_rtsummary(
 	 * that order, so we're still protected against allocation activities
 	 * even if we continue on to the repair function.
 	 */
-	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xchk_rt_unlock_rtbitmap(sc);
 	return error;
 }
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 1c3b140b02419..5d551433b3233 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -230,6 +230,7 @@ xchk_teardown(
 			xfs_trans_cancel(sc->tp);
 		sc->tp = NULL;
 	}
+	xchk_rt_unlock(sc, &sc->sr);
 	if (sc->ip) {
 		if (sc->ilock_flags)
 			xchk_iunlock(sc, sc->ilock_flags);
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index b48199d8940b8..1671b6aa48081 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -111,6 +111,12 @@ struct xchk_ag {
 	struct xfs_btree_cur	*refc_cur;
 };
 
+/* Inode lock state for the RT volume. */
+struct xchk_rt {
+	/* XCHK_RTLOCK_* lock state */
+	unsigned int		rtlock_flags;
+};
+
 struct xfs_scrub {
 	/* General scrub state. */
 	struct xfs_mount		*mp;
@@ -172,6 +178,9 @@ struct xfs_scrub {
 
 	/* State tracking for single-AG operations. */
 	struct xchk_ag			sa;
+
+	/* State tracking for realtime operations. */
+	struct xchk_rt			sr;
 };
 
 /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/4] xfs: use separate lock classes for realtime metadata inode ILOCKs
  2023-12-31 19:35 ` [PATCHSET v2.0 03/15] xfs: refactor realtime meta inode locking Darrick J. Wong
  2023-12-31 21:14   ` [PATCH 1/4] xfs: refactor realtime scrubbing context management Darrick J. Wong
@ 2023-12-31 21:15   ` Darrick J. Wong
  2023-12-31 21:15   ` [PATCH 3/4] xfs: refactor realtime inode locking Darrick J. Wong
  2023-12-31 21:15   ` [PATCH 4/4] xfs: remove XFS_ILOCK_RT* Darrick J. Wong
  3 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:15 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Realtime metadata files are not quite regular files because userspace
can't access the realtime bitmap directly, and because we take the ILOCK
of the rt bitmap file while holding the ILOCK of a realtime file.  The
double nature of inodes confuses lockdep, so up until now we've created
lockdep subclasses to help lockdep keep things straight.

We've gotten away with using lockdep subclasses because there's only two
rt metadata files, but with the coming addition of realtime rmap and
refcounting, we'd need two more subclasses, which is a lot of class bits
to burn on a side feature.

Therefore, switch to manually setting the lockdep class of the rt
metadata ILOCKs.  In the next patch we'll remove the rt-related ILOCK
subclasses.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_rtalloc.c |   36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 7156a5b55cfb6..a033d3361b561 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,6 +25,16 @@
 #include "xfs_da_format.h"
 #include "xfs_imeta.h"
 
+/*
+ * Realtime metadata files are not quite regular files because userspace can't
+ * access the realtime bitmap directly, and because we take the ILOCK of the rt
+ * bitmap file while holding the ILOCK of a regular realtime file.  This double
+ * locking confuses lockdep, so create different lockdep classes here to help
+ * it keep things straight.
+ */
+static struct lock_class_key xfs_rbmip_key;
+static struct lock_class_key xfs_rsumip_key;
+
 /*
  * Read and return the summary information for a given extent size,
  * bitmap block combination.
@@ -1330,6 +1340,28 @@ xfs_rtalloc_reinit_frextents(
 	return 0;
 }
 
+static inline int
+__xfs_rt_iget(
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	struct lock_class_key	*lockdep_key,
+	const char		*lockdep_key_name,
+	struct xfs_inode	**ipp)
+{
+	int			error;
+
+	error = xfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, ipp);
+	if (error)
+		return error;
+
+	lockdep_set_class_and_name(&(*ipp)->i_lock.mr_lock, lockdep_key,
+			lockdep_key_name);
+	return 0;
+}
+
+#define xfs_rt_iget(tp, ino, lockdep_key, ipp) \
+	__xfs_rt_iget((tp), (ino), (lockdep_key), #lockdep_key, (ipp))
+
 /*
  * Read in the bmbt of an rt metadata inode so that we never have to load them
  * at runtime.  This enables the use of shared ILOCKs for rtbitmap scans.  Use
@@ -1376,7 +1408,7 @@ xfs_rtmount_inodes(
 	if (error)
 		return error;
 
-	error = xfs_imeta_iget(tp, mp->m_sb.sb_rbmino, XFS_DIR3_FT_REG_FILE,
+	error = xfs_rt_iget(tp, mp->m_sb.sb_rbmino, &xfs_rbmip_key,
 			&mp->m_rbmip);
 	if (xfs_metadata_is_sick(error))
 		xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
@@ -1388,7 +1420,7 @@ xfs_rtmount_inodes(
 	if (error)
 		goto out_rele_bitmap;
 
-	error = xfs_imeta_iget(tp, mp->m_sb.sb_rsumino, XFS_DIR3_FT_REG_FILE,
+	error = xfs_rt_iget(tp, mp->m_sb.sb_rsumino, &xfs_rsumip_key,
 			&mp->m_rsumip);
 	if (xfs_metadata_is_sick(error))
 		xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/4] xfs: refactor realtime inode locking
  2023-12-31 19:35 ` [PATCHSET v2.0 03/15] xfs: refactor realtime meta inode locking Darrick J. Wong
  2023-12-31 21:14   ` [PATCH 1/4] xfs: refactor realtime scrubbing context management Darrick J. Wong
  2023-12-31 21:15   ` [PATCH 2/4] xfs: use separate lock classes for realtime metadata inode ILOCKs Darrick J. Wong
@ 2023-12-31 21:15   ` Darrick J. Wong
  2023-12-31 21:15   ` [PATCH 4/4] xfs: remove XFS_ILOCK_RT* Darrick J. Wong
  3 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:15 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create helper functions to deal with locking realtime metadata inodes.
This enables us to maintain correct locking order once we start adding
the realtime rmap and refcount btree inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c     |    7 +----
 fs/xfs/libxfs/xfs_rtbitmap.c |   57 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtbitmap.h |   17 +++++++++++++
 fs/xfs/scrub/common.c        |    1 +
 fs/xfs/scrub/fscounters.c    |    4 +--
 fs/xfs/xfs_bmap_util.c       |    5 +---
 fs/xfs/xfs_fsmap.c           |    4 +--
 fs/xfs/xfs_rtalloc.c         |   15 ++++-------
 8 files changed, 87 insertions(+), 23 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 6d4e4861b4f5c..c10ed52433979 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5376,12 +5376,9 @@ __xfs_bunmapi(
 
 	if (isrt) {
 		/*
-		 * Synchronize by locking the bitmap inode.
+		 * Synchronize by locking the realtime bitmap.
 		 */
-		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
-		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
-		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		xfs_rtbitmap_lock(tp, mp);
 	}
 
 	extno = 0;
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f43f31dca69d7..eafdda22edcdf 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1194,3 +1194,60 @@ xfs_rtsummary_wordcount(
 	blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
 	return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
 }
+
+/*
+ * Lock both realtime free space metadata inodes for a freespace update.  If a
+ * transaction is given, the inodes will be joined to the transaction and the
+ * ILOCKs will be released on transaction commit.
+ */
+void
+xfs_rtbitmap_lock(
+	struct xfs_trans	*tp,
+	struct xfs_mount	*mp)
+{
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+	if (tp)
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+	xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	if (tp)
+		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+}
+
+/* Unlock both realtime free space metadata inodes after a freespace update. */
+void
+xfs_rtbitmap_unlock(
+	struct xfs_mount	*mp)
+{
+	xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+}
+
+/*
+ * Lock the realtime free space metadata inodes for a freespace scan.  Callers
+ * must walk metadata blocks in order of increasing file offset.
+ */
+void
+xfs_rtbitmap_lock_shared(
+	struct xfs_mount	*mp,
+	unsigned int		rbmlock_flags)
+{
+	if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+
+	if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+}
+
+/* Unlock the realtime free space metadata inodes after a freespace scan. */
+void
+xfs_rtbitmap_unlock_shared(
+	struct xfs_mount	*mp,
+	unsigned int		rbmlock_flags)
+{
+	if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
+		xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+
+	if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
+		xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 1c84b52de3d42..6ac17f0195ea1 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -374,6 +374,19 @@ xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
 		unsigned int rsumlevels, xfs_extlen_t rbmblocks);
 unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
 		unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+
+void xfs_rtbitmap_lock(struct xfs_trans *tp, struct xfs_mount *mp);
+void xfs_rtbitmap_unlock(struct xfs_mount *mp);
+
+/* Lock the rt bitmap inode in shared mode */
+#define XFS_RBMLOCK_BITMAP	(1U << 0)
+/* Lock the rt summary inode in shared mode */
+#define XFS_RBMLOCK_SUMMARY	(1U << 1)
+
+void xfs_rtbitmap_lock_shared(struct xfs_mount *mp,
+		unsigned int rbmlock_flags);
+void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
+		unsigned int rbmlock_flags);
 #else /* CONFIG_XFS_RT */
 # define xfs_rtfree_extent(t,b,l)			(-ENOSYS)
 # define xfs_rtfree_blocks(t,rb,rl)			(-ENOSYS)
@@ -394,6 +407,10 @@ xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
 # define xfs_rtbitmap_wordcount(mp, r)			(0)
 # define xfs_rtsummary_blockcount(mp, l, b)		(0)
 # define xfs_rtsummary_wordcount(mp, l, b)		(0)
+# define xfs_rtbitmap_lock(tp, mp)		do { } while (0)
+# define xfs_rtbitmap_unlock(mp)		do { } while (0)
+# define xfs_rtbitmap_lock_shared(mp, lf)	do { } while (0)
+# define xfs_rtbitmap_unlock_shared(mp, lf)	do { } while (0)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 4ad58192f2e3d..53eec92df180a 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -33,6 +33,7 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_swapext.h"
+#include "xfs_rtbitmap.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index d310737c88236..2b4bd2eb71b57 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -415,7 +415,7 @@ xchk_fscount_count_frextents(
 	if (!xfs_has_realtime(mp))
 		return 0;
 
-	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
 	error = xfs_rtalloc_query_all(sc->mp, sc->tp,
 			xchk_fscount_add_frextent, fsc);
 	if (error) {
@@ -424,7 +424,7 @@ xchk_fscount_count_frextents(
 	}
 
 out_unlock:
-	xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
 	return error;
 }
 #else
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index d1a57164de032..ef8658a9724dd 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -131,10 +131,7 @@ xfs_bmap_rtalloc(
 	 * Lock out modifications to both the RT bitmap and summary inodes
 	 */
 	if (!rtlocked) {
-		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
-		xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
-		xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		xfs_rtbitmap_lock(ap->tp, mp);
 		rtlocked = true;
 	}
 
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a72217f5feb9..c49a5b01c3e0c 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -533,7 +533,7 @@ xfs_getfsmap_rtdev_rtbitmap(
 	trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
 	trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
 
-	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
 
 	/*
 	 * Set up query parameters to return free rtextents covering the range
@@ -557,7 +557,7 @@ xfs_getfsmap_rtdev_rtbitmap(
 	if (error)
 		goto err;
 err:
-	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
 	return error;
 }
 #endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a033d3361b561..8852d4f95b1ad 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1066,10 +1066,10 @@ xfs_growfs_rt(
 		nargs.tp = tp;
 
 		/*
-		 * Lock out other callers by grabbing the bitmap inode lock.
+		 * Lock out other callers by grabbing the bitmap and summary
+		 * inode locks and joining them to the transaction.
 		 */
-		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
-		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_rtbitmap_lock(tp, mp);
 		/*
 		 * Update the bitmap inode's size ondisk and incore.  We need
 		 * to update the incore size so that inode inactivation won't
@@ -1079,11 +1079,6 @@ xfs_growfs_rt(
 			nsbp->sb_rbmblocks * nsbp->sb_blocksize;
 		i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size);
 		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
-		/*
-		 * Get the summary inode into the transaction.
-		 */
-		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
-		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 		/*
 		 * Update the summary inode's size.  We need to update the
 		 * incore size so that inode inactivation won't punch what it
@@ -1326,10 +1321,10 @@ xfs_rtalloc_reinit_frextents(
 	uint64_t		val = 0;
 	int			error;
 
-	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
 	error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
 			&val);
-	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
 	if (error)
 		return error;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/4] xfs: remove XFS_ILOCK_RT*
  2023-12-31 19:35 ` [PATCHSET v2.0 03/15] xfs: refactor realtime meta inode locking Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:15   ` [PATCH 3/4] xfs: refactor realtime inode locking Darrick J. Wong
@ 2023-12-31 21:15   ` Darrick J. Wong
  3 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:15 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we've centralized the realtime metadata locking routines, get
rid of the ILOCK subclasses since we now use explicit lockdep classes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtbitmap.c |   16 ++++++++--------
 fs/xfs/scrub/common.c        |   10 +++++-----
 fs/xfs/xfs_inode.c           |    3 +--
 fs/xfs/xfs_inode.h           |   13 ++++---------
 fs/xfs/xfs_rtalloc.c         |   11 +++++------
 5 files changed, 23 insertions(+), 30 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index eafdda22edcdf..795035556c4b4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1205,11 +1205,11 @@ xfs_rtbitmap_lock(
 	struct xfs_trans	*tp,
 	struct xfs_mount	*mp)
 {
-	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
 	if (tp)
 		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 
-	xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
 	if (tp)
 		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 }
@@ -1219,8 +1219,8 @@ void
 xfs_rtbitmap_unlock(
 	struct xfs_mount	*mp)
 {
-	xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
-	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+	xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL);
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
 }
 
 /*
@@ -1233,10 +1233,10 @@ xfs_rtbitmap_lock_shared(
 	unsigned int		rbmlock_flags)
 {
 	if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
-		xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
 
 	if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
-		xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED);
 }
 
 /* Unlock the realtime free space metadata inodes after a freespace scan. */
@@ -1246,8 +1246,8 @@ xfs_rtbitmap_unlock_shared(
 	unsigned int		rbmlock_flags)
 {
 	if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
-		xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+		xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED);
 
 	if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
-		xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+		xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
 }
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 53eec92df180a..e9294a933c3ab 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -701,14 +701,14 @@ xchk_rt_init(
 					 XCHK_RTLOCK_SUMMARY_SHARED)) < 2);
 
 	if (rtlock_flags & XCHK_RTLOCK_BITMAP)
-		xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+		xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_EXCL);
 	else if (rtlock_flags & XCHK_RTLOCK_BITMAP_SHARED)
-		xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+		xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED);
 
 	if (rtlock_flags & XCHK_RTLOCK_SUMMARY)
-		xfs_ilock(sc->mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+		xfs_ilock(sc->mp->m_rsumip, XFS_ILOCK_EXCL);
 	else if (rtlock_flags & XCHK_RTLOCK_SUMMARY_SHARED)
-		xfs_ilock(sc->mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+		xfs_ilock(sc->mp->m_rsumip, XFS_ILOCK_SHARED);
 
 	sr->rtlock_flags = rtlock_flags;
 }
@@ -745,7 +745,7 @@ xchk_rt_unlock_rtbitmap(
 {
 	ASSERT(sc->sr.rtlock_flags & XCHK_RTLOCK_BITMAP_SHARED);
 
-	xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED);
 	sc->sr.rtlock_flags &= ~XCHK_RTLOCK_BITMAP_SHARED;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 322fa538aec4b..78afc5b8e11c6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -364,8 +364,7 @@ xfs_lock_inumorder(
 {
 	uint	class = 0;
 
-	ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
-			      XFS_ILOCK_RTSUM)));
+	ASSERT(!(lock_mode & XFS_ILOCK_PARENT));
 	ASSERT(xfs_lockdep_subclass_ok(subclass));
 
 	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e967fa10721f9..e33b270c6b508 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -437,9 +437,8 @@ static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip)
  * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly
  * limited to the subclasses we can represent via nesting. We need at least
  * 5 inodes nest depth for the ILOCK through rename, and we also have to support
- * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP
- * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all
- * 8 subclasses supported by lockdep.
+ * XFS_ILOCK_PARENT, which gives 6 subclasses.  That's 6 of the 8 subclasses
+ * supported by lockdep.
  *
  * This also means we have to number the sub-classes in the lowest bits of
  * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep
@@ -465,8 +464,8 @@ static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip)
  * ILOCK values
  * 0-4		subclass values
  * 5		PARENT subclass (not nestable)
- * 6		RTBITMAP subclass (not nestable)
- * 7		RTSUM subclass (not nestable)
+ * 6		unused
+ * 7		unused
  * 
  */
 #define XFS_IOLOCK_SHIFT		16
@@ -481,12 +480,8 @@ static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip)
 #define XFS_ILOCK_SHIFT			24
 #define XFS_ILOCK_PARENT_VAL		5u
 #define XFS_ILOCK_MAX_SUBCLASS		(XFS_ILOCK_PARENT_VAL - 1)
-#define XFS_ILOCK_RTBITMAP_VAL		6u
-#define XFS_ILOCK_RTSUM_VAL		7u
 #define XFS_ILOCK_DEP_MASK		0xff000000u
 #define	XFS_ILOCK_PARENT		(XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
-#define	XFS_ILOCK_RTBITMAP		(XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
-#define	XFS_ILOCK_RTSUM			(XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
 
 #define XFS_LOCK_SUBCLASS_MASK	(XFS_IOLOCK_DEP_MASK | \
 				 XFS_MMAPLOCK_DEP_MASK | \
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8852d4f95b1ad..f76ecb9a19b51 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1365,12 +1365,11 @@ __xfs_rt_iget(
 static inline int
 xfs_rtmount_iread_extents(
 	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	unsigned int		lock_class)
+	struct xfs_inode	*ip)
 {
 	int			error;
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
 
 	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
 	if (error)
@@ -1383,7 +1382,7 @@ xfs_rtmount_iread_extents(
 	}
 
 out_unlock:
-	xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
@@ -1411,7 +1410,7 @@ xfs_rtmount_inodes(
 		goto out_trans;
 	ASSERT(mp->m_rbmip != NULL);
 
-	error = xfs_rtmount_iread_extents(tp, mp->m_rbmip, XFS_ILOCK_RTBITMAP);
+	error = xfs_rtmount_iread_extents(tp, mp->m_rbmip);
 	if (error)
 		goto out_rele_bitmap;
 
@@ -1423,7 +1422,7 @@ xfs_rtmount_inodes(
 		goto out_rele_bitmap;
 	ASSERT(mp->m_rsumip != NULL);
 
-	error = xfs_rtmount_iread_extents(tp, mp->m_rsumip, XFS_ILOCK_RTSUM);
+	error = xfs_rtmount_iread_extents(tp, mp->m_rsumip);
 	if (error)
 		goto out_rele_summary;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/24] xfs: create incore realtime group structures
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
@ 2023-12-31 21:15   ` Darrick J. Wong
  2023-12-31 21:16   ` [PATCH 02/24] xfs: define the format of rt groups Darrick J. Wong
                     ` (22 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:15 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create an incore object that will contain information about a realtime
allocation group.  This will eventually enable us to shard the realtime
section in a similar manner to how we shard the data section.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile             |    1 
 fs/xfs/libxfs/xfs_format.h  |    8 +
 fs/xfs/libxfs/xfs_rtgroup.c |  251 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h |  132 +++++++++++++++++++++++
 fs/xfs/libxfs/xfs_sb.c      |    5 +
 fs/xfs/libxfs/xfs_types.h   |    4 +
 fs/xfs/xfs_log_recover.c    |    6 +
 fs/xfs/xfs_mount.c          |   12 ++
 fs/xfs/xfs_mount.h          |    6 +
 fs/xfs/xfs_rtalloc.c        |    8 +
 fs/xfs/xfs_super.c          |    2 
 fs/xfs/xfs_trace.c          |    1 
 fs/xfs/xfs_trace.h          |   38 +++++++
 13 files changed, 473 insertions(+), 1 deletion(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c
 create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5362a0fb56d77..500dea292a9d6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -59,6 +59,7 @@ xfs-y				+= $(addprefix libxfs/, \
 # xfs_rtbitmap is shared with libxfs
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
 				   xfs_rtbitmap.o \
+				   xfs_rtgroup.o \
 				   )
 
 # highlevel code
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 0636ca97622dd..3bd93c01bf4bf 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -184,6 +184,14 @@ typedef struct xfs_sb {
 	 */
 	xfs_ino_t	sb_metadirino;
 
+	/*
+	 * Realtime group geometry information.  On disk these fields live in
+	 * the rsumino slot, but we cache them separately in the in-core super
+	 * for easy access.
+	 */
+	xfs_rgblock_t	sb_rgblocks;	/* size of a realtime group */
+	xfs_rgnumber_t	sb_rgcount;	/* number of realtime groups */
+
 	/* must be padded to 64 bit alignment */
 } xfs_sb_t;
 
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
new file mode 100644
index 0000000000000..caa82c4813038
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+
+/*
+ * Passive reference counting access wrappers to the rtgroup structures.  If
+ * the rtgroup structure is to be freed, the freeing code is responsible for
+ * cleaning up objects with passive references before freeing the structure.
+ */
+struct xfs_rtgroup *
+xfs_rtgroup_get(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rcu_read_lock();
+	rtg = radix_tree_lookup(&mp->m_rtgroup_tree, rgno);
+	if (rtg) {
+		trace_xfs_rtgroup_get(rtg, _RET_IP_);
+		ASSERT(atomic_read(&rtg->rtg_ref) >= 0);
+		atomic_inc(&rtg->rtg_ref);
+	}
+	rcu_read_unlock();
+	return rtg;
+}
+
+/* Get a passive reference to the given rtgroup. */
+struct xfs_rtgroup *
+xfs_rtgroup_hold(
+	struct xfs_rtgroup	*rtg)
+{
+	ASSERT(atomic_read(&rtg->rtg_ref) > 0 ||
+	       atomic_read(&rtg->rtg_active_ref) > 0);
+
+	trace_xfs_rtgroup_hold(rtg, _RET_IP_);
+	atomic_inc(&rtg->rtg_ref);
+	return rtg;
+}
+
+void
+xfs_rtgroup_put(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_put(rtg, _RET_IP_);
+	ASSERT(atomic_read(&rtg->rtg_ref) > 0);
+	atomic_dec(&rtg->rtg_ref);
+}
+
+/*
+ * Active references for rtgroup structures. This is for short term access to
+ * the rtgroup structures for walking trees or accessing state. If an rtgroup
+ * is being shrunk or is offline, then this will fail to find that group and
+ * return NULL instead.
+ */
+struct xfs_rtgroup *
+xfs_rtgroup_grab(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rcu_read_lock();
+	rtg = radix_tree_lookup(&mp->m_rtgroup_tree, agno);
+	if (rtg) {
+		trace_xfs_rtgroup_grab(rtg, _RET_IP_);
+		if (!atomic_inc_not_zero(&rtg->rtg_active_ref))
+			rtg = NULL;
+	}
+	rcu_read_unlock();
+	return rtg;
+}
+
+void
+xfs_rtgroup_rele(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_rele(rtg, _RET_IP_);
+	if (atomic_dec_and_test(&rtg->rtg_active_ref))
+		wake_up(&rtg->rtg_active_wq);
+}
+
+int
+xfs_initialize_rtgroups(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgcount)
+{
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		index;
+	xfs_rgnumber_t		first_initialised = NULLRGNUMBER;
+	int			error;
+
+	if (!xfs_has_rtgroups(mp))
+		return 0;
+
+	/*
+	 * Walk the current rtgroup tree so we don't try to initialise rt
+	 * groups that already exist (growfs case). Allocate and insert all the
+	 * rtgroups we don't find ready for initialisation.
+	 */
+	for (index = 0; index < rgcount; index++) {
+		rtg = xfs_rtgroup_get(mp, index);
+		if (rtg) {
+			xfs_rtgroup_put(rtg);
+			continue;
+		}
+
+		rtg = kmem_zalloc(sizeof(struct xfs_rtgroup), KM_MAYFAIL);
+		if (!rtg) {
+			error = -ENOMEM;
+			goto out_unwind_new_rtgs;
+		}
+		rtg->rtg_rgno = index;
+		rtg->rtg_mount = mp;
+
+		error = radix_tree_preload(GFP_NOFS);
+		if (error)
+			goto out_free_rtg;
+
+		spin_lock(&mp->m_rtgroup_lock);
+		if (radix_tree_insert(&mp->m_rtgroup_tree, index, rtg)) {
+			WARN_ON_ONCE(1);
+			spin_unlock(&mp->m_rtgroup_lock);
+			radix_tree_preload_end();
+			error = -EEXIST;
+			goto out_free_rtg;
+		}
+		spin_unlock(&mp->m_rtgroup_lock);
+		radix_tree_preload_end();
+
+#ifdef __KERNEL__
+		/* Place kernel structure only init below this point. */
+		spin_lock_init(&rtg->rtg_state_lock);
+		init_waitqueue_head(&rtg->rtg_active_wq);
+#endif /* __KERNEL__ */
+
+		/* Active ref owned by mount indicates rtgroup is online. */
+		atomic_set(&rtg->rtg_active_ref, 1);
+
+		/* first new rtg is fully initialized */
+		if (first_initialised == NULLRGNUMBER)
+			first_initialised = index;
+	}
+
+	return 0;
+
+out_free_rtg:
+	kmem_free(rtg);
+out_unwind_new_rtgs:
+	/* unwind any prior newly initialized rtgs */
+	for (index = first_initialised; index < rgcount; index++) {
+		rtg = radix_tree_delete(&mp->m_rtgroup_tree, index);
+		if (!rtg)
+			break;
+		kmem_free(rtg);
+	}
+	return error;
+}
+
+STATIC void
+__xfs_free_rtgroups(
+	struct rcu_head		*head)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rtg = container_of(head, struct xfs_rtgroup, rcu_head);
+	kmem_free(rtg);
+}
+
+/*
+ * Free up the rtgroup resources associated with the mount structure.
+ */
+void
+xfs_free_rtgroups(
+	struct xfs_mount	*mp)
+{
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
+	if (!xfs_has_rtgroups(mp))
+		return;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		spin_lock(&mp->m_rtgroup_lock);
+		rtg = radix_tree_delete(&mp->m_rtgroup_tree, rgno);
+		spin_unlock(&mp->m_rtgroup_lock);
+		ASSERT(rtg);
+		XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_ref) != 0);
+
+		/* drop the mount's active reference */
+		xfs_rtgroup_rele(rtg);
+		XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_active_ref) != 0);
+
+		call_rcu(&rtg->rcu_head, __xfs_free_rtgroups);
+	}
+}
+
+/* Find the size of the rtgroup, in blocks. */
+static xfs_rgblock_t
+__xfs_rtgroup_block_count(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	xfs_rgnumber_t		rgcount,
+	xfs_rfsblock_t		rblocks)
+{
+	ASSERT(rgno < rgcount);
+
+	if (rgno < rgcount - 1)
+		return mp->m_sb.sb_rgblocks;
+	return xfs_rtb_rounddown_rtx(mp,
+			rblocks - (rgno * mp->m_sb.sb_rgblocks));
+}
+
+/* Compute the number of blocks in this realtime group. */
+xfs_rgblock_t
+xfs_rtgroup_block_count(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	return __xfs_rtgroup_block_count(mp, rgno, mp->m_sb.sb_rgcount,
+			mp->m_sb.sb_rblocks);
+}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
new file mode 100644
index 0000000000000..2f0a670217c48
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __LIBXFS_RTGROUP_H
+#define __LIBXFS_RTGROUP_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Realtime group incore structure, similar to the per-AG structure.
+ */
+struct xfs_rtgroup {
+	struct xfs_mount	*rtg_mount;
+	xfs_rgnumber_t		rtg_rgno;
+	atomic_t		rtg_ref;	/* passive reference count */
+	atomic_t		rtg_active_ref;	/* active reference count */
+	wait_queue_head_t	rtg_active_wq;/* woken active_ref falls to zero */
+
+	/* for rcu-safe freeing */
+	struct rcu_head		rcu_head;
+
+	/* Number of blocks in this group */
+	xfs_rgblock_t		rtg_blockcount;
+
+#ifdef __KERNEL__
+	/* -- kernel only structures below this line -- */
+	spinlock_t		rtg_state_lock;
+#endif /* __KERNEL__ */
+};
+
+#ifdef CONFIG_XFS_RT
+/* Passive rtgroup references */
+struct xfs_rtgroup *xfs_rtgroup_get(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+struct xfs_rtgroup *xfs_rtgroup_hold(struct xfs_rtgroup *rtg);
+void xfs_rtgroup_put(struct xfs_rtgroup *rtg);
+
+/* Active rtgroup references */
+struct xfs_rtgroup *xfs_rtgroup_grab(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+void xfs_rtgroup_rele(struct xfs_rtgroup *rtg);
+
+int xfs_initialize_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t rgcount);
+void xfs_free_rtgroups(struct xfs_mount *mp);
+#else
+static inline struct xfs_rtgroup *
+xfs_rtgroup_get(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	return NULL;
+}
+static inline struct xfs_rtgroup *
+xfs_rtgroup_hold(struct xfs_rtgroup *rtg)
+{
+	ASSERT(rtg == NULL);
+	return NULL;
+}
+# define xfs_rtgroup_grab			xfs_rtgroup_get
+# define xfs_rtgroup_put(rtg)			((void)0)
+# define xfs_rtgroup_rele(rtg)			((void)0)
+# define xfs_initialize_rtgroups(mp, rgcount)	(0)
+# define xfs_free_rtgroups(mp)			((void)0)
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * rt group iteration APIs
+ */
+static inline struct xfs_rtgroup *
+xfs_rtgroup_next(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgnumber_t		*rgno,
+	xfs_rgnumber_t		end_rgno)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+
+	*rgno = rtg->rtg_rgno + 1;
+	xfs_rtgroup_rele(rtg);
+	if (*rgno > end_rgno)
+		return NULL;
+	return xfs_rtgroup_grab(mp, *rgno);
+}
+
+#define for_each_rtgroup_range(mp, rgno, end_rgno, rtg) \
+	for ((rtg) = xfs_rtgroup_grab((mp), (rgno)); \
+		(rtg) != NULL; \
+		(rtg) = xfs_rtgroup_next((rtg), &(rgno), (end_rgno)))
+
+#define for_each_rtgroup_from(mp, rgno, rtg) \
+	for_each_rtgroup_range((mp), (rgno), (mp)->m_sb.sb_rgcount - 1, (rtg))
+
+
+#define for_each_rtgroup(mp, rgno, rtg) \
+	(rgno) = 0; \
+	for_each_rtgroup_from((mp), (rgno), (rtg))
+
+static inline bool
+xfs_verify_rgbno(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		rgbno)
+{
+	if (rgbno >= rtg->rtg_blockcount)
+		return false;
+	if (rgbno < rtg->rtg_mount->m_sb.sb_rextsize)
+		return false;
+	return true;
+}
+
+static inline bool
+xfs_verify_rgbext(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		rgbno,
+	xfs_rgblock_t		len)
+{
+	if (rgbno + len <= rgbno)
+		return false;
+
+	if (!xfs_verify_rgbno(rtg, rgbno))
+		return false;
+
+	return xfs_verify_rgbno(rtg, rgbno + len - 1);
+}
+
+#ifdef CONFIG_XFS_RT
+xfs_rgblock_t xfs_rtgroup_block_count(struct xfs_mount *mp,
+		xfs_rgnumber_t rgno);
+#else
+# define xfs_rtgroup_block_count(mp, rgno)	(0)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __LIBXFS_RTGROUP_H */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 7a20b9b4bccb3..88402cb4a6879 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -701,6 +701,9 @@ __xfs_sb_from_disk(
 		to->sb_gquotino = NULLFSINO;
 		to->sb_pquotino = NULLFSINO;
 	}
+
+	to->sb_rgcount = 0;
+	to->sb_rgblocks = 0;
 }
 
 void
@@ -1015,6 +1018,8 @@ xfs_sb_mount_common(
 	mp->m_blockwmask = mp->m_blockwsize - 1;
 	mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
 	mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
+	mp->m_rgblklog = 0;
+	mp->m_rgblkmask = 0;
 
 	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
 	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 5556615a2ff9c..195471c438599 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -9,10 +9,12 @@
 typedef uint32_t	prid_t;		/* project ID */
 
 typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
+typedef uint32_t	xfs_rgblock_t;	/* blockno in realtime group */
 typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
 typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
 typedef uint32_t	xfs_rtxlen_t;	/* file extent length in rtextents */
 typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
+typedef uint32_t	xfs_rgnumber_t;	/* realtime group number */
 typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
 typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
 typedef int64_t		xfs_fsize_t;	/* bytes in a file */
@@ -54,7 +56,9 @@ typedef void *		xfs_failaddr_t;
 #define	NULLRTEXTNO	((xfs_rtxnum_t)-1)
 
 #define	NULLAGBLOCK	((xfs_agblock_t)-1)
+#define NULLRGBLOCK	((xfs_rgblock_t)-1)
 #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
+#define	NULLRGNUMBER	((xfs_rgnumber_t)-1)
 
 #define NULLCOMMITLSN	((xfs_lsn_t)-1)
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 628d7915120b0..5e9562f37dc89 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -28,6 +28,7 @@
 #include "xfs_ag.h"
 #include "xfs_quota.h"
 #include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
 
 #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
 
@@ -3347,6 +3348,11 @@ xlog_do_recover(
 		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
 		return error;
 	}
+	error = xfs_initialize_rtgroups(mp, sbp->sb_rgcount);
+	if (error) {
+		xfs_warn(mp, "Failed post-recovery rtgroup init: %d", error);
+		return error;
+	}
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
 	/* Normal transactions can now occur */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8b61b8c51cd16..ba6c77e7e265d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -35,6 +35,7 @@
 #include "xfs_trace.h"
 #include "xfs_ag.h"
 #include "xfs_imeta.h"
+#include "xfs_rtgroup.h"
 #include "scrub/stats.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -842,10 +843,16 @@ xfs_mountfs(
 		goto out_free_dir;
 	}
 
+	error = xfs_initialize_rtgroups(mp, sbp->sb_rgcount);
+	if (error) {
+		xfs_warn(mp, "Failed rtgroup init: %d", error);
+		goto out_free_perag;
+	}
+
 	if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
 		xfs_warn(mp, "no log defined");
 		error = -EFSCORRUPTED;
-		goto out_free_perag;
+		goto out_free_rtgroup;
 	}
 
 	error = xfs_inodegc_register_shrinker(mp);
@@ -1070,6 +1077,8 @@ xfs_mountfs(
 	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
 		xfs_buftarg_drain(mp->m_logdev_targp);
 	xfs_buftarg_drain(mp->m_ddev_targp);
+ out_free_rtgroup:
+	xfs_free_rtgroups(mp);
  out_free_perag:
 	xfs_free_perag(mp);
  out_free_dir:
@@ -1151,6 +1160,7 @@ xfs_unmountfs(
 	xfs_errortag_clearall(mp);
 #endif
 	shrinker_free(mp->m_inodegc_shrinker);
+	xfs_free_rtgroups(mp);
 	xfs_free_perag(mp);
 
 	xfs_errortag_del(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b45a4f4a8503e..52976a133cec9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -121,6 +121,7 @@ typedef struct xfs_mount {
 	uint8_t			m_agno_log;	/* log #ag's */
 	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
 	int8_t			m_rtxblklog;	/* log2 of rextsize, if possible */
+	int8_t			m_rgblklog;	/* log2 of rt group sz if possible */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
 	uint			m_blockwmask;	/* blockwsize-1 */
@@ -155,6 +156,7 @@ typedef struct xfs_mount {
 	uint64_t		m_low_space[XFS_LOWSP_MAX];
 	uint64_t		m_low_rtexts[XFS_LOWSP_MAX];
 	uint64_t		m_rtxblkmask;	/* rt extent block mask */
+	uint64_t		m_rgblkmask;	/* rt group block mask */
 	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
 	struct xfs_trans_resv	m_resv;		/* precomputed res values */
 						/* low free space thresholds */
@@ -203,6 +205,8 @@ typedef struct xfs_mount {
 	 */
 	atomic64_t		m_allocbt_blks;
 
+	struct radix_tree_root	m_rtgroup_tree;	/* per-rt group info */
+	spinlock_t		m_rtgroup_lock;	/* lock for m_rtgroup_tree */
 	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
 	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
 	uint64_t		m_resblks;	/* total reserved blocks */
@@ -294,6 +298,7 @@ typedef struct xfs_mount {
 #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
 #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
 #define XFS_FEAT_METADIR	(1ULL << 27)	/* metadata directory tree */
+#define XFS_FEAT_RTGROUPS	(1ULL << 28)	/* realtime groups */
 
 /* Mount features */
 #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
@@ -358,6 +363,7 @@ __XFS_HAS_FEAT(bigtime, BIGTIME)
 __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
 __XFS_HAS_FEAT(large_extent_counts, NREXT64)
 __XFS_HAS_FEAT(metadir, METADIR)
+__XFS_HAS_FEAT(rtgroups, RTGROUPS)
 
 /*
  * Mount features
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f76ecb9a19b51..59ded74c9007e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -24,6 +24,7 @@
 #include "xfs_health.h"
 #include "xfs_da_format.h"
 #include "xfs_imeta.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Realtime metadata files are not quite regular files because userspace can't
@@ -1396,6 +1397,8 @@ xfs_rtmount_inodes(
 {
 	struct xfs_trans	*tp;
 	struct xfs_sb		*sbp = &mp->m_sb;
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
 	int			error;
 
 	error = xfs_trans_alloc_empty(mp, &tp);
@@ -1426,6 +1429,11 @@ xfs_rtmount_inodes(
 	if (error)
 		goto out_rele_summary;
 
+	for_each_rtgroup(mp, rgno, rtg) {
+		rtg->rtg_blockcount = xfs_rtgroup_block_count(mp,
+							      rtg->rtg_rgno);
+	}
+
 	xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
 	xfs_trans_cancel(tp);
 	return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 998fa006af5bb..c154e2cb7a18e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2002,6 +2002,8 @@ static int xfs_init_fs_context(
 	spin_lock_init(&mp->m_sb_lock);
 	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
 	spin_lock_init(&mp->m_perag_lock);
+	INIT_RADIX_TREE(&mp->m_rtgroup_tree, GFP_ATOMIC);
+	spin_lock_init(&mp->m_rtgroup_lock);
 	mutex_init(&mp->m_growlock);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 453cf7ffdea03..21f55b0125fc4 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -44,6 +44,7 @@
 #include "xfs_xchgrange.h"
 #include "xfs_parent.h"
 #include "xfs_imeta.h"
+#include "xfs_rtgroup.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 17f364e1b9613..81c21000d4fea 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -87,6 +87,7 @@ struct xfs_getparents;
 struct xfs_parent_name_irec;
 struct xfs_attrlist_cursor_kern;
 struct xfs_imeta_update;
+struct xfs_rtgroup;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -216,6 +217,43 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_rele);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_rtgroup_class,
+	TP_PROTO(struct xfs_rtgroup *rtg, unsigned long caller_ip),
+	TP_ARGS(rtg, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(int, refcount)
+		__field(int, active_refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg->rtg_mount->m_super->s_dev;
+		__entry->rgno = rtg->rtg_rgno;
+		__entry->refcount = atomic_read(&rtg->rtg_ref);
+		__entry->active_refcount = atomic_read(&rtg->rtg_active_ref);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d rgno 0x%x passive refs %d active refs %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->refcount,
+		  __entry->active_refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_RTGROUP_REF_EVENT(name)	\
+DEFINE_EVENT(xfs_rtgroup_class, name,	\
+	TP_PROTO(struct xfs_rtgroup *rtg, unsigned long caller_ip), \
+	TP_ARGS(rtg, caller_ip))
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_get);
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_hold);
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_put);
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_grab);
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_rele);
+#endif /* CONFIG_XFS_RT */
+
 TRACE_EVENT(xfs_inodegc_worker,
 	TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
 	TP_ARGS(mp, shrinker_hits),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/24] xfs: define the format of rt groups
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
  2023-12-31 21:15   ` [PATCH 01/24] xfs: create incore realtime group structures Darrick J. Wong
@ 2023-12-31 21:16   ` Darrick J. Wong
  2023-12-31 21:16   ` [PATCH 03/24] xfs: reduce rt summary file levels for rtgroups filesystems Darrick J. Wong
                     ` (21 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:16 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Define the ondisk format of realtime group metadata.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h  |   62 +++++++++++++++++++++++++++-
 fs/xfs/libxfs/xfs_ondisk.h  |    1 
 fs/xfs/libxfs/xfs_rtgroup.c |   96 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h |   83 +++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_sb.c      |   86 +++++++++++++++++++++++++++++++++++++--
 fs/xfs/libxfs/xfs_shared.h  |    1 
 6 files changed, 324 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 3bd93c01bf4bf..8debe92571692 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -216,7 +216,17 @@ struct xfs_dsb {
 	 * pointers are no longer used.
 	 */
 	__be64		sb_rbmino;
-	__be64		sb_rsumino;	/* summary inode for rt bitmap */
+	/*
+	 * rtgroups requires metadir, so we reuse the rsumino space to hold
+	 * the rg block count and shift values.
+	 */
+	union {
+		__be64	sb_rsumino;	/* summary inode for rt bitmap */
+		struct {
+			__be32	sb_rgcount;	/* # of realtime groups */
+			__be32	sb_rgblocks;	/* rtblocks per group */
+		};
+	};
 	__be32		sb_rextsize;	/* realtime extent size, blocks */
 	__be32		sb_agblocks;	/* size of an allocation group */
 	__be32		sb_agcount;	/* number of allocation groups */
@@ -398,6 +408,7 @@ xfs_sb_has_ro_compat_feature(
 #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4)	/* needs xfs_repair */
 #define XFS_SB_FEAT_INCOMPAT_NREXT64	(1 << 5)	/* large extent counters */
 #define XFS_SB_FEAT_INCOMPAT_PARENT	(1 << 6)	/* parent pointers */
+#define XFS_SB_FEAT_INCOMPAT_RTGROUPS	(1U << 30)	/* realtime groups */
 #define XFS_SB_FEAT_INCOMPAT_METADIR	(1U << 31)	/* metadata dir tree */
 #define XFS_SB_FEAT_INCOMPAT_ALL \
 		(XFS_SB_FEAT_INCOMPAT_FTYPE|	\
@@ -752,6 +763,55 @@ union xfs_suminfo_raw {
 	__u32		old;
 };
 
+/*
+ * Realtime allocation groups break the rt section into multiple pieces that
+ * could be locked independently.  Realtime block group numbers are 32-bit
+ * quantities.  Block numbers within a group are also 32-bit quantities, but
+ * the upper bit must never be set.
+ */
+#define XFS_MAX_RGBLOCKS	((xfs_rgblock_t)(1U << 31) - 1)
+#define XFS_MAX_RGNUMBER	((xfs_rgnumber_t)(-1U))
+
+#define XFS_RTSB_MAGIC	0x58524750	/* 'XRGP' */
+
+/*
+ * Realtime superblock - on disk version.  Must be padded to 64 bit alignment.
+ * The first block of each realtime group contains this superblock; this is
+ * how we avoid having file data extents cross a group boundary.
+ */
+struct xfs_rtsb {
+	__be32		rsb_magicnum;	/* magic number == XFS_RTSB_MAGIC */
+	__be32		rsb_blocksize;	/* logical block size, bytes */
+	__be64		rsb_rblocks;	/* number of realtime blocks */
+
+	__be64		rsb_rextents;	/* number of realtime extents */
+	__be64		rsb_lsn;	/* last write sequence */
+
+	__be32		rsb_rgcount;	/* # of realtime groups */
+	unsigned char	rsb_fname[XFSLABEL_MAX]; /* rt volume name */
+
+	uuid_t		rsb_uuid;	/* user-visible file system unique id */
+
+	__be32		rsb_rextsize;	/* realtime extent size, blocks */
+	__be32		rsb_rbmblocks;	/* number of rt bitmap blocks */
+
+	__be32		rsb_rgblocks;	/* rt blocks per group */
+	__u8		rsb_blocklog;	/* log2 of sb_blocksize */
+	__u8		rsb_sectlog;	/* log2 of sb_sectsize */
+	__u8		rsb_rextslog;	/* log2 of sb_rextents */
+	__u8		rsb_pad;
+
+	__le32		rsb_crc;	/* superblock crc */
+	__le32		rsb_pad2;
+
+	uuid_t		rsb_meta_uuid;	/* metadata file system unique id */
+
+	/* must be padded to 64 bit alignment */
+};
+
+#define XFS_RTSB_CRC_OFF	offsetof(struct xfs_rtsb, rsb_crc)
+#define XFS_RTSB_DADDR		((xfs_daddr_t)0) /* daddr in rt section */
+
 /*
  * XFS Timestamps
  * ==============
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 832d96f0f3c54..65219d4cf99ca 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -53,6 +53,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,			4);
 	XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t,			4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtsb,			104);
 
 	/* dir/attr trees */
 	XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr,	80);
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index caa82c4813038..a82dd23cf0c79 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -28,6 +28,7 @@
 #include "xfs_trace.h"
 #include "xfs_inode.h"
 #include "xfs_icache.h"
+#include "xfs_buf_item.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtbitmap.h"
 
@@ -249,3 +250,98 @@ xfs_rtgroup_block_count(
 	return __xfs_rtgroup_block_count(mp, rgno, mp->m_sb.sb_rgcount,
 			mp->m_sb.sb_rblocks);
 }
+
+static xfs_failaddr_t
+xfs_rtsb_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_rtsb		*rsb = bp->b_addr;
+
+	if (!xfs_verify_magic(bp, rsb->rsb_magicnum))
+		return __this_address;
+	if (be32_to_cpu(rsb->rsb_blocksize) != mp->m_sb.sb_blocksize)
+		return __this_address;
+	if (be64_to_cpu(rsb->rsb_rblocks) != mp->m_sb.sb_rblocks)
+		return __this_address;
+
+	if (be64_to_cpu(rsb->rsb_rextents) != mp->m_sb.sb_rextents)
+		return __this_address;
+
+	if (!uuid_equal(&rsb->rsb_uuid, &mp->m_sb.sb_uuid))
+		return __this_address;
+
+	if (be32_to_cpu(rsb->rsb_rgcount) != mp->m_sb.sb_rgcount)
+		return __this_address;
+
+	if (be32_to_cpu(rsb->rsb_rextsize) != mp->m_sb.sb_rextsize)
+		return __this_address;
+	if (be32_to_cpu(rsb->rsb_rbmblocks) != mp->m_sb.sb_rbmblocks)
+		return __this_address;
+
+	if (be32_to_cpu(rsb->rsb_rgblocks) != mp->m_sb.sb_rgblocks)
+		return __this_address;
+	if (rsb->rsb_blocklog != mp->m_sb.sb_blocklog)
+		return __this_address;
+	if (rsb->rsb_sectlog != mp->m_sb.sb_sectlog)
+		return __this_address;
+	if (rsb->rsb_rextslog != mp->m_sb.sb_rextslog)
+		return __this_address;
+	if (rsb->rsb_pad)
+		return __this_address;
+
+	if (rsb->rsb_pad2)
+		return __this_address;
+
+	if (!uuid_equal(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+
+	/* Everything to the end of the fs block must be zero */
+	if (memchr_inv(rsb + 1, 0, BBTOB(bp->b_length) - sizeof(*rsb)))
+		return __this_address;
+
+	return NULL;
+}
+
+static void
+xfs_rtsb_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_RTSB_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rtsb_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_rtsb_write_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_rtsb		*rsb = bp->b_addr;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_rtsb_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (bip)
+		rsb->rsb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_RTSB_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_rtsb_buf_ops = {
+	.name = "xfs_rtsb",
+	.magic = { 0, cpu_to_be32(XFS_RTSB_MAGIC) },
+	.verify_read = xfs_rtsb_read_verify,
+	.verify_write = xfs_rtsb_write_verify,
+	.verify_struct = xfs_rtsb_verify,
+};
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 2f0a670217c48..924c8c95acbc3 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -122,6 +122,89 @@ xfs_verify_rgbext(
 	return xfs_verify_rgbno(rtg, rgbno + len - 1);
 }
 
+static inline xfs_rtblock_t
+xfs_rgbno_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	xfs_rgblock_t		rgbno)
+{
+	ASSERT(xfs_has_rtgroups(mp));
+
+	if (mp->m_rgblklog >= 0)
+		return ((xfs_rtblock_t)rgno << mp->m_rgblklog) | rgbno;
+
+	return ((xfs_rtblock_t)rgno * mp->m_sb.sb_rgblocks) + rgbno;
+}
+
+static inline xfs_rgnumber_t
+xfs_rtb_to_rgno(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	ASSERT(xfs_has_rtgroups(mp));
+
+	if (mp->m_rgblklog >= 0)
+		return rtbno >> mp->m_rgblklog;
+
+	return div_u64(rtbno, mp->m_sb.sb_rgblocks);
+}
+
+static inline xfs_rgblock_t
+xfs_rtb_to_rgbno(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno,
+	xfs_rgnumber_t		*rgno)
+{
+	uint32_t		rem;
+
+	ASSERT(xfs_has_rtgroups(mp));
+
+	if (mp->m_rgblklog >= 0) {
+		*rgno = rtbno >> mp->m_rgblklog;
+		return rtbno & mp->m_rgblkmask;
+	}
+
+	*rgno = div_u64_rem(rtbno, mp->m_sb.sb_rgblocks, &rem);
+	return rem;
+}
+
+static inline xfs_daddr_t
+xfs_rtb_to_daddr(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	return rtbno << mp->m_blkbb_log;
+}
+
+static inline xfs_rtblock_t
+xfs_daddr_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	return daddr >> mp->m_blkbb_log;
+}
+
+static inline xfs_rgnumber_t
+xfs_daddr_to_rgno(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	xfs_rtblock_t		rtb = daddr >> mp->m_blkbb_log;
+
+	return xfs_rtb_to_rgno(mp, rtb);
+}
+
+static inline xfs_rgblock_t
+xfs_daddr_to_rgbno(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	xfs_rtblock_t		rtb = daddr >> mp->m_blkbb_log;
+	xfs_rgnumber_t		rgno;
+
+	return xfs_rtb_to_rgbno(mp, rtb, &rgno);
+}
+
 #ifdef CONFIG_XFS_RT
 xfs_rgblock_t xfs_rtgroup_block_count(struct xfs_mount *mp,
 		xfs_rgnumber_t rgno);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 88402cb4a6879..638de2f7c8dc1 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -180,6 +180,8 @@ xfs_sb_version_to_features(
 		features |= XFS_FEAT_PARENT;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
 		features |= XFS_FEAT_METADIR;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS)
+		features |= XFS_FEAT_RTGROUPS;
 
 	return features;
 }
@@ -307,6 +309,64 @@ xfs_validate_sb_write(
 	return 0;
 }
 
+static int
+xfs_validate_sb_rtgroups(
+	struct xfs_mount	*mp,
+	struct xfs_sb		*sbp)
+{
+	uint64_t		groups;
+
+	if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
+		xfs_warn(mp,
+"Realtime groups require metadata directory tree.");
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rgblocks > XFS_MAX_RGBLOCKS) {
+		xfs_warn(mp,
+"Realtime group size (%u) must be less than %u.",
+			 sbp->sb_rgblocks, XFS_MAX_RGBLOCKS);
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rextsize == 0) {
+		xfs_warn(mp,
+"Realtime extent size must not be zero.");
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rgblocks % sbp->sb_rextsize != 0) {
+		xfs_warn(mp,
+"Realtime group size (%u) must be an even multiple of extent size (%u).",
+			 sbp->sb_rgblocks, sbp->sb_rextsize);
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rgblocks < (sbp->sb_rextsize << 1)) {
+		xfs_warn(mp,
+"Realtime group size (%u) must be greater than 1 rt extent.",
+			 sbp->sb_rgblocks);
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) {
+		xfs_warn(mp,
+"Realtime groups (%u) must be less than %u.",
+			 sbp->sb_rgcount, XFS_MAX_RGNUMBER);
+		return -EINVAL;
+	}
+
+	groups = howmany_64(sbp->sb_rblocks, sbp->sb_rgblocks);
+	if (groups != sbp->sb_rgcount) {
+		xfs_warn(mp,
+"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.",
+			sbp->sb_rgcount, groups);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /* Check the validity of the SB. */
 STATIC int
 xfs_validate_sb_common(
@@ -318,6 +378,7 @@ xfs_validate_sb_common(
 	uint32_t		agcount = 0;
 	uint32_t		rem;
 	bool			has_dalign;
+	int			error;
 
 	if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
 		xfs_warn(mp,
@@ -367,6 +428,12 @@ xfs_validate_sb_common(
 				return -EINVAL;
 			}
 		}
+
+		if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS) {
+			error = xfs_validate_sb_rtgroups(mp, sbp);
+			if (error)
+				return error;
+		}
 	} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
 				XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
 			xfs_notice(mp,
@@ -702,8 +769,13 @@ __xfs_sb_from_disk(
 		to->sb_pquotino = NULLFSINO;
 	}
 
-	to->sb_rgcount = 0;
-	to->sb_rgblocks = 0;
+	if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS) {
+		to->sb_rgcount = be32_to_cpu(from->sb_rgcount);
+		to->sb_rgblocks = be32_to_cpu(from->sb_rgblocks);
+	} else {
+		to->sb_rgcount = 0;
+		to->sb_rgblocks = 0;
+	}
 }
 
 void
@@ -863,6 +935,12 @@ xfs_sb_to_disk(
 		to->sb_gquotino = cpu_to_be64(NULLFSINO);
 		to->sb_pquotino = cpu_to_be64(NULLFSINO);
 	}
+
+	if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS) {
+		/* must come after setting to_rsumino */
+		to->sb_rgcount = cpu_to_be32(from->sb_rgcount);
+		to->sb_rgblocks = cpu_to_be32(from->sb_rgblocks);
+	}
 }
 
 /*
@@ -1018,8 +1096,8 @@ xfs_sb_mount_common(
 	mp->m_blockwmask = mp->m_blockwsize - 1;
 	mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
 	mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
-	mp->m_rgblklog = 0;
-	mp->m_rgblkmask = 0;
+	mp->m_rgblklog = log2_if_power2(sbp->sb_rgblocks);
+	mp->m_rgblkmask = mask64_if_power2(sbp->sb_rgblocks);
 
 	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
 	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 2cecebe018814..f76d2789e1c2d 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
+extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/24] xfs: reduce rt summary file levels for rtgroups filesystems
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
  2023-12-31 21:15   ` [PATCH 01/24] xfs: create incore realtime group structures Darrick J. Wong
  2023-12-31 21:16   ` [PATCH 02/24] xfs: define the format of rt groups Darrick J. Wong
@ 2023-12-31 21:16   ` Darrick J. Wong
  2023-12-31 21:16   ` [PATCH 04/24] xfs: check the realtime superblock at mount time Darrick J. Wong
                     ` (20 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:16 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The rt summary file is supposed to be large enough to track the number
of log2(rtextentcount) free space extents that start in a given rt
bitmap block.  Prior to rt groups, there could be a single 2^52 block
free extent, which implies a summary file with 53 levels.

However, each rtgroup uses its first rt extent to hold a superblock,
so there can't be any free extents longer than the length of a group.
Groups are limited to 2^32-1 blocks, which means that the longest
freespace will be counted in level 31.  Hence we only need 32 levels.

Adjust the rextslog computation to create smaller rt summary files for
rtgroups filesystems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h   |    2 +-
 fs/xfs/libxfs/xfs_rtbitmap.c |   11 +++++++++++
 fs/xfs/libxfs/xfs_rtbitmap.h |    4 ++--
 fs/xfs/libxfs/xfs_sb.c       |    2 +-
 fs/xfs/scrub/rtbitmap.c      |    2 +-
 fs/xfs/scrub/rtsummary.c     |    2 +-
 fs/xfs/xfs_rtalloc.c         |    5 +++--
 7 files changed, 20 insertions(+), 8 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 8debe92571692..43e66740e2aec 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -313,7 +313,7 @@ struct xfs_dsb {
 
 #define	XFS_SB_VERSION_NUM(sbp)	((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
 
-static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
+static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp)
 {
 	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
 }
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 795035556c4b4..a8e5c702a7515 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1144,10 +1144,21 @@ xfs_rtbitmap_blockcount(
  */
 uint8_t
 xfs_compute_rextslog(
+	const struct xfs_sb	*sbp,
 	xfs_rtbxlen_t		rtextents)
 {
 	if (!rtextents)
 		return 0;
+
+	/*
+	 * Realtime groups are never larger than 2^32 extents and are never
+	 * fully free, so we can use highbit32 on the number of rtextents per
+	 * group.
+	 */
+	if (xfs_sb_is_v5(sbp) &&
+	    (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS))
+		return xfs_highbit32(sbp->sb_rgblocks / sbp->sb_rextsize);
+
 	return xfs_highbit64(rtextents);
 }
 
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 6ac17f0195ea1..3de0ec2d24123 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -351,7 +351,7 @@ xfs_rtfree_extent(
 int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
 		xfs_filblks_t rtlen);
 
-uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+uint8_t xfs_compute_rextslog(const struct xfs_sb *sbp, xfs_rtbxlen_t rtextents);
 
 /* Do we support an rt volume having this number of rtextents? */
 static inline bool
@@ -396,7 +396,7 @@ void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
 # define xfs_rtsummary_read_buf(a,b)			(-ENOSYS)
 # define xfs_rtbuf_cache_relse(a)			(0)
 # define xfs_rtalloc_extent_is_free(m,t,s,l,i)		(-ENOSYS)
-# define xfs_compute_rextslog(rtx)			(0)
+# define xfs_compute_rextslog(sbp, rtx)			(0)
 # define xfs_validate_rtextents(rtx)			(false)
 static inline xfs_filblks_t
 xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 638de2f7c8dc1..6a6877c365ae9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -583,7 +583,7 @@ xfs_validate_sb_common(
 
 		if (!xfs_validate_rtextents(rexts) ||
 		    sbp->sb_rextents != rexts ||
-		    sbp->sb_rextslog != xfs_compute_rextslog(rexts) ||
+		    sbp->sb_rextslog != xfs_compute_rextslog(sbp, rexts) ||
 		    sbp->sb_rbmblocks != rbmblocks) {
 			xfs_notice(mp,
 				"realtime geometry sanity check failed");
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index c018376256a86..7f910fed7de95 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -61,7 +61,7 @@ xchk_setup_rtbitmap(
 	 */
 	if (mp->m_sb.sb_rblocks) {
 		rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
-		rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
+		rtb->rextslog = xfs_compute_rextslog(&mp->m_sb, rtb->rextents);
 		rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
 	}
 	return 0;
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 829036b50b0c1..df9aa29cb94c6 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -92,7 +92,7 @@ xchk_setup_rtsummary(
 		int		rextslog;
 
 		rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
-		rextslog = xfs_compute_rextslog(rts->rextents);
+		rextslog = xfs_compute_rextslog(&mp->m_sb, rts->rextents);
 		rts->rsumlevels = rextslog + 1;
 		rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents);
 		rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 59ded74c9007e..ba9116b6e8de7 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -981,7 +981,7 @@ xfs_growfs_rt(
 	if (!xfs_validate_rtextents(nrextents))
 		return -EINVAL;
 	nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
-	nrextslog = xfs_compute_rextslog(nrextents);
+	nrextslog = xfs_compute_rextslog(&mp->m_sb, nrextents);
 	nrsumlevels = nrextslog + 1;
 	nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
 	nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
@@ -1048,7 +1048,8 @@ xfs_growfs_rt(
 		nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
 		nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
 		ASSERT(nsbp->sb_rextents != 0);
-		nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents);
+		nsbp->sb_rextslog = xfs_compute_rextslog(&mp->m_sb,
+				nsbp->sb_rextents);
 		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
 		nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
 				nsbp->sb_rbmblocks);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/24] xfs: check the realtime superblock at mount time
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:16   ` [PATCH 03/24] xfs: reduce rt summary file levels for rtgroups filesystems Darrick J. Wong
@ 2023-12-31 21:16   ` Darrick J. Wong
  2023-12-31 21:16   ` [PATCH 05/24] xfs: update primary realtime super every time we update the primary fs super Darrick J. Wong
                     ` (19 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:16 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Check the realtime superblock at mount time, to ensure that the label
actually matches the primary superblock.  If the rt superblock is good,
attach it to the xfs_mount so that the log can use ordered buffers to
keep this primary in sync with the primary super on the data device.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_mount.h   |    1 +
 fs/xfs/xfs_rtalloc.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_rtalloc.h |    5 +++++
 fs/xfs/xfs_super.c   |   16 ++++++++++++++--
 4 files changed, 70 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 52976a133cec9..7e86aa137d1fa 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -85,6 +85,7 @@ typedef struct xfs_mount {
 	struct super_block	*m_super;
 	struct xfs_ail		*m_ail;		/* fs active log item list */
 	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
+	struct xfs_buf		*m_rtsb_bp;	/* realtime superblock */
 	char			*m_rtname;	/* realtime device name */
 	char			*m_logname;	/* external log device name */
 	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ba9116b6e8de7..430d63e5ba751 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1252,6 +1252,56 @@ xfs_rtallocate_extent(
 	return 0;
 }
 
+/* Read the primary realtime group's superblock and attach it to the mount. */
+int
+xfs_rtmount_readsb(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	if (!xfs_has_rtgroups(mp))
+		return 0;
+	if (mp->m_sb.sb_rblocks == 0)
+		return 0;
+	if (mp->m_rtdev_targp == NULL) {
+		xfs_warn(mp,
+	"Filesystem has a realtime volume, use rtdev=device option");
+		return -ENODEV;
+	}
+
+	/* m_blkbb_log is not set up yet */
+	error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
+			mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp,
+			&xfs_rtsb_buf_ops);
+	if (error) {
+		xfs_warn(mp, "rt sb validate failed with error %d.", error);
+		/* bad CRC means corrupted metadata */
+		if (error == -EFSBADCRC)
+			error = -EFSCORRUPTED;
+		return error;
+	}
+
+	mp->m_rtsb_bp = bp;
+	xfs_buf_unlock(bp);
+	return 0;
+}
+
+/* Detach the realtime superblock from the mount and free it. */
+void
+xfs_rtmount_freesb(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp = mp->m_rtsb_bp;
+
+	if (!bp)
+		return;
+
+	xfs_buf_lock(bp);
+	mp->m_rtsb_bp = NULL;
+	xfs_buf_relse(bp);
+}
+
 /*
  * Initialize realtime fields in the mount structure.
  */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f7cb9ffe51ca6..b982b97cf073f 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -33,6 +33,9 @@ xfs_rtallocate_extent(
 	xfs_rtxnum_t		*rtblock); /* out: start rtext allocated */
 
 
+int xfs_rtmount_readsb(struct xfs_mount *mp);
+void xfs_rtmount_freesb(struct xfs_mount *mp);
+
 /*
  * Initialize realtime fields in the mount structure.
  */
@@ -79,6 +82,8 @@ int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
 # define xfs_rtpick_extent(m,t,l,rb)			(-ENOSYS)
 # define xfs_growfs_rt(mp,in)				(-ENOSYS)
 # define xfs_rtalloc_reinit_frextents(m)		(0)
+# define xfs_rtmount_readsb(mp)				(0)
+# define xfs_rtmount_freesb(mp)				((void)0)
 static inline int		/* error */
 xfs_rtmount_init(
 	xfs_mount_t	*mp)	/* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c154e2cb7a18e..f3a5e194c535b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -45,6 +45,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_swapext_item.h"
 #include "xfs_parent.h"
+#include "xfs_rtalloc.h"
 #include "scrub/stats.h"
 #include "scrub/rcbag_btree.h"
 
@@ -1152,6 +1153,7 @@ xfs_fs_put_super(
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
 
+	xfs_rtmount_freesb(mp);
 	xfs_freesb(mp);
 	xchk_mount_stats_free(mp);
 	free_percpu(mp->m_stats.xs_stats);
@@ -1671,9 +1673,13 @@ xfs_fs_fill_super(
 		goto out_free_sb;
 	}
 
+	error = xfs_rtmount_readsb(mp);
+	if (error)
+		goto out_free_sb;
+
 	error = xfs_filestream_mount(mp);
 	if (error)
-		goto out_free_sb;
+		goto out_free_rtsb;
 
 	/*
 	 * we must configure the block size in the superblock before we run the
@@ -1717,6 +1723,10 @@ xfs_fs_fill_super(
 		xfs_warn(mp,
 "EXPERIMENTAL metadata directory feature in use. Use at your own risk!");
 
+	if (xfs_has_rtgroups(mp))
+		xfs_warn(mp,
+"EXPERIMENTAL realtime allocation group feature in use. Use at your own risk!");
+
 	if (xfs_has_reflink(mp)) {
 		if (mp->m_sb.sb_rblocks) {
 			xfs_alert(mp,
@@ -1761,6 +1771,8 @@ xfs_fs_fill_super(
 
  out_filestream_unmount:
 	xfs_filestream_unmount(mp);
+ out_free_rtsb:
+	xfs_rtmount_freesb(mp);
  out_free_sb:
 	xfs_freesb(mp);
  out_free_scrub_stats:
@@ -1780,7 +1792,7 @@ xfs_fs_fill_super(
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	goto out_free_sb;
+	goto out_free_rtsb;
 }
 
 static int


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/24] xfs: update primary realtime super every time we update the primary fs super
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:16   ` [PATCH 04/24] xfs: check the realtime superblock at mount time Darrick J. Wong
@ 2023-12-31 21:16   ` Darrick J. Wong
  2023-12-31 21:17   ` [PATCH 06/24] xfs: write secondary realtime superblocks to disk Darrick J. Wong
                     ` (18 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:16 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Every time we update parts of the primary filesystem superblock that are
echoed in the primary rt super, we should update that primary realtime
super.  Avoid an ondisk log format change by using ordered buffers to
write the primary rt super.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtgroup.c   |   74 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h   |    6 +++
 fs/xfs/libxfs/xfs_sb.c        |   13 +++++++
 fs/xfs/xfs_buf_item_recover.c |   18 ++++++++++
 fs/xfs/xfs_trans.c            |   10 ++++++
 fs/xfs/xfs_trans.h            |    1 +
 fs/xfs/xfs_trans_buf.c        |   25 +++++++++++---
 7 files changed, 142 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index a82dd23cf0c79..325e4eb8f8781 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -345,3 +345,77 @@ const struct xfs_buf_ops xfs_rtsb_buf_ops = {
 	.verify_write = xfs_rtsb_write_verify,
 	.verify_struct = xfs_rtsb_verify,
 };
+
+/* Update a realtime superblock from the primary fs super */
+void
+xfs_rtgroup_update_super(
+	struct xfs_buf		*rtsb_bp,
+	const struct xfs_buf	*sb_bp)
+{
+	const struct xfs_dsb	*dsb = sb_bp->b_addr;
+	struct xfs_rtsb		*rsb = rtsb_bp->b_addr;
+	const uuid_t		*meta_uuid;
+
+	rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC);
+	rsb->rsb_blocksize = dsb->sb_blocksize;
+	rsb->rsb_rblocks = dsb->sb_rblocks;
+
+	rsb->rsb_rextents = dsb->sb_rextents;
+	rsb->rsb_lsn = 0;
+
+	memcpy(&rsb->rsb_uuid, &dsb->sb_uuid, sizeof(rsb->rsb_uuid));
+
+	rsb->rsb_rgcount = dsb->sb_rgcount;
+	memcpy(&rsb->rsb_fname, &dsb->sb_fname, XFSLABEL_MAX);
+
+	rsb->rsb_rextsize = dsb->sb_rextsize;
+	rsb->rsb_rbmblocks = dsb->sb_rbmblocks;
+
+	rsb->rsb_rgblocks = dsb->sb_rgblocks;
+	rsb->rsb_blocklog = dsb->sb_blocklog;
+	rsb->rsb_sectlog = dsb->sb_sectlog;
+	rsb->rsb_rextslog = dsb->sb_rextslog;
+	rsb->rsb_pad = 0;
+	rsb->rsb_pad2 = 0;
+
+	/*
+	 * The metadata uuid is the fs uuid if the metauuid feature is not
+	 * enabled.
+	 */
+	if (dsb->sb_features_incompat &
+				cpu_to_be32(XFS_SB_FEAT_INCOMPAT_META_UUID))
+		meta_uuid = &dsb->sb_meta_uuid;
+	else
+		meta_uuid = &dsb->sb_uuid;
+	memcpy(&rsb->rsb_meta_uuid, meta_uuid, sizeof(rsb->rsb_meta_uuid));
+}
+
+/*
+ * Update the primary realtime superblock from a filesystem superblock and
+ * log it to the given transaction.
+ */
+void
+xfs_rtgroup_log_super(
+	struct xfs_trans	*tp,
+	const struct xfs_buf	*sb_bp)
+{
+	struct xfs_buf		*rtsb_bp;
+
+	if (!xfs_has_rtgroups(tp->t_mountp))
+		return;
+
+	rtsb_bp = xfs_trans_getrtsb(tp);
+	if (!rtsb_bp) {
+		/*
+		 * It's possible for the rtgroups feature to be enabled but
+		 * there is no incore rt superblock buffer if the rt geometry
+		 * was specified at mkfs time but the rt section has not yet
+		 * been attached.  In this case, rblocks must be zero.
+		 */
+		ASSERT(tp->t_mountp->m_sb.sb_rblocks == 0);
+		return;
+	}
+
+	xfs_rtgroup_update_super(rtsb_bp, sb_bp);
+	xfs_trans_ordered_buf(tp, rtsb_bp);
+}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 924c8c95acbc3..83bbd43f7271a 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -208,8 +208,14 @@ xfs_daddr_to_rgbno(
 #ifdef CONFIG_XFS_RT
 xfs_rgblock_t xfs_rtgroup_block_count(struct xfs_mount *mp,
 		xfs_rgnumber_t rgno);
+
+void xfs_rtgroup_update_super(struct xfs_buf *rtsb_bp,
+		const struct xfs_buf *sb_bp);
+void xfs_rtgroup_log_super(struct xfs_trans *tp, const struct xfs_buf *sb_bp);
 #else
 # define xfs_rtgroup_block_count(mp, rgno)	(0)
+# define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
+# define xfs_rtgroup_log_super(tp, sb_bp)	((void)0)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __LIBXFS_RTGROUP_H */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6a6877c365ae9..b37924cf67a93 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -27,6 +27,7 @@
 #include "xfs_ag.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_swapext.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1161,6 +1162,8 @@ xfs_log_sb(
 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
 	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+
+	xfs_rtgroup_log_super(tp, bp);
 }
 
 /*
@@ -1277,6 +1280,7 @@ xfs_sync_sb_buf(
 {
 	struct xfs_trans	*tp;
 	struct xfs_buf		*bp;
+	struct xfs_buf		*rtsb_bp = NULL;
 	int			error;
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
@@ -1286,6 +1290,11 @@ xfs_sync_sb_buf(
 	bp = xfs_trans_getsb(tp);
 	xfs_log_sb(tp);
 	xfs_trans_bhold(tp, bp);
+	if (xfs_has_rtgroups(mp)) {
+		rtsb_bp = xfs_trans_getrtsb(tp);
+		if (rtsb_bp)
+			xfs_trans_bhold(tp, rtsb_bp);
+	}
 	xfs_trans_set_sync(tp);
 	error = xfs_trans_commit(tp);
 	if (error)
@@ -1294,7 +1303,11 @@ xfs_sync_sb_buf(
 	 * write out the sb buffer to get the changes to disk
 	 */
 	error = xfs_bwrite(bp);
+	if (!error && rtsb_bp)
+		error = xfs_bwrite(rtsb_bp);
 out:
+	if (rtsb_bp)
+		xfs_buf_relse(rtsb_bp);
 	xfs_buf_relse(bp);
 	return error;
 }
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 43167f543afc3..4be9a384dc6f7 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -22,6 +22,7 @@
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
 #include "xfs_quota.h"
+#include "xfs_rtgroup.h"
 
 /*
  * This is the number of entries in the l_buf_cancel_table used during
@@ -995,6 +996,23 @@ xlog_recover_buf_commit_pass2(
 		ASSERT(bp->b_mount == mp);
 		bp->b_flags |= _XBF_LOGRECOVERY;
 		xfs_buf_delwri_queue(bp, buffer_list);
+
+		/*
+		 * Update the primary rt super if we just recovered the primary
+		 * fs super.
+		 */
+		if (xfs_has_rtgroups(mp) && bp->b_ops == &xfs_sb_buf_ops) {
+			struct xfs_buf	*rtsb_bp = mp->m_rtsb_bp;
+
+			if (rtsb_bp) {
+				xfs_buf_lock(rtsb_bp);
+				xfs_buf_hold(rtsb_bp);
+				xfs_rtgroup_update_super(rtsb_bp, bp);
+				rtsb_bp->b_flags |= _XBF_LOGRECOVERY;
+				xfs_buf_delwri_queue(rtsb_bp, buffer_list);
+				xfs_buf_relse(rtsb_bp);
+			}
+		}
 	}
 
 out_release:
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 81337fdb89ab8..fab625f64189f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -25,6 +25,7 @@
 #include "xfs_dquot.h"
 #include "xfs_icache.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_trans_cache;
 
@@ -477,6 +478,7 @@ xfs_trans_apply_sb_deltas(
 {
 	struct xfs_dsb	*sbp;
 	struct xfs_buf	*bp;
+	bool		update_rtsb = false;
 	int		whole = 0;
 
 	bp = xfs_trans_getsb(tp);
@@ -537,22 +539,27 @@ xfs_trans_apply_sb_deltas(
 	if (tp->t_rextsize_delta) {
 		be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta);
 		whole = 1;
+		update_rtsb = true;
 	}
 	if (tp->t_rbmblocks_delta) {
 		be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta);
 		whole = 1;
+		update_rtsb = true;
 	}
 	if (tp->t_rblocks_delta) {
 		be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
 		whole = 1;
+		update_rtsb = true;
 	}
 	if (tp->t_rextents_delta) {
 		be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta);
 		whole = 1;
+		update_rtsb = true;
 	}
 	if (tp->t_rextslog_delta) {
 		sbp->sb_rextslog += tp->t_rextslog_delta;
 		whole = 1;
+		update_rtsb = true;
 	}
 
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
@@ -569,6 +576,9 @@ xfs_trans_apply_sb_deltas(
 		xfs_trans_log_buf(tp, bp, offsetof(struct xfs_dsb, sb_icount),
 				  offsetof(struct xfs_dsb, sb_frextents) +
 				  sizeof(sbp->sb_frextents) - 1);
+
+	if (update_rtsb)
+		xfs_rtgroup_log_super(tp, bp);
 }
 
 /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 3cffc3cb41814..f3ddb05f3bcc6 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -212,6 +212,7 @@ xfs_trans_read_buf(
 }
 
 struct xfs_buf	*xfs_trans_getsb(struct xfs_trans *);
+struct xfs_buf	*xfs_trans_getrtsb(struct xfs_trans *tp);
 
 void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index e28ab74af4f0e..8e886ecfd69a3 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -168,12 +168,11 @@ xfs_trans_get_buf_map(
 /*
  * Get and lock the superblock buffer for the given transaction.
  */
-struct xfs_buf *
-xfs_trans_getsb(
-	struct xfs_trans	*tp)
+static struct xfs_buf *
+__xfs_trans_getsb(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
 {
-	struct xfs_buf		*bp = tp->t_mountp->m_sb_bp;
-
 	/*
 	 * Just increment the lock recursion count if the buffer is already
 	 * attached to this transaction.
@@ -197,6 +196,22 @@ xfs_trans_getsb(
 	return bp;
 }
 
+struct xfs_buf *
+xfs_trans_getsb(
+	struct xfs_trans	*tp)
+{
+	return __xfs_trans_getsb(tp, tp->t_mountp->m_sb_bp);
+}
+
+struct xfs_buf *
+xfs_trans_getrtsb(
+	struct xfs_trans	*tp)
+{
+	if (!tp->t_mountp->m_rtsb_bp)
+		return NULL;
+	return __xfs_trans_getsb(tp, tp->t_mountp->m_rtsb_bp);
+}
+
 /*
  * Get and lock the buffer for the caller if it is not already
  * locked within the given transaction.  If it has not yet been


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/24] xfs: write secondary realtime superblocks to disk
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:16   ` [PATCH 05/24] xfs: update primary realtime super every time we update the primary fs super Darrick J. Wong
@ 2023-12-31 21:17   ` Darrick J. Wong
  2023-12-31 21:17   ` [PATCH 07/24] xfs: grow the realtime section when realtime groups are enabled Darrick J. Wong
                     ` (17 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:17 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create some library functions to make it easy to update all the
secondary realtime superblocks on disk; this will be used by growfs,
xfs_db, mkfs, and xfs_repair.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtgroup.c |  117 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h |    2 +
 2 files changed, 119 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 325e4eb8f8781..173297f582e00 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -419,3 +419,120 @@ xfs_rtgroup_log_super(
 	xfs_rtgroup_update_super(rtsb_bp, sb_bp);
 	xfs_trans_ordered_buf(tp, rtsb_bp);
 }
+
+/* Initialize a secondary realtime superblock. */
+static int
+xfs_rtgroup_init_secondary_super(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp;
+	struct xfs_rtsb		*rsb;
+	xfs_rtblock_t		rtbno;
+	int			error;
+
+	ASSERT(rgno != 0);
+
+	error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1),
+			0, &bp);
+	if (error)
+		return error;
+
+	rtbno = xfs_rgbno_to_rtb(mp, rgno, 0);
+	bp->b_maps[0].bm_bn = xfs_rtb_to_daddr(mp, rtbno);
+	bp->b_ops = &xfs_rtsb_buf_ops;
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+
+	rsb = bp->b_addr;
+	rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC);
+	rsb->rsb_blocksize = cpu_to_be32(mp->m_sb.sb_blocksize);
+	rsb->rsb_rblocks = cpu_to_be64(mp->m_sb.sb_rblocks);
+
+	rsb->rsb_rextents = cpu_to_be64(mp->m_sb.sb_rextents);
+
+	memcpy(&rsb->rsb_uuid, &mp->m_sb.sb_uuid, sizeof(rsb->rsb_uuid));
+
+	rsb->rsb_rgcount = cpu_to_be32(mp->m_sb.sb_rgcount);
+	memcpy(&rsb->rsb_fname, &mp->m_sb.sb_fname, XFSLABEL_MAX);
+
+	rsb->rsb_rextsize = cpu_to_be32(mp->m_sb.sb_rextsize);
+	rsb->rsb_rbmblocks = cpu_to_be32(mp->m_sb.sb_rbmblocks);
+
+	rsb->rsb_rgblocks = cpu_to_be32(mp->m_sb.sb_rgblocks);
+	rsb->rsb_blocklog = mp->m_sb.sb_blocklog;
+	rsb->rsb_sectlog = mp->m_sb.sb_sectlog;
+	rsb->rsb_rextslog = mp->m_sb.sb_rextslog;
+
+	memcpy(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid,
+			sizeof(rsb->rsb_meta_uuid));
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Update all the realtime superblocks to match the new state of the primary.
+ * Because we are completely overwriting all the existing fields in the
+ * secondary superblock buffers, there is no need to read them in from disk.
+ * Just get a new buffer, stamp it and write it.
+ *
+ * The rt super buffers do not need to be kept them in memory once they are
+ * written so we mark them as a one-shot buffer.
+ */
+int
+xfs_rtgroup_update_secondary_sbs(
+	struct xfs_mount	*mp)
+{
+	LIST_HEAD		(buffer_list);
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		start_rgno = 1;
+	int			saved_error = 0;
+	int			error = 0;
+
+	for_each_rtgroup_from(mp, start_rgno, rtg) {
+		struct xfs_buf		*bp;
+
+		error = xfs_rtgroup_init_secondary_super(mp, rtg->rtg_rgno,
+				&bp);
+		/*
+		 * If we get an error reading or writing alternate superblocks,
+		 * continue.  If we break early, we'll leave more superblocks
+		 * un-updated than updated.
+		 */
+		if (error) {
+			xfs_warn(mp,
+		"error allocating secondary superblock for rt group %d",
+				rtg->rtg_rgno);
+			if (!saved_error)
+				saved_error = error;
+			continue;
+		}
+
+		xfs_buf_oneshot(bp);
+		xfs_buf_delwri_queue(bp, &buffer_list);
+		xfs_buf_relse(bp);
+
+		/* don't hold too many buffers at once */
+		if (rtg->rtg_rgno % 16)
+			continue;
+
+		error = xfs_buf_delwri_submit(&buffer_list);
+		if (error) {
+			xfs_warn(mp,
+	"write error %d updating a secondary superblock near rt group %u",
+				error, rtg->rtg_rgno);
+			if (!saved_error)
+				saved_error = error;
+			continue;
+		}
+	}
+	error = xfs_buf_delwri_submit(&buffer_list);
+	if (error) {
+		xfs_warn(mp,
+	"write error %d updating a secondary superblock near rt group %u",
+			error, start_rgno);
+	}
+
+	return saved_error ? saved_error : error;
+}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 83bbd43f7271a..2d0422c6712da 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -212,10 +212,12 @@ xfs_rgblock_t xfs_rtgroup_block_count(struct xfs_mount *mp,
 void xfs_rtgroup_update_super(struct xfs_buf *rtsb_bp,
 		const struct xfs_buf *sb_bp);
 void xfs_rtgroup_log_super(struct xfs_trans *tp, const struct xfs_buf *sb_bp);
+int xfs_rtgroup_update_secondary_sbs(struct xfs_mount *mp);
 #else
 # define xfs_rtgroup_block_count(mp, rgno)	(0)
 # define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
 # define xfs_rtgroup_log_super(tp, sb_bp)	((void)0)
+# define xfs_rtgroup_update_secondary_sbs(mp)	(0)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __LIBXFS_RTGROUP_H */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/24] xfs: grow the realtime section when realtime groups are enabled
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:17   ` [PATCH 06/24] xfs: write secondary realtime superblocks to disk Darrick J. Wong
@ 2023-12-31 21:17   ` Darrick J. Wong
  2023-12-31 21:17   ` [PATCH 08/24] xfs: always update secondary rt supers when we update secondary fs supers Darrick J. Wong
                     ` (16 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:17 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable growing the rt section when realtime groups are enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_shared.h |    1 
 fs/xfs/xfs_rtalloc.c       |  136 +++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_trans.c         |   10 +++
 fs/xfs/xfs_trans.h         |    1 
 4 files changed, 144 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index f76d2789e1c2d..65691af0488e7 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -111,6 +111,7 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define	XFS_TRANS_SB_RBLOCKS		0x00000800
 #define	XFS_TRANS_SB_REXTENTS		0x00001000
 #define	XFS_TRANS_SB_REXTSLOG		0x00002000
+#define XFS_TRANS_SB_RGCOUNT		0x00004000
 
 /*
  * Here we centralize the specification of XFS meta-data buffer reference count
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 430d63e5ba751..23a15700d3596 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -902,6 +902,88 @@ xfs_alloc_rsum_cache(
  * Visible (exported) functions.
  */
 
+static int
+xfs_growfs_rt_free_new(
+	struct xfs_mount	*mp,
+	struct xfs_rtalloc_args	*nargs,
+	xfs_rtbxlen_t		*freed_rtx)
+{
+	struct xfs_mount	*nmp = nargs->mp;
+	struct xfs_sb		*sbp = &mp->m_sb;
+	struct xfs_sb		*nsbp = &nmp->m_sb;
+	xfs_rtblock_t		rtbno, next_rtbno;
+	int			error = 0;
+
+	if (!xfs_has_rtgroups(mp)) {
+		*freed_rtx = nsbp->sb_rextents - sbp->sb_rextents;
+		return xfs_rtfree_range(nargs, sbp->sb_rextents, *freed_rtx);
+	}
+
+	*freed_rtx = 0;
+
+	rtbno = xfs_rtx_to_rtb(nmp, sbp->sb_rextents);
+	next_rtbno = xfs_rtx_to_rtb(nmp, nsbp->sb_rextents);
+	while (rtbno < next_rtbno) {
+		xfs_rtxnum_t	start_rtx, next_rtx;
+		xfs_rtblock_t	next_free_rtbno;
+		xfs_rgnumber_t	rgno;
+		xfs_rgblock_t	rgbno;
+
+		/*
+		 * Compute the first new extent that we want to free, being
+		 * careful to skip past a realtime superblock at the start of
+		 * the new region.
+		 */
+		rgbno = xfs_rtb_to_rgbno(nmp, rtbno, &rgno);
+		if (rgbno == 0) {
+			rtbno += nsbp->sb_rextsize;
+			if (rtbno >= next_rtbno)
+				break;
+		}
+
+		start_rtx = xfs_rtb_to_rtx(nmp, rtbno);
+
+		/*
+		 * Stop freeing either at the end of the new rt section or at
+		 * the start of the next realtime group.
+		 */
+		next_free_rtbno = xfs_rgbno_to_rtb(nmp, rgno + 1, 0);
+		next_rtx = xfs_rtb_to_rtx(nmp, next_free_rtbno);
+		next_rtx = min(next_rtx, nsbp->sb_rextents);
+
+		*freed_rtx += next_rtx - start_rtx;
+		error = xfs_rtfree_range(nargs, start_rtx,
+				next_rtx - start_rtx);
+		if (error)
+			break;
+
+		rtbno = next_free_rtbno;
+	}
+
+	return error;
+}
+
+static int
+xfs_growfs_rt_init_primary(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*rtsb_bp;
+	int			error;
+
+	error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1),
+			0, &rtsb_bp);
+	if (error)
+		return error;
+
+	rtsb_bp->b_maps[0].bm_bn = XFS_RTSB_DADDR;
+	rtsb_bp->b_ops = &xfs_rtsb_buf_ops;
+
+	xfs_rtgroup_update_super(rtsb_bp, mp->m_sb_bp);
+	mp->m_rtsb_bp = rtsb_bp;
+	xfs_buf_unlock(rtsb_bp);
+	return 0;
+}
+
 /*
  * Grow the realtime area of the filesystem.
  */
@@ -992,6 +1074,35 @@ xfs_growfs_rt(
 	 */
 	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
 		return -EINVAL;
+
+	/* Allocate the new rt group structures */
+	if (xfs_has_rtgroups(mp)) {
+		uint64_t	new_rgcount;
+
+		new_rgcount = howmany_64(nrblocks, mp->m_sb.sb_rgblocks);
+		if (new_rgcount > XFS_MAX_RGNUMBER)
+			return -EINVAL;
+
+		/*
+		 * We don't support changing the group size to match the extent
+		 * size, even if the size of the rt section is currently zero.
+		 */
+		if (mp->m_sb.sb_rgblocks % in->extsize != 0)
+			return -EOPNOTSUPP;
+
+		if (mp->m_sb.sb_rblocks == 0) {
+			error = xfs_growfs_rt_init_primary(mp);
+			if (error)
+				return error;
+		}
+
+		if (new_rgcount > mp->m_sb.sb_rgcount) {
+			error = xfs_initialize_rtgroups(mp, new_rgcount);
+			if (error)
+				return error;
+		}
+	}
+
 	/*
 	 * Get the old block counts for bitmap and summary inodes.
 	 * These can't change since other growfs callers are locked out.
@@ -1033,7 +1144,10 @@ xfs_growfs_rt(
 			.mp		= nmp,
 		};
 		struct xfs_trans	*tp;
+		struct xfs_rtgroup	*rtg;
 		xfs_rfsblock_t		nrblocks_step;
+		xfs_rtbxlen_t		freed_rtx = 0;
+		xfs_rgnumber_t		last_rgno = mp->m_sb.sb_rgcount - 1;
 
 		*nmp = *mp;
 		nsbp = &nmp->m_sb;
@@ -1057,6 +1171,10 @@ xfs_growfs_rt(
 		/* recompute growfsrt reservation from new rsumsize */
 		xfs_trans_resv_calc(nmp, &nmp->m_resv);
 
+		if (xfs_has_rtgroups(mp))
+			nsbp->sb_rgcount = howmany_64(nsbp->sb_rblocks,
+						      nsbp->sb_rgblocks);
+
 		/*
 		 * Start a transaction, get the log reservation.
 		 */
@@ -1099,6 +1217,7 @@ xfs_growfs_rt(
 			if (error)
 				goto error_cancel;
 		}
+
 		/*
 		 * Update superblock fields.
 		 */
@@ -1117,11 +1236,13 @@ xfs_growfs_rt(
 		if (nsbp->sb_rextslog != sbp->sb_rextslog)
 			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
 				nsbp->sb_rextslog - sbp->sb_rextslog);
+		if (nsbp->sb_rgcount != sbp->sb_rgcount)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT,
+				nsbp->sb_rgcount - sbp->sb_rgcount);
 		/*
 		 * Free new extent.
 		 */
-		error = xfs_rtfree_range(&nargs, sbp->sb_rextents,
-				nsbp->sb_rextents - sbp->sb_rextents);
+		error = xfs_growfs_rt_free_new(mp, &nargs, &freed_rtx);
 		xfs_rtbuf_cache_relse(&nargs);
 		if (error) {
 error_cancel:
@@ -1131,8 +1252,7 @@ xfs_growfs_rt(
 		/*
 		 * Mark more blocks free in the superblock.
 		 */
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
-			nsbp->sb_rextents - sbp->sb_rextents);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
 		/*
 		 * Update mp values into the real mp structure.
 		 */
@@ -1145,6 +1265,10 @@ xfs_growfs_rt(
 		if (error)
 			break;
 
+		for_each_rtgroup_from(mp, last_rgno, rtg)
+			rtg->rtg_blockcount = xfs_rtgroup_block_count(mp,
+								rtg->rtg_rgno);
+
 		/* Ensure the mount RT feature flag is now set. */
 		mp->m_features |= XFS_FEAT_REALTIME;
 	}
@@ -1153,6 +1277,10 @@ xfs_growfs_rt(
 
 	/* Update secondary superblocks now the physical grow has completed */
 	error = xfs_update_secondary_sbs(mp);
+	if (error)
+		goto out_free;
+
+	error = xfs_rtgroup_update_secondary_sbs(mp);
 
 out_free:
 	/*
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fab625f64189f..1eaf32422bf66 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -456,6 +456,10 @@ xfs_trans_mod_sb(
 	case XFS_TRANS_SB_REXTSLOG:
 		tp->t_rextslog_delta += delta;
 		break;
+	case XFS_TRANS_SB_RGCOUNT:
+		ASSERT(delta > 0);
+		tp->t_rgcount_delta += delta;
+		break;
 	default:
 		ASSERT(0);
 		return;
@@ -561,6 +565,11 @@ xfs_trans_apply_sb_deltas(
 		whole = 1;
 		update_rtsb = true;
 	}
+	if (tp->t_rgcount_delta) {
+		be32_add_cpu(&sbp->sb_rgcount, tp->t_rgcount_delta);
+		whole = 1;
+		update_rtsb = true;
+	}
 
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
 	if (whole)
@@ -674,6 +683,7 @@ xfs_trans_unreserve_and_mod_sb(
 	mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
 	mp->m_sb.sb_rextents += tp->t_rextents_delta;
 	mp->m_sb.sb_rextslog += tp->t_rextslog_delta;
+	mp->m_sb.sb_rgcount += tp->t_rgcount_delta;
 	spin_unlock(&mp->m_sb_lock);
 
 	/*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f3ddb05f3bcc6..2046ee06fe88f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -146,6 +146,7 @@ typedef struct xfs_trans {
 	int64_t			t_rblocks_delta;/* superblock rblocks change */
 	int64_t			t_rextents_delta;/* superblocks rextents chg */
 	int64_t			t_rextslog_delta;/* superblocks rextslog chg */
+	int64_t			t_rgcount_delta; /* realtime group count */
 	struct list_head	t_items;	/* log item descriptors */
 	struct list_head	t_busy;		/* list of busy extents */
 	struct list_head	t_dfops;	/* deferred operations */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/24] xfs: always update secondary rt supers when we update secondary fs supers
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:17   ` [PATCH 07/24] xfs: grow the realtime section when realtime groups are enabled Darrick J. Wong
@ 2023-12-31 21:17   ` Darrick J. Wong
  2023-12-31 21:17   ` [PATCH 09/24] xfs: export realtime group geometry via XFS_FSOP_GEOM Darrick J. Wong
                     ` (15 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:17 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that any update to the secondary superblocks in the data
section are also echoed to the secondary superblocks in the realtime
section.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_fsops.c |    4 ++++
 fs/xfs/xfs_ioctl.c |    3 +++
 2 files changed, 7 insertions(+)


diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 80811d16dde00..84b311bd185af 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -21,6 +21,7 @@
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 #include "xfs_trace.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Write new AG headers to disk. Non-transactional, but need to be
@@ -319,6 +320,9 @@ xfs_growfs_data(
 
 	/* Update secondary superblocks now the physical grow has completed */
 	error = xfs_update_secondary_sbs(mp);
+	if (error)
+		goto out_error;
+	error = xfs_rtgroup_update_secondary_sbs(mp);
 
 out_error:
 	/*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a887c1d5d69fb..b140f7ed916c9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_xchgrange.h"
 #include "xfs_file.h"
+#include "xfs_rtgroup.h"
 
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -1874,6 +1875,8 @@ xfs_ioc_setlabel(
 	 */
 	mutex_lock(&mp->m_growlock);
 	error = xfs_update_secondary_sbs(mp);
+	if (!error)
+		error = xfs_rtgroup_update_secondary_sbs(mp);
 	mutex_unlock(&mp->m_growlock);
 
 	invalidate_bdev(bdev);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/24] xfs: export realtime group geometry via XFS_FSOP_GEOM
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:17   ` [PATCH 08/24] xfs: always update secondary rt supers when we update secondary fs supers Darrick J. Wong
@ 2023-12-31 21:17   ` Darrick J. Wong
  2023-12-31 21:18   ` [PATCH 10/24] xfs: check that rtblock extents do not overlap with the rt group metadata Darrick J. Wong
                     ` (14 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:17 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Export the realtime geometry information so that userspace can query it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |    4 +++-
 fs/xfs/libxfs/xfs_sb.c |    5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index a0efcbde5ae60..bae9ef924bf59 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -186,7 +186,9 @@ struct xfs_fsop_geom {
 	__u32		logsunit;	/* log stripe unit, bytes	*/
 	uint32_t	sick;		/* o: unhealthy fs & rt metadata */
 	uint32_t	checked;	/* o: checked fs & rt metadata	*/
-	__u64		reserved[17];	/* reserved space		*/
+	__u32		rgblocks;	/* rtblocks in a realtime group	*/
+	__u32		rgcount;	/* number of realtime groups	*/
+	__u64		reserved[16];	/* reserved space		*/
 };
 
 #define XFS_FSOP_GEOM_SICK_COUNTERS	(1 << 0)  /* summary counters */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b37924cf67a93..cf94417aa9faa 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1411,6 +1411,11 @@ xfs_fs_geometry(
 		return;
 
 	geo->version = XFS_FSOP_GEOM_VERSION_V5;
+
+	if (xfs_has_rtgroups(mp)) {
+		geo->rgcount = sbp->sb_rgcount;
+		geo->rgblocks = sbp->sb_rgblocks;
+	}
 }
 
 /* Read a secondary superblock. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/24] xfs: check that rtblock extents do not overlap with the rt group metadata
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 21:17   ` [PATCH 09/24] xfs: export realtime group geometry via XFS_FSOP_GEOM Darrick J. Wong
@ 2023-12-31 21:18   ` Darrick J. Wong
  2023-12-31 21:18   ` [PATCH 11/24] xfs: add frextents to the lazysbcounters when rtgroups enabled Darrick J. Wong
                     ` (13 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:18 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The ondisk format specifies that the start of each realtime group must
have a superblock so that rt space mappings never cross an rtgroup
boundary.  Check that rt block pointers obey this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_types.c |   46 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index b1fa715e5f398..34d02b2bfdd16 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -13,6 +13,8 @@
 #include "xfs_mount.h"
 #include "xfs_ag.h"
 #include "xfs_imeta.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 
 
 /*
@@ -133,6 +135,26 @@ xfs_verify_dir_ino(
 	return xfs_verify_ino(mp, ino);
 }
 
+/*
+ * Verify that an rtgroup block number pointer neither points outside the
+ * rtgroup nor points at static metadata.
+ */
+static inline bool
+xfs_verify_rgno_rgbno(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	xfs_rgblock_t		rgbno)
+{
+	xfs_rgblock_t		eorg;
+
+	eorg = xfs_rtgroup_block_count(mp, rgno);
+	if (rgbno >= eorg)
+		return false;
+	if (rgbno < mp->m_sb.sb_rextsize)
+		return false;
+	return true;
+}
+
 /*
  * Verify that an realtime block number pointer doesn't point off the
  * end of the realtime device.
@@ -142,7 +164,20 @@ xfs_verify_rtbno(
 	struct xfs_mount	*mp,
 	xfs_rtblock_t		rtbno)
 {
-	return rtbno < mp->m_sb.sb_rblocks;
+	xfs_rgnumber_t		rgno;
+	xfs_rgblock_t		rgbno;
+
+	if (rtbno >= mp->m_sb.sb_rblocks)
+		return false;
+
+	if (!xfs_has_rtgroups(mp))
+		return true;
+
+	rgbno = xfs_rtb_to_rgbno(mp, rtbno, &rgno);
+	if (rgno >= mp->m_sb.sb_rgcount)
+		return false;
+
+	return xfs_verify_rgno_rgbno(mp, rgno, rgbno);
 }
 
 /* Verify that a realtime device extent is fully contained inside the volume. */
@@ -158,7 +193,14 @@ xfs_verify_rtbext(
 	if (!xfs_verify_rtbno(mp, rtbno))
 		return false;
 
-	return xfs_verify_rtbno(mp, rtbno + len - 1);
+	if (!xfs_verify_rtbno(mp, rtbno + len - 1))
+		return false;
+
+	if (xfs_has_rtgroups(mp) &&
+	    xfs_rtb_to_rgno(mp, rtbno) != xfs_rtb_to_rgno(mp, rtbno + len - 1))
+		return false;
+
+	return true;
 }
 
 /* Calculate the range of valid icount values. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/24] xfs: add frextents to the lazysbcounters when rtgroups enabled
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 21:18   ` [PATCH 10/24] xfs: check that rtblock extents do not overlap with the rt group metadata Darrick J. Wong
@ 2023-12-31 21:18   ` Darrick J. Wong
  2023-12-31 21:18   ` [PATCH 12/24] xfs: record rt group superblock errors in the health system Darrick J. Wong
                     ` (12 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:18 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make the free rt extent count a part of the lazy sb counters when the
realtime groups feature is enabled.  This is possible because the patch
to recompute frextents from the rtbitmap during log recovery predates
the code adding rtgroup support, hence we know that the value will
always be correct during runtime.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_sb.c |    5 +++++
 fs/xfs/xfs_trans.c     |   18 +++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index cf94417aa9faa..da20189bbe199 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1150,6 +1150,9 @@ xfs_log_sb(
 	 * sb counters, despite having a percpu counter. It is always kept
 	 * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
 	 * and hence we don't need have to update it here.
+	 *
+	 * sb_frextents was added to the lazy sb counters when the rt groups
+	 * feature was introduced.
 	 */
 	if (xfs_has_lazysbcount(mp)) {
 		mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
@@ -1158,6 +1161,8 @@ xfs_log_sb(
 				mp->m_sb.sb_icount);
 		mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
 	}
+	if (xfs_has_rtgroups(mp))
+		mp->m_sb.sb_frextents = percpu_counter_sum(&mp->m_frextents);
 
 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1eaf32422bf66..d4952be8f2498 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -421,6 +421,8 @@ xfs_trans_mod_sb(
 			ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
 		}
 		tp->t_frextents_delta += delta;
+		if (xfs_has_rtgroups(mp))
+			flags &= ~XFS_TRANS_SB_DIRTY;
 		break;
 	case XFS_TRANS_SB_RES_FREXTENTS:
 		/*
@@ -515,8 +517,14 @@ xfs_trans_apply_sb_deltas(
 	 *
 	 * Don't touch m_frextents because it includes incore reservations,
 	 * and those are handled by the unreserve function.
+	 *
+	 * sb_frextents was added to the lazy sb counters when the rt groups
+	 * feature was introduced.  This is possible because we know that all
+	 * kernels supporting rtgroups will also recompute frextents from the
+	 * realtime bitmap.
 	 */
-	if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+	if ((tp->t_frextents_delta || tp->t_res_frextents_delta) &&
+	    !xfs_has_rtgroups(tp->t_mountp)) {
 		struct xfs_mount	*mp = tp->t_mountp;
 		int64_t			rtxdelta;
 
@@ -630,7 +638,8 @@ xfs_trans_unreserve_and_mod_sb(
 	if (tp->t_rtx_res > 0)
 		rtxdelta = tp->t_rtx_res;
 	if ((tp->t_frextents_delta != 0) &&
-	    (tp->t_flags & XFS_TRANS_SB_DIRTY))
+	    (xfs_has_rtgroups(mp) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)))
 		rtxdelta += tp->t_frextents_delta;
 
 	if (xfs_has_lazysbcount(mp) ||
@@ -669,8 +678,11 @@ xfs_trans_unreserve_and_mod_sb(
 	 * Do not touch sb_frextents here because we are dealing with incore
 	 * reservation.  sb_frextents is not part of the lazy sb counters so it
 	 * must be consistent with the ondisk rtbitmap and must never include
-	 * incore reservations.
+	 * incore reservations.  sb_frextents was added to the lazy sb counters
+	 * when the realtime groups feature was introduced.
 	 */
+	if (xfs_has_rtgroups(mp))
+		mp->m_sb.sb_frextents += rtxdelta;
 	mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
 	mp->m_sb.sb_agcount += tp->t_agcount_delta;
 	mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/24] xfs: record rt group superblock errors in the health system
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 21:18   ` [PATCH 11/24] xfs: add frextents to the lazysbcounters when rtgroups enabled Darrick J. Wong
@ 2023-12-31 21:18   ` Darrick J. Wong
  2023-12-31 21:19   ` [PATCH 13/24] xfs: define locking primitives for realtime groups Darrick J. Wong
                     ` (11 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:18 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Record the state of per-rtgroup metadata sickness in the rtgroup
structure for later reporting.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_health.h  |   28 ++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h |    8 ++++
 fs/xfs/scrub/health.c       |   24 ++++++++++++
 fs/xfs/xfs_health.c         |   86 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trace.h          |   25 +++++++++++++
 5 files changed, 170 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 1816c67351ac8..f5449a804c6c8 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -52,6 +52,7 @@ struct xfs_inode;
 struct xfs_fsop_geom;
 struct xfs_btree_cur;
 struct xfs_da_args;
+struct xfs_rtgroup;
 
 /* Observable health issues for metadata spanning the entire filesystem. */
 #define XFS_SICK_FS_COUNTERS	(1 << 0)  /* summary counters */
@@ -66,6 +67,7 @@ struct xfs_da_args;
 /* Observable health issues for realtime volume metadata. */
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
 #define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
+#define XFS_SICK_RT_SUPER	(1 << 2)  /* rt group superblock */
 
 /* Observable health issues for AG metadata. */
 #define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
@@ -110,7 +112,8 @@ struct xfs_da_args;
 				 XFS_SICK_FS_METAPATH)
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
-				 XFS_SICK_RT_SUMMARY)
+				 XFS_SICK_RT_SUMMARY | \
+				 XFS_SICK_RT_SUPER)
 
 #define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
 				 XFS_SICK_AG_AGF | \
@@ -192,6 +195,14 @@ void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
 void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
 		unsigned int *checked);
 
+void xfs_rgno_mark_sick(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		unsigned int mask);
+void xfs_rtgroup_mark_sick(struct xfs_rtgroup *rtg, unsigned int mask);
+void xfs_rtgroup_mark_checked(struct xfs_rtgroup *rtg, unsigned int mask);
+void xfs_rtgroup_mark_healthy(struct xfs_rtgroup *rtg, unsigned int mask);
+void xfs_rtgroup_measure_sickness(struct xfs_rtgroup *rtg, unsigned int *sick,
+		unsigned int *checked);
+
 void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
 		unsigned int mask);
 void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
@@ -241,6 +252,15 @@ xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
 	return sick & mask;
 }
 
+static inline bool
+xfs_rtgroup_has_sickness(struct xfs_rtgroup *rtg, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_rtgroup_measure_sickness(rtg, &sick, &checked);
+	return sick & mask;
+}
+
 static inline bool
 xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
 {
@@ -262,6 +282,12 @@ xfs_rt_is_healthy(struct xfs_mount *mp)
 	return !xfs_rt_has_sickness(mp, -1U);
 }
 
+static inline bool
+xfs_rtgroup_is_healthy(struct xfs_rtgroup *rtg)
+{
+	return !xfs_rtgroup_has_sickness(rtg, -1U);
+}
+
 static inline bool
 xfs_ag_is_healthy(struct xfs_perag *pag)
 {
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 2d0422c6712da..c3f4f644ea56b 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -25,6 +25,14 @@ struct xfs_rtgroup {
 	/* Number of blocks in this group */
 	xfs_rgblock_t		rtg_blockcount;
 
+	/*
+	 * Bitsets of per-rtgroup metadata that have been checked and/or are
+	 * sick.  Callers should hold rtg_state_lock before accessing this
+	 * field.
+	 */
+	uint16_t		rtg_checked;
+	uint16_t		rtg_sick;
+
 #ifdef __KERNEL__
 	/* -- kernel only structures below this line -- */
 	spinlock_t		rtg_state_lock;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 4aae9a594cce5..063176c1f35eb 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -14,6 +14,7 @@
 #include "xfs_mount.h"
 #include "xfs_ag.h"
 #include "xfs_health.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/health.h"
 #include "scrub/common.h"
@@ -76,6 +77,7 @@ enum xchk_health_group {
 	XHG_RT,
 	XHG_AG,
 	XHG_INO,
+	XHG_RTGROUP,
 };
 
 struct xchk_health_map {
@@ -164,12 +166,16 @@ xchk_mark_all_healthy(
 	struct xfs_mount	*mp)
 {
 	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 
 	xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT);
 	xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT);
 	for_each_perag(mp, agno, pag)
 		xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT);
+	for_each_rtgroup(mp, rgno, rtg)
+		xfs_rtgroup_mark_healthy(rtg, XFS_SICK_RT_INDIRECT);
 }
 
 /*
@@ -187,6 +193,7 @@ xchk_update_health(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
 	bool			bad;
 
 	/*
@@ -249,6 +256,15 @@ xchk_update_health(
 		} else
 			xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
 		break;
+	case XHG_RTGROUP:
+		rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
+		if (bad) {
+			xfs_rtgroup_mark_sick(rtg, sc->sick_mask);
+			xfs_rtgroup_mark_checked(rtg, sc->sick_mask);
+		} else
+			xfs_rtgroup_mark_healthy(rtg, sc->sick_mask);
+		xfs_rtgroup_put(rtg);
+		break;
 	default:
 		ASSERT(0);
 		break;
@@ -336,7 +352,9 @@ xchk_health_record(
 {
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 
 	unsigned int		sick;
 	unsigned int		checked;
@@ -355,5 +373,11 @@ xchk_health_record(
 			xchk_set_corrupt(sc);
 	}
 
+	for_each_rtgroup(mp, rgno, rtg) {
+		xfs_rtgroup_measure_sickness(rtg, &sick, &checked);
+		if (sick & XFS_SICK_RT_PRIMARY)
+			xchk_set_corrupt(sc);
+	}
+
 	return 0;
 }
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index b7aa33a4c9e06..1ec015663a6aa 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -18,6 +18,7 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_quota_defs.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Warn about metadata corruption that we detected but haven't fixed, and
@@ -29,7 +30,9 @@ xfs_health_unmount(
 	struct xfs_mount	*mp)
 {
 	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 	unsigned int		sick = 0;
 	unsigned int		checked = 0;
 	bool			warn = false;
@@ -46,6 +49,15 @@ xfs_health_unmount(
 		}
 	}
 
+	/* Measure realtime group corruption levels. */
+	for_each_rtgroup(mp, rgno, rtg) {
+		xfs_rtgroup_measure_sickness(rtg, &sick, &checked);
+		if (sick) {
+			trace_xfs_rtgroup_unfixed_corruption(rtg, sick);
+			warn = true;
+		}
+	}
+
 	/* Measure realtime volume corruption levels. */
 	xfs_rt_measure_sickness(mp, &sick, &checked);
 	if (sick) {
@@ -280,6 +292,80 @@ xfs_ag_measure_sickness(
 	spin_unlock(&pag->pag_state_lock);
 }
 
+/* Mark unhealthy per-rtgroup metadata given a raw rt group number. */
+void
+xfs_rgno_mark_sick(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	unsigned int		mask)
+{
+	struct xfs_rtgroup	*rtg = xfs_rtgroup_get(mp, rgno);
+
+	/* per-rtgroup structure not set up yet? */
+	if (!rtg)
+		return;
+
+	xfs_rtgroup_mark_sick(rtg, mask);
+	xfs_rtgroup_put(rtg);
+}
+
+/* Mark unhealthy per-rtgroup metadata. */
+void
+xfs_rtgroup_mark_sick(
+	struct xfs_rtgroup	*rtg,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_ALL));
+	trace_xfs_rtgroup_mark_sick(rtg, mask);
+
+	spin_lock(&rtg->rtg_state_lock);
+	rtg->rtg_sick |= mask;
+	spin_unlock(&rtg->rtg_state_lock);
+}
+
+/* Mark per-rtgroup metadata as having been checked. */
+void
+xfs_rtgroup_mark_checked(
+	struct xfs_rtgroup	*rtg,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+
+	spin_lock(&rtg->rtg_state_lock);
+	rtg->rtg_checked |= mask;
+	spin_unlock(&rtg->rtg_state_lock);
+}
+
+/* Mark per-rtgroup metadata ok. */
+void
+xfs_rtgroup_mark_healthy(
+	struct xfs_rtgroup	*rtg,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_ALL));
+	trace_xfs_rtgroup_mark_healthy(rtg, mask);
+
+	spin_lock(&rtg->rtg_state_lock);
+	rtg->rtg_sick &= ~mask;
+	if (!(rtg->rtg_sick & XFS_SICK_RT_PRIMARY))
+		rtg->rtg_sick &= ~XFS_SICK_RT_SECONDARY;
+	rtg->rtg_checked |= mask;
+	spin_unlock(&rtg->rtg_state_lock);
+}
+
+/* Sample which per-rtgroup metadata are unhealthy. */
+void
+xfs_rtgroup_measure_sickness(
+	struct xfs_rtgroup	*rtg,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&rtg->rtg_state_lock);
+	*sick = rtg->rtg_sick;
+	*checked = rtg->rtg_checked;
+	spin_unlock(&rtg->rtg_state_lock);
+}
+
 /* Mark the unhealthy parts of an inode. */
 void
 xfs_inode_mark_sick(
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 81c21000d4fea..d23566e841cba 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -4307,6 +4307,31 @@ DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
 DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
 DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
 
+DECLARE_EVENT_CLASS(xfs_rtgroup_corrupt_class,
+	TP_PROTO(struct xfs_rtgroup *rtg, unsigned int flags),
+	TP_ARGS(rtg, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg->rtg_mount->m_super->s_dev;
+		__entry->rgno = rtg->rtg_rgno;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d rgno 0x%x flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno, __entry->flags)
+);
+#define DEFINE_RTGROUP_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_rtgroup_corrupt_class, name,	\
+	TP_PROTO(struct xfs_rtgroup *rtg, unsigned int flags), \
+	TP_ARGS(rtg, flags))
+DEFINE_RTGROUP_CORRUPT_EVENT(xfs_rtgroup_mark_sick);
+DEFINE_RTGROUP_CORRUPT_EVENT(xfs_rtgroup_mark_healthy);
+DEFINE_RTGROUP_CORRUPT_EVENT(xfs_rtgroup_unfixed_corruption);
+
 DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
 	TP_PROTO(struct xfs_inode *ip, unsigned int flags),
 	TP_ARGS(ip, flags),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/24] xfs: define locking primitives for realtime groups
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 21:18   ` [PATCH 12/24] xfs: record rt group superblock errors in the health system Darrick J. Wong
@ 2023-12-31 21:19   ` Darrick J. Wong
  2023-12-31 21:19   ` [PATCH 14/24] xfs: export the geometry of realtime groups to userspace Darrick J. Wong
                     ` (10 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:19 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Define helper functions to lock all metadata inodes related to a
realtime group.  There's not much to look at now, but this will become
important when we add per-rtgroup metadata files and online fsck code
for them.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtgroup.c |   33 +++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h |   14 ++++++++++++++
 2 files changed, 47 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 173297f582e00..3c86a560adfe3 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -536,3 +536,36 @@ xfs_rtgroup_update_secondary_sbs(
 
 	return saved_error ? saved_error : error;
 }
+
+/* Lock metadata inodes associated with this rt group. */
+void
+xfs_rtgroup_lock(
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	unsigned int		rtglock_flags)
+{
+	ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
+	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
+
+	if (rtglock_flags & XFS_RTGLOCK_BITMAP)
+		xfs_rtbitmap_lock(tp, rtg->rtg_mount);
+	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
+		xfs_rtbitmap_lock_shared(rtg->rtg_mount, XFS_RBMLOCK_BITMAP);
+}
+
+/* Unlock metadata inodes associated with this rt group. */
+void
+xfs_rtgroup_unlock(
+	struct xfs_rtgroup	*rtg,
+	unsigned int		rtglock_flags)
+{
+	ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
+	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
+
+	if (rtglock_flags & XFS_RTGLOCK_BITMAP)
+		xfs_rtbitmap_unlock(rtg->rtg_mount);
+	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
+		xfs_rtbitmap_unlock_shared(rtg->rtg_mount, XFS_RBMLOCK_BITMAP);
+}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index c3f4f644ea56b..70968cf700fab 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -221,11 +221,25 @@ void xfs_rtgroup_update_super(struct xfs_buf *rtsb_bp,
 		const struct xfs_buf *sb_bp);
 void xfs_rtgroup_log_super(struct xfs_trans *tp, const struct xfs_buf *sb_bp);
 int xfs_rtgroup_update_secondary_sbs(struct xfs_mount *mp);
+
+/* Lock the rt bitmap inode in exclusive mode */
+#define XFS_RTGLOCK_BITMAP		(1U << 0)
+/* Lock the rt bitmap inode in shared mode */
+#define XFS_RTGLOCK_BITMAP_SHARED	(1U << 1)
+
+#define XFS_RTGLOCK_ALL_FLAGS	(XFS_RTGLOCK_BITMAP | \
+				 XFS_RTGLOCK_BITMAP_SHARED)
+
+void xfs_rtgroup_lock(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+		unsigned int rtglock_flags);
+void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
 #else
 # define xfs_rtgroup_block_count(mp, rgno)	(0)
 # define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
 # define xfs_rtgroup_log_super(tp, sb_bp)	((void)0)
 # define xfs_rtgroup_update_secondary_sbs(mp)	(0)
+# define xfs_rtgroup_lock(tp, rtg, gf)		((void)0)
+# define xfs_rtgroup_unlock(rtg, gf)		((void)0)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __LIBXFS_RTGROUP_H */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/24] xfs: export the geometry of realtime groups to userspace
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 21:19   ` [PATCH 13/24] xfs: define locking primitives for realtime groups Darrick J. Wong
@ 2023-12-31 21:19   ` Darrick J. Wong
  2023-12-31 21:19   ` [PATCH 15/24] xfs: add block headers to realtime bitmap blocks Darrick J. Wong
                     ` (9 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:19 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create an ioctl so that the kernel can report the status of realtime
groups to userspace.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h         |    1 +
 fs/xfs/libxfs/xfs_fs_staging.h |   17 +++++++++++++++++
 fs/xfs/libxfs/xfs_health.h     |    2 ++
 fs/xfs/libxfs/xfs_rtgroup.c    |   14 ++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h    |    4 ++++
 fs/xfs/xfs_health.c            |   28 ++++++++++++++++++++++++++++
 fs/xfs/xfs_ioctl.c             |   32 ++++++++++++++++++++++++++++++++
 7 files changed, 98 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index bae9ef924bf59..c5bf53c6a43ca 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -852,6 +852,7 @@ struct xfs_scrub_metadata {
 /*	XFS_IOC_SCRUBV_METADATA -- staging 60	   */
 #define XFS_IOC_AG_GEOMETRY	_IOWR('X', 61, struct xfs_ag_geometry)
 /*	XFS_IOC_GETPARENTS ---- staging 62         */
+/*	XFS_IOC_RTGROUP_GEOMETRY - staging 63	   */
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/libxfs/xfs_fs_staging.h b/fs/xfs/libxfs/xfs_fs_staging.h
index 69d29f213af3a..1f57331487791 100644
--- a/fs/xfs/libxfs/xfs_fs_staging.h
+++ b/fs/xfs/libxfs/xfs_fs_staging.h
@@ -202,4 +202,21 @@ static inline size_t sizeof_xfs_scrub_vec(unsigned int nr)
 
 #define XFS_IOC_SCRUBV_METADATA	_IOWR('X', 60, struct xfs_scrub_vec_head)
 
+/*
+ * Output for XFS_IOC_RTGROUP_GEOMETRY
+ */
+struct xfs_rtgroup_geometry {
+	__u32 rg_number;	/* i/o: rtgroup number */
+	__u32 rg_length;	/* o: length in blocks */
+	__u32 rg_sick;		/* o: sick things in ag */
+	__u32 rg_checked;	/* o: checked metadata in ag */
+	__u32 rg_flags;		/* i/o: flags for this ag */
+	__u32 rg_pad;		/* o: zero */
+	__u64 rg_reserved[13];	/* o: zero */
+};
+#define XFS_RTGROUP_GEOM_SICK_SUPER	(1 << 0)  /* superblock */
+#define XFS_RTGROUP_GEOM_SICK_BITMAP	(1 << 1)  /* rtbitmap for this group */
+
+#define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 63, struct xfs_rtgroup_geometry)
+
 #endif /* __XFS_FS_STAGING_H__ */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index f5449a804c6c8..1e9938a417b42 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -302,6 +302,8 @@ xfs_inode_is_healthy(struct xfs_inode *ip)
 
 void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
 void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_rtgroup_geom_health(struct xfs_rtgroup *rtg,
+		struct xfs_rtgroup_geometry *rgeo);
 void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
 
 #define xfs_metadata_is_sick(error) \
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 3c86a560adfe3..ed4f8aa67b158 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -569,3 +569,17 @@ xfs_rtgroup_unlock(
 	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
 		xfs_rtbitmap_unlock_shared(rtg->rtg_mount, XFS_RBMLOCK_BITMAP);
 }
+
+/* Retrieve rt group geometry. */
+int
+xfs_rtgroup_get_geometry(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_rtgroup_geometry *rgeo)
+{
+	/* Fill out form. */
+	memset(rgeo, 0, sizeof(*rgeo));
+	rgeo->rg_number = rtg->rtg_rgno;
+	rgeo->rg_length = rtg->rtg_blockcount;
+	xfs_rtgroup_geom_health(rtg, rgeo);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 70968cf700fab..e6d60425faa4d 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -233,6 +233,9 @@ int xfs_rtgroup_update_secondary_sbs(struct xfs_mount *mp);
 void xfs_rtgroup_lock(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
 		unsigned int rtglock_flags);
 void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
+
+int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg,
+		struct xfs_rtgroup_geometry *rgeo);
 #else
 # define xfs_rtgroup_block_count(mp, rgno)	(0)
 # define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
@@ -240,6 +243,7 @@ void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
 # define xfs_rtgroup_update_secondary_sbs(mp)	(0)
 # define xfs_rtgroup_lock(tp, rtg, gf)		((void)0)
 # define xfs_rtgroup_unlock(rtg, gf)		((void)0)
+# define xfs_rtgroup_get_geometry(rtg, rgeo)	(-EOPNOTSUPP)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __LIBXFS_RTGROUP_H */
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 1ec015663a6aa..b53ef95a37a54 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -529,6 +529,34 @@ xfs_ag_geom_health(
 	}
 }
 
+static const struct ioctl_sick_map rtgroup_map[] = {
+	{ XFS_SICK_RT_SUPER,	XFS_RTGROUP_GEOM_SICK_SUPER },
+	{ XFS_SICK_RT_BITMAP,	XFS_RTGROUP_GEOM_SICK_BITMAP },
+	{ 0, 0 },
+};
+
+/* Fill out rtgroup geometry health info. */
+void
+xfs_rtgroup_geom_health(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_rtgroup_geometry *rgeo)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			sick;
+	unsigned int			checked;
+
+	rgeo->rg_sick = 0;
+	rgeo->rg_checked = 0;
+
+	xfs_rtgroup_measure_sickness(rtg, &sick, &checked);
+	for (m = rtgroup_map; m->sick_mask; m++) {
+		if (checked & m->sick_mask)
+			rgeo->rg_checked |= m->ioctl_mask;
+		if (sick & m->sick_mask)
+			rgeo->rg_sick |= m->ioctl_mask;
+	}
+}
+
 static const struct ioctl_sick_map ino_map[] = {
 	{ XFS_SICK_INO_CORE,	XFS_BS_SICK_INODE },
 	{ XFS_SICK_INO_BMBTD,	XFS_BS_SICK_BMBTD },
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index b140f7ed916c9..32a52799ae826 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -995,6 +995,36 @@ xfs_ioc_ag_geometry(
 	return 0;
 }
 
+STATIC int
+xfs_ioc_rtgroup_geometry(
+	struct xfs_mount	*mp,
+	void			__user *arg)
+{
+	struct xfs_rtgroup	*rtg;
+	struct xfs_rtgroup_geometry rgeo;
+	int			error;
+
+	if (copy_from_user(&rgeo, arg, sizeof(rgeo)))
+		return -EFAULT;
+	if (rgeo.rg_flags || rgeo.rg_pad)
+		return -EINVAL;
+	if (memchr_inv(&rgeo.rg_reserved, 0, sizeof(rgeo.rg_reserved)))
+		return -EINVAL;
+
+	rtg = xfs_rtgroup_get(mp, rgeo.rg_number);
+	if (!rtg)
+		return -EINVAL;
+
+	error = xfs_rtgroup_get_geometry(rtg, &rgeo);
+	xfs_rtgroup_put(rtg);
+	if (error)
+		return error;
+
+	if (copy_to_user(arg, &rgeo, sizeof(rgeo)))
+		return -EFAULT;
+	return 0;
+}
+
 /*
  * Linux extended inode flags interface.
  */
@@ -2081,6 +2111,8 @@ xfs_file_ioctl(
 
 	case XFS_IOC_AG_GEOMETRY:
 		return xfs_ioc_ag_geometry(mp, arg);
+	case XFS_IOC_RTGROUP_GEOMETRY:
+		return xfs_ioc_rtgroup_geometry(mp, arg);
 
 	case XFS_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *)arg);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/24] xfs: add block headers to realtime bitmap blocks
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-31 21:19   ` [PATCH 14/24] xfs: export the geometry of realtime groups to userspace Darrick J. Wong
@ 2023-12-31 21:19   ` Darrick J. Wong
  2023-12-31 21:19   ` [PATCH 16/24] xfs: encode the rtbitmap in little endian format Darrick J. Wong
                     ` (8 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:19 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Upgrade rtbitmap blocks to have self describing metadata like most every
other thing in XFS.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h    |   14 +++++
 fs/xfs/libxfs/xfs_ondisk.h    |    1 
 fs/xfs/libxfs/xfs_rtbitmap.c  |  108 ++++++++++++++++++++++++++++++++++++++---
 fs/xfs/libxfs/xfs_rtbitmap.h  |   33 ++++++++++++-
 fs/xfs/libxfs/xfs_sb.c        |   18 +++++--
 fs/xfs/libxfs/xfs_shared.h    |    1 
 fs/xfs/xfs_buf_item_recover.c |   20 +++++++-
 fs/xfs/xfs_mount.h            |    3 +
 fs/xfs/xfs_rtalloc.c          |   60 ++++++++++++++++-------
 9 files changed, 224 insertions(+), 34 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 43e66740e2aec..7178bd463c1c0 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1300,6 +1300,20 @@ static inline bool xfs_dinode_has_large_extent_counts(
 /*
  * RT bit manipulation macros.
  */
+#define XFS_RTBITMAP_MAGIC	0x424D505A	/* BMPZ */
+
+struct xfs_rtbuf_blkinfo {
+	__be32		rt_magic;	/* validity check on block */
+	__be32		rt_crc;		/* CRC of block */
+	__be64		rt_owner;	/* inode that owns the block */
+	__be64		rt_blkno;	/* first block of the buffer */
+	__be64		rt_lsn;		/* sequence number of last write */
+	uuid_t		rt_uuid;	/* filesystem we belong to */
+};
+
+#define XFS_RTBUF_CRC_OFF \
+	offsetof(struct xfs_rtbuf_blkinfo, rt_crc)
+
 #define	XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
 #define	XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
 
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 65219d4cf99ca..70b96efa26973 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -76,6 +76,7 @@ xfs_check_ondisk_structs(void)
 	/* realtime structures */
 	XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index a8e5c702a7515..4d85b8ea9a861 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -18,28 +18,84 @@
 #include "xfs_error.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_health.h"
+#include "xfs_log.h"
+#include "xfs_buf_item.h"
 
 /*
  * Realtime allocator bitmap functions shared with userspace.
  */
 
-/*
- * Real time buffers need verifiers to avoid runtime warnings during IO.
- * We don't have anything to verify, however, so these are just dummy
- * operations.
- */
+static xfs_failaddr_t
+xfs_rtbuf_verify(
+	struct xfs_buf			*bp)
+{
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+
+	if (!xfs_verify_magic(bp, hdr->rt_magic))
+		return __this_address;
+	if (!xfs_has_rtgroups(mp))
+		return __this_address;
+	if (!xfs_has_crc(mp))
+		return __this_address;
+	if (!uuid_equal(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+	if (hdr->rt_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
+		return __this_address;
+	return NULL;
+}
+
 static void
 xfs_rtbuf_verify_read(
-	struct xfs_buf	*bp)
+	struct xfs_buf			*bp)
 {
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+	xfs_failaddr_t			fa;
+
+	if (!xfs_has_rtgroups(mp) || bp->b_ops != &xfs_rtbitmap_buf_ops)
+		return;
+
+	if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) {
+		fa = __this_address;
+		goto fail;
+	}
+
+	if (!xfs_buf_verify_cksum(bp, XFS_RTBUF_CRC_OFF)) {
+		fa = __this_address;
+		goto fail;
+	}
+
+	fa = xfs_rtbuf_verify(bp);
+	if (fa)
+		goto fail;
+
 	return;
+fail:
+	xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 }
 
 static void
 xfs_rtbuf_verify_write(
 	struct xfs_buf	*bp)
 {
-	return;
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+	struct xfs_buf_log_item		*bip = bp->b_log_item;
+	xfs_failaddr_t			fa;
+
+	if (!xfs_has_rtgroups(mp) || bp->b_ops != &xfs_rtbitmap_buf_ops)
+		return;
+
+	fa = xfs_rtbuf_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (bip)
+		hdr->rt_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_RTBUF_CRC_OFF);
 }
 
 const struct xfs_buf_ops xfs_rtbuf_ops = {
@@ -48,6 +104,14 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
 	.verify_write = xfs_rtbuf_verify_write,
 };
 
+const struct xfs_buf_ops xfs_rtbitmap_buf_ops = {
+	.name		= "xfs_rtbitmap",
+	.magic		= { 0, cpu_to_be32(XFS_RTBITMAP_MAGIC) },
+	.verify_read	= xfs_rtbuf_verify_read,
+	.verify_write	= xfs_rtbuf_verify_write,
+	.verify_struct	= xfs_rtbuf_verify,
+};
+
 /* Release cached rt bitmap and summary buffers. */
 void
 xfs_rtbuf_cache_relse(
@@ -125,13 +189,26 @@ xfs_rtbuf_get(
 	ASSERT(map.br_startblock != NULLFSBLOCK);
 	error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
 				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-				   mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+				   mp->m_bsize, 0, &bp,
+				   xfs_rtblock_ops(mp, issum));
 	if (xfs_metadata_is_sick(error))
 		xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
 					     XFS_SICK_RT_BITMAP);
 	if (error)
 		return error;
 
+	if (xfs_has_rtgroups(mp) && !issum) {
+		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+
+		if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) {
+			xfs_buf_mark_corrupt(bp);
+			xfs_trans_brelse(args->tp, bp);
+			xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+						     XFS_SICK_RT_BITMAP);
+			return -EFSCORRUPTED;
+		}
+	}
+
 	xfs_trans_buf_set_type(args->tp, bp, type);
 	*cbpp = bp;
 	*coffp = block;
@@ -1125,6 +1202,19 @@ xfs_rtalloc_extent_is_free(
 	return 0;
 }
 
+/* Compute the number of rt extents tracked by a single bitmap block. */
+xfs_rtxnum_t
+xfs_rtbitmap_rtx_per_rbmblock(
+	struct xfs_mount	*mp)
+{
+	unsigned int		rbmblock_bytes = mp->m_sb.sb_blocksize;
+
+	if (xfs_has_rtgroups(mp))
+		rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo);
+
+	return rbmblock_bytes * NBBY;
+}
+
 /*
  * Compute the number of rtbitmap blocks needed to track the given number of rt
  * extents.
@@ -1134,7 +1224,7 @@ xfs_rtbitmap_blockcount(
 	struct xfs_mount	*mp,
 	xfs_rtbxlen_t		rtextents)
 {
-	return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
+	return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 3de0ec2d24123..e1cf47d2a9281 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -148,6 +148,9 @@ xfs_rtx_to_rbmblock(
 	struct xfs_mount	*mp,
 	xfs_rtxnum_t		rtx)
 {
+	if (xfs_has_rtgroups(mp))
+		return div_u64(rtx, mp->m_rtx_per_rbmblock);
+
 	return rtx >> mp->m_blkbit_log;
 }
 
@@ -157,6 +160,13 @@ xfs_rtx_to_rbmword(
 	struct xfs_mount	*mp,
 	xfs_rtxnum_t		rtx)
 {
+	if (xfs_has_rtgroups(mp)) {
+		unsigned int	mod;
+
+		div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod);
+		return mod;
+	}
+
 	return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1);
 }
 
@@ -166,6 +176,9 @@ xfs_rbmblock_to_rtx(
 	struct xfs_mount	*mp,
 	xfs_fileoff_t		rbmoff)
 {
+	if (xfs_has_rtgroups(mp))
+		return rbmoff * mp->m_rtx_per_rbmblock;
+
 	return rbmoff << mp->m_blkbit_log;
 }
 
@@ -175,7 +188,14 @@ xfs_rbmblock_wordptr(
 	struct xfs_rtalloc_args	*args,
 	unsigned int		index)
 {
-	union xfs_rtword_raw	*words = args->rbmbp->b_addr;
+	struct xfs_mount	*mp = args->mp;
+	union xfs_rtword_raw	*words;
+	struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr;
+
+	if (xfs_has_rtgroups(mp))
+		words = (union xfs_rtword_raw *)(hdr + 1);
+	else
+		words = args->rbmbp->b_addr;
 
 	return words + index;
 }
@@ -277,6 +297,16 @@ xfs_suminfo_add(
 	return info->old;
 }
 
+static inline const struct xfs_buf_ops *
+xfs_rtblock_ops(
+	struct xfs_mount	*mp,
+	bool			issum)
+{
+	if (xfs_has_rtgroups(mp) && !issum)
+		return &xfs_rtbitmap_buf_ops;
+	return &xfs_rtbuf_ops;
+}
+
 /*
  * Functions for walking free space rtextents in the realtime bitmap.
  */
@@ -365,6 +395,7 @@ xfs_validate_rtextents(
 	return true;
 }
 
+xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp);
 xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
 		rtextents);
 unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index da20189bbe199..8298334f4ee8d 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -577,10 +577,15 @@ xfs_validate_sb_common(
 	} else {
 		uint64_t	rexts;
 		uint64_t	rbmblocks;
+		unsigned int	rbmblock_bytes = sbp->sb_blocksize;
 
 		rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
-		rbmblocks = howmany_64(sbp->sb_rextents,
-				       NBBY * sbp->sb_blocksize);
+
+		if (xfs_sb_is_v5(sbp) &&
+		    (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS))
+			rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo);
+
+		rbmblocks = howmany_64(sbp->sb_rextents, NBBY * rbmblock_bytes);
 
 		if (!xfs_validate_rtextents(rexts) ||
 		    sbp->sb_rextents != rexts ||
@@ -1093,8 +1098,13 @@ xfs_sb_mount_common(
 	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
 	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
 	mp->m_blockmask = sbp->sb_blocksize - 1;
-	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
-	mp->m_blockwmask = mp->m_blockwsize - 1;
+	if (xfs_has_rtgroups(mp))
+		mp->m_blockwsize = (sbp->sb_blocksize -
+					sizeof(struct xfs_rtbuf_blkinfo)) >>
+					XFS_WORDLOG;
+	else
+		mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+	mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG;
 	mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
 	mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
 	mp->m_rgblklog = log2_if_power2(sbp->sb_rgblocks);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 65691af0488e7..f57788164a702 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -38,6 +38,7 @@ extern const struct xfs_buf_ops xfs_inode_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 4be9a384dc6f7..448df616e975e 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -23,6 +23,7 @@
 #include "xfs_dir2.h"
 #include "xfs_quota.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * This is the number of entries in the l_buf_cancel_table used during
@@ -391,9 +392,15 @@ xlog_recover_validate_buf_type(
 		break;
 #ifdef CONFIG_XFS_RT
 	case XFS_BLFT_RTBITMAP_BUF:
+		if (xfs_has_rtgroups(mp) && magic32 != XFS_RTBITMAP_MAGIC) {
+			warnmsg = "Bad rtbitmap magic!";
+			break;
+		}
+		bp->b_ops = xfs_rtblock_ops(mp, false);
+		break;
 	case XFS_BLFT_RTSUMMARY_BUF:
 		/* no magic numbers for verification of RT buffers */
-		bp->b_ops = &xfs_rtbuf_ops;
+		bp->b_ops = xfs_rtblock_ops(mp, true);
 		break;
 #endif /* CONFIG_XFS_RT */
 	default:
@@ -728,11 +735,20 @@ xlog_recover_get_buf_lsn(
 	 * UUIDs, so we must recover them immediately.
 	 */
 	blft = xfs_blft_from_flags(buf_f);
-	if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF)
+	if (!xfs_has_rtgroups(mp) && blft == XFS_BLFT_RTBITMAP_BUF)
+		goto recover_immediately;
+	if (blft == XFS_BLFT_RTSUMMARY_BUF)
 		goto recover_immediately;
 
 	magic32 = be32_to_cpu(*(__be32 *)blk);
 	switch (magic32) {
+	case XFS_RTBITMAP_MAGIC: {
+		struct xfs_rtbuf_blkinfo	*hdr = blk;
+
+		lsn = be64_to_cpu(hdr->rt_lsn);
+		uuid = &hdr->rt_uuid;
+		break;
+	}
 	case XFS_ABTB_CRC_MAGIC:
 	case XFS_ABTC_CRC_MAGIC:
 	case XFS_ABTB_MAGIC:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7e86aa137d1fa..14094b29ab6fe 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -125,7 +125,8 @@ typedef struct xfs_mount {
 	int8_t			m_rgblklog;	/* log2 of rt group sz if possible */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
-	uint			m_blockwmask;	/* blockwsize-1 */
+	/* number of rt extents per rt bitmap block if rtgroups enabled */
+	unsigned int		m_rtx_per_rbmblock;
 	uint			m_alloc_mxr[2];	/* max alloc btree records */
 	uint			m_alloc_mnr[2];	/* min alloc btree records */
 	uint			m_bmap_dmxr[2];	/* max bmap btree records */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 23a15700d3596..924266978ca27 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -761,6 +761,42 @@ xfs_rtallocate_extent_size(
 	return 0;
 }
 
+/* Get a buffer for the block. */
+static int
+xfs_growfs_init_rtbuf(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fsblock_t		fsbno,
+	enum xfs_blft		buf_type)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp;
+	xfs_daddr_t		d;
+	int			error;
+
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0,
+			&bp);
+	if (error)
+		return error;
+
+	xfs_trans_buf_set_type(tp, bp, buf_type);
+	bp->b_ops = xfs_rtblock_ops(mp, buf_type == XFS_BLFT_RTSUMMARY_BUF);
+	memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
+
+	if (xfs_has_rtgroups(mp) && buf_type == XFS_BLFT_RTBITMAP_BUF) {
+		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+
+		hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+		hdr->rt_owner = cpu_to_be64(ip->i_ino);
+		hdr->rt_blkno = cpu_to_be64(d);
+		uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid);
+	}
+
+	xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+	return 0;
+}
+
 /*
  * Allocate space to the bitmap or summary file, and zero it, for growfs.
  */
@@ -772,8 +808,6 @@ xfs_growfs_rt_alloc(
 	struct xfs_inode	*ip)		/* inode (bitmap/summary) */
 {
 	xfs_fileoff_t		bno;		/* block number in file */
-	struct xfs_buf		*bp;	/* temporary buffer for zeroing */
-	xfs_daddr_t		d;		/* disk block address */
 	int			error;		/* error return value */
 	xfs_fsblock_t		fsbno;		/* filesystem block for bno */
 	struct xfs_bmbt_irec	map;		/* block map output */
@@ -848,19 +882,11 @@ xfs_growfs_rt_alloc(
 			 */
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
 			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-			/*
-			 * Get a buffer for the block.
-			 */
-			d = XFS_FSB_TO_DADDR(mp, fsbno);
-			error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-					mp->m_bsize, 0, &bp);
+
+			error = xfs_growfs_init_rtbuf(tp, ip, fsbno, buf_type);
 			if (error)
 				goto out_trans_cancel;
 
-			xfs_trans_buf_set_type(tp, bp, buf_type);
-			bp->b_ops = &xfs_rtbuf_ops;
-			memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
-			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
 			/*
 			 * Commit the transaction.
 			 */
@@ -1133,10 +1159,10 @@ xfs_growfs_rt(
 	 * Skip the current block if it is exactly full.
 	 * This also deals with the case where there were no rtextents before.
 	 */
-	for (bmbno = sbp->sb_rbmblocks -
-		     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
-	     bmbno < nrbmblocks;
-	     bmbno++) {
+	bmbno = sbp->sb_rbmblocks;
+	if (xfs_rtx_to_rbmword(mp, sbp->sb_rextents) != 0)
+		bmbno--;
+	for (; bmbno < nrbmblocks; bmbno++) {
 		struct xfs_rtalloc_args	args = {
 			.mp		= mp,
 		};
@@ -1157,7 +1183,7 @@ xfs_growfs_rt(
 		nsbp->sb_rextsize = in->extsize;
 		nmp->m_rtxblklog = -1; /* don't use shift or masking */
 		nsbp->sb_rbmblocks = bmbno + 1;
-		nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
+		nrblocks_step = (bmbno + 1) * mp->m_rtx_per_rbmblock *
 				nsbp->sb_rextsize;
 		nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
 		nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/24] xfs: encode the rtbitmap in little endian format
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-31 21:19   ` [PATCH 15/24] xfs: add block headers to realtime bitmap blocks Darrick J. Wong
@ 2023-12-31 21:19   ` Darrick J. Wong
  2023-12-31 21:20   ` [PATCH 17/24] xfs: add block headers to realtime summary blocks Darrick J. Wong
                     ` (7 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:19 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Currently, the ondisk realtime bitmap file is accessed in units of
32-bit words.  There's no endian translation of the contents of this
file, which means that the Bad Things Happen(tm) if you go from (say)
x86 to powerpc.  Since we have a new feature flag, let's take the
opportunity to enforce an endianness on the file.

The natural format of a bitmap is (IMHO) little endian, because the byte
offsets of the bitmap data should always increase in step with the
information being indexed.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h   |    4 +++-
 fs/xfs/libxfs/xfs_rtbitmap.h |    7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 7178bd463c1c0..77422fbd3372e 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -749,10 +749,12 @@ struct xfs_agfl {
 
 /*
  * Realtime bitmap information is accessed by the word, which is currently
- * stored in host-endian format.
+ * stored in host-endian format.  Starting with the realtime groups feature,
+ * the words are stored in le32 ondisk.
  */
 union xfs_rtword_raw {
 	__u32		old;
+	__le32		rtg;
 };
 
 /*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index e1cf47d2a9281..588689e53ba39 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -208,6 +208,8 @@ xfs_rtbitmap_getword(
 {
 	union xfs_rtword_raw	*word = xfs_rbmblock_wordptr(args, index);
 
+	if (xfs_has_rtgroups(args->mp))
+		return le32_to_cpu(word->rtg);
 	return word->old;
 }
 
@@ -220,7 +222,10 @@ xfs_rtbitmap_setword(
 {
 	union xfs_rtword_raw	*word = xfs_rbmblock_wordptr(args, index);
 
-	word->old = value;
+	if (xfs_has_rtgroups(args->mp))
+		word->rtg = cpu_to_le32(value);
+	else
+		word->old = value;
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/24] xfs: add block headers to realtime summary blocks
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-31 21:19   ` [PATCH 16/24] xfs: encode the rtbitmap in little endian format Darrick J. Wong
@ 2023-12-31 21:20   ` Darrick J. Wong
  2023-12-31 21:20   ` [PATCH 18/24] xfs: encode the rtsummary in big endian format Darrick J. Wong
                     ` (6 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:20 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Upgrade rtsummary blocks to have self describing metadata like most
every other thing in XFS.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h      |    1 +
 fs/xfs/libxfs/xfs_rtbitmap.c    |   18 +++++++++++++++---
 fs/xfs/libxfs/xfs_rtbitmap.h    |   19 +++++++++++++++++--
 fs/xfs/libxfs/xfs_shared.h      |    1 +
 fs/xfs/scrub/rtsummary_repair.c |   15 +++++++++++++--
 fs/xfs/xfs_buf_item_recover.c   |   11 +++++++----
 fs/xfs/xfs_rtalloc.c            |    7 +++++--
 7 files changed, 59 insertions(+), 13 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 77422fbd3372e..f2c70e1027ed7 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1303,6 +1303,7 @@ static inline bool xfs_dinode_has_large_extent_counts(
  * RT bit manipulation macros.
  */
 #define XFS_RTBITMAP_MAGIC	0x424D505A	/* BMPZ */
+#define XFS_RTSUMMARY_MAGIC	0x53554D59	/* SUMY */
 
 struct xfs_rtbuf_blkinfo {
 	__be32		rt_magic;	/* validity check on block */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 4d85b8ea9a861..3530524b3fc2b 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -53,7 +53,7 @@ xfs_rtbuf_verify_read(
 	struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
 	xfs_failaddr_t			fa;
 
-	if (!xfs_has_rtgroups(mp) || bp->b_ops != &xfs_rtbitmap_buf_ops)
+	if (!xfs_has_rtgroups(mp))
 		return;
 
 	if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) {
@@ -84,7 +84,7 @@ xfs_rtbuf_verify_write(
 	struct xfs_buf_log_item		*bip = bp->b_log_item;
 	xfs_failaddr_t			fa;
 
-	if (!xfs_has_rtgroups(mp) || bp->b_ops != &xfs_rtbitmap_buf_ops)
+	if (!xfs_has_rtgroups(mp))
 		return;
 
 	fa = xfs_rtbuf_verify(bp);
@@ -112,6 +112,14 @@ const struct xfs_buf_ops xfs_rtbitmap_buf_ops = {
 	.verify_struct	= xfs_rtbuf_verify,
 };
 
+const struct xfs_buf_ops xfs_rtsummary_buf_ops = {
+	.name		= "xfs_rtsummary",
+	.magic		= { 0, cpu_to_be32(XFS_RTSUMMARY_MAGIC) },
+	.verify_read	= xfs_rtbuf_verify_read,
+	.verify_write	= xfs_rtbuf_verify_write,
+	.verify_struct	= xfs_rtbuf_verify,
+};
+
 /* Release cached rt bitmap and summary buffers. */
 void
 xfs_rtbuf_cache_relse(
@@ -197,7 +205,7 @@ xfs_rtbuf_get(
 	if (error)
 		return error;
 
-	if (xfs_has_rtgroups(mp) && !issum) {
+	if (xfs_has_rtgroups(mp)) {
 		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
 
 		if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) {
@@ -1277,6 +1285,10 @@ xfs_rtsummary_blockcount(
 	unsigned long long	rsumwords;
 
 	rsumwords = (unsigned long long)rsumlevels * rbmblocks;
+
+	if (xfs_has_rtgroups(mp))
+		return howmany_64(rsumwords, mp->m_blockwsize);
+
 	return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
 }
 
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 588689e53ba39..e8558db14b9b7 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -250,6 +250,9 @@ xfs_rtsumoffs_to_block(
 	struct xfs_mount	*mp,
 	xfs_rtsumoff_t		rsumoff)
 {
+	if (xfs_has_rtgroups(mp))
+		return rsumoff / mp->m_blockwsize;
+
 	return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t));
 }
 
@@ -264,6 +267,9 @@ xfs_rtsumoffs_to_infoword(
 {
 	unsigned int		mask = mp->m_blockmask >> XFS_SUMINFOLOG;
 
+	if (xfs_has_rtgroups(mp))
+		return rsumoff % mp->m_blockwsize;
+
 	return rsumoff & mask;
 }
 
@@ -273,7 +279,13 @@ xfs_rsumblock_infoptr(
 	struct xfs_rtalloc_args	*args,
 	unsigned int		index)
 {
-	union xfs_suminfo_raw	*info = args->sumbp->b_addr;
+	union xfs_suminfo_raw	*info;
+	struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr;
+
+	if (xfs_has_rtgroups(args->mp))
+		info = (union xfs_suminfo_raw *)(hdr + 1);
+	else
+		info = args->sumbp->b_addr;
 
 	return info + index;
 }
@@ -307,8 +319,11 @@ xfs_rtblock_ops(
 	struct xfs_mount	*mp,
 	bool			issum)
 {
-	if (xfs_has_rtgroups(mp) && !issum)
+	if (xfs_has_rtgroups(mp)) {
+		if (issum)
+			return &xfs_rtsummary_buf_ops;
 		return &xfs_rtbitmap_buf_ops;
+	}
 	return &xfs_rtbuf_ops;
 }
 
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index f57788164a702..8ad4b67d6febb 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
+extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
index 058c5ebabf9a2..c66373eec436d 100644
--- a/fs/xfs/scrub/rtsummary_repair.c
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -87,12 +87,23 @@ xrep_rtsummary_prep_buf(
 	ondisk = xfs_rsumblock_infoptr(&rts->args, 0);
 	rts->args.sumbp = NULL;
 
-	bp->b_ops = &xfs_rtbuf_ops;
-
 	error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize);
 	if (error)
 		return error;
 
+	if (xfs_has_rtgroups(sc->mp)) {
+		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+
+		hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC);
+		hdr->rt_owner = cpu_to_be64(sc->ip->i_ino);
+		hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		hdr->rt_lsn = 0;
+		uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid);
+		bp->b_ops = &xfs_rtsummary_buf_ops;
+	} else {
+		bp->b_ops = &xfs_rtbuf_ops;
+	}
+
 	rts->prep_wordoff += mp->m_blockwsize;
 	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF);
 	return 0;
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 448df616e975e..2e617041161e0 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -399,7 +399,10 @@ xlog_recover_validate_buf_type(
 		bp->b_ops = xfs_rtblock_ops(mp, false);
 		break;
 	case XFS_BLFT_RTSUMMARY_BUF:
-		/* no magic numbers for verification of RT buffers */
+		if (xfs_has_rtgroups(mp) && magic32 != XFS_RTSUMMARY_MAGIC) {
+			warnmsg = "Bad rtsummary magic!";
+			break;
+		}
 		bp->b_ops = xfs_rtblock_ops(mp, true);
 		break;
 #endif /* CONFIG_XFS_RT */
@@ -735,13 +738,13 @@ xlog_recover_get_buf_lsn(
 	 * UUIDs, so we must recover them immediately.
 	 */
 	blft = xfs_blft_from_flags(buf_f);
-	if (!xfs_has_rtgroups(mp) && blft == XFS_BLFT_RTBITMAP_BUF)
-		goto recover_immediately;
-	if (blft == XFS_BLFT_RTSUMMARY_BUF)
+	if (!xfs_has_rtgroups(mp) && (blft == XFS_BLFT_RTBITMAP_BUF ||
+				      blft == XFS_BLFT_RTSUMMARY_BUF))
 		goto recover_immediately;
 
 	magic32 = be32_to_cpu(*(__be32 *)blk);
 	switch (magic32) {
+	case XFS_RTSUMMARY_MAGIC:
 	case XFS_RTBITMAP_MAGIC: {
 		struct xfs_rtbuf_blkinfo	*hdr = blk;
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 924266978ca27..ff08c1d997fdb 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -784,10 +784,13 @@ xfs_growfs_init_rtbuf(
 	bp->b_ops = xfs_rtblock_ops(mp, buf_type == XFS_BLFT_RTSUMMARY_BUF);
 	memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
 
-	if (xfs_has_rtgroups(mp) && buf_type == XFS_BLFT_RTBITMAP_BUF) {
+	if (xfs_has_rtgroups(mp)) {
 		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
 
-		hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+		if (buf_type == XFS_BLFT_RTBITMAP_BUF)
+			hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+		else
+			hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC);
 		hdr->rt_owner = cpu_to_be64(ip->i_ino);
 		hdr->rt_blkno = cpu_to_be64(d);
 		uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/24] xfs: encode the rtsummary in big endian format
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-31 21:20   ` [PATCH 17/24] xfs: add block headers to realtime summary blocks Darrick J. Wong
@ 2023-12-31 21:20   ` Darrick J. Wong
  2023-12-31 21:20   ` [PATCH 19/24] xfs: store rtgroup information with a bmap intent Darrick J. Wong
                     ` (5 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:20 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Currently, the ondisk realtime summary file counters are accessed in
units of 32-bit words.  There's no endian translation of the contents of
this file, which means that the Bad Things Happen(tm) if you go from
(say) x86 to powerpc.  Since we have a new feature flag, let's take the
opportunity to enforce an endianness on the file.  Encode the summary
information in big endian format, like most of the rest of the
filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h   |    4 +++-
 fs/xfs/libxfs/xfs_rtbitmap.h |    7 +++++++
 fs/xfs/scrub/rtsummary.c     |    5 +++++
 3 files changed, 15 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index f2c70e1027ed7..59ba13db53e7a 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -759,10 +759,12 @@ union xfs_rtword_raw {
 
 /*
  * Realtime summary counts are accessed by the word, which is currently
- * stored in host-endian format.
+ * stored in host-endian format.  Starting with the realtime groups feature,
+ * the words are stored in be32 ondisk.
  */
 union xfs_suminfo_raw {
 	__u32		old;
+	__be32		rtg;
 };
 
 /*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index e8558db14b9b7..0bb63f77ede82 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -298,6 +298,8 @@ xfs_suminfo_get(
 {
 	union xfs_suminfo_raw	*info = xfs_rsumblock_infoptr(args, index);
 
+	if (xfs_has_rtgroups(args->mp))
+		return be32_to_cpu(info->rtg);
 	return info->old;
 }
 
@@ -310,6 +312,11 @@ xfs_suminfo_add(
 {
 	union xfs_suminfo_raw	*info = xfs_rsumblock_infoptr(args, index);
 
+	if (xfs_has_rtgroups(args->mp)) {
+		be32_add_cpu(&info->rtg, delta);
+		return be32_to_cpu(info->rtg);
+	}
+
 	info->old += delta;
 	return info->old;
 }
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index df9aa29cb94c6..5107873775b7f 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -142,6 +142,11 @@ xchk_rtsum_inc(
 	struct xfs_mount	*mp,
 	union xfs_suminfo_raw	*v)
 {
+	if (xfs_has_rtgroups(mp)) {
+		be32_add_cpu(&v->rtg, 1);
+		return be32_to_cpu(v->rtg);
+	}
+
 	v->old += 1;
 	return v->old;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/24] xfs: store rtgroup information with a bmap intent
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-31 21:20   ` [PATCH 18/24] xfs: encode the rtsummary in big endian format Darrick J. Wong
@ 2023-12-31 21:20   ` Darrick J. Wong
  2023-12-31 21:20   ` [PATCH 20/24] xfs: use an incore rtgroup rotor for rtpick Darrick J. Wong
                     ` (4 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:20 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make the bmap intent items take an active reference to the rtgroup
containing the space that is being mapped or unmapped.  We will need
this functionality once we start enabling rmap and reflink on the rt
volume.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.h |    5 ++++-
 fs/xfs/xfs_bmap_item.c   |   18 ++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index bd7f936262cc6..61c195198db43 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -246,7 +246,10 @@ struct xfs_bmap_intent {
 	enum xfs_bmap_intent_type		bi_type;
 	int					bi_whichfork;
 	struct xfs_inode			*bi_owner;
-	struct xfs_perag			*bi_pag;
+	union {
+		struct xfs_perag		*bi_pag;
+		struct xfs_rtgroup		*bi_rtg;
+	};
 	struct xfs_bmbt_irec			bi_bmap;
 };
 
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 02b872a133104..234296a37b269 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -26,6 +26,7 @@
 #include "xfs_log_recover.h"
 #include "xfs_ag.h"
 #include "xfs_trace.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_bui_cache;
 struct kmem_cache	*xfs_bud_cache;
@@ -326,8 +327,18 @@ xfs_bmap_update_get_group(
 {
 	xfs_agnumber_t		agno;
 
-	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
+		if (xfs_has_rtgroups(mp)) {
+			xfs_rgnumber_t	rgno;
+
+			rgno = xfs_rtb_to_rgno(mp, bi->bi_bmap.br_startblock);
+			bi->bi_rtg = xfs_rtgroup_get(mp, rgno);
+		} else {
+			bi->bi_rtg = NULL;
+		}
+
 		return;
+	}
 
 	agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
 
@@ -358,8 +369,11 @@ static inline void
 xfs_bmap_update_put_group(
 	struct xfs_bmap_intent	*bi)
 {
-	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
+		if (xfs_has_rtgroups(bi->bi_owner->i_mount))
+			xfs_rtgroup_put(bi->bi_rtg);
 		return;
+	}
 
 	xfs_perag_intent_put(bi->bi_pag);
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/24] xfs: use an incore rtgroup rotor for rtpick
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-31 21:20   ` [PATCH 19/24] xfs: store rtgroup information with a bmap intent Darrick J. Wong
@ 2023-12-31 21:20   ` Darrick J. Wong
  2023-12-31 21:21   ` [PATCH 21/24] xfs: scrub the realtime group superblock Darrick J. Wong
                     ` (3 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:20 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

During the 6.7 merge window, Linus noticed that the realtime allocator
was doing some sketchy things trying to encode a u64 sequence counter
into the rtbitmap file's atime.  The sketchy casting of a struct pointer
to a u64 pointer has subtly broken several times over the past decade as
the codebase has transitioned to using the VFS i_atime field and that
field has changed in size and layout over time.

Since the goal of the rtpick code is to _suggest_ a starting place for
new rt file allocations, the repeated breakage has not resulted in
inconsistent metadata.  IOWs, it's a hint.

For rtgroups, we don't need this complex code to cut the rtextents space
into fractions.  Add an rtgroup rotor and use that for rtpick, similar
to AG rotoring on the data device.  The new rotor does not persist,
which reduces the logging overhead slightly.

Link: https://lore.kernel.org/linux-xfs/CAHk-=wj3oM3d-Hw2vvxys3KCZ9De+gBN7Gxr2jf96OTisL9udw@mail.gmail.com/
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtbitmap.c |   12 ++++++++----
 fs/xfs/xfs_mount.h           |    1 +
 fs/xfs/xfs_rtalloc.c         |   11 +++++++++++
 3 files changed, 20 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 3530524b3fc2b..0ef14157e8157 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1064,10 +1064,14 @@ xfs_rtfree_extent(
 		if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
 			mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
 
-		atime = inode_get_atime(VFS_I(mp->m_rbmip));
-		atime.tv_sec = 0;
-		inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
-		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		if (xfs_has_rtgroups(mp)) {
+			mp->m_rtgrotor = 0;
+		} else {
+			atime = inode_get_atime(VFS_I(mp->m_rbmip));
+			atime.tv_sec = 0;
+			inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
+			xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		}
 	}
 	error = 0;
 out:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 14094b29ab6fe..a086b96b0a513 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -224,6 +224,7 @@ typedef struct xfs_mount {
 #ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
 	struct xchk_stats	*m_scrub_stats;
 #endif
+	xfs_rgnumber_t		m_rtgrotor;	/* last rtgroup rtpicked */
 	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
 	atomic_t		m_agirotor;	/* last ag dir inode alloced */
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ff08c1d997fdb..6479bb04657ec 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1688,6 +1688,17 @@ xfs_rtpick_extent(
 
 	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 
+	if (xfs_has_rtgroups(mp)) {
+		xfs_rtblock_t	rtbno;
+
+		/* Pick the first usable rtx of the group. */
+		rtbno = xfs_rgbno_to_rtb(mp, mp->m_rtgrotor, 0);
+		*pick = xfs_rtb_to_rtx(mp, rtbno) + 1;
+
+		mp->m_rtgrotor = (mp->m_rtgrotor + 1) % mp->m_sb.sb_rgcount;
+		return 0;
+	}
+
 	ts = inode_get_atime(VFS_I(mp->m_rbmip));
 	if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
 		mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/24] xfs: scrub the realtime group superblock
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-31 21:20   ` [PATCH 20/24] xfs: use an incore rtgroup rotor for rtpick Darrick J. Wong
@ 2023-12-31 21:21   ` Darrick J. Wong
  2023-12-31 21:21   ` [PATCH 22/24] xfs: repair secondary realtime group superblocks Darrick J. Wong
                     ` (2 subsequent siblings)
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:21 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable scrubbing of realtime group superblocks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile        |    1 +
 fs/xfs/libxfs/xfs_fs.h |    3 +-
 fs/xfs/scrub/common.c  |   88 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/common.h  |   46 ++++++++++++++-----------
 fs/xfs/scrub/health.c  |    1 +
 fs/xfs/scrub/rgsuper.c |   77 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c   |   20 ++++++++++-
 fs/xfs/scrub/scrub.h   |   40 ++++++++++------------
 fs/xfs/scrub/stats.c   |    1 +
 fs/xfs/scrub/trace.h   |    4 ++
 10 files changed, 237 insertions(+), 44 deletions(-)
 create mode 100644 fs/xfs/scrub/rgsuper.c


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 500dea292a9d6..7416ab9efc4d8 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -190,6 +190,7 @@ xfs-y				+= $(addprefix scrub/, \
 xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
 
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
+				   rgsuper.o \
 				   rtbitmap.o \
 				   rtsummary.o \
 				   )
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index c5bf53c6a43ca..237d13a500daf 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -735,9 +735,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_HEALTHY	27	/* everything checked out ok */
 #define XFS_SCRUB_TYPE_DIRTREE	28	/* directory tree structure */
 #define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
+#define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	30
+#define XFS_SCRUB_TYPE_NR	31
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index e9294a933c3ab..3dee7d717073e 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -34,6 +34,7 @@
 #include "xfs_quota.h"
 #include "xfs_swapext.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -121,6 +122,17 @@ xchk_process_error(
 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
 }
 
+bool
+xchk_process_rt_error(
+	struct xfs_scrub	*sc,
+	xfs_rgnumber_t		rgno,
+	xfs_rgblock_t		rgbno,
+	int			*error)
+{
+	return __xchk_process_error(sc, rgno, rgbno, error,
+			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
 bool
 xchk_xref_process_error(
 	struct xfs_scrub	*sc,
@@ -132,6 +144,17 @@ xchk_xref_process_error(
 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
 }
 
+bool
+xchk_xref_process_rt_error(
+	struct xfs_scrub	*sc,
+	xfs_rgnumber_t		rgno,
+	xfs_rgblock_t		rgbno,
+	int			*error)
+{
+	return __xchk_process_error(sc, rgno, rgbno, error,
+			XFS_SCRUB_OFLAG_XFAIL, __return_address);
+}
+
 /* Check for operational errors for a file offset. */
 static bool
 __xchk_fblock_process_error(
@@ -699,6 +722,7 @@ xchk_rt_init(
 					 XCHK_RTLOCK_BITMAP_SHARED)) < 2);
 	ASSERT(hweight32(rtlock_flags & (XCHK_RTLOCK_SUMMARY |
 					 XCHK_RTLOCK_SUMMARY_SHARED)) < 2);
+	ASSERT(sr->rtg == NULL);
 
 	if (rtlock_flags & XCHK_RTLOCK_BITMAP)
 		xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_EXCL);
@@ -722,6 +746,8 @@ xchk_rt_unlock(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr)
 {
+	ASSERT(sr->rtg == NULL);
+
 	if (!sr->rtlock_flags)
 		return;
 
@@ -749,6 +775,68 @@ xchk_rt_unlock_rtbitmap(
 	sc->sr.rtlock_flags &= ~XCHK_RTLOCK_BITMAP_SHARED;
 }
 
+#ifdef CONFIG_XFS_RT
+/*
+ * For scrubbing a realtime group, grab all the in-core resources we'll need to
+ * check the metadata, which means taking the ILOCK of the realtime group's
+ * metadata inodes.  Callers must not join these inodes to the transaction with
+ * non-zero lockflags or concurrency problems will result.  The @rtglock_flags
+ * argument takes XFS_RTGLOCK_* flags.
+ */
+int
+xchk_rtgroup_init(
+	struct xfs_scrub	*sc,
+	xfs_rgnumber_t		rgno,
+	struct xchk_rt		*sr,
+	unsigned int		rtglock_flags)
+{
+	ASSERT(sr->rtg == NULL);
+	ASSERT(sr->rtlock_flags == 0);
+
+	sr->rtg = xfs_rtgroup_get(sc->mp, rgno);
+	if (!sr->rtg)
+		return -ENOENT;
+
+	xfs_rtgroup_lock(NULL, sr->rtg, rtglock_flags);
+	sr->rtlock_flags = rtglock_flags;
+	return 0;
+}
+
+/*
+ * Unlock the realtime group.  This must be done /after/ committing (or
+ * cancelling) the scrub transaction.
+ */
+void
+xchk_rtgroup_unlock(
+	struct xfs_scrub	*sc,
+	struct xchk_rt		*sr)
+{
+	ASSERT(sr->rtg != NULL);
+
+	if (sr->rtlock_flags) {
+		xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags);
+		sr->rtlock_flags = 0;
+	}
+}
+
+/*
+ * Unlock the realtime group and release its resources.  This must be done
+ * /after/ committing (or cancelling) the scrub transaction.
+ */
+void
+xchk_rtgroup_free(
+	struct xfs_scrub	*sc,
+	struct xchk_rt		*sr)
+{
+	ASSERT(sr->rtg != NULL);
+
+	xchk_rtgroup_unlock(sc, sr);
+
+	xfs_rtgroup_put(sr->rtg);
+	sr->rtg = NULL;
+}
+#endif /* CONFIG_XFS_RT */
+
 /* Per-scrubber setup functions */
 
 void
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 007d293a06d52..0edf67e697da3 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -12,11 +12,15 @@ void xchk_trans_cancel(struct xfs_scrub *sc);
 
 bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		xfs_agblock_t bno, int *error);
+bool xchk_process_rt_error(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
+		xfs_rgblock_t rgbno, int *error);
 bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork,
 		xfs_fileoff_t offset, int *error);
 
 bool xchk_xref_process_error(struct xfs_scrub *sc,
 		xfs_agnumber_t agno, xfs_agblock_t bno, int *error);
+bool xchk_xref_process_rt_error(struct xfs_scrub *sc,
+		xfs_rgnumber_t rgno, xfs_rgblock_t rgbno, int *error);
 bool xchk_fblock_xref_process_error(struct xfs_scrub *sc,
 		int whichfork, xfs_fileoff_t offset, int *error);
 
@@ -53,6 +57,11 @@ int xchk_checkpoint_log(struct xfs_mount *mp);
 bool xchk_should_check_xref(struct xfs_scrub *sc, int *error,
 			   struct xfs_btree_cur **curpp);
 
+static inline int xchk_setup_nothing(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
+
 /* Setup functions */
 int xchk_setup_agheader(struct xfs_scrub *sc);
 int xchk_setup_fs(struct xfs_scrub *sc);
@@ -72,17 +81,11 @@ int xchk_setup_metapath(struct xfs_scrub *sc);
 #ifdef CONFIG_XFS_RT
 int xchk_setup_rtbitmap(struct xfs_scrub *sc);
 int xchk_setup_rtsummary(struct xfs_scrub *sc);
+int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
 #else
-static inline int
-xchk_setup_rtbitmap(struct xfs_scrub *sc)
-{
-	return -ENOENT;
-}
-static inline int
-xchk_setup_rtsummary(struct xfs_scrub *sc)
-{
-	return -ENOENT;
-}
+# define xchk_setup_rtbitmap		xchk_setup_nothing
+# define xchk_setup_rtsummary		xchk_setup_nothing
+# define xchk_setup_rgsuperblock	xchk_setup_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -94,16 +97,8 @@ xchk_ino_dqattach(struct xfs_scrub *sc)
 {
 	return 0;
 }
-static inline int
-xchk_setup_quota(struct xfs_scrub *sc)
-{
-	return -ENOENT;
-}
-static inline int
-xchk_setup_quotacheck(struct xfs_scrub *sc)
-{
-	return -ENOENT;
-}
+# define xchk_setup_quota		xchk_setup_nothing
+# define xchk_setup_quotacheck		xchk_setup_nothing
 #endif
 int xchk_setup_fscounters(struct xfs_scrub *sc);
 int xchk_setup_nlinks(struct xfs_scrub *sc);
@@ -147,6 +142,17 @@ void xchk_rt_init(struct xfs_scrub *sc, struct xchk_rt *sr,
 		unsigned int xchk_rtlock_flags);
 void xchk_rt_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rt_unlock_rtbitmap(struct xfs_scrub *sc);
+
+#ifdef CONFIG_XFS_RT
+int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
+		struct xchk_rt *sr, unsigned int rtglock_flags);
+void xchk_rtgroup_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
+void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
+#else
+# define xchk_rtgroup_init(sc, rgno, sr, lockflags)	(-ENOSYS)
+# define xchk_rtgroup_free(sc, sr)			((void)0)
+#endif /* CONFIG_XFS_RT */
+
 int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		struct xchk_ag *sa);
 void xchk_ag_btcur_free(struct xchk_ag *sa);
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 063176c1f35eb..7fccb1a03060a 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -114,6 +114,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_NLINKS]		= { XHG_FS,  XFS_SICK_FS_NLINKS },
 	[XFS_SCRUB_TYPE_DIRTREE]	= { XHG_INO, XFS_SICK_INO_DIRTREE },
 	[XFS_SCRUB_TYPE_METAPATH]	= { XHG_FS,  XFS_SICK_FS_METAPATH },
+	[XFS_SCRUB_TYPE_RGSUPER]	= { XHG_RTGROUP, XFS_SICK_RT_SUPER },
 };
 
 /* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c
new file mode 100644
index 0000000000000..ae23609cdb900
--- /dev/null
+++ b/fs/xfs/scrub/rgsuper.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_rtgroup.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Set us up with a transaction and an empty context. */
+int
+xchk_setup_rgsuperblock(
+	struct xfs_scrub	*sc)
+{
+	return xchk_trans_alloc(sc, 0);
+}
+
+/* Cross-reference with the other rt metadata. */
+STATIC void
+xchk_rgsuperblock_xref(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	xfs_rgnumber_t		rgno = sc->sr.rtg->rtg_rgno;
+	xfs_rtblock_t		rtbno;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	rtbno = xfs_rgbno_to_rtb(mp, rgno, 0);
+	xchk_xref_is_used_rt_space(sc, rtbno, 1);
+}
+
+int
+xchk_rgsuperblock(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_buf		*bp;
+	xfs_rgnumber_t		rgno = sc->sm->sm_agno;
+	int			error;
+
+	/*
+	 * Grab an active reference to the rtgroup structure.  If we can't get
+	 * it, we're racing with something that's tearing down the group, so
+	 * signal that the group no longer exists.  Take the rtbitmap in shared
+	 * mode so that the group can't change while we're doing things.
+	 */
+	error = xchk_rtgroup_init(sc, rgno, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+	if (error)
+		return error;
+
+	/*
+	 * If this is the primary rtgroup superblock, we know it passed the
+	 * verifier checks at mount time and do not need to load the buffer
+	 * again.
+	 */
+	if (sc->sr.rtg->rtg_rgno == 0) {
+		xchk_rgsuperblock_xref(sc);
+		return 0;
+	}
+
+	/* The secondary rt super is checked by the read verifier. */
+	error = xfs_buf_read_uncached(sc->mp->m_rtdev_targp, XFS_RTSB_DADDR,
+			XFS_FSB_TO_BB(sc->mp, 1), 0, &bp, &xfs_rtsb_buf_ops);
+	if (!xchk_process_rt_error(sc, rgno, 0, &error))
+		return error;
+
+	xchk_rgsuperblock_xref(sc);
+	xfs_buf_relse(bp);
+	return 0;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 5d551433b3233..c1e226a9ac08d 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -230,7 +230,10 @@ xchk_teardown(
 			xfs_trans_cancel(sc->tp);
 		sc->tp = NULL;
 	}
-	xchk_rt_unlock(sc, &sc->sr);
+	if (sc->sr.rtg)
+		xchk_rtgroup_free(sc, &sc->sr);
+	else
+		xchk_rt_unlock(sc, &sc->sr);
 	if (sc->ip) {
 		if (sc->ilock_flags)
 			xchk_iunlock(sc, sc->ilock_flags);
@@ -455,6 +458,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.has	= xfs_has_metadir,
 		.repair	= xrep_metapath,
 	},
+	[XFS_SCRUB_TYPE_RGSUPER] = {	/* realtime group superblock */
+		.type	= ST_RTGROUP,
+		.setup	= xchk_setup_rgsuperblock,
+		.scrub	= xchk_rgsuperblock,
+		.has	= xfs_has_rtgroups,
+		.repair = xrep_notsupported,
+	},
 };
 
 static int
@@ -504,6 +514,14 @@ xchk_validate_inputs(
 		break;
 	case ST_GENERIC:
 		break;
+	case ST_RTGROUP:
+		if (sm->sm_ino || sm->sm_gen)
+			goto out;
+		if (!xfs_has_rtgroups(mp) && sm->sm_agno != 0)
+			goto out;
+		if (xfs_has_rtgroups(mp) && sm->sm_agno >= mp->m_sb.sb_rgcount)
+			goto out;
+		break;
 	default:
 		goto out;
 	}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 1671b6aa48081..26d2731eddb99 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -67,6 +67,7 @@ enum xchk_type {
 	ST_FS,		/* per-FS metadata */
 	ST_INODE,	/* per-inode metadata */
 	ST_GENERIC,	/* determined by the scrubber */
+	ST_RTGROUP,	/* rtgroup metadata */
 };
 
 struct xchk_meta_ops {
@@ -113,7 +114,13 @@ struct xchk_ag {
 
 /* Inode lock state for the RT volume. */
 struct xchk_rt {
-	/* XCHK_RTLOCK_* lock state */
+	/* incore rtgroup, if applicable */
+	struct xfs_rtgroup	*rtg;
+
+	/*
+	 * XCHK_RTLOCK_* lock state if rtg == NULL, or XFS_RTGLOCK_* lock state
+	 * if rtg != NULL.
+	 */
 	unsigned int		rtlock_flags;
 };
 
@@ -241,6 +248,11 @@ xchk_should_terminate(
 	return false;
 }
 
+static inline int xchk_nothing(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
+
 /* Metadata scrubbers */
 int xchk_tester(struct xfs_scrub *sc);
 int xchk_superblock(struct xfs_scrub *sc);
@@ -264,32 +276,18 @@ int xchk_metapath(struct xfs_scrub *sc);
 #ifdef CONFIG_XFS_RT
 int xchk_rtbitmap(struct xfs_scrub *sc);
 int xchk_rtsummary(struct xfs_scrub *sc);
+int xchk_rgsuperblock(struct xfs_scrub *sc);
 #else
-static inline int
-xchk_rtbitmap(struct xfs_scrub *sc)
-{
-	return -ENOENT;
-}
-static inline int
-xchk_rtsummary(struct xfs_scrub *sc)
-{
-	return -ENOENT;
-}
+# define xchk_rtbitmap		xchk_nothing
+# define xchk_rtsummary		xchk_nothing
+# define xchk_rgsuperblock	xchk_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_quota(struct xfs_scrub *sc);
 int xchk_quotacheck(struct xfs_scrub *sc);
 #else
-static inline int
-xchk_quota(struct xfs_scrub *sc)
-{
-	return -ENOENT;
-}
-static inline int
-xchk_quotacheck(struct xfs_scrub *sc)
-{
-	return -ENOENT;
-}
+# define xchk_quota		xchk_nothing
+# define xchk_quotacheck	xchk_nothing
 #endif
 int xchk_fscounters(struct xfs_scrub *sc);
 int xchk_nlinks(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index 2e576c601b7dc..c3f8ac37e5e03 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -81,6 +81,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_NLINKS]		= "nlinks",
 	[XFS_SCRUB_TYPE_DIRTREE]	= "dirtree",
 	[XFS_SCRUB_TYPE_METAPATH]	= "metapath",
+	[XFS_SCRUB_TYPE_RGSUPER]	= "rgsuper",
 };
 
 /* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 038a4f8dda5a0..0bcd93bed07d6 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -83,6 +83,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
 
 #define XFS_SCRUB_TYPE_STRINGS \
 	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
@@ -115,7 +116,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
 	{ XFS_SCRUB_TYPE_HEALTHY,	"healthy" }, \
 	{ XFS_SCRUB_TYPE_DIRTREE,	"dirtree" }, \
 	{ XFS_SCRUB_TYPE_BARRIER,	"barrier" }, \
-	{ XFS_SCRUB_TYPE_METAPATH,	"metapath" }
+	{ XFS_SCRUB_TYPE_METAPATH,	"metapath" }, \
+	{ XFS_SCRUB_TYPE_RGSUPER,	"rgsuper" }
 
 #define XFS_SCRUB_FLAG_STRINGS \
 	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/24] xfs: repair secondary realtime group superblocks
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-31 21:21   ` [PATCH 21/24] xfs: scrub the realtime group superblock Darrick J. Wong
@ 2023-12-31 21:21   ` Darrick J. Wong
  2023-12-31 21:21   ` [PATCH 23/24] xfs: scrub each rtgroup's portion of the rtbitmap separately Darrick J. Wong
  2023-12-31 21:21   ` [PATCH 24/24] xfs: enable realtime group feature Darrick J. Wong
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:21 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Repair secondary realtime group superblocks.  They're not critical for
anything, but some consistency would be a good idea.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile               |    1 +
 fs/xfs/libxfs/xfs_rtgroup.c   |    2 +-
 fs/xfs/libxfs/xfs_rtgroup.h   |    3 +++
 fs/xfs/scrub/repair.h         |    3 +++
 fs/xfs/scrub/rgsuper_repair.c |   48 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c          |    2 +-
 6 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 fs/xfs/scrub/rgsuper_repair.c


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7416ab9efc4d8..6dc9e740f8ce5 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -231,6 +231,7 @@ xfs-y				+= $(addprefix scrub/, \
 				   )
 
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
+				   rgsuper_repair.o \
 				   rtbitmap_repair.o \
 				   rtsummary_repair.o \
 				   )
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index ed4f8aa67b158..7a45a16cbbab8 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -421,7 +421,7 @@ xfs_rtgroup_log_super(
 }
 
 /* Initialize a secondary realtime superblock. */
-static int
+int
 xfs_rtgroup_init_secondary_super(
 	struct xfs_mount	*mp,
 	xfs_rgnumber_t		rgno,
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index e6d60425faa4d..0a63f14b5aa0f 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -221,6 +221,8 @@ void xfs_rtgroup_update_super(struct xfs_buf *rtsb_bp,
 		const struct xfs_buf *sb_bp);
 void xfs_rtgroup_log_super(struct xfs_trans *tp, const struct xfs_buf *sb_bp);
 int xfs_rtgroup_update_secondary_sbs(struct xfs_mount *mp);
+int xfs_rtgroup_init_secondary_super(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		struct xfs_buf **bpp);
 
 /* Lock the rt bitmap inode in exclusive mode */
 #define XFS_RTGLOCK_BITMAP		(1U << 0)
@@ -241,6 +243,7 @@ int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg,
 # define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
 # define xfs_rtgroup_log_super(tp, sb_bp)	((void)0)
 # define xfs_rtgroup_update_secondary_sbs(mp)	(0)
+# define xfs_rtgroup_init_secondary_super(mp, rgno, bpp)	(-EOPNOTSUPP)
 # define xfs_rtgroup_lock(tp, rtg, gf)		((void)0)
 # define xfs_rtgroup_unlock(rtg, gf)		((void)0)
 # define xfs_rtgroup_get_geometry(rtg, rgeo)	(-EOPNOTSUPP)
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index a0780ccdd9ab6..3d8ca12b2cc51 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -139,9 +139,11 @@ int xrep_metapath(struct xfs_scrub *sc);
 #ifdef CONFIG_XFS_RT
 int xrep_rtbitmap(struct xfs_scrub *sc);
 int xrep_rtsummary(struct xfs_scrub *sc);
+int xrep_rgsuperblock(struct xfs_scrub *sc);
 #else
 # define xrep_rtbitmap			xrep_notsupported
 # define xrep_rtsummary			xrep_notsupported
+# define xrep_rgsuperblock		xrep_notsupported
 #endif /* CONFIG_XFS_RT */
 
 #ifdef CONFIG_XFS_QUOTA
@@ -246,6 +248,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
 #define xrep_symlink			xrep_notsupported
 #define xrep_dirtree			xrep_notsupported
 #define xrep_metapath			xrep_notsupported
+#define xrep_rgsuperblock		xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/rgsuper_repair.c b/fs/xfs/scrub/rgsuper_repair.c
new file mode 100644
index 0000000000000..61961764a6cf9
--- /dev/null
+++ b/fs/xfs/scrub/rgsuper_repair.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_rtgroup.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/repair.h"
+
+int
+xrep_rgsuperblock(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	/*
+	 * If this is the primary rtgroup superblock, log a superblock update
+	 * to force both to disk.
+	 */
+	if (sc->sr.rtg->rtg_rgno == 0) {
+		xfs_log_sb(sc->tp);
+		return 0;
+	}
+
+	/* Otherwise just write a new secondary to disk directly. */
+	error = xfs_rtgroup_init_secondary_super(sc->mp, sc->sr.rtg->rtg_rgno,
+			&bp);
+	if (error)
+		return error;
+
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c1e226a9ac08d..c9acc10209ddb 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -463,7 +463,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_rgsuperblock,
 		.scrub	= xchk_rgsuperblock,
 		.has	= xfs_has_rtgroups,
-		.repair = xrep_notsupported,
+		.repair = xrep_rgsuperblock,
 	},
 };
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/24] xfs: scrub each rtgroup's portion of the rtbitmap separately
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-31 21:21   ` [PATCH 22/24] xfs: repair secondary realtime group superblocks Darrick J. Wong
@ 2023-12-31 21:21   ` Darrick J. Wong
  2023-12-31 21:21   ` [PATCH 24/24] xfs: enable realtime group feature Darrick J. Wong
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:21 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new scrub type code so that userspace can scrub each rtgroup's
portion of the rtbitmap file separately.  This reduces the long tail
latency that results from scanning the entire bitmap all at once, and
prepares us for future patchsets, wherein we'll need to be able to lock
a specific rtgroup so that we can rebuild that rtgroup's part of the
rtbitmap contents from the rtgroup's rmap btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h  |    3 +
 fs/xfs/scrub/common.h   |    6 ++
 fs/xfs/scrub/rtbitmap.c |  127 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/scrub/rtbitmap.h |    6 ++
 fs/xfs/scrub/scrub.c    |    7 +++
 fs/xfs/scrub/scrub.h    |    2 +
 fs/xfs/scrub/stats.c    |    1 
 fs/xfs/scrub/trace.h    |    4 +
 8 files changed, 150 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 237d13a500daf..102b927336057 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -736,9 +736,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_DIRTREE	28	/* directory tree structure */
 #define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
 #define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
+#define XFS_SCRUB_TYPE_RGBITMAP	31	/* realtime group bitmap */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	31
+#define XFS_SCRUB_TYPE_NR	32
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 0edf67e697da3..f96dd658feab9 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -82,10 +82,12 @@ int xchk_setup_metapath(struct xfs_scrub *sc);
 int xchk_setup_rtbitmap(struct xfs_scrub *sc);
 int xchk_setup_rtsummary(struct xfs_scrub *sc);
 int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
+int xchk_setup_rgbitmap(struct xfs_scrub *sc);
 #else
 # define xchk_setup_rtbitmap		xchk_setup_nothing
 # define xchk_setup_rtsummary		xchk_setup_nothing
 # define xchk_setup_rgsuperblock	xchk_setup_nothing
+# define xchk_setup_rgbitmap		xchk_setup_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -144,6 +146,10 @@ void xchk_rt_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rt_unlock_rtbitmap(struct xfs_scrub *sc);
 
 #ifdef CONFIG_XFS_RT
+
+/* All the locks we need to check an rtgroup. */
+#define XCHK_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP_SHARED)
+
 int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
 		struct xchk_rt *sr, unsigned int rtglock_flags);
 void xchk_rtgroup_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 7f910fed7de95..aae8b0e6ff281 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -15,11 +15,66 @@
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bit.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/repair.h"
 #include "scrub/rtbitmap.h"
 
+static inline void
+xchk_rtbitmap_compute_geometry(
+	struct xfs_mount	*mp,
+	struct xchk_rtbitmap	*rtb)
+{
+	if (mp->m_sb.sb_rblocks == 0)
+		return;
+
+	rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+	rtb->rextslog = xfs_compute_rextslog(&mp->m_sb, rtb->rextents);
+	rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
+}
+
+/* Set us up with the realtime group metadata locked. */
+int
+xchk_setup_rgbitmap(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xchk_rgbitmap	*rgb;
+	int			error;
+
+	rgb = kzalloc(sizeof(struct xchk_rgbitmap), XCHK_GFP_FLAGS);
+	if (!rgb)
+		return -ENOMEM;
+	rgb->sc = sc;
+	sc->buf = rgb;
+
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		return error;
+
+	error = xchk_install_live_inode(sc, mp->m_rbmip);
+	if (error)
+		return error;
+
+	error = xchk_ino_dqattach(sc);
+	if (error)
+		return error;
+
+	error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr,
+			XCHK_RTGLOCK_ALL);
+	if (error)
+		return error;
+
+	/*
+	 * Now that we've locked the rtbitmap, we can't race with growfsrt
+	 * trying to expand the bitmap or change the size of the rt volume.
+	 * Hence it is safe to compute and check the geometry values.
+	 */
+	xchk_rtbitmap_compute_geometry(mp, &rgb->rtb);
+	return 0;
+}
+
 /* Set us up with the realtime metadata locked. */
 int
 xchk_setup_rtbitmap(
@@ -59,11 +114,68 @@ xchk_setup_rtbitmap(
 	 * trying to expand the bitmap or change the size of the rt volume.
 	 * Hence it is safe to compute and check the geometry values.
 	 */
-	if (mp->m_sb.sb_rblocks) {
-		rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
-		rtb->rextslog = xfs_compute_rextslog(&mp->m_sb, rtb->rextents);
-		rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
+	xchk_rtbitmap_compute_geometry(mp, rtb);
+	return 0;
+}
+
+/* Per-rtgroup bitmap contents. */
+
+/* Scrub a free extent record from the realtime bitmap. */
+STATIC int
+xchk_rgbitmap_rec(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	const struct xfs_rtalloc_rec *rec,
+	void			*priv)
+{
+	struct xchk_rgbitmap	*rgb = priv;
+	struct xfs_scrub	*sc = rgb->sc;
+	xfs_rtblock_t		startblock;
+	xfs_filblks_t		blockcount;
+
+	startblock = xfs_rtx_to_rtb(mp, rec->ar_startext);
+	blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+
+	if (!xfs_verify_rtbext(mp, startblock, blockcount))
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+	return 0;
+}
+
+/* Scrub this group's realtime bitmap. */
+int
+xchk_rgbitmap(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_rtalloc_rec	keys[2];
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_rtgroup	*rtg = sc->sr.rtg;
+	struct xchk_rgbitmap	*rgb = sc->buf;
+	xfs_rtblock_t		rtbno;
+	xfs_rgblock_t		last_rgbno = rtg->rtg_blockcount - 1;
+	int			error;
+
+	/* Sanity check the realtime bitmap size. */
+	if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rgb->rtb.rbmblocks)) {
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
 	}
+
+	/*
+	 * Check only the portion of the rtbitmap that corresponds to this
+	 * realtime group.
+	 */
+	rtbno = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, 0);
+	keys[0].ar_startext = xfs_rtb_to_rtx(mp, rtbno);
+
+	rtbno = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, last_rgbno);
+	keys[1].ar_startext = xfs_rtb_to_rtx(mp, rtbno);
+	keys[0].ar_extcount = keys[1].ar_extcount = 0;
+
+	error = xfs_rtalloc_query_range(mp, sc->tp, &keys[0], &keys[1],
+			xchk_rgbitmap_rec, rgb);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+		return error;
+
 	return 0;
 }
 
@@ -192,6 +304,13 @@ xchk_rtbitmap(
 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 		return error;
 
+	/*
+	 * Each rtgroup checks its portion of the rt bitmap, so if we don't
+	 * have that feature, we have to check the bitmap contents now.
+	 */
+	if (xfs_has_rtgroups(mp))
+		return 0;
+
 	error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc);
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
 		return error;
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
index 85304ff019e1d..f659f0e76b4fa 100644
--- a/fs/xfs/scrub/rtbitmap.h
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -13,6 +13,12 @@ struct xchk_rtbitmap {
 	unsigned int		resblks;
 };
 
+struct xchk_rgbitmap {
+	struct xfs_scrub	*sc;
+
+	struct xchk_rtbitmap	rtb;
+};
+
 #ifdef CONFIG_XFS_ONLINE_REPAIR
 int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
 #else
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c9acc10209ddb..a6b7b57fc1df7 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -465,6 +465,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.has	= xfs_has_rtgroups,
 		.repair = xrep_rgsuperblock,
 	},
+	[XFS_SCRUB_TYPE_RGBITMAP] = {	/* realtime group bitmap */
+		.type	= ST_RTGROUP,
+		.setup	= xchk_setup_rgbitmap,
+		.scrub	= xchk_rgbitmap,
+		.has	= xfs_has_rtgroups,
+		.repair = xrep_notsupported,
+	},
 };
 
 static int
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 26d2731eddb99..76eca41a8995a 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -277,10 +277,12 @@ int xchk_metapath(struct xfs_scrub *sc);
 int xchk_rtbitmap(struct xfs_scrub *sc);
 int xchk_rtsummary(struct xfs_scrub *sc);
 int xchk_rgsuperblock(struct xfs_scrub *sc);
+int xchk_rgbitmap(struct xfs_scrub *sc);
 #else
 # define xchk_rtbitmap		xchk_nothing
 # define xchk_rtsummary		xchk_nothing
 # define xchk_rgsuperblock	xchk_nothing
+# define xchk_rgbitmap		xchk_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_quota(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index c3f8ac37e5e03..4bdff9a19dd6c 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -82,6 +82,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_DIRTREE]	= "dirtree",
 	[XFS_SCRUB_TYPE_METAPATH]	= "metapath",
 	[XFS_SCRUB_TYPE_RGSUPER]	= "rgsuper",
+	[XFS_SCRUB_TYPE_RGBITMAP]	= "rgbitmap",
 };
 
 /* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 0bcd93bed07d6..dd809042a6041 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -84,6 +84,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGBITMAP);
 
 #define XFS_SCRUB_TYPE_STRINGS \
 	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
@@ -117,7 +118,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
 	{ XFS_SCRUB_TYPE_DIRTREE,	"dirtree" }, \
 	{ XFS_SCRUB_TYPE_BARRIER,	"barrier" }, \
 	{ XFS_SCRUB_TYPE_METAPATH,	"metapath" }, \
-	{ XFS_SCRUB_TYPE_RGSUPER,	"rgsuper" }
+	{ XFS_SCRUB_TYPE_RGSUPER,	"rgsuper" }, \
+	{ XFS_SCRUB_TYPE_RGBITMAP,	"rgbitmap" }
 
 #define XFS_SCRUB_FLAG_STRINGS \
 	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/24] xfs: enable realtime group feature
  2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-31 21:21   ` [PATCH 23/24] xfs: scrub each rtgroup's portion of the rtbitmap separately Darrick J. Wong
@ 2023-12-31 21:21   ` Darrick J. Wong
  23 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:21 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h |    1 +
 1 file changed, 1 insertion(+)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 59ba13db53e7a..87476c6bb6c64 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -418,6 +418,7 @@ xfs_sb_has_ro_compat_feature(
 		 XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
 		 XFS_SB_FEAT_INCOMPAT_NREXT64| \
 		 XFS_SB_FEAT_INCOMPAT_PARENT | \
+		 XFS_SB_FEAT_INCOMPAT_RTGROUPS | \
 		 XFS_SB_FEAT_INCOMPAT_METADIR)
 
 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/1] xfs: enable FITRIM on the realtime device
  2023-12-31 19:35 ` [PATCHSET v2.0 05/15] xfs: enable FITRIM for the realtime section Darrick J. Wong
@ 2023-12-31 21:22   ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:22 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Implement FITRIM for the realtime device by pretending that it's
"space" immediately after the data device.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_discard.c |  225 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_trace.h   |   29 ++++++
 2 files changed, 248 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d235f60c166c6..6febfd6caa500 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -19,6 +19,7 @@
 #include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_health.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * Notes on an efficient, low latency fstrim algorithm
@@ -514,6 +515,197 @@ xfs_trim_extents(
 
 }
 
+#ifdef CONFIG_XFS_RT
+struct xfs_trim_rtdev {
+	/* list of rt extents to free */
+	struct list_head	extent_list;
+
+	/* pointer to count of blocks trimmed */
+	uint64_t		*blocks_trimmed;
+
+	/* minimum length that caller allows us to trim */
+	xfs_rtblock_t		minlen_fsb;
+
+	/* restart point for the rtbitmap walk */
+	xfs_rtxnum_t		restart_rtx;
+
+	/* stopping point for the current rtbitmap walk */
+	xfs_rtxnum_t		stop_rtx;
+};
+
+struct xfs_rtx_busy {
+	struct list_head	list;
+	xfs_rtblock_t		bno;
+	xfs_rtblock_t		length;
+};
+
+static void
+xfs_discard_free_rtdev_extents(
+	struct xfs_trim_rtdev	*tr)
+{
+	struct xfs_rtx_busy	*busyp, *n;
+
+	list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
+		list_del_init(&busyp->list);
+		kfree(busyp);
+	}
+}
+
+/*
+ * Walk the discard list and issue discards on all the busy extents in the
+ * list. We plug and chain the bios so that we only need a single completion
+ * call to clear all the busy extents once the discards are complete.
+ */
+static int
+xfs_discard_rtdev_extents(
+	struct xfs_mount	*mp,
+	struct xfs_trim_rtdev	*tr)
+{
+	struct block_device	*bdev = xfs_buftarg_bdev(mp->m_rtdev_targp);
+	struct xfs_rtx_busy	*busyp;
+	struct bio		*bio = NULL;
+	struct blk_plug		plug;
+	int			error = 0;
+
+	blk_start_plug(&plug);
+	list_for_each_entry(busyp, &tr->extent_list, list) {
+		trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
+
+		error = __blkdev_issue_discard(bdev,
+				XFS_FSB_TO_BB(mp, busyp->bno),
+				XFS_FSB_TO_BB(mp, busyp->length),
+				GFP_NOFS, &bio);
+		if (error && error != -EOPNOTSUPP) {
+			xfs_info(mp,
+	 "discard failed for rtextent [0x%llx,%llu], error %d",
+				 (unsigned long long)busyp->bno,
+				 (unsigned long long)busyp->length,
+				 error);
+			break;
+		}
+	}
+	xfs_discard_free_rtdev_extents(tr);
+
+	if (bio)
+		submit_bio(bio);
+	blk_finish_plug(&plug);
+
+	return error;
+}
+
+static int
+xfs_trim_gather_rtextent(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*rec,
+	void				*priv)
+{
+	struct xfs_trim_rtdev		*tr = priv;
+	struct xfs_rtx_busy		*busyp;
+	xfs_rtblock_t			rbno, rlen;
+
+	if (rec->ar_startext > tr->stop_rtx) {
+		/*
+		 * If we've scanned a large number of rtbitmap blocks, update
+		 * the cursor to point at this extent so we restart the next
+		 * batch from this extent.
+		 */
+		tr->restart_rtx = rec->ar_startext;
+		return -ECANCELED;
+	}
+
+	rbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
+	rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+
+	/* Ignore too small. */
+	if (rlen < tr->minlen_fsb) {
+		trace_xfs_discard_rttoosmall(mp, rbno, rlen);
+		return 0;
+	}
+
+	busyp = kmem_zalloc(sizeof(struct xfs_rtx_busy), 0);
+	busyp->bno = rbno;
+	busyp->length = rlen;
+	INIT_LIST_HEAD(&busyp->list);
+	list_add_tail(&busyp->list, &tr->extent_list);
+	*tr->blocks_trimmed += rlen;
+
+	tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
+	return 0;
+}
+
+static int
+xfs_trim_rtdev_extents(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		start,
+	xfs_daddr_t		end,
+	xfs_daddr_t		minlen,
+	uint64_t		*blocks_trimmed)
+{
+	struct xfs_rtalloc_rec	low = { };
+	struct xfs_rtalloc_rec	high = { };
+	struct xfs_trim_rtdev	tr = {
+		.blocks_trimmed	= blocks_trimmed,
+		.minlen_fsb	= XFS_BB_TO_FSB(mp, minlen),
+	};
+	xfs_daddr_t		rtdev_daddr;
+	int			error;
+
+	INIT_LIST_HEAD(&tr.extent_list);
+
+	/* Shift the start and end downwards to match the rt device. */
+	rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (start > rtdev_daddr)
+		start -= rtdev_daddr;
+	else
+		start = 0;
+
+	if (end <= rtdev_daddr)
+		return 0;
+	end -= rtdev_daddr;
+
+	if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1)
+		end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1;
+
+	/* Convert the rt blocks to rt extents */
+	low.ar_startext = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start));
+	high.ar_startext = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end));
+
+	/*
+	 * Walk the free ranges between low and high.  The query_range function
+	 * trims the extents returned.
+	 */
+	do {
+		tr.stop_rtx = low.ar_startext +
+					xfs_rtbitmap_rtx_per_rbmblock(mp);
+		xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
+		error = xfs_rtalloc_query_range(mp, NULL, &low, &high,
+				xfs_trim_gather_rtextent, &tr);
+		xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+
+		if (error == -ECANCELED)
+			error = 0;
+		if (error) {
+			xfs_discard_free_rtdev_extents(&tr);
+			break;
+		}
+
+		if (list_empty(&tr.extent_list))
+			break;
+
+		error = xfs_discard_rtdev_extents(mp, &tr);
+		if (error)
+			break;
+
+		low.ar_startext = tr.restart_rtx;
+	} while (!xfs_trim_should_stop() && low.ar_startext <= high.ar_startext);
+
+	return error;
+}
+#else
+# define xfs_trim_rtdev_extents(m,s,e,n,b)	(-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
+
 /*
  * trim a range of the filesystem.
  *
@@ -522,6 +714,9 @@ xfs_trim_extents(
  * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
  * is a linear address range. Hence we need to use DADDR based conversions and
  * comparisons for determining the correct offset and regions to trim.
+ *
+ * The realtime device is mapped into the FITRIM "address space" immediately
+ * after the data device.
  */
 int
 xfs_ioc_trim(
@@ -530,9 +725,12 @@ xfs_ioc_trim(
 {
 	struct xfs_perag	*pag;
 	struct block_device	*bdev = xfs_buftarg_bdev(mp->m_ddev_targp);
+	struct block_device	*rt_bdev = NULL;
 	unsigned int		granularity = bdev_discard_granularity(bdev);
 	struct fstrim_range	range;
+	xfs_rfsblock_t		max_blocks;
 	xfs_daddr_t		start, end, minlen;
+	xfs_daddr_t		ddev_end;
 	xfs_agnumber_t		agno;
 	uint64_t		blocks_trimmed = 0;
 	int			error, last_error = 0;
@@ -542,6 +740,14 @@ xfs_ioc_trim(
 	if (!bdev_max_discard_sectors(bdev))
 		return -EOPNOTSUPP;
 
+	if (mp->m_rtdev_targp) {
+		rt_bdev = xfs_buftarg_bdev(mp->m_rtdev_targp);
+		if (!bdev_max_discard_sectors(rt_bdev))
+			return -EOPNOTSUPP;
+		granularity = max(granularity,
+				  bdev_discard_granularity(rt_bdev));
+	}
+
 	/*
 	 * We haven't recovered the log, so we cannot use our bnobt-guided
 	 * storage zapping commands.
@@ -561,7 +767,8 @@ xfs_ioc_trim(
 	 * used by the fstrim application.  In the end it really doesn't
 	 * matter as trimming blocks is an advisory interface.
 	 */
-	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
+	max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
+	if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
 	    range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
 	    range.len < mp->m_sb.sb_blocksize)
 		return -EINVAL;
@@ -569,12 +776,11 @@ xfs_ioc_trim(
 	start = BTOBB(range.start);
 	end = start + BTOBBT(range.len) - 1;
 
-	if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
-		end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1;
-
 	agno = xfs_daddr_to_agno(mp, start);
-	for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) {
-		error = xfs_trim_extents(pag, start, end, minlen,
+	ddev_end = min_t(xfs_daddr_t, end,
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
+	for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, ddev_end), pag) {
+		error = xfs_trim_extents(pag, start, ddev_end, minlen,
 					  &blocks_trimmed);
 		if (error)
 			last_error = error;
@@ -585,6 +791,13 @@ xfs_ioc_trim(
 		}
 	}
 
+	if (rt_bdev) {
+		error = xfs_trim_rtdev_extents(mp, start, end, minlen,
+				&blocks_trimmed);
+		if (error)
+			last_error = error;
+	}
+
 	if (last_error)
 		return last_error;
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d23566e841cba..063b1ee79de2f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2506,6 +2506,35 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
 DEFINE_DISCARD_EVENT(xfs_discard_exclude);
 DEFINE_DISCARD_EVENT(xfs_discard_busy);
 
+DECLARE_EVENT_CLASS(xfs_rtdiscard_class,
+	TP_PROTO(struct xfs_mount *mp,
+		 xfs_rtblock_t rtbno, xfs_rtblock_t len),
+	TP_ARGS(mp, rtbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rtblock_t, rtbno)
+		__field(xfs_rtblock_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_rtdev_targp->bt_dev;
+		__entry->rtbno = rtbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d rtbno 0x%llx rtbcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rtbno,
+		  __entry->len)
+)
+
+#define DEFINE_RTDISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_rtdiscard_class, name, \
+	TP_PROTO(struct xfs_mount *mp, \
+		 xfs_rtblock_t rtbno, xfs_rtblock_t len), \
+	TP_ARGS(mp, rtbno, len))
+DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent);
+DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall);
+DEFINE_RTDISCARD_EVENT(xfs_discard_rtrelax);
+
 /* btree cursor events */
 TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
@ 2023-12-31 21:22   ` Darrick J. Wong
  2023-12-31 21:22   ` [PATCH 02/14] xfs: refactor the allocation and freeing of incore inode fork btree roots Darrick J. Wong
                     ` (12 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:22 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Replace all the shouty bmap btree and bmap disk root macros with actual
functions, and fix a type handling error in the xattr code that the
macros previously didn't care about.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_attr_leaf.c  |    8 +-
 fs/xfs/libxfs/xfs_bmap.c       |   40 ++++----
 fs/xfs/libxfs/xfs_bmap_btree.c |   18 ++--
 fs/xfs/libxfs/xfs_bmap_btree.h |  204 +++++++++++++++++++++++++++-------------
 fs/xfs/libxfs/xfs_inode_fork.c |   30 +++---
 fs/xfs/libxfs/xfs_trans_resv.c |    2 
 fs/xfs/scrub/bmap_repair.c     |    2 
 fs/xfs/scrub/inode_repair.c    |   12 +-
 fs/xfs/xfs_xchgrange.c         |    4 -
 9 files changed, 198 insertions(+), 122 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2168747aaa2dc..fee156d72fbac 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -672,7 +672,7 @@ xfs_attr_shortform_bytesfit(
 		 */
 		if (!dp->i_forkoff && dp->i_df.if_bytes >
 		    xfs_default_attroffset(dp))
-			dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+			dsize = xfs_bmdr_space_calc(MINDBTPTRS);
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		/*
@@ -686,7 +686,7 @@ xfs_attr_shortform_bytesfit(
 				return 0;
 			return dp->i_forkoff;
 		}
-		dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+		dsize = xfs_bmap_bmdr_space(dp->i_df.if_broot);
 		break;
 	}
 
@@ -694,11 +694,11 @@ xfs_attr_shortform_bytesfit(
 	 * A data fork btree root must have space for at least
 	 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
 	 */
-	minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+	minforkoff = max_t(int64_t, dsize, xfs_bmdr_space_calc(MINDBTPTRS));
 	minforkoff = roundup(minforkoff, 8) >> 3;
 
 	/* attr fork btree root can have at least this many key/ptr pairs */
-	maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	maxforkoff = XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS);
 	maxforkoff = maxforkoff >> 3;	/* rounded down */
 
 	if (offset >= maxforkoff)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c10ed52433979..da89a76512776 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -79,9 +79,9 @@ xfs_bmap_compute_maxlevels(
 	maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
 				whichfork);
 	if (whichfork == XFS_DATA_FORK)
-		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+		sz = xfs_bmdr_space_calc(MINDBTPTRS);
 	else
-		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+		sz = xfs_bmdr_space_calc(MINABTPTRS);
 
 	maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
 	minleafrecs = mp->m_bmap_dmnr[0];
@@ -102,8 +102,8 @@ xfs_bmap_compute_attr_offset(
 	struct xfs_mount	*mp)
 {
 	if (mp->m_sb.sb_inodesize == 256)
-		return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
-	return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+		return XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS);
+	return xfs_bmdr_space_calc(6 * MINABTPTRS);
 }
 
 STATIC int				/* error */
@@ -276,7 +276,7 @@ xfs_check_block(
 	prevp = NULL;
 	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
 		dmxr = mp->m_bmap_dmxr[0];
-		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+		keyp = xfs_bmbt_key_addr(mp, block, i);
 
 		if (prevp) {
 			ASSERT(be64_to_cpu(prevp->br_startoff) <
@@ -288,15 +288,15 @@ xfs_check_block(
 		 * Compare the block numbers to see if there are dups.
 		 */
 		if (root)
-			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+			pp = xfs_bmap_broot_ptr_addr(mp, block, i, sz);
 		else
-			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+			pp = xfs_bmbt_ptr_addr(mp, block, i, dmxr);
 
 		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
 			if (root)
-				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+				thispa = xfs_bmap_broot_ptr_addr(mp, block, j, sz);
 			else
-				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+				thispa = xfs_bmbt_ptr_addr(mp, block, j, dmxr);
 			if (*thispa == *pp) {
 				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld",
 					__func__, j, i,
@@ -351,7 +351,7 @@ xfs_bmap_check_leaf_extents(
 	level = be16_to_cpu(block->bb_level);
 	ASSERT(level > 0);
 	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	pp = xfs_bmap_broot_ptr_addr(mp, block, 1, ifp->if_broot_bytes);
 	bno = be64_to_cpu(*pp);
 
 	ASSERT(bno != NULLFSBLOCK);
@@ -386,7 +386,7 @@ xfs_bmap_check_leaf_extents(
 		 */
 
 		xfs_check_block(block, mp, 0, 0);
-		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]);
 		bno = be64_to_cpu(*pp);
 		if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
 			xfs_btree_mark_sick(cur);
@@ -426,14 +426,14 @@ xfs_bmap_check_leaf_extents(
 		 * conform with the first entry in this one.
 		 */
 
-		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+		ep = xfs_bmbt_rec_addr(mp, block, 1);
 		if (i) {
 			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
 			       xfs_bmbt_disk_get_blockcount(&last) <=
 			       xfs_bmbt_disk_get_startoff(ep));
 		}
 		for (j = 1; j < num_recs; j++) {
-			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+			nextp = xfs_bmbt_rec_addr(mp, block, j + 1);
 			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
 			       xfs_bmbt_disk_get_blockcount(ep) <=
 			       xfs_bmbt_disk_get_startoff(nextp));
@@ -568,7 +568,7 @@ xfs_bmap_btree_to_extents(
 	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
 	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
 
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+	pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes);
 	cbno = be64_to_cpu(*pp);
 #ifdef DEBUG
 	if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) {
@@ -696,7 +696,7 @@ xfs_bmap_extents_to_btree(
 	for_each_xfs_iext(ifp, &icur, &rec) {
 		if (isnullstartblock(rec.br_startblock))
 			continue;
-		arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt);
+		arp = xfs_bmbt_rec_addr(mp, ablock, 1 + cnt);
 		xfs_bmbt_disk_set_all(arp, &rec);
 		cnt++;
 	}
@@ -706,10 +706,10 @@ xfs_bmap_extents_to_btree(
 	/*
 	 * Fill in the root key and pointer.
 	 */
-	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	kp = xfs_bmbt_key_addr(mp, block, 1);
+	arp = xfs_bmbt_rec_addr(mp, ablock, 1);
 	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+	pp = xfs_bmbt_ptr_addr(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
 						be16_to_cpu(block->bb_level)));
 	*pp = cpu_to_be64(args.fsbno);
 
@@ -878,7 +878,7 @@ xfs_bmap_add_attrfork_btree(
 
 	mp = ip->i_mount;
 
-	if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
+	if (xfs_bmap_bmdr_space(block) <= xfs_inode_data_fork_size(ip))
 		*flags |= XFS_ILOG_DBROOT;
 	else {
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
@@ -1145,7 +1145,7 @@ xfs_iread_bmbt_block(
 	}
 
 	/* Copy records into the incore cache. */
-	frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+	frp = xfs_bmbt_rec_addr(mp, block, 1);
 	for (j = 0; j < num_recs; j++, frp++, ir->loaded++) {
 		struct xfs_bmbt_irec	new;
 		xfs_failaddr_t		fa;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fd5fb8abf4448..da63d64f185c8 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -49,10 +49,10 @@ xfs_bmdr_to_bmbt(
 	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
 	rblock->bb_numrecs = dblock->bb_numrecs;
 	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
-	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-	fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-	tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	fkp = xfs_bmdr_key_addr(dblock, 1);
+	tkp = xfs_bmbt_key_addr(mp, rblock, 1);
+	fpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr);
+	tpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
 	dmxr = be16_to_cpu(dblock->bb_numrecs);
 	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
 	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
@@ -152,10 +152,10 @@ xfs_bmbt_to_bmdr(
 	dblock->bb_level = rblock->bb_level;
 	dblock->bb_numrecs = rblock->bb_numrecs;
 	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
-	fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-	tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-	fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-	tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	fkp = xfs_bmbt_key_addr(mp, rblock, 1);
+	tkp = xfs_bmdr_key_addr(dblock, 1);
+	fpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+	tpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr);
 	dmxr = be16_to_cpu(dblock->bb_numrecs);
 	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
 	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
@@ -672,7 +672,7 @@ xfs_bmbt_maxrecs(
 	int			blocklen,
 	int			leaf)
 {
-	blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+	blocklen -= xfs_bmbt_block_len(mp);
 	return xfs_bmbt_block_maxrecs(blocklen, leaf);
 }
 
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 151b8491f60ee..62fbc4f7c2c40 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -13,70 +13,6 @@ struct xfs_inode;
 struct xfs_trans;
 struct xbtree_ifakeroot;
 
-/*
- * Btree block header size depends on a superblock flag.
- */
-#define XFS_BMBT_BLOCK_LEN(mp) \
-	(xfs_has_crc(((mp))) ? \
-		XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
-
-#define XFS_BMBT_REC_ADDR(mp, block, index) \
-	((xfs_bmbt_rec_t *) \
-		((char *)(block) + \
-		 XFS_BMBT_BLOCK_LEN(mp) + \
-		 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
-
-#define XFS_BMBT_KEY_ADDR(mp, block, index) \
-	((xfs_bmbt_key_t *) \
-		((char *)(block) + \
-		 XFS_BMBT_BLOCK_LEN(mp) + \
-		 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
-
-#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
-	((xfs_bmbt_ptr_t *) \
-		((char *)(block) + \
-		 XFS_BMBT_BLOCK_LEN(mp) + \
-		 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
-		 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
-
-#define XFS_BMDR_REC_ADDR(block, index) \
-	((xfs_bmdr_rec_t *) \
-		((char *)(block) + \
-		 sizeof(struct xfs_bmdr_block) + \
-	         ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
-
-#define XFS_BMDR_KEY_ADDR(block, index) \
-	((xfs_bmdr_key_t *) \
-		((char *)(block) + \
-		 sizeof(struct xfs_bmdr_block) + \
-		 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
-
-#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
-	((xfs_bmdr_ptr_t *) \
-		((char *)(block) + \
-		 sizeof(struct xfs_bmdr_block) + \
-		 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
-		 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
-
-/*
- * These are to be used when we know the size of the block and
- * we don't have a cursor.
- */
-#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
-	XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-
-#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
-	(int)(XFS_BMBT_BLOCK_LEN(mp) + \
-	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-
-#define XFS_BMAP_BROOT_SPACE(mp, bb) \
-	(XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
-#define XFS_BMDR_SPACE_CALC(nrecs) \
-	(int)(sizeof(xfs_bmdr_block_t) + \
-	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-#define XFS_BMAP_BMDR_SPACE(bb) \
-	(XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
-
 /*
  * Maximum number of bmap btree levels.
  */
@@ -120,4 +56,144 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void);
 int __init xfs_bmbt_init_cur_cache(void);
 void xfs_bmbt_destroy_cur_cache(void);
 
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+static inline size_t
+xfs_bmbt_block_len(struct xfs_mount *mp)
+{
+	return xfs_has_crc(mp) ?
+			XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN;
+}
+
+/* Addresses of key, pointers, and records within an incore bmbt block. */
+
+static inline struct xfs_bmbt_rec *
+xfs_bmbt_rec_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_bmbt_rec *)
+		((char *)block + xfs_bmbt_block_len(mp) +
+		 (index - 1) * sizeof(struct xfs_bmbt_rec));
+}
+
+static inline struct xfs_bmbt_key *
+xfs_bmbt_key_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_bmbt_key *)
+		((char *)block + xfs_bmbt_block_len(mp) +
+		 (index - 1) * sizeof(struct xfs_bmbt_key *));
+}
+
+static inline xfs_bmbt_ptr_t *
+xfs_bmbt_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_bmbt_ptr_t *)
+		((char *)block + xfs_bmbt_block_len(mp) +
+		 maxrecs * sizeof(struct xfs_bmbt_key) +
+		 (index - 1) * sizeof(xfs_bmbt_ptr_t));
+}
+
+/* Addresses of key, pointers, and records within an ondisk bmbt block. */
+
+static inline struct xfs_bmbt_rec *
+xfs_bmdr_rec_addr(
+	struct xfs_bmdr_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_bmbt_rec *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_bmbt_rec));
+}
+
+static inline struct xfs_bmbt_key *
+xfs_bmdr_key_addr(
+	struct xfs_bmdr_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_bmbt_key *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_bmbt_key));
+}
+
+static inline xfs_bmbt_ptr_t *
+xfs_bmdr_ptr_addr(
+	struct xfs_bmdr_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_bmbt_ptr_t *)
+		((char *)(block + 1) +
+		 maxrecs * sizeof(struct xfs_bmbt_key) +
+		 (index - 1) * sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_bmbt_ptr_t *
+xfs_bmap_broot_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*bb,
+	unsigned int		i,
+	unsigned int		sz)
+{
+	return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_bmap_broot_space_calc(
+	struct xfs_mount	*mp,
+	unsigned int		nrecs)
+{
+	return xfs_bmbt_block_len(mp) + \
+	       (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t)));
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_bmap_broot_space(
+	struct xfs_mount	*mp,
+	struct xfs_bmdr_block	*bb)
+{
+	return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_bmdr_space_calc(unsigned int nrecs)
+{
+	return sizeof(struct xfs_bmdr_block) +
+	       (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t)));
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_bmap_bmdr_space(struct xfs_btree_block *bb)
+{
+	return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs));
+}
+
 #endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 7e571fcfe9d1a..62cd25ed4c59d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -181,7 +181,7 @@ xfs_iformat_btree(
 
 	ifp = xfs_ifork_ptr(ip, whichfork);
 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
-	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+	size = xfs_bmap_broot_space(mp, dfp);
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
 	level = be16_to_cpu(dfp->bb_level);
 
@@ -194,7 +194,7 @@ xfs_iformat_btree(
 	 */
 	if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) ||
 		     nrecs == 0 ||
-		     XFS_BMDR_SPACE_CALC(nrecs) >
+		     xfs_bmdr_space_calc(nrecs) >
 					XFS_DFORK_SIZE(dip, mp, whichfork) ||
 		     ifp->if_nextents > ip->i_nblocks) ||
 		     level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) {
@@ -405,7 +405,7 @@ xfs_iroot_realloc(
 		 * allocate it now and get out.
 		 */
 		if (ifp->if_broot_bytes == 0) {
-			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
 			ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
 			ifp->if_broot_bytes = (int)new_size;
 			return;
@@ -419,15 +419,15 @@ xfs_iroot_realloc(
 		 */
 		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
 		new_max = cur_max + rec_diff;
-		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_NOFS | __GFP_NOFAIL);
-		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
-		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     (int)new_size);
 		ifp->if_broot_bytes = (int)new_size;
-		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 			xfs_inode_fork_size(ip, whichfork));
 		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
 		return;
@@ -443,7 +443,7 @@ xfs_iroot_realloc(
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	if (new_max > 0)
-		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	else
 		new_size = 0;
 	if (new_size > 0) {
@@ -452,7 +452,7 @@ xfs_iroot_realloc(
 		 * First copy over the btree block header.
 		 */
 		memcpy(new_broot, ifp->if_broot,
-			XFS_BMBT_BLOCK_LEN(ip->i_mount));
+			xfs_bmbt_block_len(ip->i_mount));
 	} else {
 		new_broot = NULL;
 	}
@@ -464,16 +464,16 @@ xfs_iroot_realloc(
 		/*
 		 * First copy the records.
 		 */
-		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+		op = (char *)xfs_bmbt_rec_addr(mp, ifp->if_broot, 1);
+		np = (char *)xfs_bmbt_rec_addr(mp, new_broot, 1);
 		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
 
 		/*
 		 * Then copy the pointers.
 		 */
-		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
-		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+		np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1,
 						     (int)new_size);
 		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
 	}
@@ -481,7 +481,7 @@ xfs_iroot_realloc(
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
 	if (ifp->if_broot)
-		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 			xfs_inode_fork_size(ip, whichfork));
 	return;
 }
@@ -654,7 +654,7 @@ xfs_iflush_fork(
 		if ((iip->ili_fields & brootflag[whichfork]) &&
 		    (ifp->if_broot_bytes > 0)) {
 			ASSERT(ifp->if_broot != NULL);
-			ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
 			        xfs_inode_fork_size(ip, whichfork));
 			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
 				(xfs_bmdr_block_t *)cp,
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 0840d5000bf97..83922f5e54aed 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -131,7 +131,7 @@ xfs_calc_inode_res(
 		(4 * sizeof(struct xlog_op_header) +
 		 sizeof(struct xfs_inode_log_format) +
 		 mp->m_sb.sb_inodesize +
-		 2 * XFS_BMBT_BLOCK_LEN(mp));
+		 2 * xfs_bmbt_block_len(mp));
 }
 
 /*
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index a4bb89fdd5106..f005c21770204 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -480,7 +480,7 @@ xrep_bmap_iroot_size(
 {
 	ASSERT(level > 0);
 
-	return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+	return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level);
 }
 
 /* Update the inode counters. */
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 2c4209054f41b..30be423b3716e 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -808,7 +808,7 @@ xrep_dinode_bad_bmbt_fork(
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
 	level = be16_to_cpu(dfp->bb_level);
 
-	if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
+	if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size)
 		return true;
 	if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
 		return true;
@@ -820,12 +820,12 @@ xrep_dinode_bad_bmbt_fork(
 		xfs_fileoff_t		fileoff;
 		xfs_fsblock_t		fsbno;
 
-		fkp = XFS_BMDR_KEY_ADDR(dfp, i);
+		fkp = xfs_bmdr_key_addr(dfp, i);
 		fileoff = be64_to_cpu(fkp->br_startoff);
 		if (!xfs_verify_fileoff(sc->mp, fileoff))
 			return true;
 
-		fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
+		fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr);
 		fsbno = be64_to_cpu(*fpp);
 		if (!xfs_verify_fsbno(sc->mp, fsbno))
 			return true;
@@ -1083,7 +1083,7 @@ xrep_dinode_ensure_forkoff(
 	struct xfs_bmdr_block	*bmdr;
 	struct xfs_scrub	*sc = ri->sc;
 	xfs_extnum_t		attr_extents, data_extents;
-	size_t			bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
+	size_t			bmdr_minsz = xfs_bmdr_space_calc(1);
 	unsigned int		lit_sz = XFS_LITINO(sc->mp);
 	unsigned int		afork_min, dfork_min;
 
@@ -1135,7 +1135,7 @@ xrep_dinode_ensure_forkoff(
 	case XFS_DINODE_FMT_BTREE:
 		/* Must have space for btree header and key/pointers. */
 		bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
-		afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+		afork_min = xfs_bmap_broot_space(sc->mp, bmdr);
 		break;
 	default:
 		/* We should never see any other formats. */
@@ -1185,7 +1185,7 @@ xrep_dinode_ensure_forkoff(
 	case XFS_DINODE_FMT_BTREE:
 		/* Must have space for btree header and key/pointers. */
 		bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
-		dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+		dfork_min = xfs_bmap_broot_space(sc->mp, bmdr);
 		break;
 	default:
 		dfork_min = 0;
diff --git a/fs/xfs/xfs_xchgrange.c b/fs/xfs/xfs_xchgrange.c
index d805678e946c1..e8d44e4de5a81 100644
--- a/fs/xfs/xfs_xchgrange.c
+++ b/fs/xfs/xfs_xchgrange.c
@@ -566,7 +566,7 @@ xfs_swap_extents_check_format(
 	 */
 	if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
 		if (xfs_inode_has_attr_fork(ip) &&
-		    XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
+		    xfs_bmap_bmdr_space(tifp->if_broot) > xfs_inode_fork_boff(ip))
 			return -EINVAL;
 		if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
 			return -EINVAL;
@@ -575,7 +575,7 @@ xfs_swap_extents_check_format(
 	/* Reciprocal target->temp btree format checks */
 	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
 		if (xfs_inode_has_attr_fork(tip) &&
-		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
+		    xfs_bmap_bmdr_space(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
 			return -EINVAL;
 		if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
 			return -EINVAL;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/14] xfs: refactor the allocation and freeing of incore inode fork btree roots
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
  2023-12-31 21:22   ` [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros Darrick J. Wong
@ 2023-12-31 21:22   ` Darrick J. Wong
  2023-12-31 21:22   ` [PATCH 03/14] xfs: refactor creation of bmap " Darrick J. Wong
                     ` (11 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:22 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Refactor the code that allocates and freese the incore inode fork btree
roots.  This will help us disentangle some of the weird logic when we're
creating and tearing down inode-based btrees.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_fork.c |   53 ++++++++++++++++++++++++++++------------
 fs/xfs/libxfs/xfs_inode_fork.h |    3 ++
 2 files changed, 40 insertions(+), 16 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 62cd25ed4c59d..9256499589408 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -207,8 +207,7 @@ xfs_iformat_btree(
 		return -EFSCORRUPTED;
 	}
 
-	ifp->if_broot_bytes = size;
-	ifp->if_broot = kmem_alloc(size, KM_NOFS);
+	xfs_iroot_alloc(ip, whichfork, size);
 	ASSERT(ifp->if_broot != NULL);
 	/*
 	 * Copy and convert from the on-disk structure
@@ -358,6 +357,32 @@ xfs_iformat_attr_fork(
 	return error;
 }
 
+/* Allocate a new incore ifork btree root. */
+void
+xfs_iroot_alloc(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	size_t			bytes)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	ifp->if_broot = kmem_alloc(bytes, KM_NOFS);
+	ifp->if_broot_bytes = bytes;
+}
+
+/* Free all the memory and state associated with an incore ifork btree root. */
+void
+xfs_iroot_free(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	ifp->if_broot_bytes = 0;
+	kmem_free(ifp->if_broot);
+	ifp->if_broot = NULL;
+}
+
 /*
  * Reallocate the space for if_broot based on the number of records
  * being added or deleted as indicated in rec_diff.  Move the records
@@ -406,8 +431,7 @@ xfs_iroot_realloc(
 		 */
 		if (ifp->if_broot_bytes == 0) {
 			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
-			ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
-			ifp->if_broot_bytes = (int)new_size;
+			xfs_iroot_alloc(ip, whichfork, new_size);
 			return;
 		}
 
@@ -446,17 +470,15 @@ xfs_iroot_realloc(
 		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	else
 		new_size = 0;
-	if (new_size > 0) {
-		new_broot = kmem_alloc(new_size, KM_NOFS);
-		/*
-		 * First copy over the btree block header.
-		 */
-		memcpy(new_broot, ifp->if_broot,
-			xfs_bmbt_block_len(ip->i_mount));
-	} else {
-		new_broot = NULL;
+	if (new_size == 0) {
+		xfs_iroot_free(ip, whichfork);
+		return;
 	}
 
+	/* First copy over the btree block header. */
+	new_broot = kmem_alloc(new_size, KM_NOFS);
+	memcpy(new_broot, ifp->if_broot, xfs_bmbt_block_len(ip->i_mount));
+
 	/*
 	 * Only copy the records and pointers if there are any.
 	 */
@@ -480,9 +502,8 @@ xfs_iroot_realloc(
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
-	if (ifp->if_broot)
-		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-			xfs_inode_fork_size(ip, whichfork));
+	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
+		xfs_inode_fork_size(ip, whichfork));
 	return;
 }
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index ebeb925be09d9..18ea2d27777a2 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -172,6 +172,9 @@ void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
 void		xfs_idestroy_fork(struct xfs_ifork *ifp);
 void		xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
 				int whichfork);
+void		xfs_iroot_alloc(struct xfs_inode *ip, int whichfork,
+				size_t bytes);
+void		xfs_iroot_free(struct xfs_inode *ip, int whichfork);
 void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/14] xfs: refactor creation of bmap btree roots
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
  2023-12-31 21:22   ` [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros Darrick J. Wong
  2023-12-31 21:22   ` [PATCH 02/14] xfs: refactor the allocation and freeing of incore inode fork btree roots Darrick J. Wong
@ 2023-12-31 21:22   ` Darrick J. Wong
  2023-12-31 21:23   ` [PATCH 04/14] xfs: fix a sloppy memory handling bug in xfs_iroot_realloc Darrick J. Wong
                     ` (10 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:22 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we've created inode fork helpers to allocate and free btree
roots, create a new bmap btree helper to create a new bmbt root, and
refactor the extents <-> btree conversion functions to use our new
helpers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c       |   17 ++++-------------
 fs/xfs/libxfs/xfs_bmap_btree.c |   16 ++++++++++++++++
 fs/xfs/libxfs/xfs_bmap_btree.h |    2 ++
 3 files changed, 22 insertions(+), 13 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index da89a76512776..bfc93cf0bd470 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -597,7 +597,7 @@ xfs_bmap_btree_to_extents(
 	xfs_trans_binval(tp, cbp);
 	if (cur->bc_levels[0].bp == cbp)
 		cur->bc_levels[0].bp = NULL;
-	xfs_iroot_realloc(ip, -1, whichfork);
+	xfs_iroot_free(ip, whichfork);
 	ASSERT(ifp->if_broot == NULL);
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -637,20 +637,10 @@ xfs_bmap_extents_to_btree(
 	ifp = xfs_ifork_ptr(ip, whichfork);
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
 
-	/*
-	 * Make space in the inode incore. This needs to be undone if we fail
-	 * to expand the root.
-	 */
-	xfs_iroot_realloc(ip, 1, whichfork);
-
-	/*
-	 * Fill in the root.
-	 */
-	block = ifp->if_broot;
-	xfs_btree_init_block(mp, block, &xfs_bmbt_ops, 1, 1, ip->i_ino);
 	/*
 	 * Need a cursor.  Can't allocate until bb_level is filled in.
 	 */
+	xfs_bmbt_iroot_alloc(ip, whichfork);
 	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
 	cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
 	/*
@@ -706,6 +696,7 @@ xfs_bmap_extents_to_btree(
 	/*
 	 * Fill in the root key and pointer.
 	 */
+	block = ifp->if_broot;
 	kp = xfs_bmbt_key_addr(mp, block, 1);
 	arp = xfs_bmbt_rec_addr(mp, ablock, 1);
 	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
@@ -727,7 +718,7 @@ xfs_bmap_extents_to_btree(
 out_unreserve_dquot:
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
 out_root_realloc:
-	xfs_iroot_realloc(ip, -1, whichfork);
+	xfs_iroot_free(ip, whichfork);
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	ASSERT(ifp->if_broot == NULL);
 	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index da63d64f185c8..1bdb942c87e22 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -780,3 +780,19 @@ xfs_bmbt_destroy_cur_cache(void)
 	kmem_cache_destroy(xfs_bmbt_cur_cache);
 	xfs_bmbt_cur_cache = NULL;
 }
+
+/* Create an incore bmbt btree root block. */
+void
+xfs_bmbt_iroot_alloc(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+
+	xfs_iroot_alloc(ip, whichfork,
+			xfs_bmap_broot_space_calc(ip->i_mount, 1));
+
+	/* Fill in the root. */
+	xfs_btree_init_block(ip->i_mount, ifp->if_broot, &xfs_bmbt_ops, 1, 1,
+			ip->i_ino);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 62fbc4f7c2c40..3fe9c4f7f1a0b 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -196,4 +196,6 @@ xfs_bmap_bmdr_space(struct xfs_btree_block *bb)
 	return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs));
 }
 
+void xfs_bmbt_iroot_alloc(struct xfs_inode *ip, int whichfork);
+
 #endif	/* __XFS_BMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/14] xfs: fix a sloppy memory handling bug in xfs_iroot_realloc
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:22   ` [PATCH 03/14] xfs: refactor creation of bmap " Darrick J. Wong
@ 2023-12-31 21:23   ` Darrick J. Wong
  2023-12-31 21:23   ` [PATCH 05/14] xfs: hoist the code that moves the incore inode fork broot memory Darrick J. Wong
                     ` (9 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:23 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

While refactoring code, I noticed that when xfs_iroot_realloc tries to
shrink a bmbt root block, it allocates a smaller new block and then
copies "records" and pointers to the new block.  However, bmbt root
blocks cannot ever be leaves, which means that it's not technically
correct to copy records.  We /should/ be copying keys.

Note that this has never resulted in actual memory corruption because
sizeof(bmbt_rec) == (sizeof(bmbt_key) + sizeof(bmbt_ptr)).  However,
this will no longer be true when we start adding realtime rmap stuff,
so fix this now.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_fork.c |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9256499589408..08f3d003d5383 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -480,15 +480,15 @@ xfs_iroot_realloc(
 	memcpy(new_broot, ifp->if_broot, xfs_bmbt_block_len(ip->i_mount));
 
 	/*
-	 * Only copy the records and pointers if there are any.
+	 * Only copy the keys and pointers if there are any.
 	 */
 	if (new_max > 0) {
 		/*
-		 * First copy the records.
+		 * First copy the keys.
 		 */
-		op = (char *)xfs_bmbt_rec_addr(mp, ifp->if_broot, 1);
-		np = (char *)xfs_bmbt_rec_addr(mp, new_broot, 1);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+		op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
+		np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
 
 		/*
 		 * Then copy the pointers.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/14] xfs: hoist the code that moves the incore inode fork broot memory
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:23   ` [PATCH 04/14] xfs: fix a sloppy memory handling bug in xfs_iroot_realloc Darrick J. Wong
@ 2023-12-31 21:23   ` Darrick J. Wong
  2023-12-31 21:23   ` [PATCH 06/14] xfs: move the zero records logic into xfs_bmap_broot_space_calc Darrick J. Wong
                     ` (8 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:23 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Whenever we change the size of the memory buffer holding an inode fork
btree root block, we have to copy the contents over.  Refactor all this
into a single function that handles both, in preparation for making
xfs_iroot_realloc more generic.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_fork.c |   99 +++++++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 42 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 08f3d003d5383..c84def822dd18 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -383,6 +383,50 @@ xfs_iroot_free(
 	ifp->if_broot = NULL;
 }
 
+/* Move the bmap btree root from one incore buffer to another. */
+static void
+xfs_ifork_move_broot(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_btree_block	*dst_broot,
+	size_t			dst_bytes,
+	struct xfs_btree_block	*src_broot,
+	size_t			src_bytes,
+	unsigned int		numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	void			*dptr;
+	void			*sptr;
+
+	ASSERT(xfs_bmap_bmdr_space(src_broot) <= xfs_inode_fork_size(ip, whichfork));
+
+	/*
+	 * We always have to move the pointers because they are not butted
+	 * against the btree block header.
+	 */
+	if (numrecs) {
+		sptr = xfs_bmap_broot_ptr_addr(mp, src_broot, 1, src_bytes);
+		dptr = xfs_bmap_broot_ptr_addr(mp, dst_broot, 1, dst_bytes);
+		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
+	}
+
+	if (src_broot == dst_broot)
+		return;
+
+	/*
+	 * If the root is being totally relocated, we have to migrate the block
+	 * header and the keys that come after it.
+	 */
+	memcpy(dst_broot, src_broot, xfs_bmbt_block_len(mp));
+
+	/* Now copy the keys, which come right after the header. */
+	if (numrecs) {
+		sptr = xfs_bmbt_key_addr(mp, src_broot, 1);
+		dptr = xfs_bmbt_key_addr(mp, dst_broot, 1);
+		memcpy(dptr, sptr, numrecs * sizeof(struct xfs_bmbt_key));
+	}
+}
+
 /*
  * Reallocate the space for if_broot based on the number of records
  * being added or deleted as indicated in rec_diff.  Move the records
@@ -409,12 +453,11 @@ xfs_iroot_realloc(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	int			cur_max;
-	struct xfs_ifork	*ifp;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*new_broot;
 	int			new_max;
 	size_t			new_size;
-	char			*np;
-	char			*op;
+	size_t			old_size = ifp->if_broot_bytes;
 
 	/*
 	 * Handle the degenerate case quietly.
@@ -423,13 +466,12 @@ xfs_iroot_realloc(
 		return;
 	}
 
-	ifp = xfs_ifork_ptr(ip, whichfork);
 	if (rec_diff > 0) {
 		/*
 		 * If there wasn't any memory allocated before, just
 		 * allocate it now and get out.
 		 */
-		if (ifp->if_broot_bytes == 0) {
+		if (old_size == 0) {
 			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
 			xfs_iroot_alloc(ip, whichfork, new_size);
 			return;
@@ -438,22 +480,16 @@ xfs_iroot_realloc(
 		/*
 		 * If there is already an existing if_broot, then we need
 		 * to realloc() it and shift the pointers to their new
-		 * location.  The records don't change location because
-		 * they are kept butted up against the btree block header.
+		 * location.
 		 */
-		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+		cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
 		new_max = cur_max + rec_diff;
 		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_NOFS | __GFP_NOFAIL);
-		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     ifp->if_broot_bytes);
-		np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     (int)new_size);
-		ifp->if_broot_bytes = (int)new_size;
-		ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-			xfs_inode_fork_size(ip, whichfork));
-		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
+		ifp->if_broot_bytes = new_size;
+		xfs_ifork_move_broot(ip, whichfork, ifp->if_broot, new_size,
+				ifp->if_broot, old_size, cur_max);
 		return;
 	}
 
@@ -462,8 +498,8 @@ xfs_iroot_realloc(
 	 * if_broot buffer.  It must already exist.  If we go to zero
 	 * records, just get rid of the root and clear the status bit.
 	 */
-	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+	ASSERT((ifp->if_broot != NULL) && (old_size > 0));
+	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	if (new_max > 0)
@@ -475,35 +511,14 @@ xfs_iroot_realloc(
 		return;
 	}
 
-	/* First copy over the btree block header. */
+	/* Reallocate the btree root and move the contents. */
 	new_broot = kmem_alloc(new_size, KM_NOFS);
-	memcpy(new_broot, ifp->if_broot, xfs_bmbt_block_len(ip->i_mount));
+	xfs_ifork_move_broot(ip, whichfork, new_broot, new_size, ifp->if_broot,
+			old_size, new_max);
 
-	/*
-	 * Only copy the keys and pointers if there are any.
-	 */
-	if (new_max > 0) {
-		/*
-		 * First copy the keys.
-		 */
-		op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
-		np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
-
-		/*
-		 * Then copy the pointers.
-		 */
-		op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
-						     ifp->if_broot_bytes);
-		np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1,
-						     (int)new_size);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
-	}
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
-	ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
-		xfs_inode_fork_size(ip, whichfork));
 	return;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/14] xfs: move the zero records logic into xfs_bmap_broot_space_calc
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:23   ` [PATCH 05/14] xfs: hoist the code that moves the incore inode fork broot memory Darrick J. Wong
@ 2023-12-31 21:23   ` Darrick J. Wong
  2023-12-31 21:23   ` [PATCH 07/14] xfs: rearrange xfs_iroot_realloc a bit Darrick J. Wong
                     ` (7 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:23 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The bmap btree cannot ever have zero records in an incore btree block.
If the number of records drops to zero, that means we're converting the
fork to extents format and are trying to remove the tree.  This logic
won't hold for the future realtime rmap btree, so move the logic into
the bmbt code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap_btree.h |    7 +++++++
 fs/xfs/libxfs/xfs_inode_fork.c |    6 ++----
 2 files changed, 9 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 3fe9c4f7f1a0b..5a3bae94debd4 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -162,6 +162,13 @@ xfs_bmap_broot_space_calc(
 	struct xfs_mount	*mp,
 	unsigned int		nrecs)
 {
+	/*
+	 * If the bmbt root block is empty, we should be converting the fork
+	 * to extents format.  Hence, the size is zero.
+	 */
+	if (nrecs == 0)
+		return 0;
+
 	return xfs_bmbt_block_len(mp) + \
 	       (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t)));
 }
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index c84def822dd18..fc23087dc2a87 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -502,10 +502,8 @@ xfs_iroot_realloc(
 	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
-	if (new_max > 0)
-		new_size = xfs_bmap_broot_space_calc(mp, new_max);
-	else
-		new_size = 0;
+
+	new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	if (new_size == 0) {
 		xfs_iroot_free(ip, whichfork);
 		return;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/14] xfs: rearrange xfs_iroot_realloc a bit
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:23   ` [PATCH 06/14] xfs: move the zero records logic into xfs_bmap_broot_space_calc Darrick J. Wong
@ 2023-12-31 21:23   ` Darrick J. Wong
  2023-12-31 21:24   ` [PATCH 08/14] xfs: standardize the btree maxrecs function parameters Darrick J. Wong
                     ` (6 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:23 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Rearrange the innards of xfs_iroot_realloc so that we can reduce
duplicated code prior to genericizing the function.  No functional
changes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_fork.c |   49 ++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 27 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index fc23087dc2a87..c2de6fbf5d128 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -447,44 +447,46 @@ xfs_ifork_move_broot(
  */
 void
 xfs_iroot_realloc(
-	xfs_inode_t		*ip,
+	struct xfs_inode	*ip,
 	int			rec_diff,
 	int			whichfork)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	int			cur_max;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*new_broot;
-	int			new_max;
 	size_t			new_size;
 	size_t			old_size = ifp->if_broot_bytes;
+	int			cur_max;
+	int			new_max;
+
+	/* Handle degenerate cases. */
+	if (rec_diff == 0)
+		return;
 
 	/*
-	 * Handle the degenerate case quietly.
+	 * If there wasn't any memory allocated before, just allocate it now
+	 * and get out.
 	 */
-	if (rec_diff == 0) {
+	if (old_size == 0) {
+		ASSERT(rec_diff > 0);
+
+		new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
+		xfs_iroot_alloc(ip, whichfork, new_size);
 		return;
 	}
 
+	/* Compute the new and old record count and space requirements. */
+	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
+	new_max = cur_max + rec_diff;
+	ASSERT(new_max >= 0);
+	new_size = xfs_bmap_broot_space_calc(mp, new_max);
+
 	if (rec_diff > 0) {
-		/*
-		 * If there wasn't any memory allocated before, just
-		 * allocate it now and get out.
-		 */
-		if (old_size == 0) {
-			new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
-			xfs_iroot_alloc(ip, whichfork, new_size);
-			return;
-		}
-
 		/*
 		 * If there is already an existing if_broot, then we need
 		 * to realloc() it and shift the pointers to their new
 		 * location.
 		 */
-		cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
-		new_max = cur_max + rec_diff;
-		new_size = xfs_bmap_broot_space_calc(mp, new_max);
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_NOFS | __GFP_NOFAIL);
 		ifp->if_broot_bytes = new_size;
@@ -496,14 +498,8 @@ xfs_iroot_realloc(
 	/*
 	 * rec_diff is less than 0.  In this case, we are shrinking the
 	 * if_broot buffer.  It must already exist.  If we go to zero
-	 * records, just get rid of the root and clear the status bit.
+	 * bytes, just get rid of the root and clear the status bit.
 	 */
-	ASSERT((ifp->if_broot != NULL) && (old_size > 0));
-	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
-	new_max = cur_max + rec_diff;
-	ASSERT(new_max >= 0);
-
-	new_size = xfs_bmap_broot_space_calc(mp, new_max);
 	if (new_size == 0) {
 		xfs_iroot_free(ip, whichfork);
 		return;
@@ -516,8 +512,7 @@ xfs_iroot_realloc(
 
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
-	ifp->if_broot_bytes = (int)new_size;
-	return;
+	ifp->if_broot_bytes = new_size;
 }
 
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/14] xfs: standardize the btree maxrecs function parameters
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:23   ` [PATCH 07/14] xfs: rearrange xfs_iroot_realloc a bit Darrick J. Wong
@ 2023-12-31 21:24   ` Darrick J. Wong
  2023-12-31 21:24   ` [PATCH 09/14] xfs: generalize the btree root reallocation function Darrick J. Wong
                     ` (5 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:24 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Standardize the parameters in xfs_{alloc,bm,ino,rmap,refcount}bt_maxrecs
so that we have consistent calling conventions.  This doesn't affect the
kernel that much, but enables us to clean up userspace a bit.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_alloc_btree.c    |    6 +++---
 fs/xfs/libxfs/xfs_alloc_btree.h    |    3 ++-
 fs/xfs/libxfs/xfs_bmap.c           |    2 +-
 fs/xfs/libxfs/xfs_bmap_btree.c     |    6 +++---
 fs/xfs/libxfs/xfs_bmap_btree.h     |    5 +++--
 fs/xfs/libxfs/xfs_ialloc.c         |    4 ++--
 fs/xfs/libxfs/xfs_ialloc_btree.c   |    6 +++---
 fs/xfs/libxfs/xfs_ialloc_btree.h   |    3 ++-
 fs/xfs/libxfs/xfs_inode_fork.c     |    2 +-
 fs/xfs/libxfs/xfs_refcount_btree.c |    5 +++--
 fs/xfs/libxfs/xfs_refcount_btree.h |    3 ++-
 fs/xfs/libxfs/xfs_rmap_btree.c     |    9 +++++----
 fs/xfs/libxfs/xfs_rmap_btree.h     |    3 ++-
 fs/xfs/libxfs/xfs_sb.c             |   16 ++++++++--------
 14 files changed, 40 insertions(+), 33 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index fd769e62cc35b..928680b182761 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -611,11 +611,11 @@ xfs_allocbt_block_maxrecs(
 /*
  * Calculate number of records in an alloc btree block.
  */
-int
+unsigned int
 xfs_allocbt_maxrecs(
 	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
+	unsigned int		blocklen,
+	bool			leaf)
 {
 	blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
 	return xfs_allocbt_block_maxrecs(blocklen, leaf);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45df893ef6bb0..f61f51d0bd76d 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -53,7 +53,8 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
 struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
 		struct xbtree_afakeroot *afake, struct xfs_perag *pag,
 		xfs_btnum_t btnum);
-extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+unsigned int xfs_allocbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index bfc93cf0bd470..2bc0b76bcfbbb 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -566,7 +566,7 @@ xfs_bmap_btree_to_extents(
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
 	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
 	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false) == 1);
 
 	pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes);
 	cbno = be64_to_cpu(*pp);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 1bdb942c87e22..27b11d496f809 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -666,11 +666,11 @@ xfs_bmbt_commit_staged_btree(
 /*
  * Calculate number of records in a bmap btree block.
  */
-int
+unsigned int
 xfs_bmbt_maxrecs(
 	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
+	unsigned int		blocklen,
+	bool			leaf)
 {
 	blocklen -= xfs_bmbt_block_len(mp);
 	return xfs_bmbt_block_maxrecs(blocklen, leaf);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 5a3bae94debd4..a9ddc9b42e614 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -35,7 +35,8 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
 
 extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
 extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
-extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+unsigned int xfs_bmbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 
 extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
 				 int whichfork, xfs_ino_t new_owner,
@@ -150,7 +151,7 @@ xfs_bmap_broot_ptr_addr(
 	unsigned int		i,
 	unsigned int		sz)
 {
-	return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0));
+	return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, false));
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 96d22f9698a7a..7278392a8f949 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2919,8 +2919,8 @@ xfs_ialloc_setup_geometry(
 
 	/* Compute inode btree geometry. */
 	igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-	igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-	igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+	igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true);
+	igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false);
 	igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
 	igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
 
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index cdb1f99970724..4a220946d99ca 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -559,11 +559,11 @@ xfs_inobt_block_maxrecs(
 /*
  * Calculate number of records in an inobt btree block.
  */
-int
+unsigned int
 xfs_inobt_maxrecs(
 	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
+	unsigned int		blocklen,
+	bool			leaf)
 {
 	blocklen -= XFS_INOBT_BLOCK_LEN(mp);
 	return xfs_inobt_block_maxrecs(blocklen, leaf);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 3262c3fe5ebee..ed0f619fd3361 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -50,7 +50,8 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
 		struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum);
 struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
 		struct xbtree_afakeroot *afake, xfs_btnum_t btnum);
-extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+unsigned int xfs_inobt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 
 /* ir_holemask to inode allocation bitmap conversion */
 uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index c2de6fbf5d128..ec393840dd945 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -476,7 +476,7 @@ xfs_iroot_realloc(
 	}
 
 	/* Compute the new and old record count and space requirements. */
-	cur_max = xfs_bmbt_maxrecs(mp, old_size, 0);
+	cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	new_size = xfs_bmap_broot_space_calc(mp, new_max);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 06a2f062b58cb..ce42ecd3e3715 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -434,9 +434,10 @@ xfs_refcountbt_block_maxrecs(
 /*
  * Calculate the number of records in a refcount btree block.
  */
-int
+unsigned int
 xfs_refcountbt_maxrecs(
-	int			blocklen,
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
 	bool			leaf)
 {
 	blocklen -= XFS_REFCOUNT_BLOCK_LEN;
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index d66b37259bedb..fe3c20d67790d 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -50,7 +50,8 @@ extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
 		struct xbtree_afakeroot *afake, struct xfs_perag *pag);
-extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
+unsigned int xfs_refcountbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
 
 extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 71d32f9fee14d..71887cc23e03f 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -589,7 +589,7 @@ xfs_rmapbt_mem_verify(
 	}
 
 	return xfbtree_sblock_verify(bp,
-			xfs_rmapbt_maxrecs(xfo_to_b(1), level == 0));
+			xfs_rmapbt_maxrecs(mp, xfo_to_b(1), level == 0));
 }
 
 static void
@@ -717,10 +717,11 @@ xfs_rmapbt_block_maxrecs(
 /*
  * Calculate number of records in an rmap btree block.
  */
-int
+unsigned int
 xfs_rmapbt_maxrecs(
-	int			blocklen,
-	int			leaf)
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
 {
 	blocklen -= XFS_RMAP_BLOCK_LEN;
 	return xfs_rmapbt_block_maxrecs(blocklen, leaf);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 5d0454fd05299..415fad8dad73e 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -48,7 +48,8 @@ struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
 		struct xbtree_afakeroot *afake, struct xfs_perag *pag);
 void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
 		struct xfs_trans *tp, struct xfs_buf *agbp);
-int xfs_rmapbt_maxrecs(int blocklen, int leaf);
+unsigned int xfs_rmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
 extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
 
 extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 8298334f4ee8d..8178a8e8097ff 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1110,23 +1110,23 @@ xfs_sb_mount_common(
 	mp->m_rgblklog = log2_if_power2(sbp->sb_rgblocks);
 	mp->m_rgblkmask = mask64_if_power2(sbp->sb_rgblocks);
 
-	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
-	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
 	mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
 
-	mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
-	mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
 	mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
 
-	mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1);
-	mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0);
+	mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
 	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
 
-	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true);
-	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false);
+	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
 	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/14] xfs: generalize the btree root reallocation function
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:24   ` [PATCH 08/14] xfs: standardize the btree maxrecs function parameters Darrick J. Wong
@ 2023-12-31 21:24   ` Darrick J. Wong
  2023-12-31 21:24   ` [PATCH 10/14] xfs: support leaves in the incore btree root block in xfs_iroot_realloc Darrick J. Wong
                     ` (4 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:24 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In preparation for storing realtime rmap btree roots in an inode fork,
make xfs_iroot_realloc take an ops structure that takes care of all the
btree-specific geometry pieces.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap_btree.c |   51 +++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_btree.c      |   22 +++++++----
 fs/xfs/libxfs/xfs_btree.h      |    3 +
 fs/xfs/libxfs/xfs_inode_fork.c |   82 ++++++++--------------------------------
 fs/xfs/libxfs/xfs_inode_fork.h |   23 +++++++++++
 5 files changed, 107 insertions(+), 74 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 27b11d496f809..51ffe59fa4485 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -513,6 +513,56 @@ xfs_bmbt_keys_contiguous(
 				 be64_to_cpu(key2->bmbt.br_startoff));
 }
 
+/* Move the bmap btree root from one incore buffer to another. */
+static void
+xfs_bmbt_broot_move(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_btree_block	*dst_broot,
+	size_t			dst_bytes,
+	struct xfs_btree_block	*src_broot,
+	size_t			src_bytes,
+	unsigned int		numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	void			*dptr;
+	void			*sptr;
+
+	ASSERT(xfs_bmap_bmdr_space(src_broot) <= xfs_inode_fork_size(ip, whichfork));
+
+	/*
+	 * We always have to move the pointers because they are not butted
+	 * against the btree block header.
+	 */
+	if (numrecs) {
+		sptr = xfs_bmap_broot_ptr_addr(mp, src_broot, 1, src_bytes);
+		dptr = xfs_bmap_broot_ptr_addr(mp, dst_broot, 1, dst_bytes);
+		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
+	}
+
+	if (src_broot == dst_broot)
+		return;
+
+	/*
+	 * If the root is being totally relocated, we have to migrate the block
+	 * header and the keys that come after it.
+	 */
+	memcpy(dst_broot, src_broot, xfs_bmbt_block_len(mp));
+
+	/* Now copy the keys, which come right after the header. */
+	if (numrecs) {
+		sptr = xfs_bmbt_key_addr(mp, src_broot, 1);
+		dptr = xfs_bmbt_key_addr(mp, dst_broot, 1);
+		memcpy(dptr, sptr, numrecs * sizeof(struct xfs_bmbt_key));
+	}
+}
+
+static const struct xfs_ifork_broot_ops xfs_bmbt_iroot_ops = {
+	.maxrecs		= xfs_bmbt_maxrecs,
+	.size			= xfs_bmap_broot_space_calc,
+	.move			= xfs_bmbt_broot_move,
+};
+
 const struct xfs_btree_ops xfs_bmbt_ops = {
 	.rec_len		= sizeof(xfs_bmbt_rec_t),
 	.key_len		= sizeof(xfs_bmbt_key_t),
@@ -536,6 +586,7 @@ const struct xfs_btree_ops xfs_bmbt_ops = {
 	.keys_inorder		= xfs_bmbt_keys_inorder,
 	.recs_inorder		= xfs_bmbt_recs_inorder,
 	.keys_contiguous	= xfs_bmbt_keys_contiguous,
+	.iroot_ops		= &xfs_bmbt_iroot_ops,
 };
 
 static struct xfs_btree_cur *
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 4c8e9dd25b739..e6df7608ce564 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3071,6 +3071,16 @@ xfs_btree_split(
 #define xfs_btree_split	__xfs_btree_split
 #endif /* __KERNEL__ */
 
+static inline void
+xfs_btree_iroot_realloc(
+	struct xfs_btree_cur		*cur,
+	int				rec_diff)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	xfs_iroot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
+			cur->bc_ops->iroot_ops, rec_diff);
+}
 
 /*
  * Copy the old inode root contents into a real block and make the
@@ -3155,9 +3165,7 @@ xfs_btree_new_iroot(
 
 	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
 
-	xfs_iroot_realloc(cur->bc_ino.ip,
-			  1 - xfs_btree_get_numrecs(cblock),
-			  cur->bc_ino.whichfork);
+	xfs_btree_iroot_realloc(cur, 1 - xfs_btree_get_numrecs(cblock));
 
 	xfs_btree_setbuf(cur, level, cbp);
 
@@ -3327,7 +3335,7 @@ xfs_btree_make_block_unfull(
 
 		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
 			/* A root block that can be made bigger. */
-			xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
+			xfs_btree_iroot_realloc(cur, 1);
 			*stat = 1;
 		} else {
 			/* A root block that needs replacing */
@@ -3735,8 +3743,7 @@ xfs_btree_kill_iroot(
 
 	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
 	if (index) {
-		xfs_iroot_realloc(cur->bc_ino.ip, index,
-				  cur->bc_ino.whichfork);
+		xfs_btree_iroot_realloc(cur, index);
 		block = ifp->if_broot;
 	}
 
@@ -3933,8 +3940,7 @@ xfs_btree_delrec(
 	 */
 	if (level == cur->bc_nlevels - 1) {
 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
-			xfs_iroot_realloc(cur->bc_ino.ip, -1,
-					  cur->bc_ino.whichfork);
+			xfs_btree_iroot_realloc(cur, -1);
 
 			error = xfs_btree_kill_iroot(cur);
 			if (error)
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 339b5561e5b04..7872fc1739b84 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -205,6 +205,9 @@ struct xfs_btree_ops {
 			       const union xfs_btree_key *key1,
 			       const union xfs_btree_key *key2,
 			       const union xfs_btree_key *mask);
+
+	/* Functions for manipulating the btree root block. */
+	const struct xfs_ifork_broot_ops *iroot_ops;
 };
 
 /*
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index ec393840dd945..e342be9911774 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -383,50 +383,6 @@ xfs_iroot_free(
 	ifp->if_broot = NULL;
 }
 
-/* Move the bmap btree root from one incore buffer to another. */
-static void
-xfs_ifork_move_broot(
-	struct xfs_inode	*ip,
-	int			whichfork,
-	struct xfs_btree_block	*dst_broot,
-	size_t			dst_bytes,
-	struct xfs_btree_block	*src_broot,
-	size_t			src_bytes,
-	unsigned int		numrecs)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	void			*dptr;
-	void			*sptr;
-
-	ASSERT(xfs_bmap_bmdr_space(src_broot) <= xfs_inode_fork_size(ip, whichfork));
-
-	/*
-	 * We always have to move the pointers because they are not butted
-	 * against the btree block header.
-	 */
-	if (numrecs) {
-		sptr = xfs_bmap_broot_ptr_addr(mp, src_broot, 1, src_bytes);
-		dptr = xfs_bmap_broot_ptr_addr(mp, dst_broot, 1, dst_bytes);
-		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
-	}
-
-	if (src_broot == dst_broot)
-		return;
-
-	/*
-	 * If the root is being totally relocated, we have to migrate the block
-	 * header and the keys that come after it.
-	 */
-	memcpy(dst_broot, src_broot, xfs_bmbt_block_len(mp));
-
-	/* Now copy the keys, which come right after the header. */
-	if (numrecs) {
-		sptr = xfs_bmbt_key_addr(mp, src_broot, 1);
-		dptr = xfs_bmbt_key_addr(mp, dst_broot, 1);
-		memcpy(dptr, sptr, numrecs * sizeof(struct xfs_bmbt_key));
-	}
-}
-
 /*
  * Reallocate the space for if_broot based on the number of records
  * being added or deleted as indicated in rec_diff.  Move the records
@@ -440,24 +396,21 @@ xfs_ifork_move_broot(
  * if we are adding records, one will be allocated.  The caller must also
  * not request that the number of records go below zero, although
  * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- *	 requested for the if_broot array.
  */
 void
 xfs_iroot_realloc(
-	struct xfs_inode	*ip,
-	int			rec_diff,
-	int			whichfork)
+	struct xfs_inode		*ip,
+	int				whichfork,
+	const struct xfs_ifork_broot_ops *ops,
+	int				rec_diff)
 {
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
-	struct xfs_btree_block	*new_broot;
-	size_t			new_size;
-	size_t			old_size = ifp->if_broot_bytes;
-	int			cur_max;
-	int			new_max;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_block		*new_broot;
+	size_t				new_size;
+	size_t				old_size = ifp->if_broot_bytes;
+	int				cur_max;
+	int				new_max;
 
 	/* Handle degenerate cases. */
 	if (rec_diff == 0)
@@ -470,16 +423,16 @@ xfs_iroot_realloc(
 	if (old_size == 0) {
 		ASSERT(rec_diff > 0);
 
-		new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
+		new_size = ops->size(mp, rec_diff);
 		xfs_iroot_alloc(ip, whichfork, new_size);
 		return;
 	}
 
 	/* Compute the new and old record count and space requirements. */
-	cur_max = xfs_bmbt_maxrecs(mp, old_size, false);
+	cur_max = ops->maxrecs(mp, old_size, false);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
-	new_size = xfs_bmap_broot_space_calc(mp, new_max);
+	new_size = ops->size(mp, new_max);
 
 	if (rec_diff > 0) {
 		/*
@@ -490,7 +443,7 @@ xfs_iroot_realloc(
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
 					 GFP_NOFS | __GFP_NOFAIL);
 		ifp->if_broot_bytes = new_size;
-		xfs_ifork_move_broot(ip, whichfork, ifp->if_broot, new_size,
+		ops->move(ip, whichfork, ifp->if_broot, new_size,
 				ifp->if_broot, old_size, cur_max);
 		return;
 	}
@@ -507,15 +460,14 @@ xfs_iroot_realloc(
 
 	/* Reallocate the btree root and move the contents. */
 	new_broot = kmem_alloc(new_size, KM_NOFS);
-	xfs_ifork_move_broot(ip, whichfork, new_broot, new_size, ifp->if_broot,
-			old_size, new_max);
+	ops->move(ip, whichfork, new_broot, new_size, ifp->if_broot,
+			ifp->if_broot_bytes, new_max);
 
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = new_size;
 }
 
-
 /*
  * This is called when the amount of space needed for if_data
  * is increased or decreased.  The change in size is indicated by
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 18ea2d27777a2..1ac9a7a8b5f5e 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -175,7 +175,6 @@ void		xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
 void		xfs_iroot_alloc(struct xfs_inode *ip, int whichfork,
 				size_t bytes);
 void		xfs_iroot_free(struct xfs_inode *ip, int whichfork);
-void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
 				  int);
@@ -274,4 +273,26 @@ static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
 	return smp_load_acquire(&ifp->if_needextents) != 0;
 }
 
+struct xfs_ifork_broot_ops {
+	/* Calculate the number of records/keys in the incore btree block. */
+	unsigned int (*maxrecs)(struct xfs_mount *mp, unsigned int blocksize,
+			bool leaf);
+
+	/* Calculate the bytes required for the incore btree root block. */
+	size_t (*size)(struct xfs_mount *mp, unsigned int nrecs);
+
+	/*
+	 * Move an incore btree root from one buffer to another.  Note that
+	 * src_broot and dst_broot could be the same or they could be totally
+	 * separate memory regions.
+	 */
+	void (*move)(struct xfs_inode *ip, int whichfork,
+			struct xfs_btree_block *dst_broot, size_t dst_bytes,
+			struct xfs_btree_block *src_broot, size_t src_bytes,
+			unsigned int numrecs);
+};
+
+void xfs_iroot_realloc(struct xfs_inode *ip, int whichfork,
+		const struct xfs_ifork_broot_ops *ops, int rec_diff);
+
 #endif	/* __XFS_INODE_FORK_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/14] xfs: support leaves in the incore btree root block in xfs_iroot_realloc
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 21:24   ` [PATCH 09/14] xfs: generalize the btree root reallocation function Darrick J. Wong
@ 2023-12-31 21:24   ` Darrick J. Wong
  2023-12-31 21:24   ` [PATCH 11/14] xfs: hoist the node iroot update code out of xfs_btree_new_iroot Darrick J. Wong
                     ` (3 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:24 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add some logic to xfs_iroot_realloc so that we can handle leaf records
in the btree root block correctly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap_btree.c |    4 +++-
 fs/xfs/libxfs/xfs_bmap_btree.h |    5 ++++-
 fs/xfs/libxfs/xfs_inode_fork.c |   12 +++++++-----
 fs/xfs/libxfs/xfs_inode_fork.h |    5 +++--
 fs/xfs/scrub/bmap_repair.c     |    2 +-
 5 files changed, 18 insertions(+), 10 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 51ffe59fa4485..0a5020c8e0f5e 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -522,6 +522,7 @@ xfs_bmbt_broot_move(
 	size_t			dst_bytes,
 	struct xfs_btree_block	*src_broot,
 	size_t			src_bytes,
+	unsigned int		level,
 	unsigned int		numrecs)
 {
 	struct xfs_mount	*mp = ip->i_mount;
@@ -529,6 +530,7 @@ xfs_bmbt_broot_move(
 	void			*sptr;
 
 	ASSERT(xfs_bmap_bmdr_space(src_broot) <= xfs_inode_fork_size(ip, whichfork));
+	ASSERT(level > 0);
 
 	/*
 	 * We always have to move the pointers because they are not butted
@@ -841,7 +843,7 @@ xfs_bmbt_iroot_alloc(
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 
 	xfs_iroot_alloc(ip, whichfork,
-			xfs_bmap_broot_space_calc(ip->i_mount, 1));
+			xfs_bmap_broot_space_calc(ip->i_mount, 1, 1));
 
 	/* Fill in the root. */
 	xfs_btree_init_block(ip->i_mount, ifp->if_broot, &xfs_bmbt_ops, 1, 1,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index a9ddc9b42e614..d20321bfe2f60 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -161,8 +161,11 @@ xfs_bmap_broot_ptr_addr(
 static inline size_t
 xfs_bmap_broot_space_calc(
 	struct xfs_mount	*mp,
+	unsigned int		level,
 	unsigned int		nrecs)
 {
+	ASSERT(level > 0);
+
 	/*
 	 * If the bmbt root block is empty, we should be converting the fork
 	 * to extents format.  Hence, the size is zero.
@@ -183,7 +186,7 @@ xfs_bmap_broot_space(
 	struct xfs_mount	*mp,
 	struct xfs_bmdr_block	*bb)
 {
-	return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs));
+	return xfs_bmap_broot_space_calc(mp, 1, be16_to_cpu(bb->bb_numrecs));
 }
 
 /* Compute the space required for the ondisk root block. */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index e342be9911774..16543bb873a81 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -409,6 +409,7 @@ xfs_iroot_realloc(
 	struct xfs_btree_block		*new_broot;
 	size_t				new_size;
 	size_t				old_size = ifp->if_broot_bytes;
+	unsigned int			level;
 	int				cur_max;
 	int				new_max;
 
@@ -423,16 +424,17 @@ xfs_iroot_realloc(
 	if (old_size == 0) {
 		ASSERT(rec_diff > 0);
 
-		new_size = ops->size(mp, rec_diff);
+		new_size = ops->size(mp, 0, rec_diff);
 		xfs_iroot_alloc(ip, whichfork, new_size);
 		return;
 	}
 
 	/* Compute the new and old record count and space requirements. */
-	cur_max = ops->maxrecs(mp, old_size, false);
+	level = be16_to_cpu(ifp->if_broot->bb_level);
+	cur_max = ops->maxrecs(mp, old_size, level == 0);
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
-	new_size = ops->size(mp, new_max);
+	new_size = ops->size(mp, level, new_max);
 
 	if (rec_diff > 0) {
 		/*
@@ -444,7 +446,7 @@ xfs_iroot_realloc(
 					 GFP_NOFS | __GFP_NOFAIL);
 		ifp->if_broot_bytes = new_size;
 		ops->move(ip, whichfork, ifp->if_broot, new_size,
-				ifp->if_broot, old_size, cur_max);
+				ifp->if_broot, old_size, level, cur_max);
 		return;
 	}
 
@@ -461,7 +463,7 @@ xfs_iroot_realloc(
 	/* Reallocate the btree root and move the contents. */
 	new_broot = kmem_alloc(new_size, KM_NOFS);
 	ops->move(ip, whichfork, new_broot, new_size, ifp->if_broot,
-			ifp->if_broot_bytes, new_max);
+			ifp->if_broot_bytes, level, new_max);
 
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 1ac9a7a8b5f5e..9a0136f82738d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -279,7 +279,8 @@ struct xfs_ifork_broot_ops {
 			bool leaf);
 
 	/* Calculate the bytes required for the incore btree root block. */
-	size_t (*size)(struct xfs_mount *mp, unsigned int nrecs);
+	size_t (*size)(struct xfs_mount *mp, unsigned int level,
+			unsigned int nrecs);
 
 	/*
 	 * Move an incore btree root from one buffer to another.  Note that
@@ -289,7 +290,7 @@ struct xfs_ifork_broot_ops {
 	void (*move)(struct xfs_inode *ip, int whichfork,
 			struct xfs_btree_block *dst_broot, size_t dst_bytes,
 			struct xfs_btree_block *src_broot, size_t src_bytes,
-			unsigned int numrecs);
+			unsigned int level, unsigned int numrecs);
 };
 
 void xfs_iroot_realloc(struct xfs_inode *ip, int whichfork,
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index f005c21770204..8a6dc7ff2f79e 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -480,7 +480,7 @@ xrep_bmap_iroot_size(
 {
 	ASSERT(level > 0);
 
-	return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level);
+	return xfs_bmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
 }
 
 /* Update the inode counters. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/14] xfs: hoist the node iroot update code out of xfs_btree_new_iroot
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 21:24   ` [PATCH 10/14] xfs: support leaves in the incore btree root block in xfs_iroot_realloc Darrick J. Wong
@ 2023-12-31 21:24   ` Darrick J. Wong
  2023-12-31 21:25   ` [PATCH 12/14] xfs: hoist the node iroot update code out of xfs_btree_kill_iroot Darrick J. Wong
                     ` (2 subsequent siblings)
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:24 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In preparation for allowing records in an inode btree root, hoist the
code that copies keyptrs from an existing node root into a child block
to a separate function.  Note that the new function explicitly computes
the keys of the new child block and stores that in the root block; while
the bmap btree could rely on leaving the key alone, realtime rmap needs
to set the new high key.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.c |  113 +++++++++++++++++++++++++++++----------------
 1 file changed, 74 insertions(+), 39 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index e6df7608ce564..8a721191c1900 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3082,6 +3082,77 @@ xfs_btree_iroot_realloc(
 			cur->bc_ops->iroot_ops, rec_diff);
 }
 
+/*
+ * Move the keys and pointers from a root block to a separate block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_promote_node_iroot(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level,
+	struct xfs_buf		*cbp,
+	union xfs_btree_ptr	*cptr,
+	struct xfs_btree_block	*cblock)
+{
+	union xfs_btree_key	*ckp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*cpp;
+	union xfs_btree_ptr	*pp;
+	int			i;
+	int			error;
+	int			numrecs = xfs_btree_get_numrecs(block);
+
+	/*
+	 * Increase tree height, adjusting the root block level to match.
+	 * We cannot change the root btree node size until we've copied the
+	 * block contents to the new child block.
+	 */
+	be16_add_cpu(&block->bb_level, 1);
+	cur->bc_nlevels++;
+	cur->bc_levels[level + 1].ptr = 1;
+
+	/*
+	 * Adjust the root btree record count, then copy the keys from the old
+	 * root to the new child block.
+	 */
+	xfs_btree_set_numrecs(block, 1);
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, ckp, kp, numrecs);
+
+	/* Check the pointers and copy them to the new child block. */
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+	for (i = 0; i < numrecs; i++) {
+		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+		if (error)
+			return error;
+	}
+	xfs_btree_copy_ptrs(cur, cpp, pp, numrecs);
+
+	/*
+	 * Set the first keyptr to point to the new child block, then shrink
+	 * the memory buffer for the root block.
+	 */
+	error = xfs_btree_debug_check_ptr(cur, cptr, 0, level);
+	if (error)
+		return error;
+	xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+	xfs_btree_get_keys(cur, cblock, kp);
+	xfs_btree_iroot_realloc(cur, 1 - numrecs);
+
+	/* Attach the new block to the cursor and log it. */
+	xfs_btree_setbuf(cur, level, cbp);
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_keys(cur, cbp, 1, numrecs);
+	xfs_btree_log_ptrs(cur, cbp, 1, numrecs);
+	return 0;
+}
+
 /*
  * Copy the old inode root contents into a real block and make the
  * broot point to it.
@@ -3095,14 +3166,10 @@ xfs_btree_new_iroot(
 	struct xfs_buf		*cbp;		/* buffer for cblock */
 	struct xfs_btree_block	*block;		/* btree block */
 	struct xfs_btree_block	*cblock;	/* child btree block */
-	union xfs_btree_key	*ckp;		/* child key pointer */
-	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
-	union xfs_btree_key	*kp;		/* pointer to btree key */
-	union xfs_btree_ptr	*pp;		/* pointer to block addr */
+	union xfs_btree_ptr	*pp;
 	union xfs_btree_ptr	nptr;		/* new block addr */
 	int			level;		/* btree level */
 	int			error;		/* error return code */
-	int			i;		/* loop counter */
 
 	XFS_BTREE_STATS_INC(cur, newroot);
 
@@ -3140,43 +3207,11 @@ xfs_btree_new_iroot(
 			cblock->bb_u.s.bb_blkno = bno;
 	}
 
-	be16_add_cpu(&block->bb_level, 1);
-	xfs_btree_set_numrecs(block, 1);
-	cur->bc_nlevels++;
-	ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
-	cur->bc_levels[level + 1].ptr = 1;
-
-	kp = xfs_btree_key_addr(cur, 1, block);
-	ckp = xfs_btree_key_addr(cur, 1, cblock);
-	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
-
-	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
-		if (error)
-			goto error0;
-	}
-
-	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
-
-	error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
+	error = xfs_btree_promote_node_iroot(cur, block, level, cbp, &nptr,
+			cblock);
 	if (error)
 		goto error0;
 
-	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
-
-	xfs_btree_iroot_realloc(cur, 1 - xfs_btree_get_numrecs(cblock));
-
-	xfs_btree_setbuf(cur, level, cbp);
-
-	/*
-	 * Do all this logging at the end so that
-	 * the root is at the right level.
-	 */
-	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
-	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-
 	*logflags |=
 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
 	*stat = 1;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/14] xfs: hoist the node iroot update code out of xfs_btree_kill_iroot
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 21:24   ` [PATCH 11/14] xfs: hoist the node iroot update code out of xfs_btree_new_iroot Darrick J. Wong
@ 2023-12-31 21:25   ` Darrick J. Wong
  2023-12-31 21:25   ` [PATCH 13/14] xfs: support storing records in the inode core root Darrick J. Wong
  2023-12-31 21:25   ` [PATCH 14/14] xfs: update btree keys correctly when _insrec splits an inode root block Darrick J. Wong
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:25 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In preparation for allowing records in an inode btree root, hoist the
code that copies keyptrs from an existing node child into the root block
to a separate function.  Remove some unnecessary conditionals and clean
up a few function calls in the new function.  Note that this change
reorders the ->free_block call with respect to the change in bc_nlevels
to make it easier to support inode root leaf blocks in the next patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.c |   94 +++++++++++++++++++++++++++++----------------
 1 file changed, 60 insertions(+), 34 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 8a721191c1900..f0e4cf23ecdf2 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3707,6 +3707,63 @@ xfs_btree_insert(
 	return error;
 }
 
+/*
+ * Move the keyptrs from a child node block to the root block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_demote_node_child(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*cblock,
+	int			level,
+	int			numrecs)
+{
+	struct xfs_btree_block	*block;
+	union xfs_btree_key	*ckp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*cpp;
+	union xfs_btree_ptr	*pp;
+	int			i;
+	int			error;
+	int			diff;
+
+	/*
+	 * Adjust the root btree node size and the record count to match the
+	 * doomed child so that we can copy the keyptrs ahead of changing the
+	 * tree shape.
+	 */
+	diff = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+	xfs_btree_iroot_realloc(cur, diff);
+	block = xfs_btree_get_iroot(cur);
+
+	xfs_btree_set_numrecs(block, numrecs);
+	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+	/* Copy keys from the doomed block. */
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+	/* Copy pointers from the doomed block. */
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+	for (i = 0; i < numrecs; i++) {
+		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+		if (error)
+			return error;
+	}
+	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+	/* Decrease tree height, adjusting the root block level to match. */
+	cur->bc_levels[level - 1].bp = NULL;
+	be16_add_cpu(&block->bb_level, -1);
+	cur->bc_nlevels--;
+	return 0;
+}
+
 /*
  * Try to merge a non-leaf block back into the inode root.
  *
@@ -3719,24 +3776,16 @@ STATIC int
 xfs_btree_kill_iroot(
 	struct xfs_btree_cur	*cur)
 {
-	int			whichfork = cur->bc_ino.whichfork;
 	struct xfs_inode	*ip = cur->bc_ino.ip;
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*block;
 	struct xfs_btree_block	*cblock;
-	union xfs_btree_key	*kp;
-	union xfs_btree_key	*ckp;
-	union xfs_btree_ptr	*pp;
-	union xfs_btree_ptr	*cpp;
 	struct xfs_buf		*cbp;
 	int			level;
-	int			index;
 	int			numrecs;
 	int			error;
 #ifdef DEBUG
 	union xfs_btree_ptr	ptr;
 #endif
-	int			i;
 
 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 	ASSERT(cur->bc_nlevels > 1);
@@ -3776,39 +3825,16 @@ xfs_btree_kill_iroot(
 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 #endif
 
-	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
-	if (index) {
-		xfs_btree_iroot_realloc(cur, index);
-		block = ifp->if_broot;
-	}
-
-	be16_add_cpu(&block->bb_numrecs, index);
-	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-
-	kp = xfs_btree_key_addr(cur, 1, block);
-	ckp = xfs_btree_key_addr(cur, 1, cblock);
-	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
-
-	pp = xfs_btree_ptr_addr(cur, 1, block);
-	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-
-	for (i = 0; i < numrecs; i++) {
-		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
-		if (error)
-			return error;
-	}
-
-	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+	error = xfs_btree_demote_node_child(cur, cblock, level, numrecs);
+	if (error)
+		return error;
 
 	error = xfs_btree_free_block(cur, cbp);
 	if (error)
 		return error;
 
-	cur->bc_levels[level - 1].bp = NULL;
-	be16_add_cpu(&block->bb_level, -1);
 	xfs_trans_log_inode(cur->bc_tp, ip,
 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
-	cur->bc_nlevels--;
 out0:
 	return 0;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/14] xfs: support storing records in the inode core root
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 21:25   ` [PATCH 12/14] xfs: hoist the node iroot update code out of xfs_btree_kill_iroot Darrick J. Wong
@ 2023-12-31 21:25   ` Darrick J. Wong
  2023-12-31 21:25   ` [PATCH 14/14] xfs: update btree keys correctly when _insrec splits an inode root block Darrick J. Wong
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:25 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add the necessary flags and code so that we can support storing leaf
records in the inode root block of a btree.  This hasn't been necessary
before, but the realtime rmapbt will need to be able to do this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.c         |  150 ++++++++++++++++++++++++++++++++++---
 fs/xfs/libxfs/xfs_btree.h         |    1 
 fs/xfs/libxfs/xfs_btree_staging.c |    4 +
 3 files changed, 141 insertions(+), 14 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f0e4cf23ecdf2..76320ab728467 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -267,6 +267,11 @@ xfs_btree_check_block(
 	int			level,	/* level of the btree block */
 	struct xfs_buf		*bp)	/* buffer containing block, if any */
 {
+	/* Don't check the inode-core root. */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1)
+		return 0;
+
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 		return xfs_btree_check_lblock(cur, block, level, bp);
 	else
@@ -1547,12 +1552,16 @@ xfs_btree_log_recs(
 	int			first,
 	int			last)
 {
+	if (!bp) {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+				xfs_ilog_fbroot(cur->bc_ino.whichfork));
+		return;
+	}
 
 	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
 	xfs_trans_log_buf(cur->bc_tp, bp,
 			  xfs_btree_rec_offset(cur, first),
 			  xfs_btree_rec_offset(cur, last + 1) - 1);
-
 }
 
 /*
@@ -3082,6 +3091,64 @@ xfs_btree_iroot_realloc(
 			cur->bc_ops->iroot_ops, rec_diff);
 }
 
+/*
+ * Move the records from a root leaf block to a separate block.
+ *
+ * Trickery here: The amount of memory that we need per record for the incore
+ * root block changes when we convert a leaf block to an internal block.
+ * Therefore, we copy leaf records into the new btree block (cblock) before
+ * freeing the incore root block and changing the tree height.
+ *
+ * Once we've changed the tree height, we allocate a new incore root block
+ * (which will now be an internal root block) and populate it with a pointer to
+ * cblock and the relevant keys.
+ */
+STATIC void
+xfs_btree_promote_leaf_iroot(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	struct xfs_buf		*cbp,
+	union xfs_btree_ptr	*cptr,
+	struct xfs_btree_block	*cblock)
+{
+	union xfs_btree_rec	*rp;
+	union xfs_btree_rec	*crp;
+	union xfs_btree_key	*kp;
+	union xfs_btree_ptr	*pp;
+	size_t			size;
+	int			numrecs = xfs_btree_get_numrecs(block);
+
+	/* Copy the records from the leaf root into the new child block. */
+	rp = xfs_btree_rec_addr(cur, 1, block);
+	crp = xfs_btree_rec_addr(cur, 1, cblock);
+	xfs_btree_copy_recs(cur, crp, rp, numrecs);
+
+	/* Zap the old root and change the tree height. */
+	xfs_iroot_free(cur->bc_ino.ip, cur->bc_ino.whichfork);
+	cur->bc_nlevels++;
+	cur->bc_levels[1].ptr = 1;
+
+	/*
+	 * Allocate a new internal root block buffer and reinitialize it to
+	 * point to a single new child.
+	 */
+	size = cur->bc_ops->iroot_ops->size(cur->bc_mp, cur->bc_nlevels - 1, 1);
+	xfs_iroot_alloc(cur->bc_ino.ip, cur->bc_ino.whichfork, size);
+	block = xfs_btree_get_iroot(cur);
+	xfs_btree_init_block(cur->bc_mp, block, cur->bc_ops,
+			cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino);
+
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	kp = xfs_btree_key_addr(cur, 1, block);
+	xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+	xfs_btree_get_keys(cur, cblock, kp);
+
+	/* Attach the new block to the cursor and log it. */
+	xfs_btree_setbuf(cur, 0, cbp);
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_recs(cur, cbp, 1, numrecs);
+}
+
 /*
  * Move the keys and pointers from a root block to a separate block.
  *
@@ -3166,7 +3233,7 @@ xfs_btree_new_iroot(
 	struct xfs_buf		*cbp;		/* buffer for cblock */
 	struct xfs_btree_block	*block;		/* btree block */
 	struct xfs_btree_block	*cblock;	/* child btree block */
-	union xfs_btree_ptr	*pp;
+	union xfs_btree_ptr	aptr;
 	union xfs_btree_ptr	nptr;		/* new block addr */
 	int			level;		/* btree level */
 	int			error;		/* error return code */
@@ -3178,10 +3245,15 @@ xfs_btree_new_iroot(
 	level = cur->bc_nlevels - 1;
 
 	block = xfs_btree_get_iroot(cur);
-	pp = xfs_btree_ptr_addr(cur, 1, block);
+	ASSERT(level > 0 || (cur->bc_flags & XFS_BTREE_IROOT_RECORDS));
+	if (level > 0)
+		aptr = *xfs_btree_ptr_addr(cur, 1, block);
+	else
+		aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp,
+				cur->bc_ino.ip->i_ino));
 
 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
+	error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat);
 	if (error)
 		goto error0;
 	if (*stat == 0)
@@ -3207,10 +3279,14 @@ xfs_btree_new_iroot(
 			cblock->bb_u.s.bb_blkno = bno;
 	}
 
-	error = xfs_btree_promote_node_iroot(cur, block, level, cbp, &nptr,
-			cblock);
-	if (error)
-		goto error0;
+	if (level > 0) {
+		error = xfs_btree_promote_node_iroot(cur, block, level, cbp,
+				&nptr, cblock);
+		if (error)
+			goto error0;
+	} else {
+		xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock);
+	}
 
 	*logflags |=
 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
@@ -3707,6 +3783,45 @@ xfs_btree_insert(
 	return error;
 }
 
+/*
+ * Move the records from a child leaf block to the root block.
+ *
+ * Trickery here: The amount of memory we need per record for the incore root
+ * block changes when we convert a leaf block to an internal block.  Therefore,
+ * we free the incore root block, change the tree height, allocate a new incore
+ * root, and copy the records from the doomed block into the new root.
+ */
+STATIC void
+xfs_btree_demote_leaf_child(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*cblock,
+	int			numrecs)
+{
+	union xfs_btree_rec	*rp;
+	union xfs_btree_rec	*crp;
+	struct xfs_btree_block	*block;
+	size_t			size;
+
+	/* Zap the old root and change the tree height. */
+	xfs_iroot_free(cur->bc_ino.ip, cur->bc_ino.whichfork);
+	cur->bc_levels[0].bp = NULL;
+	cur->bc_nlevels--;
+
+	/*
+	 * Allocate a new internal root block buffer and reinitialize it with
+	 * the leaf records in the child.
+	 */
+	size = cur->bc_ops->iroot_ops->size(cur->bc_mp, 0, numrecs);
+	xfs_iroot_alloc(cur->bc_ino.ip, cur->bc_ino.whichfork, size);
+	block = xfs_btree_get_iroot(cur);
+	xfs_btree_init_block(cur->bc_mp, block, cur->bc_ops, 0, numrecs,
+			cur->bc_ino.ip->i_ino);
+
+	rp = xfs_btree_rec_addr(cur, 1, block);
+	crp = xfs_btree_rec_addr(cur, 1, cblock);
+	xfs_btree_copy_recs(cur, rp, crp, numrecs);
+}
+
 /*
  * Move the keyptrs from a child node block to the root block.
  *
@@ -3788,14 +3903,19 @@ xfs_btree_kill_iroot(
 #endif
 
 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
-	ASSERT(cur->bc_nlevels > 1);
+	ASSERT((cur->bc_flags & XFS_BTREE_IROOT_RECORDS) ||
+	       cur->bc_nlevels > 1);
 
 	/*
 	 * Don't deal with the root block needs to be a leaf case.
 	 * We're just going to turn the thing back into extents anyway.
 	 */
 	level = cur->bc_nlevels - 1;
-	if (level == 1)
+	if (level == 1 && !(cur->bc_flags & XFS_BTREE_IROOT_RECORDS))
+		goto out0;
+
+	/* If we're already a leaf, jump out. */
+	if (level == 0)
 		goto out0;
 
 	/*
@@ -3825,9 +3945,13 @@ xfs_btree_kill_iroot(
 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 #endif
 
-	error = xfs_btree_demote_node_child(cur, cblock, level, numrecs);
-	if (error)
-		return error;
+	if (level > 1) {
+		error = xfs_btree_demote_node_child(cur, cblock, level,
+				numrecs);
+		if (error)
+			return error;
+	} else
+		xfs_btree_demote_leaf_child(cur, cblock, numrecs);
 
 	error = xfs_btree_free_block(cur, cbp);
 	if (error)
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 7872fc1739b84..bb6c2feecea7d 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -337,6 +337,7 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
  * is dynamically allocated and must be freed when the cursor is deleted.
  */
 #define XFS_BTREE_STAGING		(1<<5)
+#define XFS_BTREE_IROOT_RECORDS		(1<<6)	/* iroot can store records */
 
 /* btree stored in memory; not compatible with ROOT_IN_INODE */
 #ifdef CONFIG_XFS_BTREE_IN_XFILE
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index 8f186ada630ba..85294235386b5 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -710,7 +710,9 @@ xfs_btree_bload_compute_geometry(
 			 *
 			 * Note that bmap btrees forbid records in the root.
 			 */
-			if (level != 0 && nr_this_level <= avg_per_block) {
+			if ((level != 0 ||
+			     (cur->bc_flags & XFS_BTREE_IROOT_RECORDS)) &&
+			    nr_this_level <= avg_per_block) {
 				nr_blocks++;
 				break;
 			}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/14] xfs: update btree keys correctly when _insrec splits an inode root block
  2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 21:25   ` [PATCH 13/14] xfs: support storing records in the inode core root Darrick J. Wong
@ 2023-12-31 21:25   ` Darrick J. Wong
  13 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:25 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In commit 2c813ad66a72, I partially fixed a bug wherein xfs_btree_insrec
would erroneously try to update the parent's key for a block that had
been split if we decided to insert the new record into the new block.
The solution was to detect this situation and update the in-core key
value that we pass up to the caller so that the caller will (eventually)
add the new block to the parent level of the tree with the correct key.

However, I missed a subtlety about the way inode-rooted btrees work.  If
the full block was a maximally sized inode root block, we'll solve that
fullness by moving the root block's records to a new block, resizing the
root block, and updating the root to point to the new block.  We don't
pass a pointer to the new block to the caller because that work has
already been done.  The new record will /always/ land in the new block,
so in this case we need to use xfs_btree_update_keys to update the keys.

This bug can theoretically manifest itself in the very rare case that we
split a bmbt root block and the new record lands in the very first slot
of the new block, though I've never managed to trigger it in practice.
However, it is very easy to reproduce by running generic/522 with the
realtime rmapbt patchset if rtinherit=1.

Fixes: 2c813ad66a72 ("xfs: support btrees with overlapping intervals for keys")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.c |   29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 76320ab728467..ddbe2d55a8077 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3656,14 +3656,31 @@ xfs_btree_insrec(
 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
 
 	/*
-	 * If we just inserted into a new tree block, we have to
-	 * recalculate nkey here because nkey is out of date.
+	 * Update btree keys to reflect the newly added record or keyptr.
+	 * There are three cases here to be aware of.  Normally, all we have to
+	 * do is walk towards the root, updating keys as necessary.
 	 *
-	 * Otherwise we're just updating an existing block (having shoved
-	 * some records into the new tree block), so use the regular key
-	 * update mechanism.
+	 * If the caller had us target a full block for the insertion, we dealt
+	 * with that by calling the _make_block_unfull function.  If the
+	 * "make unfull" function splits the block, it'll hand us back the key
+	 * and pointer of the new block.  We haven't yet added the new block to
+	 * the next level up, so if we decide to add the new record to the new
+	 * block (bp->b_bn != old_bn), we have to update the caller's pointer
+	 * so that the caller adds the new block with the correct key.
+	 *
+	 * However, there is a third possibility-- if the selected block is the
+	 * root block of an inode-rooted btree and cannot be expanded further,
+	 * the "make unfull" function moves the root block contents to a new
+	 * block and updates the root block to point to the new block.  In this
+	 * case, no block pointer is passed back because the block has already
+	 * been added to the btree.  In this case, we need to use the regular
+	 * key update function, just like the first case.  This is critical for
+	 * overlapping btrees, because the high key must be updated to reflect
+	 * the entire tree, not just the subtree accessible through the first
+	 * child of the root (which is now two levels down from the root).
 	 */
-	if (bp && xfs_buf_daddr(bp) != old_bn) {
+	if (!xfs_btree_ptr_is_null(cur, &nptr) &&
+	    bp && xfs_buf_daddr(bp) != old_bn) {
 		xfs_btree_get_keys(cur, block, lkey);
 	} else if (xfs_btree_needs_key_update(cur, optr)) {
 		error = xfs_btree_update_keys(cur, level);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/2] xfs: simplify xfs_ag_resv_free signature
  2023-12-31 19:36 ` [PATCHSET v2.0 07/15] xfs: enable in-core block reservation for rt metadata Darrick J. Wong
@ 2023-12-31 21:26   ` Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 2/2] xfs: allow inode-based btrees to reserve space in the data device Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:26 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

It's not possible to fail at increasing fdblocks, so get rid of all the
error returns here.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ag.c      |    4 +---
 fs/xfs/libxfs/xfs_ag_resv.c |   22 +++++-----------------
 fs/xfs/libxfs/xfs_ag_resv.h |    2 +-
 fs/xfs/scrub/repair.c       |    5 +----
 fs/xfs/xfs_fsops.c          |   24 ++++++------------------
 fs/xfs/xfs_fsops.h          |    2 +-
 fs/xfs/xfs_super.c          |    6 +-----
 fs/xfs/xfs_trace.h          |    1 -
 8 files changed, 16 insertions(+), 50 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 77309a1ce12fb..136e02bb1c9eb 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -944,9 +944,7 @@ xfs_ag_shrink_space(
 	 * Disable perag reservations so it doesn't cause the allocation request
 	 * to fail. We'll reestablish reservation before we return.
 	 */
-	error = xfs_ag_resv_free(pag);
-	if (error)
-		return error;
+	xfs_ag_resv_free(pag);
 
 	/* internal log shouldn't also show up in the free space btrees */
 	error = xfs_alloc_vextent_exact_bno(&args,
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index da1057bd0e606..c9594254304b2 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -126,14 +126,13 @@ xfs_ag_resv_needed(
 }
 
 /* Clean out a reservation */
-static int
+static void
 __xfs_ag_resv_free(
 	struct xfs_perag		*pag,
 	enum xfs_ag_resv_type		type)
 {
 	struct xfs_ag_resv		*resv;
 	xfs_extlen_t			oldresv;
-	int				error;
 
 	trace_xfs_ag_resv_free(pag, type, 0);
 
@@ -149,30 +148,19 @@ __xfs_ag_resv_free(
 		oldresv = resv->ar_orig_reserved;
 	else
 		oldresv = resv->ar_reserved;
-	error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+	xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
 	resv->ar_reserved = 0;
 	resv->ar_asked = 0;
 	resv->ar_orig_reserved = 0;
-
-	if (error)
-		trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
-				error, _RET_IP_);
-	return error;
 }
 
 /* Free a per-AG reservation. */
-int
+void
 xfs_ag_resv_free(
 	struct xfs_perag		*pag)
 {
-	int				error;
-	int				err2;
-
-	error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
-	err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
-	if (err2 && !error)
-		error = err2;
-	return error;
+	__xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
+	__xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
 }
 
 static int
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
index b74b210008ea7..ff20ed93de772 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -6,7 +6,7 @@
 #ifndef __XFS_AG_RESV_H__
 #define	__XFS_AG_RESV_H__
 
-int xfs_ag_resv_free(struct xfs_perag *pag);
+void xfs_ag_resv_free(struct xfs_perag *pag);
 int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp);
 
 bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 12cbc14b24810..24d99891a634c 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -968,9 +968,7 @@ xrep_reset_perag_resv(
 	ASSERT(sc->tp);
 
 	sc->flags &= ~XREP_RESET_PERAG_RESV;
-	error = xfs_ag_resv_free(sc->sa.pag);
-	if (error)
-		goto out;
+	xfs_ag_resv_free(sc->sa.pag);
 	error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
 	if (error == -ENOSPC) {
 		xfs_err(sc->mp,
@@ -979,7 +977,6 @@ xrep_reset_perag_resv(
 		error = 0;
 	}
 
-out:
 	return error;
 }
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 84b311bd185af..bd37a7ef28178 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -214,11 +214,10 @@ xfs_growfs_data_private(
 			struct xfs_perag	*pag;
 
 			pag = xfs_perag_get(mp, id.agno);
-			error = xfs_ag_resv_free(pag);
+			xfs_ag_resv_free(pag);
 			xfs_perag_put(pag);
-			if (error)
-				return error;
 		}
+
 		/*
 		 * Reserve AG metadata blocks. ENOSPC here does not mean there
 		 * was a growfs failure, just that there still isn't space for
@@ -601,24 +600,13 @@ xfs_fs_reserve_ag_blocks(
 /*
  * Free space reserved for per-AG metadata.
  */
-int
+void
 xfs_fs_unreserve_ag_blocks(
 	struct xfs_mount	*mp)
 {
-	xfs_agnumber_t		agno;
 	struct xfs_perag	*pag;
-	int			error = 0;
-	int			err2;
+	xfs_agnumber_t		agno;
 
-	for_each_perag(mp, agno, pag) {
-		err2 = xfs_ag_resv_free(pag);
-		if (err2 && !error)
-			error = err2;
-	}
-
-	if (error)
-		xfs_warn(mp,
-	"Error %d freeing per-AG metadata reserve pool.", error);
-
-	return error;
+	for_each_perag(mp, agno, pag)
+		xfs_ag_resv_free(pag);
 }
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 2cffe51a31e8b..dba17c404e7d1 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -14,6 +14,6 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
 extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
 
 extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
-extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+extern void xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
 
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f3a5e194c535b..4a3119a48ef08 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1900,11 +1900,7 @@ xfs_remount_ro(
 	xfs_inodegc_stop(mp);
 
 	/* Free the per-AG metadata reservation pool. */
-	error = xfs_fs_unreserve_ag_blocks(mp);
-	if (error) {
-		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-		return error;
-	}
+	xfs_fs_unreserve_ag_blocks(mp);
 
 	/*
 	 * Before we sync the metadata, we need to free up the reserve block
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 063b1ee79de2f..87c3fe0f078be 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3149,7 +3149,6 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
 DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
 DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
 
-DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
 DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
 
 /* refcount tracepoint classes */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/2] xfs: allow inode-based btrees to reserve space in the data device
  2023-12-31 19:36 ` [PATCHSET v2.0 07/15] xfs: enable in-core block reservation for rt metadata Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 1/2] xfs: simplify xfs_ag_resv_free signature Darrick J. Wong
@ 2023-12-31 21:26   ` Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:26 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new space reservation scheme so that btree metadata for the
realtime volume can reserve space in the data device to avoid space
underruns.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ag_resv.c  |    3 +
 fs/xfs/libxfs/xfs_errortag.h |    4 +
 fs/xfs/libxfs/xfs_imeta.c    |  191 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_imeta.h    |   11 ++
 fs/xfs/libxfs/xfs_types.h    |    7 ++
 fs/xfs/xfs_error.c           |    3 +
 fs/xfs/xfs_fsops.c           |   17 ++++
 fs/xfs/xfs_inode.h           |    3 +
 fs/xfs/xfs_mount.c           |   10 ++
 fs/xfs/xfs_mount.h           |    1 
 fs/xfs/xfs_rtalloc.c         |   23 +++++
 fs/xfs/xfs_rtalloc.h         |    5 +
 fs/xfs/xfs_trace.h           |   45 ++++++++++
 13 files changed, 322 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index c9594254304b2..f775b92b4aacd 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -113,6 +113,7 @@ xfs_ag_resv_needed(
 	case XFS_AG_RESV_RMAPBT:
 		len -= xfs_perag_resv(pag, type)->ar_reserved;
 		break;
+	case XFS_AG_RESV_IMETA:
 	case XFS_AG_RESV_NONE:
 		/* empty */
 		break;
@@ -347,6 +348,7 @@ xfs_ag_resv_alloc_extent(
 
 	switch (type) {
 	case XFS_AG_RESV_AGFL:
+	case XFS_AG_RESV_IMETA:
 		return;
 	case XFS_AG_RESV_METADATA:
 	case XFS_AG_RESV_RMAPBT:
@@ -389,6 +391,7 @@ xfs_ag_resv_free_extent(
 
 	switch (type) {
 	case XFS_AG_RESV_AGFL:
+	case XFS_AG_RESV_IMETA:
 		return;
 	case XFS_AG_RESV_METADATA:
 	case XFS_AG_RESV_RMAPBT:
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 263d62a8d70f8..f359df69d6b52 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -64,7 +64,8 @@
 #define XFS_ERRTAG_WB_DELAY_MS				42
 #define XFS_ERRTAG_WRITE_DELAY_MS			43
 #define XFS_ERRTAG_SWAPEXT_FINISH_ONE			44
-#define XFS_ERRTAG_MAX					45
+#define XFS_ERRTAG_IMETA_RESV_CRITICAL			45
+#define XFS_ERRTAG_MAX					46
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -113,5 +114,6 @@
 #define XFS_RANDOM_WB_DELAY_MS				3000
 #define XFS_RANDOM_WRITE_DELAY_MS			3000
 #define XFS_RANDOM_SWAPEXT_FINISH_ONE			1
+#define XFS_RANDOM_IMETA_RESV_CRITICAL			4
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index 57be23f89d864..8c9eef0eebda1 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -27,6 +27,10 @@
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_health.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
 
 /*
  * Metadata File Management
@@ -1075,3 +1079,190 @@ xfs_imeta_free_path(
 	kfree(path->im_path);
 	kfree(path);
 }
+
+/*
+ * Is the amount of space that could be allocated towards a given metadata
+ * file at or beneath a certain threshold?
+ */
+static inline bool
+xfs_imeta_resv_can_cover(
+	struct xfs_inode	*ip,
+	int64_t			rhs)
+{
+	/*
+	 * The amount of space that can be allocated to this metadata file is
+	 * the remaining reservation for the particular metadata file + the
+	 * global free block count.  Take care of the first case to avoid
+	 * touching the per-cpu counter.
+	 */
+	if (ip->i_delayed_blks >= rhs)
+		return true;
+
+	/*
+	 * There aren't enough blocks left in the inode's reservation, but it
+	 * isn't critical unless there also isn't enough free space.
+	 */
+	return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+			rhs - ip->i_delayed_blks, 2048) >= 0;
+}
+
+/*
+ * Is this metadata file critically low on blocks?  For now we'll define that
+ * as the number of blocks we can get our hands on being less than 10% of what
+ * we reserved or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_imeta_resv_critical(
+	struct xfs_inode	*ip)
+{
+	uint64_t		asked_low_water;
+
+	if (!ip)
+		return false;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	trace_xfs_imeta_resv_critical(ip, 0);
+
+	if (!xfs_imeta_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
+		return true;
+
+	asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
+	if (!xfs_imeta_resv_can_cover(ip, asked_low_water))
+		return true;
+
+	return XFS_TEST_ERROR(false, ip->i_mount,
+			XFS_ERRTAG_IMETA_RESV_CRITICAL);
+}
+
+/* Allocate a block from the metadata file's reservation. */
+void
+xfs_imeta_resv_alloc_extent(
+	struct xfs_inode	*ip,
+	struct xfs_alloc_arg	*args)
+{
+	int64_t			len = args->len;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+	ASSERT(args->resv == XFS_AG_RESV_IMETA);
+
+	trace_xfs_imeta_resv_alloc_extent(ip, args->len);
+
+	/*
+	 * Allocate the blocks from the metadata inode's block reservation
+	 * and update the ondisk sb counter.
+	 */
+	if (ip->i_delayed_blks > 0) {
+		int64_t		from_resv;
+
+		from_resv = min_t(int64_t, len, ip->i_delayed_blks);
+		ip->i_delayed_blks -= from_resv;
+		xfs_mod_delalloc(ip->i_mount, -from_resv);
+		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
+				-from_resv);
+		len -= from_resv;
+	}
+
+	/*
+	 * Any allocation in excess of the reservation requires in-core and
+	 * on-disk fdblocks updates.
+	 */
+	if (len)
+		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, -len);
+
+	ip->i_nblocks += args->len;
+	xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
+}
+
+/* Free a block to the metadata file's reservation. */
+void
+xfs_imeta_resv_free_extent(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp,
+	xfs_filblks_t		len)
+{
+	int64_t			to_resv;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+	trace_xfs_imeta_resv_free_extent(ip, len);
+
+	ip->i_nblocks -= len;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	/*
+	 * Add the freed blocks back into the inode's delalloc reservation
+	 * until it reaches the maximum size.  Update the ondisk fdblocks only.
+	 */
+	to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+	if (to_resv > 0) {
+		to_resv = min_t(int64_t, to_resv, len);
+		ip->i_delayed_blks += to_resv;
+		xfs_mod_delalloc(ip->i_mount, to_resv);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
+		len -= to_resv;
+	}
+
+	/*
+	 * Everything else goes back to the filesystem, so update the in-core
+	 * and on-disk counters.
+	 */
+	if (len)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
+}
+
+/* Release a metadata file's space reservation. */
+void
+xfs_imeta_resv_free_inode(
+	struct xfs_inode	*ip)
+{
+	if (!ip)
+		return;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	trace_xfs_imeta_resv_free(ip, 0);
+
+	xfs_mod_delalloc(ip->i_mount, -ip->i_delayed_blks);
+	xfs_mod_fdblocks(ip->i_mount, ip->i_delayed_blks, true);
+	ip->i_delayed_blks = 0;
+	ip->i_meta_resv_asked = 0;
+}
+
+/* Set up a metadata file's space reservation. */
+int
+xfs_imeta_resv_init_inode(
+	struct xfs_inode	*ip,
+	xfs_filblks_t		ask)
+{
+	xfs_filblks_t		hidden_space;
+	xfs_filblks_t		used;
+	int			error;
+
+	if (!ip || ip->i_meta_resv_asked > 0)
+		return 0;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+
+	/*
+	 * Space taken by all other metadata btrees are accounted on-disk as
+	 * used space.  We therefore only hide the space that is reserved but
+	 * not used by the trees.
+	 */
+	used = ip->i_nblocks;
+	if (used > ask)
+		ask = used;
+	hidden_space = ask - used;
+
+	error = xfs_mod_fdblocks(ip->i_mount, -(int64_t)hidden_space, true);
+	if (error) {
+		trace_xfs_imeta_resv_init_error(ip, error, _RET_IP_);
+		return error;
+	}
+
+	xfs_mod_delalloc(ip->i_mount, hidden_space);
+	ip->i_delayed_blks = hidden_space;
+	ip->i_meta_resv_asked = ask;
+
+	trace_xfs_imeta_resv_init(ip, ask);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h
index 3b5953efc013c..f6dda8e5af006 100644
--- a/fs/xfs/libxfs/xfs_imeta.h
+++ b/fs/xfs/libxfs/xfs_imeta.h
@@ -102,6 +102,17 @@ unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp);
 unsigned int xfs_imeta_link_space_res(struct xfs_mount *mp);
 unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp);
 
+/* Space reservations for metadata inodes. */
+struct xfs_alloc_arg;
+
+bool xfs_imeta_resv_critical(struct xfs_inode *ip);
+void xfs_imeta_resv_alloc_extent(struct xfs_inode *ip,
+		struct xfs_alloc_arg *args);
+void xfs_imeta_resv_free_extent(struct xfs_inode *ip, struct xfs_trans *tp,
+		xfs_filblks_t len);
+void xfs_imeta_resv_free_inode(struct xfs_inode *ip);
+int xfs_imeta_resv_init_inode(struct xfs_inode *ip, xfs_filblks_t ask);
+
 /* Must be implemented by the libxfs client */
 int xfs_imeta_iget(struct xfs_trans *tp, xfs_ino_t ino, unsigned char ftype,
 		struct xfs_inode **ipp);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 195471c438599..ad2ce83874f9f 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -221,6 +221,13 @@ enum xfs_ag_resv_type {
 	 * altering fdblocks.  If you think you need this you're wrong.
 	 */
 	XFS_AG_RESV_IGNORE,
+
+	/*
+	 * This allocation activity is being done on behalf of a metadata file.
+	 * These files maintain their own permanent space reservations and are
+	 * required to adjust fdblocks using the xfs_imeta_resv_* helpers.
+	 */
+	XFS_AG_RESV_IMETA,
 };
 
 /* Results of scanning a btree keyspace to check occupancy. */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c3792ab41c271..35702905a0fe1 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -63,6 +63,7 @@ static unsigned int xfs_errortag_random_default[] = {
 	XFS_RANDOM_WB_DELAY_MS,
 	XFS_RANDOM_WRITE_DELAY_MS,
 	XFS_RANDOM_SWAPEXT_FINISH_ONE,
+	XFS_RANDOM_IMETA_RESV_CRITICAL,
 };
 
 struct xfs_errortag_attr {
@@ -181,6 +182,7 @@ XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node,	XFS_ERRTAG_ATTR_LEAF_TO_NODE);
 XFS_ERRORTAG_ATTR_RW(wb_delay_ms,	XFS_ERRTAG_WB_DELAY_MS);
 XFS_ERRORTAG_ATTR_RW(write_delay_ms,	XFS_ERRTAG_WRITE_DELAY_MS);
 XFS_ERRORTAG_ATTR_RW(swapext_finish_one, XFS_ERRTAG_SWAPEXT_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(imeta_resv_critical, XFS_ERRTAG_IMETA_RESV_CRITICAL);
 
 static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -227,6 +229,7 @@ static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
 	XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
 	XFS_ERRORTAG_ATTR_LIST(swapext_finish_one),
+	XFS_ERRORTAG_ATTR_LIST(imeta_resv_critical),
 	NULL,
 };
 ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index bd37a7ef28178..99f9f5c8d9b6e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -22,6 +22,7 @@
 #include "xfs_ag_resv.h"
 #include "xfs_trace.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
 
 /*
  * Write new AG headers to disk. Non-transactional, but need to be
@@ -592,6 +593,19 @@ xfs_fs_reserve_ag_blocks(
 		xfs_warn(mp,
 	"Error %d reserving per-AG metadata reserve pool.", error);
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		return error;
+	}
+
+	if (xfs_has_realtime(mp)) {
+		err2 = xfs_rt_resv_init(mp);
+		if (err2 && err2 != -ENOSPC) {
+			xfs_warn(mp,
+		"Error %d reserving realtime metadata reserve pool.", err2);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		}
+
+		if (err2 && !error)
+			error = err2;
 	}
 
 	return error;
@@ -607,6 +621,9 @@ xfs_fs_unreserve_ag_blocks(
 	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
 
+	if (xfs_has_realtime(mp))
+		xfs_rt_resv_free(mp);
+
 	for_each_perag(mp, agno, pag)
 		xfs_ag_resv_free(pag);
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e33b270c6b508..6013a97d02c5d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -55,6 +55,9 @@ typedef struct xfs_inode {
 	/* Miscellaneous state. */
 	unsigned long		i_flags;	/* see defined flags below */
 	uint64_t		i_delayed_blks;	/* count of delay alloc blks */
+	/* Space that has been set aside to root a btree in this file. */
+	uint64_t		i_meta_resv_asked;
+
 	xfs_fsize_t		i_disk_size;	/* number of bytes in file */
 	xfs_rfsblock_t		i_nblocks;	/* # of direct & btree blocks */
 	prid_t			i_projid;	/* owner's project id */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ba6c77e7e265d..c774ac73bdec5 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -659,6 +659,15 @@ xfs_agbtree_compute_maxlevels(
 	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
 }
 
+/* Compute maximum possible height for realtime btree types for this fs. */
+static inline void
+xfs_rtbtree_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	/* This will be filled in later. */
+	mp->m_rtbtree_maxlevels = 0;
+}
+
 /*
  * This function does the following on an initial mount of a file system:
  *	- reads the superblock from disk and init the mount struct
@@ -731,6 +740,7 @@ xfs_mountfs(
 	xfs_refcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
+	xfs_rtbtree_compute_maxlevels(mp);
 
 	/*
 	 * Check if sb_agblocks is aligned at stripe boundary.  If sb_agblocks
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a086b96b0a513..7ef8d5f706883 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -140,6 +140,7 @@ typedef struct xfs_mount {
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
 	uint			m_refc_maxlevels; /* max refcount btree level */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
+	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
 	uint			m_alloc_set_aside; /* space we can't use */
 	uint			m_ag_max_usable; /* max space per AG */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6479bb04657ec..49528f901d047 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1310,6 +1310,14 @@ xfs_growfs_rt(
 		goto out_free;
 
 	error = xfs_rtgroup_update_secondary_sbs(mp);
+	if (error)
+		goto out_free;
+
+	/* Reset the rt metadata btree space reservations. */
+	xfs_rt_resv_free(mp);
+	error = xfs_rt_resv_init(mp);
+	if (error == -ENOSPC)
+		error = 0;
 
 out_free:
 	/*
@@ -1544,6 +1552,21 @@ xfs_rtalloc_reinit_frextents(
 	return 0;
 }
 
+/* Free space reservations for rt metadata inodes. */
+void
+xfs_rt_resv_free(
+	struct xfs_mount	*mp)
+{
+}
+
+/* Reserve space for rt metadata inodes' space expansion. */
+int
+xfs_rt_resv_init(
+	struct xfs_mount	*mp)
+{
+	return 0;
+}
+
 static inline int
 __xfs_rt_iget(
 	struct xfs_trans	*tp,
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b982b97cf073f..c01ca192646a9 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -54,6 +54,9 @@ int					/* error */
 xfs_rtmount_inodes(
 	struct xfs_mount	*mp);	/* file system mount structure */
 
+void xfs_rt_resv_free(struct xfs_mount *mp);
+int xfs_rt_resv_init(struct xfs_mount *mp);
+
 /*
  * Pick an extent for allocation at the start of a new realtime file.
  * Use the sequence number stored in the atime field of the bitmap inode.
@@ -96,6 +99,8 @@ xfs_rtmount_init(
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
 # define xfs_rtunmount_inodes(m)
+# define xfs_rt_resv_free(mp)				((void)0)
+# define xfs_rt_resv_init(mp)				(0)
 #endif	/* CONFIG_XFS_RT */
 
 #endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 87c3fe0f078be..db5d2b20b36fc 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5282,6 +5282,51 @@ DEFINE_EVENT(xfs_imeta_dir_class, name, \
 	TP_ARGS(dp, name, ino))
 DEFINE_IMETA_DIR_EVENT(xfs_imeta_dir_lookup);
 
+/* metadata inode space reservations */
+
+DECLARE_EVENT_CLASS(xfs_imeta_resv_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
+	TP_ARGS(ip, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned long long, freeblks)
+		__field(unsigned long long, reserved)
+		__field(unsigned long long, asked)
+		__field(unsigned long long, used)
+		__field(unsigned long long, len)
+	),
+	TP_fast_assign(
+		struct xfs_mount *mp = ip->i_mount;
+
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
+		__entry->reserved = ip->i_delayed_blks;
+		__entry->asked = ip->i_meta_resv_asked;
+		__entry->used = ip->i_nblocks;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->freeblks,
+		  __entry->reserved,
+		  __entry->asked,
+		  __entry->used,
+		  __entry->len)
+)
+#define DEFINE_IMETA_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_imeta_resv_class, name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
+	TP_ARGS(ip, len))
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_init);
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free);
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_alloc_extent);
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free_extent);
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_critical);
+DEFINE_INODE_ERROR_EVENT(xfs_imeta_resv_init_error);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/9] xfs: clean up extent free log intent item tracepoint callsites
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
@ 2023-12-31 21:26   ` Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 2/9] xfs: convert "skip_discard" to a proper flags bitset Darrick J. Wong
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:26 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pass the incore EFI structure to the tracepoints instead of open-coding
the argument passing.  This cleans up the call sites a bit.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_alloc.c |    7 +++----
 fs/xfs/xfs_extfree_item.c |    6 ++----
 fs/xfs/xfs_trace.h        |   33 +++++++++++++++------------------
 3 files changed, 20 insertions(+), 26 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ac31b62e70177..0d8cddb1b41ce 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2573,7 +2573,7 @@ xfs_defer_agfl_block(
 	xefi->xefi_owner = oinfo->oi_owner;
 	xefi->xefi_agresv = XFS_AG_RESV_AGFL;
 
-	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+	trace_xfs_agfl_free_defer(mp, xefi);
 
 	xfs_extent_free_get_group(mp, xefi);
 	xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
@@ -2635,9 +2635,8 @@ xfs_defer_extent_free(
 	} else {
 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
 	}
-	trace_xfs_bmap_free_defer(mp,
-			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
-			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+
+	trace_xfs_extent_free_defer(mp, xefi);
 
 	xfs_extent_free_get_group(mp, xefi);
 	*dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 1d1185fca6a58..190d6a69e40b0 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -464,8 +464,7 @@ xfs_extent_free_finish_item(
 	if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
 		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
 
-	trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
-			agbno, xefi->xefi_blockcount);
+	trace_xfs_extent_free_deferred(mp, xefi);
 
 	/*
 	 * If we need a new transaction to make progress, the caller will log a
@@ -542,8 +541,7 @@ xfs_agfl_free_finish_item(
 	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
 	oinfo.oi_owner = xefi->xefi_owner;
 
-	trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno,
-			xefi->xefi_blockcount);
+	trace_xfs_agfl_free_deferred(mp, xefi);
 
 	error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp);
 	if (!error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index db5d2b20b36fc..aafc0735e20ac 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -88,6 +88,7 @@ struct xfs_parent_name_irec;
 struct xfs_attrlist_cursor_kern;
 struct xfs_imeta_update;
 struct xfs_rtgroup;
+struct xfs_extent_free_item;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -2762,41 +2763,37 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
 
 DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 int type, xfs_agblock_t agbno, xfs_extlen_t len),
-	TP_ARGS(mp, agno, type, agbno, len),
+	TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free),
+	TP_ARGS(mp, free),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
-		__field(int, type)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
+		__field(unsigned int, flags)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
-		__entry->type = type;
-		__entry->agbno = agbno;
-		__entry->len = len;
+		__entry->agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+		__entry->agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+		__entry->len = free->xefi_blockcount;
+		__entry->flags = free->xefi_flags;
 	),
-	TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x flags 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
 		  __entry->agno,
 		  __entry->agbno,
-		  __entry->len)
+		  __entry->len,
+		  __entry->flags)
 );
 #define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \
 DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 int type, \
-		 xfs_agblock_t bno, \
-		 xfs_extlen_t len), \
-	TP_ARGS(mp, agno, type, bno, len))
-DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer);
-DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred);
+	TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free), \
+	TP_ARGS(mp, free))
 DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer);
 DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred);
 
 DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
 	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/9] xfs: convert "skip_discard" to a proper flags bitset
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 1/9] xfs: clean up extent free log intent item tracepoint callsites Darrick J. Wong
@ 2023-12-31 21:26   ` Darrick J. Wong
  2023-12-31 21:27   ` [PATCH 3/9] xfs: pass the fsbno to xfs_perag_intent_get Darrick J. Wong
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:26 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert the boolean to skip discard on free into a proper flags field so
that we can add more flags in the next patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ag.c             |    2 +-
 fs/xfs/libxfs/xfs_alloc.c          |   13 +++++++------
 fs/xfs/libxfs/xfs_alloc.h          |    9 +++++++--
 fs/xfs/libxfs/xfs_bmap.c           |   12 ++++++++----
 fs/xfs/libxfs/xfs_bmap_btree.c     |    2 +-
 fs/xfs/libxfs/xfs_ialloc.c         |    5 ++---
 fs/xfs/libxfs/xfs_ialloc_btree.c   |    2 +-
 fs/xfs/libxfs/xfs_refcount.c       |    6 +++---
 fs/xfs/libxfs/xfs_refcount_btree.c |    2 +-
 fs/xfs/scrub/newbt.c               |    5 +++--
 fs/xfs/scrub/reap.c                |    7 ++++---
 fs/xfs/xfs_reflink.c               |    2 +-
 12 files changed, 39 insertions(+), 28 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 136e02bb1c9eb..35deb474a7cf0 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -980,7 +980,7 @@ xfs_ag_shrink_space(
 			goto resv_err;
 
 		err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
-				XFS_AG_RESV_NONE, true);
+				XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD);
 		if (err2)
 			goto resv_err;
 
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 0d8cddb1b41ce..1ea88d4675f5f 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2591,7 +2591,7 @@ xfs_defer_extent_free(
 	xfs_filblks_t			len,
 	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type		type,
-	bool				skip_discard,
+	unsigned int			free_flags,
 	struct xfs_defer_pending	**dfpp)
 {
 	struct xfs_extent_free_item	*xefi;
@@ -2611,6 +2611,7 @@ xfs_defer_extent_free(
 	ASSERT(len < mp->m_sb.sb_agblocks);
 	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
 #endif
+	ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
 	ASSERT(xfs_extfree_item_cache != NULL);
 	ASSERT(type != XFS_AG_RESV_AGFL);
 
@@ -2622,7 +2623,7 @@ xfs_defer_extent_free(
 	xefi->xefi_startblock = bno;
 	xefi->xefi_blockcount = (xfs_extlen_t)len;
 	xefi->xefi_agresv = type;
-	if (skip_discard)
+	if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
 	if (oinfo) {
 		ASSERT(oinfo->oi_offset == 0);
@@ -2650,11 +2651,11 @@ xfs_free_extent_later(
 	xfs_filblks_t			len,
 	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type		type,
-	bool				skip_discard)
+	unsigned int			free_flags)
 {
 	struct xfs_defer_pending	*dontcare = NULL;
 
-	return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
+	return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags,
 			&dontcare);
 }
 
@@ -2679,13 +2680,13 @@ xfs_free_extent_later(
 int
 xfs_alloc_schedule_autoreap(
 	const struct xfs_alloc_arg	*args,
-	bool				skip_discard,
+	unsigned int			free_flags,
 	struct xfs_alloc_autoreap	*aarp)
 {
 	int				error;
 
 	error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
-			&args->oinfo, args->resv, skip_discard, &aarp->dfp);
+			&args->oinfo, args->resv, free_flags, &aarp->dfp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 0b956f8b9d5a7..2da543fb90ecd 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -233,7 +233,12 @@ xfs_buf_to_agfl_bno(
 
 int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
 		xfs_filblks_t len, const struct xfs_owner_info *oinfo,
-		enum xfs_ag_resv_type type, bool skip_discard);
+		enum xfs_ag_resv_type type, unsigned int free_flags);
+
+/* Don't issue a discard for the blocks freed. */
+#define XFS_FREE_EXTENT_SKIP_DISCARD	(1U << 0)
+
+#define XFS_FREE_EXTENT_ALL_FLAGS	(XFS_FREE_EXTENT_SKIP_DISCARD)
 
 /*
  * List of extents to be free "later".
@@ -262,7 +267,7 @@ struct xfs_alloc_autoreap {
 };
 
 int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args,
-		bool skip_discard, struct xfs_alloc_autoreap *aarp);
+		unsigned int free_flags, struct xfs_alloc_autoreap *aarp);
 void xfs_alloc_cancel_autoreap(struct xfs_trans *tp,
 		struct xfs_alloc_autoreap *aarp);
 void xfs_alloc_commit_autoreap(struct xfs_trans *tp,
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2bc0b76bcfbbb..ce9c247525a3d 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -588,7 +588,7 @@ xfs_bmap_btree_to_extents(
 
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
 	error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo,
-			XFS_AG_RESV_NONE, false);
+			XFS_AG_RESV_NONE, 0);
 	if (error)
 		return error;
 
@@ -5266,11 +5266,15 @@ xfs_bmap_del_extent_real(
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
 			xfs_refcount_decrease_extent(tp, del);
 		} else {
+			unsigned int	efi_flags = 0;
+
+			if ((bflags & XFS_BMAPI_NODISCARD) ||
+			    del->br_state == XFS_EXT_UNWRITTEN)
+				efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD;
+
 			error = xfs_free_extent_later(tp, del->br_startblock,
 					del->br_blockcount, NULL,
-					XFS_AG_RESV_NONE,
-					((bflags & XFS_BMAPI_NODISCARD) ||
-					del->br_state == XFS_EXT_UNWRITTEN));
+					XFS_AG_RESV_NONE, efi_flags);
 			if (error)
 				return error;
 		}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 0a5020c8e0f5e..a6de8b7528aa1 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -271,7 +271,7 @@ xfs_bmbt_free_block(
 
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
 	error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo,
-			XFS_AG_RESV_NONE, false);
+			XFS_AG_RESV_NONE, 0);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 7278392a8f949..46c0bb67e4c47 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1965,7 +1965,7 @@ xfs_difree_inode_chunk(
 		return xfs_free_extent_later(tp,
 				XFS_AGB_TO_FSB(mp, agno, sagbno),
 				M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
-				XFS_AG_RESV_NONE, false);
+				XFS_AG_RESV_NONE, 0);
 	}
 
 	/* holemask is only 16-bits (fits in an unsigned long) */
@@ -2011,8 +2011,7 @@ xfs_difree_inode_chunk(
 		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
 		error = xfs_free_extent_later(tp,
 				XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
-				&XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE,
-				false);
+				&XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0);
 		if (error)
 			return error;
 
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 4a220946d99ca..bb6d4eb6698ca 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -161,7 +161,7 @@ __xfs_inobt_free_block(
 	xfs_inobt_mod_blockcount(cur, -1);
 	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
 	return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
-			&XFS_RMAP_OINFO_INOBT, resv, false);
+			&XFS_RMAP_OINFO_INOBT, resv, 0);
 }
 
 STATIC int
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 90a4526fef929..e30c4cdfaa392 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1173,7 +1173,7 @@ xfs_refcount_adjust_extents(
 						tmp.rc_startblock);
 				error = xfs_free_extent_later(cur->bc_tp, fsbno,
 						  tmp.rc_blockcount, NULL,
-						  XFS_AG_RESV_NONE, false);
+						  XFS_AG_RESV_NONE, 0);
 				if (error)
 					goto out_error;
 			}
@@ -1237,7 +1237,7 @@ xfs_refcount_adjust_extents(
 					ext.rc_startblock);
 			error = xfs_free_extent_later(cur->bc_tp, fsbno,
 					ext.rc_blockcount, NULL,
-					XFS_AG_RESV_NONE, false);
+					XFS_AG_RESV_NONE, 0);
 			if (error)
 				goto out_error;
 		}
@@ -2022,7 +2022,7 @@ xfs_refcount_recover_cow_leftovers(
 		/* Free the block. */
 		error = xfs_free_extent_later(tp, fsb,
 				rr->rr_rrec.rc_blockcount, NULL,
-				XFS_AG_RESV_NONE, false);
+				XFS_AG_RESV_NONE, 0);
 		if (error)
 			goto out_trans;
 
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index ce42ecd3e3715..c2e20854f9bc2 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -108,7 +108,7 @@ xfs_refcountbt_free_block(
 	be32_add_cpu(&agf->agf_refcount_blocks, -1);
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
 	return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
-			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false);
+			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, 0);
 }
 
 STATIC int
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index ad2da235d21bb..8375c8b9752a7 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -160,7 +160,8 @@ xrep_newbt_add_blocks(
 	if (args->tp) {
 		ASSERT(xnr->oinfo.oi_offset == 0);
 
-		error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
+		error = xfs_alloc_schedule_autoreap(args,
+				XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
 		if (error)
 			goto out_pag;
 	}
@@ -414,7 +415,7 @@ xrep_newbt_free_extent(
 	 */
 	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
 	error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
-			xnr->resv, true);
+			xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 01ceaa4efa16b..37eb61906be18 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -451,7 +451,7 @@ xreap_agextent_iter(
 
 		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
 		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
-				rs->resv, true);
+				rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
 		if (error)
 			return error;
 
@@ -477,7 +477,7 @@ xreap_agextent_iter(
 	 * system with large EFIs.
 	 */
 	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
-			rs->resv, true);
+			rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
 	if (error)
 		return error;
 
@@ -943,7 +943,8 @@ xrep_reap_bmapi_iter(
 	xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
 			-(int64_t)imap->br_blockcount);
 	return xfs_free_extent_later(sc->tp, imap->br_startblock,
-			imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true);
+			imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
+			XFS_FREE_EXTENT_SKIP_DISCARD);
 }
 
 /*
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 69dfd7e430fe7..7e9273cc16e14 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -619,7 +619,7 @@ xfs_reflink_cancel_cow_blocks(
 
 			error = xfs_free_extent_later(*tpp, del.br_startblock,
 					del.br_blockcount, NULL,
-					XFS_AG_RESV_NONE, false);
+					XFS_AG_RESV_NONE, 0);
 			if (error)
 				break;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/9] xfs: pass the fsbno to xfs_perag_intent_get
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 1/9] xfs: clean up extent free log intent item tracepoint callsites Darrick J. Wong
  2023-12-31 21:26   ` [PATCH 2/9] xfs: convert "skip_discard" to a proper flags bitset Darrick J. Wong
@ 2023-12-31 21:27   ` Darrick J. Wong
  2023-12-31 21:27   ` [PATCH 4/9] xfs: add a xefi_entry helper Darrick J. Wong
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:27 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

All callers of xfs_perag_intent_get have a fsbno and need boilerplate
code to turn that into an agno.  Just pass the fsbno to
xfs_perag_intent_get and look up the agno there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_bmap_item.c     |    6 +-----
 fs/xfs/xfs_drain.c         |    8 ++++----
 fs/xfs/xfs_drain.h         |    5 +++--
 fs/xfs/xfs_extfree_item.c  |    5 +----
 fs/xfs/xfs_refcount_item.c |    5 +----
 fs/xfs/xfs_rmap_item.c     |    5 +----
 6 files changed, 11 insertions(+), 23 deletions(-)


diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 234296a37b269..e50276ceceae9 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -325,8 +325,6 @@ xfs_bmap_update_get_group(
 	struct xfs_mount	*mp,
 	struct xfs_bmap_intent	*bi)
 {
-	xfs_agnumber_t		agno;
-
 	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
 		if (xfs_has_rtgroups(mp)) {
 			xfs_rgnumber_t	rgno;
@@ -340,8 +338,6 @@ xfs_bmap_update_get_group(
 		return;
 	}
 
-	agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
-
 	/*
 	 * Bump the intent count on behalf of the deferred rmap and refcount
 	 * intent items that that we can queue when we finish this bmap work.
@@ -349,7 +345,7 @@ xfs_bmap_update_get_group(
 	 * intent drops the intent count, ensuring that the intent count
 	 * remains nonzero across the transaction roll.
 	 */
-	bi->bi_pag = xfs_perag_intent_get(mp, agno);
+	bi->bi_pag = xfs_perag_intent_get(mp, bi->bi_bmap.br_startblock);
 }
 
 /* Add this deferred BUI to the transaction. */
diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c
index 005a66be44a25..7bdb9688c0f5e 100644
--- a/fs/xfs/xfs_drain.c
+++ b/fs/xfs/xfs_drain.c
@@ -94,17 +94,17 @@ static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr)
 }
 
 /*
- * Get a passive reference to an AG and declare an intent to update its
- * metadata.
+ * Get a passive reference to the AG that contains a fsbno and declare an intent
+ * to update its metadata.
  */
 struct xfs_perag *
 xfs_perag_intent_get(
 	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno)
+	xfs_fsblock_t		fsbno)
 {
 	struct xfs_perag	*pag;
 
-	pag = xfs_perag_get(mp, agno);
+	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, fsbno));
 	if (!pag)
 		return NULL;
 
diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h
index 50a5772a8296c..775164f54ea6d 100644
--- a/fs/xfs/xfs_drain.h
+++ b/fs/xfs/xfs_drain.h
@@ -62,7 +62,7 @@ void xfs_drain_wait_enable(void);
  * until the item is finished or cancelled.
  */
 struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp,
-		xfs_agnumber_t agno);
+		xfs_fsblock_t fsbno);
 void xfs_perag_intent_put(struct xfs_perag *pag);
 
 void xfs_perag_intent_hold(struct xfs_perag *pag);
@@ -76,7 +76,8 @@ struct xfs_defer_drain { /* empty */ };
 #define xfs_defer_drain_free(dr)		((void)0)
 #define xfs_defer_drain_init(dr)		((void)0)
 
-#define xfs_perag_intent_get(mp, agno)		xfs_perag_get((mp), (agno))
+#define xfs_perag_intent_get(mp, fsbno) \
+	xfs_perag_get((mp), XFS_FSB_TO_AGNO(mp, fsbno))
 #define xfs_perag_intent_put(pag)		xfs_perag_put(pag)
 
 static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { }
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 190d6a69e40b0..a29105583de96 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -424,10 +424,7 @@ xfs_extent_free_get_group(
 	struct xfs_mount		*mp,
 	struct xfs_extent_free_item	*xefi)
 {
-	xfs_agnumber_t			agno;
-
-	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
-	xefi->xefi_pag = xfs_perag_intent_get(mp, agno);
+	xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock);
 }
 
 /* Release a passive AG ref after some freeing work. */
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 20ad8086da60b..ff1fdb759a8d2 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -330,10 +330,7 @@ xfs_refcount_update_get_group(
 	struct xfs_mount		*mp,
 	struct xfs_refcount_intent	*ri)
 {
-	xfs_agnumber_t			agno;
-
-	agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock);
-	ri->ri_pag = xfs_perag_intent_get(mp, agno);
+	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
 }
 
 /* Release a passive AG ref after finishing refcounting work. */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 79ad0087aecaf..183544ac74c98 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -355,10 +355,7 @@ xfs_rmap_update_get_group(
 	struct xfs_mount	*mp,
 	struct xfs_rmap_intent	*ri)
 {
-	xfs_agnumber_t		agno;
-
-	agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
-	ri->ri_pag = xfs_perag_intent_get(mp, agno);
+	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
 }
 
 /* Release a passive AG ref after finishing rmapping work. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/9] xfs: add a xefi_entry helper
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:27   ` [PATCH 3/9] xfs: pass the fsbno to xfs_perag_intent_get Darrick J. Wong
@ 2023-12-31 21:27   ` Darrick J. Wong
  2023-12-31 21:27   ` [PATCH 5/9] xfs: reuse xfs_extent_free_cancel_item Darrick J. Wong
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:27 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

Add a helper to translate from the item list head to the
xfs_extent_free_item structure and use it so shorten assignments
and avoid the need for extra local variables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_extfree_item.c |   22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)


diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a29105583de96..d86ec1d5e468a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -303,6 +303,11 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
 	.iop_intent	= xfs_efd_item_intent,
 };
 
+static inline struct xfs_extent_free_item *xefi_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_extent_free_item, xefi_list);
+}
+
 /*
  * Fill the EFD with all extents from the EFI when we need to roll the
  * transaction and continue with a new EFI.
@@ -338,11 +343,8 @@ xfs_extent_free_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	struct xfs_extent_free_item	*ra;
-	struct xfs_extent_free_item	*rb;
-
-	ra = container_of(a, struct xfs_extent_free_item, xefi_list);
-	rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+	struct xfs_extent_free_item	*ra = xefi_entry(a);
+	struct xfs_extent_free_item	*rb = xefi_entry(b);
 
 	return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno;
 }
@@ -444,7 +446,7 @@ xfs_extent_free_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_owner_info		oinfo = { };
-	struct xfs_extent_free_item	*xefi;
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_extent		*extp;
@@ -452,7 +454,6 @@ xfs_extent_free_finish_item(
 	xfs_agblock_t			agbno;
 	int				error = 0;
 
-	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
 	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
 
 	oinfo.oi_owner = xefi->xefi_owner;
@@ -504,9 +505,7 @@ STATIC void
 xfs_extent_free_cancel_item(
 	struct list_head		*item)
 {
-	struct xfs_extent_free_item	*xefi;
-
-	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 
 	xfs_extent_free_put_group(xefi);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
@@ -526,14 +525,13 @@ xfs_agfl_free_finish_item(
 	struct xfs_owner_info		oinfo = { };
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
-	struct xfs_extent_free_item	*xefi;
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 	struct xfs_extent		*extp;
 	struct xfs_buf			*agbp;
 	int				error;
 	xfs_agblock_t			agbno;
 	uint				next_extent;
 
-	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
 	ASSERT(xefi->xefi_blockcount == 1);
 	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
 	oinfo.oi_owner = xefi->xefi_owner;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/9] xfs: reuse xfs_extent_free_cancel_item
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:27   ` [PATCH 4/9] xfs: add a xefi_entry helper Darrick J. Wong
@ 2023-12-31 21:27   ` Darrick J. Wong
  2023-12-31 21:27   ` [PATCH 6/9] xfs: factor out a xfs_efd_add_extent helper Darrick J. Wong
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:27 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

Reuse xfs_extent_free_cancel_item to put the AG/RTG and free the item in
a few places that currently open code the logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_extfree_item.c |   28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)


diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d86ec1d5e468a..9d141b9876572 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -437,6 +437,17 @@ xfs_extent_free_put_group(
 	xfs_perag_intent_put(xefi->xefi_pag);
 }
 
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
+
+	xfs_extent_free_put_group(xefi);
+	kmem_cache_free(xfs_extfree_item_cache, xefi);
+}
+
 /* Process a free extent. */
 STATIC int
 xfs_extent_free_finish_item(
@@ -487,8 +498,7 @@ xfs_extent_free_finish_item(
 	extp->ext_len = xefi->xefi_blockcount;
 	efdp->efd_next_extent++;
 
-	xfs_extent_free_put_group(xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
+	xfs_extent_free_cancel_item(item);
 	return error;
 }
 
@@ -500,17 +510,6 @@ xfs_extent_free_abort_intent(
 	xfs_efi_release(EFI_ITEM(intent));
 }
 
-/* Cancel a free extent. */
-STATIC void
-xfs_extent_free_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_extent_free_item	*xefi = xefi_entry(item);
-
-	xfs_extent_free_put_group(xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
-}
-
 /*
  * AGFL blocks are accounted differently in the reserve pools and are not
  * inserted into the busy extent list.
@@ -550,8 +549,7 @@ xfs_agfl_free_finish_item(
 	extp->ext_len = xefi->xefi_blockcount;
 	efdp->efd_next_extent++;
 
-	xfs_extent_free_put_group(xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
+	xfs_extent_free_cancel_item(&xefi->xefi_list);
 	return error;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/9] xfs: factor out a xfs_efd_add_extent helper
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:27   ` [PATCH 5/9] xfs: reuse xfs_extent_free_cancel_item Darrick J. Wong
@ 2023-12-31 21:27   ` Darrick J. Wong
  2023-12-31 21:28   ` [PATCH 7/9] xfs: remove duplicate asserts in xfs_defer_extent_free Darrick J. Wong
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:27 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

Factor out a helper to add an extent to and EFD instead of duplicating
the logic in two places.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_extfree_item.c |   37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)


diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 9d141b9876572..be932390bd1f7 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -336,6 +336,22 @@ xfs_efd_from_efi(
 	efdp->efd_next_extent = efip->efi_format.efi_nextents;
 }
 
+static void
+xfs_efd_add_extent(
+	struct xfs_efd_log_item		*efdp,
+	struct xfs_extent_free_item	*xefi)
+{
+	struct xfs_extent		*extp;
+
+	ASSERT(efdp->efd_next_extent < efdp->efd_format.efd_nextents);
+
+	extp = &efdp->efd_format.efd_extents[efdp->efd_next_extent];
+	extp->ext_start = xefi->xefi_startblock;
+	extp->ext_len = xefi->xefi_blockcount;
+
+	efdp->efd_next_extent++;
+}
+
 /* Sort bmap items by AG. */
 static int
 xfs_extent_free_diff_items(
@@ -460,8 +476,6 @@ xfs_extent_free_finish_item(
 	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_extent		*extp;
-	uint				next_extent;
 	xfs_agblock_t			agbno;
 	int				error = 0;
 
@@ -490,14 +504,7 @@ xfs_extent_free_finish_item(
 		return error;
 	}
 
-	/* Add the work we finished to the EFD, even though nobody uses that */
-	next_extent = efdp->efd_next_extent;
-	ASSERT(next_extent < efdp->efd_format.efd_nextents);
-	extp = &(efdp->efd_format.efd_extents[next_extent]);
-	extp->ext_start = xefi->xefi_startblock;
-	extp->ext_len = xefi->xefi_blockcount;
-	efdp->efd_next_extent++;
-
+	xfs_efd_add_extent(efdp, xefi);
 	xfs_extent_free_cancel_item(item);
 	return error;
 }
@@ -525,11 +532,9 @@ xfs_agfl_free_finish_item(
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
 	struct xfs_extent_free_item	*xefi = xefi_entry(item);
-	struct xfs_extent		*extp;
 	struct xfs_buf			*agbp;
 	int				error;
 	xfs_agblock_t			agbno;
-	uint				next_extent;
 
 	ASSERT(xefi->xefi_blockcount == 1);
 	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
@@ -542,13 +547,7 @@ xfs_agfl_free_finish_item(
 		error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
 				agbno, agbp, &oinfo);
 
-	next_extent = efdp->efd_next_extent;
-	ASSERT(next_extent < efdp->efd_format.efd_nextents);
-	extp = &(efdp->efd_format.efd_extents[next_extent]);
-	extp->ext_start = xefi->xefi_startblock;
-	extp->ext_len = xefi->xefi_blockcount;
-	efdp->efd_next_extent++;
-
+	xfs_efd_add_extent(efdp, xefi);
 	xfs_extent_free_cancel_item(&xefi->xefi_list);
 	return error;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 7/9] xfs: remove duplicate asserts in xfs_defer_extent_free
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:27   ` [PATCH 6/9] xfs: factor out a xfs_efd_add_extent helper Darrick J. Wong
@ 2023-12-31 21:28   ` Darrick J. Wong
  2023-12-31 21:28   ` [PATCH 8/9] xfs: remove xfs_defer_agfl_block Darrick J. Wong
  2023-12-31 21:28   ` [PATCH 9/9] xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:28 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

The bno/len verification is already done by the calls to
xfs_verify_rtbext / xfs_verify_fsbext, and reporting a corruption error
seem like the better handling than tripping an assert anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_alloc.c |   13 -------------
 1 file changed, 13 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 1ea88d4675f5f..ca4df5e0b2a31 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2596,23 +2596,10 @@ xfs_defer_extent_free(
 {
 	struct xfs_extent_free_item	*xefi;
 	struct xfs_mount		*mp = tp->t_mountp;
-#ifdef DEBUG
-	xfs_agnumber_t			agno;
-	xfs_agblock_t			agbno;
 
-	ASSERT(bno != NULLFSBLOCK);
-	ASSERT(len > 0);
 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
 	ASSERT(!isnullstartblock(bno));
-	agno = XFS_FSB_TO_AGNO(mp, bno);
-	agbno = XFS_FSB_TO_AGBNO(mp, bno);
-	ASSERT(agno < mp->m_sb.sb_agcount);
-	ASSERT(agbno < mp->m_sb.sb_agblocks);
-	ASSERT(len < mp->m_sb.sb_agblocks);
-	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
 	ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
-	ASSERT(xfs_extfree_item_cache != NULL);
 	ASSERT(type != XFS_AG_RESV_AGFL);
 
 	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 8/9] xfs: remove xfs_defer_agfl_block
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:28   ` [PATCH 7/9] xfs: remove duplicate asserts in xfs_defer_extent_free Darrick J. Wong
@ 2023-12-31 21:28   ` Darrick J. Wong
  2023-12-31 21:28   ` [PATCH 9/9] xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:28 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

xfs_free_extent_later can handle the extra AGFL special casing with
very little extra logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_alloc.c |   67 +++++++++++++++------------------------------
 1 file changed, 22 insertions(+), 45 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ca4df5e0b2a31..0227985676042 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2538,48 +2538,6 @@ xfs_agfl_reset(
 	clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
 }
 
-/*
- * Defer an AGFL block free. This is effectively equivalent to
- * xfs_free_extent_later() with some special handling particular to AGFL blocks.
- *
- * Deferring AGFL frees helps prevent log reservation overruns due to too many
- * allocation operations in a transaction. AGFL frees are prone to this problem
- * because for one they are always freed one at a time. Further, an immediate
- * AGFL block free can cause a btree join and require another block free before
- * the real allocation can proceed. Deferring the free disconnects freeing up
- * the AGFL slot from freeing the block.
- */
-static int
-xfs_defer_agfl_block(
-	struct xfs_trans		*tp,
-	xfs_agnumber_t			agno,
-	xfs_agblock_t			agbno,
-	struct xfs_owner_info		*oinfo)
-{
-	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_extent_free_item	*xefi;
-	xfs_fsblock_t			fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
-
-	ASSERT(xfs_extfree_item_cache != NULL);
-	ASSERT(oinfo != NULL);
-
-	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
-		return -EFSCORRUPTED;
-
-	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
-			       GFP_KERNEL | __GFP_NOFAIL);
-	xefi->xefi_startblock = fsbno;
-	xefi->xefi_blockcount = 1;
-	xefi->xefi_owner = oinfo->oi_owner;
-	xefi->xefi_agresv = XFS_AG_RESV_AGFL;
-
-	trace_xfs_agfl_free_defer(mp, xefi);
-
-	xfs_extent_free_get_group(mp, xefi);
-	xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
-	return 0;
-}
-
 /*
  * Add the extent to the list of extents to be free at transaction end.
  * The list is maintained sorted (by block number).
@@ -2627,7 +2585,13 @@ xfs_defer_extent_free(
 	trace_xfs_extent_free_defer(mp, xefi);
 
 	xfs_extent_free_get_group(mp, xefi);
-	*dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
+
+	if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_agfl_free_defer_type);
+	else
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_extent_free_defer_type);
 	return 0;
 }
 
@@ -2885,8 +2849,21 @@ xfs_alloc_fix_freelist(
 		if (error)
 			goto out_agbp_relse;
 
-		/* defer agfl frees */
-		error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+		/*
+		 * Defer the AGFL block free.
+		 *
+		 * This helps to prevent log reservation overruns due to too
+		 * many allocation operations in a transaction. AGFL frees are
+		 * prone to this problem because for one they are always freed
+		 * one at a time.  Further, an immediate AGFL block free can
+		 * cause a btree join and require another block free before the
+		 * real allocation can proceed.
+		 * Deferring the free disconnects freeing up the AGFL slot from
+		 * freeing the block.
+		 */
+		error = xfs_free_extent_later(tp,
+				XFS_AGB_TO_FSB(mp, args->agno, bno), 1,
+				&targs.oinfo, XFS_AG_RESV_AGFL, 0);
 		if (error)
 			goto out_agbp_relse;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 9/9] xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c
  2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:28   ` [PATCH 8/9] xfs: remove xfs_defer_agfl_block Darrick J. Wong
@ 2023-12-31 21:28   ` Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:28 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the code that adds the incore xfs_extent_free_item deferred work
data to a transaction live with the EFI log item code.  This means that
the allocator code no longer has to know about the inner workings of the
EFI log items.

As a consequence, we can get rid of the _{get,put}_group helpers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_alloc.c |   12 ++----------
 fs/xfs/libxfs/xfs_alloc.h |    3 ---
 fs/xfs/xfs_extfree_item.c |   31 +++++++++++++++++--------------
 fs/xfs/xfs_extfree_item.h |    6 ++++++
 4 files changed, 25 insertions(+), 27 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 0227985676042..5d63711ad1aac 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -27,6 +27,7 @@
 #include "xfs_ag_resv.h"
 #include "xfs_bmap.h"
 #include "xfs_health.h"
+#include "xfs_extfree_item.h"
 
 struct kmem_cache	*xfs_extfree_item_cache;
 
@@ -2582,16 +2583,7 @@ xfs_defer_extent_free(
 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
 	}
 
-	trace_xfs_extent_free_defer(mp, xefi);
-
-	xfs_extent_free_get_group(mp, xefi);
-
-	if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
-		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
-				&xfs_agfl_free_defer_type);
-	else
-		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
-				&xfs_extent_free_defer_type);
+	xfs_extent_free_defer_add(tp, xefi, dfpp);
 	return 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2da543fb90ecd..0ed71a31fe7ce 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -254,9 +254,6 @@ struct xfs_extent_free_item {
 	enum xfs_ag_resv_type	xefi_agresv;
 };
 
-void xfs_extent_free_get_group(struct xfs_mount *mp,
-		struct xfs_extent_free_item *xefi);
-
 #define XFS_EFI_SKIP_DISCARD	(1U << 0) /* don't issue discard */
 #define XFS_EFI_ATTR_FORK	(1U << 1) /* freeing attr fork block */
 #define XFS_EFI_BMBT_BLOCK	(1U << 2) /* freeing bmap btree block */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index be932390bd1f7..e8569bf26819c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -436,21 +436,24 @@ xfs_extent_free_create_done(
 	return &efdp->efd_item;
 }
 
-/* Take a passive ref to the AG containing the space we're freeing. */
+/* Add this deferred EFI to the transaction. */
 void
-xfs_extent_free_get_group(
-	struct xfs_mount		*mp,
-	struct xfs_extent_free_item	*xefi)
+xfs_extent_free_defer_add(
+	struct xfs_trans		*tp,
+	struct xfs_extent_free_item	*xefi,
+	struct xfs_defer_pending	**dfpp)
 {
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	trace_xfs_extent_free_defer(mp, xefi);
+
 	xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock);
-}
-
-/* Release a passive AG ref after some freeing work. */
-static inline void
-xfs_extent_free_put_group(
-	struct xfs_extent_free_item	*xefi)
-{
-	xfs_perag_intent_put(xefi->xefi_pag);
+	if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_agfl_free_defer_type);
+	else
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_extent_free_defer_type);
 }
 
 /* Cancel a free extent. */
@@ -460,7 +463,7 @@ xfs_extent_free_cancel_item(
 {
 	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 
-	xfs_extent_free_put_group(xefi);
+	xfs_perag_intent_put(xefi->xefi_pag);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
 }
 
@@ -575,7 +578,7 @@ xfs_efi_recover_work(
 	xefi->xefi_blockcount = extp->ext_len;
 	xefi->xefi_agresv = XFS_AG_RESV_NONE;
 	xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
-	xfs_extent_free_get_group(mp, xefi);
+	xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start);
 
 	xfs_defer_add_item(dfp, &xefi->xefi_list);
 }
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index da6a5afa607cf..41b7c43060799 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -88,4 +88,10 @@ xfs_efd_log_item_sizeof(
 extern struct kmem_cache	*xfs_efi_cache;
 extern struct kmem_cache	*xfs_efd_cache;
 
+struct xfs_extent_free_item;
+
+void xfs_extent_free_defer_add(struct xfs_trans *tp,
+		struct xfs_extent_free_item *xefi,
+		struct xfs_defer_pending **dfpp);
+
 #endif	/* __XFS_EXTFREE_ITEM_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/2] xfs: support logging EFIs for realtime extents
  2023-12-31 19:36 ` [PATCHSET v2.0 09/15] xfs: widen EFI format to support rt Darrick J. Wong
@ 2023-12-31 21:28   ` Darrick J. Wong
  2023-12-31 21:29   ` [PATCH 2/2] xfs: support error injection when freeing rt extents Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:28 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the EFI mechanism how to free realtime extents.  We're going to
need this to enforce proper ordering of operations when we enable
realtime rmap.

Declare a new log intent item type (XFS_LI_EFI_RT) and a separate defer
ops for rt extents.  This keeps the ondisk artifacts and processing code
completely separate between the rt and non-rt cases.  Hopefully this
will make it easier to debug filesystem problems.

Previous versions of this patch accomplished this by setting the high
bit in each rt EFI extent.  This was found to be less transparent by
reviewers.

[Contains a bug fix and cleanups from hch]

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_alloc.c       |   16 ++
 fs/xfs/libxfs/xfs_alloc.h       |   17 ++
 fs/xfs/libxfs/xfs_defer.c       |    6 +
 fs/xfs/libxfs/xfs_defer.h       |    1 
 fs/xfs/libxfs/xfs_log_format.h  |    6 +
 fs/xfs/libxfs/xfs_log_recover.h |    2 
 fs/xfs/xfs_extfree_item.c       |  282 ++++++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_log_recover.c        |    2 
 8 files changed, 306 insertions(+), 26 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 5d63711ad1aac..cf56784aabbbc 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2559,10 +2559,18 @@ xfs_defer_extent_free(
 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
 	ASSERT(!isnullstartblock(bno));
 	ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
-	ASSERT(type != XFS_AG_RESV_AGFL);
 
-	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
-		return -EFSCORRUPTED;
+	if (free_flags & XFS_FREE_EXTENT_REALTIME) {
+		if (type != XFS_AG_RESV_NONE) {
+			ASSERT(type == XFS_AG_RESV_NONE);
+			return -EFSCORRUPTED;
+		}
+		if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len)))
+			return -EFSCORRUPTED;
+	} else {
+		if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
+			return -EFSCORRUPTED;
+	}
 
 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
 			       GFP_KERNEL | __GFP_NOFAIL);
@@ -2571,6 +2579,8 @@ xfs_defer_extent_free(
 	xefi->xefi_agresv = type;
 	if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+	if (free_flags & XFS_FREE_EXTENT_REALTIME)
+		xefi->xefi_flags |= XFS_EFI_REALTIME;
 	if (oinfo) {
 		ASSERT(oinfo->oi_offset == 0);
 
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 0ed71a31fe7ce..130026e981ea2 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -238,7 +238,11 @@ int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
 /* Don't issue a discard for the blocks freed. */
 #define XFS_FREE_EXTENT_SKIP_DISCARD	(1U << 0)
 
-#define XFS_FREE_EXTENT_ALL_FLAGS	(XFS_FREE_EXTENT_SKIP_DISCARD)
+/* Free blocks on the realtime device. */
+#define XFS_FREE_EXTENT_REALTIME	(1U << 1)
+
+#define XFS_FREE_EXTENT_ALL_FLAGS	(XFS_FREE_EXTENT_SKIP_DISCARD | \
+					 XFS_FREE_EXTENT_REALTIME)
 
 /*
  * List of extents to be free "later".
@@ -249,7 +253,10 @@ struct xfs_extent_free_item {
 	uint64_t		xefi_owner;
 	xfs_fsblock_t		xefi_startblock;/* starting fs block number */
 	xfs_extlen_t		xefi_blockcount;/* number of blocks in extent */
-	struct xfs_perag	*xefi_pag;
+	union {
+		struct xfs_perag	*xefi_pag;
+		struct xfs_rtgroup	*xefi_rtg;
+	};
 	unsigned int		xefi_flags;
 	enum xfs_ag_resv_type	xefi_agresv;
 };
@@ -258,6 +265,12 @@ struct xfs_extent_free_item {
 #define XFS_EFI_ATTR_FORK	(1U << 1) /* freeing attr fork block */
 #define XFS_EFI_BMBT_BLOCK	(1U << 2) /* freeing bmap btree block */
 #define XFS_EFI_CANCELLED	(1U << 3) /* dont actually free the space */
+#define XFS_EFI_REALTIME	(1U << 4) /* freeing realtime extent */
+
+static inline bool xfs_efi_is_realtime(const struct xfs_extent_free_item *xefi)
+{
+	return xefi->xefi_flags & XFS_EFI_REALTIME;
+}
 
 struct xfs_alloc_autoreap {
 	struct xfs_defer_pending	*dfp;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 8788f9f3f19ec..cd28b96b49ea9 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -845,6 +845,12 @@ xfs_defer_add(
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 
+	if (!ops->finish_item) {
+		ASSERT(ops->finish_item != NULL);
+		xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+		return NULL;
+	}
+
 	dfp = xfs_defer_find_last(tp, ops);
 	if (!dfp || !xfs_defer_can_append(dfp, ops))
 		dfp = xfs_defer_alloc(tp, ops);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index c9a1fe3fe363e..b4e1c386768c9 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -71,6 +71,7 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
 extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;
 extern const struct xfs_defer_op_type xfs_attr_defer_type;
 extern const struct xfs_defer_op_type xfs_swapext_defer_type;
 
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index bded03634e53d..1f5fe4a588eca 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -248,6 +248,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_ATTRD		0x1247  /* attr set/remove done */
 #define	XFS_LI_SXI		0x1248  /* extent swap intent */
 #define	XFS_LI_SXD		0x1249  /* extent swap done */
+#define	XFS_LI_EFI_RT		0x124a	/* realtime extent free intent */
+#define	XFS_LI_EFD_RT		0x124b	/* realtime extent free done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -267,7 +269,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_ATTRI,		"XFS_LI_ATTRI" }, \
 	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }, \
 	{ XFS_LI_SXI,		"XFS_LI_SXI" }, \
-	{ XFS_LI_SXD,		"XFS_LI_SXD" }
+	{ XFS_LI_SXD,		"XFS_LI_SXD" }, \
+	{ XFS_LI_EFI_RT,	"XFS_LI_EFI_RT" }, \
+	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }
 
 /*
  * Inode Log Item Format definitions.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 891221b0b83aa..811c37026d251 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -77,6 +77,8 @@ extern const struct xlog_recover_item_ops xlog_attri_item_ops;
 extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
 extern const struct xlog_recover_item_ops xlog_sxi_item_ops;
 extern const struct xlog_recover_item_ops xlog_sxd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
 
 /*
  * Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index e8569bf26819c..51a363d85978f 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -25,6 +25,10 @@
 #include "xfs_error.h"
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_efi_cache;
 struct kmem_cache	*xfs_efd_cache;
@@ -95,16 +99,15 @@ xfs_efi_item_format(
 
 	ASSERT(atomic_read(&efip->efi_next_extent) ==
 				efip->efi_format.efi_nextents);
+	ASSERT(lip->li_type == XFS_LI_EFI || lip->li_type == XFS_LI_EFI_RT);
 
-	efip->efi_format.efi_type = XFS_LI_EFI;
+	efip->efi_format.efi_type = lip->li_type;
 	efip->efi_format.efi_size = 1;
 
-	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
-			&efip->efi_format,
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, &efip->efi_format,
 			xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents));
 }
 
-
 /*
  * The unpin operation is the last place an EFI is manipulated in the log. It is
  * either inserted in the AIL or aborted in the event of a log I/O error. In
@@ -140,12 +143,14 @@ xfs_efi_item_release(
 STATIC struct xfs_efi_log_item *
 xfs_efi_init(
 	struct xfs_mount	*mp,
+	unsigned short		item_type,
 	uint			nextents)
-
 {
 	struct xfs_efi_log_item	*efip;
 
+	ASSERT(item_type == XFS_LI_EFI || item_type == XFS_LI_EFI_RT);
 	ASSERT(nextents > 0);
+
 	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
 		efip = kzalloc(xfs_efi_log_item_sizeof(nextents),
 				GFP_KERNEL | __GFP_NOFAIL);
@@ -154,7 +159,7 @@ xfs_efi_init(
 					 GFP_KERNEL | __GFP_NOFAIL);
 	}
 
-	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+	xfs_log_item_init(mp, &efip->efi_item, item_type, &xfs_efi_item_ops);
 	efip->efi_format.efi_nextents = nextents;
 	efip->efi_format.efi_id = (uintptr_t)(void *)efip;
 	atomic_set(&efip->efi_next_extent, 0);
@@ -264,12 +269,12 @@ xfs_efd_item_format(
 	struct xfs_log_iovec	*vecp = NULL;
 
 	ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+	ASSERT(lip->li_type == XFS_LI_EFD || lip->li_type == XFS_LI_EFD_RT);
 
-	efdp->efd_format.efd_type = XFS_LI_EFD;
+	efdp->efd_format.efd_type = lip->li_type;
 	efdp->efd_format.efd_size = 1;
 
-	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
-			&efdp->efd_format,
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, &efdp->efd_format,
 			xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents));
 }
 
@@ -308,6 +313,14 @@ static inline struct xfs_extent_free_item *xefi_entry(const struct list_head *e)
 	return list_entry(e, struct xfs_extent_free_item, xefi_list);
 }
 
+static inline bool
+xfs_efi_item_isrt(const struct xfs_log_item *lip)
+{
+	ASSERT(lip->li_type == XFS_LI_EFI || lip->li_type == XFS_LI_EFI_RT);
+
+	return lip->li_type == XFS_LI_EFI_RT;
+}
+
 /*
  * Fill the EFD with all extents from the EFI when we need to roll the
  * transaction and continue with a new EFI.
@@ -395,11 +408,12 @@ xfs_extent_free_create_intent(
 	bool				sort)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_efi_log_item		*efip = xfs_efi_init(mp, count);
+	struct xfs_efi_log_item		*efip;
 	struct xfs_extent_free_item	*xefi;
 
 	ASSERT(count > 0);
 
+	efip = xfs_efi_init(mp, XFS_LI_EFI, count);
 	if (sort)
 		list_sort(mp, items, xfs_extent_free_diff_items);
 	list_for_each_entry(xefi, items, xefi_list)
@@ -407,6 +421,12 @@ xfs_extent_free_create_intent(
 	return &efip->efi_item;
 }
 
+static inline unsigned short
+xfs_efd_type_from_efi(const struct xfs_efi_log_item *efip)
+{
+	return xfs_efi_item_isrt(&efip->efi_item) ?  XFS_LI_EFD_RT : XFS_LI_EFD;
+}
+
 /* Get an EFD so we can process all the free extents. */
 static struct xfs_log_item *
 xfs_extent_free_create_done(
@@ -427,8 +447,8 @@ xfs_extent_free_create_done(
 					GFP_KERNEL | __GFP_NOFAIL);
 	}
 
-	xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
-			  &xfs_efd_item_ops);
+	xfs_log_item_init(tp->t_mountp, &efdp->efd_item,
+			xfs_efd_type_from_efi(efip), &xfs_efd_item_ops);
 	efdp->efd_efip = efip;
 	efdp->efd_format.efd_nextents = count;
 	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
@@ -447,6 +467,17 @@ xfs_extent_free_defer_add(
 
 	trace_xfs_extent_free_defer(mp, xefi);
 
+	if (xfs_efi_is_realtime(xefi)) {
+		xfs_rgnumber_t		rgno;
+
+		rgno = xfs_rtb_to_rgno(mp, xefi->xefi_startblock);
+		xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
+
+		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+				&xfs_rtextent_free_defer_type);
+		return;
+	}
+
 	xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock);
 	if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
 		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
@@ -559,8 +590,12 @@ xfs_agfl_free_finish_item(
 static inline bool
 xfs_efi_validate_ext(
 	struct xfs_mount		*mp,
+	bool				isrt,
 	struct xfs_extent		*extp)
 {
+	if (isrt)
+		return xfs_verify_rtbext(mp, extp->ext_start, extp->ext_len);
+
 	return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len);
 }
 
@@ -568,6 +603,7 @@ static inline void
 xfs_efi_recover_work(
 	struct xfs_mount		*mp,
 	struct xfs_defer_pending	*dfp,
+	bool				isrt,
 	struct xfs_extent		*extp)
 {
 	struct xfs_extent_free_item	*xefi;
@@ -578,7 +614,15 @@ xfs_efi_recover_work(
 	xefi->xefi_blockcount = extp->ext_len;
 	xefi->xefi_agresv = XFS_AG_RESV_NONE;
 	xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
-	xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start);
+	if (isrt) {
+		xfs_rgnumber_t		rgno;
+
+		xefi->xefi_flags |= XFS_EFI_REALTIME;
+		rgno = xfs_rtb_to_rgno(mp, extp->ext_start);
+		xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
+	} else {
+		xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start);
+	}
 
 	xfs_defer_add_item(dfp, &xefi->xefi_list);
 }
@@ -599,14 +643,15 @@ xfs_extent_free_recover_work(
 	struct xfs_trans		*tp;
 	int				i;
 	int				error = 0;
+	bool				isrt = xfs_efi_item_isrt(lip);
 
 	/*
-	 * First check the validity of the extents described by the
-	 * EFI.  If any are bad, then assume that all are bad and
-	 * just toss the EFI.
+	 * First check the validity of the extents described by the EFI.  If
+	 * any are bad, then assume that all are bad and just toss the EFI.
+	 * Mixing RT and non-RT extents in the same EFI item is not allowed.
 	 */
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
-		if (!xfs_efi_validate_ext(mp,
+		if (!xfs_efi_validate_ext(mp, isrt,
 					&efip->efi_format.efi_extents[i])) {
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					&efip->efi_format,
@@ -614,7 +659,8 @@ xfs_extent_free_recover_work(
 			return -EFSCORRUPTED;
 		}
 
-		xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]);
+		xfs_efi_recover_work(mp, dfp, isrt,
+				&efip->efi_format.efi_extents[i]);
 	}
 
 	resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -652,10 +698,12 @@ xfs_extent_free_relog_intent(
 	count = EFI_ITEM(intent)->efi_format.efi_nextents;
 	extp = EFI_ITEM(intent)->efi_format.efi_extents;
 
+	ASSERT(intent->li_type == XFS_LI_EFI || intent->li_type == XFS_LI_EFI_RT);
+
 	efdp->efd_next_extent = count;
 	memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
 
-	efip = xfs_efi_init(tp->t_mountp, count);
+	efip = xfs_efi_init(tp->t_mountp, intent->li_type, count);
 	memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
 	atomic_set(&efip->efi_next_extent, count);
 
@@ -687,6 +735,107 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
 	.relog_intent	= xfs_extent_free_relog_intent,
 };
 
+#ifdef CONFIG_XFS_RT
+/* Sort realtime efi items by rtgroup. */
+static int
+xfs_rtextent_free_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_extent_free_item	*ra = xefi_entry(a);
+	struct xfs_extent_free_item	*rb = xefi_entry(b);
+
+	return ra->xefi_rtg->rtg_rgno - rb->xefi_rtg->rtg_rgno;
+}
+
+/* Create a realtime extent freeing */
+static struct xfs_log_item *
+xfs_rtextent_free_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_efi_log_item		*efip;
+	struct xfs_extent_free_item	*xefi;
+
+	ASSERT(count > 0);
+
+	efip = xfs_efi_init(mp, XFS_LI_EFI_RT, count);
+	if (sort)
+		list_sort(mp, items, xfs_rtextent_free_diff_items);
+	list_for_each_entry(xefi, items, xefi_list)
+		xfs_extent_free_log_item(tp, efip, xefi);
+	return &efip->efi_item;
+}
+
+/* Cancel a realtime extent freeing. */
+STATIC void
+xfs_rtextent_free_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
+
+	xfs_rtgroup_put(xefi->xefi_rtg);
+	kmem_cache_free(xfs_extfree_item_cache, xefi);
+}
+
+/* Process a free realtime extent. */
+STATIC int
+xfs_rtextent_free_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_extent_free_item	*xefi = xefi_entry(item);
+	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
+	int				error = 0;
+
+	/*
+	 * Lock the rt bitmap if we've any realtime extents to free and we
+	 * haven't locked the rt inodes yet.
+	 */
+	if (*state == NULL) {
+		xfs_rtbitmap_lock(tp, mp);
+		*state = (struct xfs_btree_cur *)1;
+	}
+
+	trace_xfs_extent_free_deferred(mp, xefi);
+
+	if (!(xefi->xefi_flags & XFS_EFI_CANCELLED))
+		error = xfs_rtfree_blocks(tp, xefi->xefi_startblock,
+					xefi->xefi_blockcount);
+	if (error == -EAGAIN) {
+		xfs_efd_from_efi(efdp);
+		return error;
+	}
+
+	xfs_efd_add_extent(efdp, xefi);
+	xfs_rtextent_free_cancel_item(item);
+	return error;
+}
+
+const struct xfs_defer_op_type xfs_rtextent_free_defer_type = {
+	.name		= "rtextent_free",
+	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_rtextent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_rtextent_free_finish_item,
+	.cancel_item	= xfs_rtextent_free_cancel_item,
+	.recover_work	= xfs_extent_free_recover_work,
+	.relog_intent	= xfs_extent_free_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtextent_free_defer_type = {
+	.name		= "rtextent_free",
+};
+#endif /* CONFIG_XFS_RT */
+
 STATIC bool
 xfs_efi_item_match(
 	struct xfs_log_item	*lip,
@@ -731,7 +880,7 @@ xlog_recover_efi_commit_pass2(
 		return -EFSCORRUPTED;
 	}
 
-	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+	efip = xfs_efi_init(mp, ITEM_TYPE(item), efi_formatp->efi_nextents);
 	error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
 	if (error) {
 		xfs_efi_item_free(efip);
@@ -749,6 +898,58 @@ const struct xlog_recover_item_ops xlog_efi_item_ops = {
 	.commit_pass2		= xlog_recover_efi_commit_pass2,
 };
 
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtefi_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_efi_log_item		*efip;
+	struct xfs_efi_log_format	*efi_formatp;
+	int				error;
+
+	efi_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	efip = xfs_efi_init(mp, ITEM_TYPE(item), efi_formatp->efi_nextents);
+	error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+	if (error) {
+		xfs_efi_item_free(efip);
+		return error;
+	}
+	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+
+	xlog_recover_intent_item(log, &efip->efi_item, lsn,
+			&xfs_rtextent_free_defer_type);
+	return 0;
+}
+#else
+STATIC int
+xlog_recover_rtefi_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+			item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+	return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtefi_item_ops = {
+	.item_type		= XFS_LI_EFI_RT,
+	.commit_pass2		= xlog_recover_rtefi_commit_pass2,
+};
+
 /*
  * This routine is called when an EFD format structure is found in a committed
  * transaction in the log. Its purpose is to cancel the corresponding EFI if it
@@ -791,3 +992,44 @@ const struct xlog_recover_item_ops xlog_efd_item_ops = {
 	.item_type		= XFS_LI_EFD,
 	.commit_pass2		= xlog_recover_efd_commit_pass2,
 };
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtefd_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_efd_log_format	*efd_formatp;
+	int				buflen = item->ri_buf[0].i_len;
+
+	efd_formatp = item->ri_buf[0].i_addr;
+
+	if (buflen < sizeof(struct xfs_efd_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				efd_formatp, buflen);
+		return -EFSCORRUPTED;
+	}
+
+	if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+						efd_formatp->efd_nextents) &&
+	    item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+						efd_formatp->efd_nextents)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				efd_formatp, buflen);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_EFI_RT,
+			efd_formatp->efd_efi_id);
+	return 0;
+}
+#else
+# define xlog_recover_rtefd_commit_pass2	xlog_recover_rtefi_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtefd_item_ops = {
+	.item_type		= XFS_LI_EFD_RT,
+	.commit_pass2		= xlog_recover_rtefd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5e9562f37dc89..0aeca77d511d0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1793,6 +1793,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
 	&xlog_attrd_item_ops,
 	&xlog_sxi_item_ops,
 	&xlog_sxd_item_ops,
+	&xlog_rtefi_item_ops,
+	&xlog_rtefd_item_ops,
 };
 
 static const struct xlog_recover_item_ops *


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/2] xfs: support error injection when freeing rt extents
  2023-12-31 19:36 ` [PATCHSET v2.0 09/15] xfs: widen EFI format to support rt Darrick J. Wong
  2023-12-31 21:28   ` [PATCH 1/2] xfs: support logging EFIs for realtime extents Darrick J. Wong
@ 2023-12-31 21:29   ` Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:29 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

A handful of fstests expect to be able to test what happens when extent
free intents fail to actually free the extent.  Now that we're
supporting EFIs for realtime extents, add to xfs_rtfree_extent the same
injection point that exists in the regular extent freeing code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtbitmap.c |    4 ++++
 1 file changed, 4 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 0ef14157e8157..16471ad8365d4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -20,6 +20,7 @@
 #include "xfs_health.h"
 #include "xfs_log.h"
 #include "xfs_buf_item.h"
+#include "xfs_errortag.h"
 
 /*
  * Realtime allocator bitmap functions shared with userspace.
@@ -1040,6 +1041,9 @@ xfs_rtfree_extent(
 	ASSERT(mp->m_rbmip->i_itemp != NULL);
 	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT))
+		return -EIO;
+
 	error = xfs_rtcheck_alloc_range(&args, start, len);
 	if (error)
 		return error;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/10] xfs: attach rtgroup objects to btree cursors
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
@ 2023-12-31 21:29   ` Darrick J. Wong
  2023-12-31 21:29   ` [PATCH 02/10] xfs: give rmap btree cursor error tracepoints their own class Darrick J. Wong
                     ` (8 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:29 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make it so that we can attach realtime group objects to btree cursors.
This will be crucial for enabling rmap btrees in realtime groups.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.c |    4 ++++
 fs/xfs/libxfs/xfs_btree.h |    2 ++
 2 files changed, 6 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index ddbe2d55a8077..5b4a2c6a3f2ac 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -31,6 +31,7 @@
 #include "scrub/xfile.h"
 #include "scrub/xfbtree.h"
 #include "xfs_btree_mem.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Btree magic numbers.
@@ -476,6 +477,9 @@ xfs_btree_del_cursor(
 	       xfs_is_shutdown(cur->bc_mp) || error != 0);
 	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
 		kmem_free(cur->bc_ops);
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    !(cur->bc_flags & XFS_BTREE_IN_XFILE) && cur->bc_ino.rtg)
+		xfs_rtgroup_put(cur->bc_ino.rtg);
 	if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
 	    !(cur->bc_flags & XFS_BTREE_IN_XFILE) && cur->bc_ag.pag)
 		xfs_perag_put(cur->bc_ag.pag);
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index bb6c2feecea7d..ce0bc5dfffe1c 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -12,6 +12,7 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_ifork;
 struct xfs_perag;
+struct xfs_rtgroup;
 
 /*
  * Generic key, ptr and record wrapper structures.
@@ -247,6 +248,7 @@ struct xfs_btree_cur_ag {
 /* Btree-in-inode cursor information */
 struct xfs_btree_cur_ino {
 	struct xfs_inode		*ip;
+	struct xfs_rtgroup		*rtg;	/* if realtime metadata */
 	struct xbtree_ifakeroot		*ifake;	/* for staging cursor */
 	int				allocated;
 	short				forksize;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/10] xfs: give rmap btree cursor error tracepoints their own class
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
  2023-12-31 21:29   ` [PATCH 01/10] xfs: attach rtgroup objects to btree cursors Darrick J. Wong
@ 2023-12-31 21:29   ` Darrick J. Wong
  2023-12-31 21:29   ` [PATCH 03/10] xfs: prepare rmap btree tracepoints for widening Darrick J. Wong
                     ` (7 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:29 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new tracepoint class for btree-related errors, then convert all
the rmap tracepoints to use it.  Also fix the one tracepoint that was
abusing the old class by making it a separate tracepoint.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c |   33 +++++---------
 fs/xfs/xfs_trace.h       |  106 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 99 insertions(+), 40 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 539200e4b2516..813c2f77c9b3d 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -111,8 +111,7 @@ xfs_rmap_update(
 			xfs_rmap_irec_offset_pack(irec));
 	error = xfs_btree_update(cur, &rec);
 	if (error)
-		trace_xfs_rmap_update_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_update_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -155,8 +154,7 @@ xfs_rmap_insert(
 	}
 done:
 	if (error)
-		trace_xfs_rmap_insert_error(rcur->bc_mp,
-				rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_insert_error(rcur, error, _RET_IP_);
 	return error;
 }
 
@@ -194,8 +192,7 @@ xfs_rmap_delete(
 	}
 done:
 	if (error)
-		trace_xfs_rmap_delete_error(rcur->bc_mp,
-				rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_delete_error(rcur, error, _RET_IP_);
 	return error;
 }
 
@@ -816,8 +813,7 @@ xfs_rmap_unmap(
 			unwritten, oinfo);
 out_error:
 	if (error)
-		trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno,
-				error, _RET_IP_);
+		trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1139,8 +1135,7 @@ xfs_rmap_map(
 			unwritten, oinfo);
 out_error:
 	if (error)
-		trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno,
-				error, _RET_IP_);
+		trace_xfs_rmap_map_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1335,8 +1330,7 @@ xfs_rmap_convert(
 	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
 		state &= ~RMAP_RIGHT_CONTIG;
 
-	trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
-			_RET_IP_);
+	trace_xfs_rmap_convert_state(cur, state, _RET_IP_);
 
 	/* reset the cursor back to PREV */
 	error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
@@ -1689,8 +1683,7 @@ xfs_rmap_convert(
 			unwritten, oinfo);
 done:
 	if (error)
-		trace_xfs_rmap_convert_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1813,8 +1806,7 @@ xfs_rmap_convert_shared(
 	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
 		state &= ~RMAP_RIGHT_CONTIG;
 
-	trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
-			_RET_IP_);
+	trace_xfs_rmap_convert_state(cur, state, _RET_IP_);
 	/*
 	 * Switch out based on the FILLING and CONTIG state bits.
 	 */
@@ -2116,8 +2108,7 @@ xfs_rmap_convert_shared(
 			unwritten, oinfo);
 done:
 	if (error)
-		trace_xfs_rmap_convert_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -2316,8 +2307,7 @@ xfs_rmap_unmap_shared(
 			unwritten, oinfo);
 out_error:
 	if (error)
-		trace_xfs_rmap_unmap_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -2477,8 +2467,7 @@ xfs_rmap_map_shared(
 			unwritten, oinfo);
 out_error:
 	if (error)
-		trace_xfs_rmap_map_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_rmap_map_error(cur, error, _RET_IP_);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aafc0735e20ac..a7c752023f662 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2878,46 +2878,87 @@ DEFINE_EVENT(xfs_rmap_class, name, \
 		 const struct xfs_owner_info *oinfo), \
 	TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
 
-/* simple AG-based error/%ip tracepoint class */
-DECLARE_EVENT_CLASS(xfs_ag_error_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error,
+/* btree cursor error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_btree_error_class,
+	TP_PROTO(struct xfs_btree_cur *cur, int error,
 		 unsigned long caller_ip),
-	TP_ARGS(mp, agno, error, caller_ip),
+	TP_ARGS(cur, error, caller_ip),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
+		__field(xfs_ino_t, ino)
 		__field(int, error)
 		__field(unsigned long, caller_ip)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		if (cur->bc_flags & XFS_BTREE_IN_XFILE) {
+			__entry->agno = 0;
+			__entry->ino = 0;
+		} else if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+			__entry->agno = 0;
+			__entry->ino = cur->bc_ino.ip->i_ino;
+		} else {
+			__entry->agno = cur->bc_ag.pag->pag_agno;
+			__entry->ino = 0;
+		}
 		__entry->error = error;
 		__entry->caller_ip = caller_ip;
 	),
-	TP_printk("dev %d:%d agno 0x%x error %d caller %pS",
+	TP_printk("dev %d:%d agno 0x%x ino 0x%llx error %d caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
+		  __entry->ino,
 		  __entry->error,
 		  (char *)__entry->caller_ip)
 );
 
-#define DEFINE_AG_ERROR_EVENT(name) \
-DEFINE_EVENT(xfs_ag_error_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \
+#define DEFINE_BTREE_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_btree_error_class, name, \
+	TP_PROTO(struct xfs_btree_cur *cur, int error, \
 		 unsigned long caller_ip), \
-	TP_ARGS(mp, agno, error, caller_ip))
+	TP_ARGS(cur, error, caller_ip))
 
 DEFINE_RMAP_EVENT(xfs_rmap_unmap);
 DEFINE_RMAP_EVENT(xfs_rmap_unmap_done);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_unmap_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_unmap_error);
 DEFINE_RMAP_EVENT(xfs_rmap_map);
 DEFINE_RMAP_EVENT(xfs_rmap_map_done);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_map_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_map_error);
 DEFINE_RMAP_EVENT(xfs_rmap_convert);
 DEFINE_RMAP_EVENT(xfs_rmap_convert_done);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_error);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_state);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_convert_error);
+
+TRACE_EVENT(xfs_rmap_convert_state,
+	TP_PROTO(struct xfs_btree_cur *cur, int state,
+		 unsigned long caller_ip),
+	TP_ARGS(cur, state, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_ino_t, ino)
+		__field(int, state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+			__entry->agno = 0;
+			__entry->ino = cur->bc_ino.ip->i_ino;
+		} else {
+			__entry->agno = cur->bc_ag.pag->pag_agno;
+			__entry->ino = 0;
+		}
+		__entry->state = state;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno 0x%x ino 0x%llx state %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->ino,
+		  __entry->state,
+		  (char *)__entry->caller_ip)
+);
 
 DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -3018,9 +3059,9 @@ DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
 DEFINE_RMAPBT_EVENT(xfs_rmap_update);
 DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
 DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_insert_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_delete_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_update_error);
 
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query);
@@ -3146,6 +3187,35 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
 DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
 DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
 
+/* simple AG-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_ag_error_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, error)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->error = error;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno 0x%x error %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->error,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_AG_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_ag_error_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \
+		 unsigned long caller_ip), \
+	TP_ARGS(mp, agno, error, caller_ip))
 DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
 
 /* refcount tracepoint classes */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/10] xfs: prepare rmap btree tracepoints for widening
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
  2023-12-31 21:29   ` [PATCH 01/10] xfs: attach rtgroup objects to btree cursors Darrick J. Wong
  2023-12-31 21:29   ` [PATCH 02/10] xfs: give rmap btree cursor error tracepoints their own class Darrick J. Wong
@ 2023-12-31 21:29   ` Darrick J. Wong
  2023-12-31 21:30   ` [PATCH 04/10] xfs: clean up rmap log intent item tracepoint callsites Darrick J. Wong
                     ` (6 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:29 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Prepare the rmap btree tracepoints for use with realtime rmap btrees by
making them take the btree cursor object as a parameter.  This will save
us a lot of trouble later on.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c |  184 ++++++++++++++++++----------------------------
 fs/xfs/xfs_trace.h       |   24 +++---
 2 files changed, 85 insertions(+), 123 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 813c2f77c9b3d..b8425ae7807cc 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -100,8 +100,7 @@ xfs_rmap_update(
 	union xfs_btree_rec	rec;
 	int			error;
 
-	trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			irec->rm_startblock, irec->rm_blockcount,
+	trace_xfs_rmap_update(cur, irec->rm_startblock, irec->rm_blockcount,
 			irec->rm_owner, irec->rm_offset, irec->rm_flags);
 
 	rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock);
@@ -127,8 +126,7 @@ xfs_rmap_insert(
 	int			i;
 	int			error;
 
-	trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
-			len, owner, offset, flags);
+	trace_xfs_rmap_insert(rcur, agbno, len, owner, offset, flags);
 
 	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
 	if (error)
@@ -170,8 +168,7 @@ xfs_rmap_delete(
 	int			i;
 	int			error;
 
-	trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
-			len, owner, offset, flags);
+	trace_xfs_rmap_delete(rcur, agbno, len, owner, offset, flags);
 
 	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
 	if (error)
@@ -339,8 +336,7 @@ xfs_rmap_find_left_neighbor_helper(
 {
 	struct xfs_find_left_neighbor_info	*info = priv;
 
-	trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+	trace_xfs_rmap_find_left_neighbor_candidate(cur, rec->rm_startblock,
 			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
 			rec->rm_flags);
 
@@ -390,8 +386,8 @@ xfs_rmap_find_left_neighbor(
 	info.high.rm_blockcount = 0;
 	info.irec = irec;
 
-	trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
+	trace_xfs_rmap_find_left_neighbor_query(cur, bno, 0, owner, offset,
+			flags);
 
 	/*
 	 * Historically, we always used the range query to walk every reverse
@@ -422,8 +418,7 @@ xfs_rmap_find_left_neighbor(
 		return error;
 
 	*stat = 1;
-	trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+	trace_xfs_rmap_find_left_neighbor_result(cur, irec->rm_startblock,
 			irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
 			irec->rm_flags);
 	return 0;
@@ -438,8 +433,7 @@ xfs_rmap_lookup_le_range_helper(
 {
 	struct xfs_find_left_neighbor_info	*info = priv;
 
-	trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+	trace_xfs_rmap_lookup_le_range_candidate(cur, rec->rm_startblock,
 			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
 			rec->rm_flags);
 
@@ -486,8 +480,7 @@ xfs_rmap_lookup_le_range(
 	*stat = 0;
 	info.irec = irec;
 
-	trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			bno, 0, owner, offset, flags);
+	trace_xfs_rmap_lookup_le_range(cur, bno, 0, owner, offset, flags);
 
 	/*
 	 * Historically, we always used the range query to walk every reverse
@@ -518,8 +511,7 @@ xfs_rmap_lookup_le_range(
 		return error;
 
 	*stat = 1;
-	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+	trace_xfs_rmap_lookup_le_range_result(cur, irec->rm_startblock,
 			irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
 			irec->rm_flags);
 	return 0;
@@ -631,8 +623,7 @@ xfs_rmap_unmap(
 			(flags & XFS_RMAP_BMBT_BLOCK);
 	if (unwritten)
 		flags |= XFS_RMAP_UNWRITTEN;
-	trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo);
 
 	/*
 	 * We should always have a left record because there's a static record
@@ -648,10 +639,9 @@ xfs_rmap_unmap(
 		goto out_error;
 	}
 
-	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
-			ltrec.rm_blockcount, ltrec.rm_owner,
-			ltrec.rm_offset, ltrec.rm_flags);
+	trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock,
+			ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset,
+			ltrec.rm_flags);
 	ltoff = ltrec.rm_offset;
 
 	/*
@@ -718,10 +708,9 @@ xfs_rmap_unmap(
 
 	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
 		/* exact match, simply remove the record from rmap tree */
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				ltrec.rm_startblock, ltrec.rm_blockcount,
-				ltrec.rm_owner, ltrec.rm_offset,
-				ltrec.rm_flags);
+		trace_xfs_rmap_delete(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto out_error;
@@ -797,8 +786,7 @@ xfs_rmap_unmap(
 		else
 			cur->bc_rec.r.rm_offset = offset + len;
 		cur->bc_rec.r.rm_flags = flags;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
-				cur->bc_rec.r.rm_startblock,
+		trace_xfs_rmap_insert(cur, cur->bc_rec.r.rm_startblock,
 				cur->bc_rec.r.rm_blockcount,
 				cur->bc_rec.r.rm_owner,
 				cur->bc_rec.r.rm_offset,
@@ -809,8 +797,7 @@ xfs_rmap_unmap(
 	}
 
 out_done:
-	trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo);
 out_error:
 	if (error)
 		trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
@@ -974,8 +961,7 @@ xfs_rmap_map(
 			(flags & XFS_RMAP_BMBT_BLOCK);
 	if (unwritten)
 		flags |= XFS_RMAP_UNWRITTEN;
-	trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo);
 	ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
 
 	/*
@@ -988,8 +974,7 @@ xfs_rmap_map(
 	if (error)
 		goto out_error;
 	if (have_lt) {
-		trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
+		trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock,
 				ltrec.rm_blockcount, ltrec.rm_owner,
 				ltrec.rm_offset, ltrec.rm_flags);
 
@@ -1027,10 +1012,10 @@ xfs_rmap_map(
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
-		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
-			gtrec.rm_blockcount, gtrec.rm_owner,
-			gtrec.rm_offset, gtrec.rm_flags);
+		trace_xfs_rmap_find_right_neighbor_result(cur,
+				gtrec.rm_startblock, gtrec.rm_blockcount,
+				gtrec.rm_owner, gtrec.rm_offset,
+				gtrec.rm_flags);
 		if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
 			have_gt = 0;
 	}
@@ -1067,12 +1052,9 @@ xfs_rmap_map(
 			 * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
 			 */
 			ltrec.rm_blockcount += gtrec.rm_blockcount;
-			trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-					gtrec.rm_startblock,
-					gtrec.rm_blockcount,
-					gtrec.rm_owner,
-					gtrec.rm_offset,
-					gtrec.rm_flags);
+			trace_xfs_rmap_delete(cur, gtrec.rm_startblock,
+					gtrec.rm_blockcount, gtrec.rm_owner,
+					gtrec.rm_offset, gtrec.rm_flags);
 			error = xfs_btree_delete(cur, &i);
 			if (error)
 				goto out_error;
@@ -1119,8 +1101,7 @@ xfs_rmap_map(
 		cur->bc_rec.r.rm_owner = owner;
 		cur->bc_rec.r.rm_offset = offset;
 		cur->bc_rec.r.rm_flags = flags;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			owner, offset, flags);
+		trace_xfs_rmap_insert(cur, bno, len, owner, offset, flags);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
 			goto out_error;
@@ -1131,8 +1112,7 @@ xfs_rmap_map(
 		}
 	}
 
-	trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo);
 out_error:
 	if (error)
 		trace_xfs_rmap_map_error(cur, error, _RET_IP_);
@@ -1209,8 +1189,7 @@ xfs_rmap_convert(
 			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
 	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
 	new_endoff = offset + len;
-	trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo);
 
 	/*
 	 * For the initial lookup, look for an exact match or the left-adjacent
@@ -1226,10 +1205,9 @@ xfs_rmap_convert(
 		goto done;
 	}
 
-	trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
-			PREV.rm_blockcount, PREV.rm_owner,
-			PREV.rm_offset, PREV.rm_flags);
+	trace_xfs_rmap_lookup_le_range_result(cur, PREV.rm_startblock,
+			PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset,
+			PREV.rm_flags);
 
 	ASSERT(PREV.rm_offset <= offset);
 	ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
@@ -1270,10 +1248,9 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, LEFT.rm_startblock,
-				LEFT.rm_blockcount, LEFT.rm_owner,
-				LEFT.rm_offset, LEFT.rm_flags);
+		trace_xfs_rmap_find_left_neighbor_result(cur,
+				LEFT.rm_startblock, LEFT.rm_blockcount,
+				LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags);
 		if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
 		    LEFT.rm_offset + LEFT.rm_blockcount == offset &&
 		    xfs_rmap_is_mergeable(&LEFT, owner, newext))
@@ -1311,10 +1288,10 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
-				RIGHT.rm_blockcount, RIGHT.rm_owner,
-				RIGHT.rm_offset, RIGHT.rm_flags);
+		trace_xfs_rmap_find_right_neighbor_result(cur,
+				RIGHT.rm_startblock, RIGHT.rm_blockcount,
+				RIGHT.rm_owner, RIGHT.rm_offset,
+				RIGHT.rm_flags);
 		if (bno + len == RIGHT.rm_startblock &&
 		    offset + len == RIGHT.rm_offset &&
 		    xfs_rmap_is_mergeable(&RIGHT, owner, newext))
@@ -1361,10 +1338,9 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				RIGHT.rm_startblock, RIGHT.rm_blockcount,
-				RIGHT.rm_owner, RIGHT.rm_offset,
-				RIGHT.rm_flags);
+		trace_xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto done;
@@ -1381,10 +1357,9 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				PREV.rm_startblock, PREV.rm_blockcount,
-				PREV.rm_owner, PREV.rm_offset,
-				PREV.rm_flags);
+		trace_xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto done;
@@ -1413,10 +1388,9 @@ xfs_rmap_convert(
 		 * Setting all of a previous oldext extent to newext.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				PREV.rm_startblock, PREV.rm_blockcount,
-				PREV.rm_owner, PREV.rm_offset,
-				PREV.rm_flags);
+		trace_xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto done;
@@ -1453,10 +1427,9 @@ xfs_rmap_convert(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
-				RIGHT.rm_startblock, RIGHT.rm_blockcount,
-				RIGHT.rm_owner, RIGHT.rm_offset,
-				RIGHT.rm_flags);
+		trace_xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
 		error = xfs_btree_delete(cur, &i);
 		if (error)
 			goto done;
@@ -1534,8 +1507,7 @@ xfs_rmap_convert(
 		NEW.rm_blockcount = len;
 		NEW.rm_flags = newext;
 		cur->bc_rec.r = NEW;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
-				len, owner, offset, newext);
+		trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
 			goto done;
@@ -1593,8 +1565,7 @@ xfs_rmap_convert(
 		NEW.rm_blockcount = len;
 		NEW.rm_flags = newext;
 		cur->bc_rec.r = NEW;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
-				len, owner, offset, newext);
+		trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
 			goto done;
@@ -1625,9 +1596,8 @@ xfs_rmap_convert(
 		NEW = PREV;
 		NEW.rm_blockcount = offset - PREV.rm_offset;
 		cur->bc_rec.r = NEW;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
-				NEW.rm_startblock, NEW.rm_blockcount,
-				NEW.rm_owner, NEW.rm_offset,
+		trace_xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
 				NEW.rm_flags);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
@@ -1654,8 +1624,7 @@ xfs_rmap_convert(
 		/* new middle extent - newext */
 		cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
 		cur->bc_rec.r.rm_flags |= newext;
-		trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
-				owner, offset, newext);
+		trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
 		error = xfs_btree_insert(cur, &i);
 		if (error)
 			goto done;
@@ -1679,8 +1648,7 @@ xfs_rmap_convert(
 		ASSERT(0);
 	}
 
-	trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo);
 done:
 	if (error)
 		trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
@@ -1719,8 +1687,7 @@ xfs_rmap_convert_shared(
 			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
 	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
 	new_endoff = offset + len;
-	trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo);
 
 	/*
 	 * For the initial lookup, look for and exact match or the left-adjacent
@@ -1789,10 +1756,10 @@ xfs_rmap_convert_shared(
 			error = -EFSCORRUPTED;
 			goto done;
 		}
-		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
-				RIGHT.rm_blockcount, RIGHT.rm_owner,
-				RIGHT.rm_offset, RIGHT.rm_flags);
+		trace_xfs_rmap_find_right_neighbor_result(cur,
+				RIGHT.rm_startblock, RIGHT.rm_blockcount,
+				RIGHT.rm_owner, RIGHT.rm_offset,
+				RIGHT.rm_flags);
 		if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
 			state |= RMAP_RIGHT_CONTIG;
 	}
@@ -2104,8 +2071,7 @@ xfs_rmap_convert_shared(
 		ASSERT(0);
 	}
 
-	trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo);
 done:
 	if (error)
 		trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
@@ -2146,8 +2112,7 @@ xfs_rmap_unmap_shared(
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	if (unwritten)
 		flags |= XFS_RMAP_UNWRITTEN;
-	trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo);
 
 	/*
 	 * We should always have a left record because there's a static record
@@ -2303,8 +2268,7 @@ xfs_rmap_unmap_shared(
 			goto out_error;
 	}
 
-	trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo);
 out_error:
 	if (error)
 		trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
@@ -2342,8 +2306,7 @@ xfs_rmap_map_shared(
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	if (unwritten)
 		flags |= XFS_RMAP_UNWRITTEN;
-	trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo);
 
 	/* Is there a left record that abuts our range? */
 	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
@@ -2368,10 +2331,10 @@ xfs_rmap_map_shared(
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
-		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
-			gtrec.rm_blockcount, gtrec.rm_owner,
-			gtrec.rm_offset, gtrec.rm_flags);
+		trace_xfs_rmap_find_right_neighbor_result(cur,
+				gtrec.rm_startblock, gtrec.rm_blockcount,
+				gtrec.rm_owner, gtrec.rm_offset,
+				gtrec.rm_flags);
 
 		if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
 			have_gt = 0;
@@ -2463,8 +2426,7 @@ xfs_rmap_map_shared(
 			goto out_error;
 	}
 
-	trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
-			unwritten, oinfo);
+	trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo);
 out_error:
 	if (error)
 		trace_xfs_rmap_map_error(cur, error, _RET_IP_);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a7c752023f662..67c73e1d9b95c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2838,10 +2838,10 @@ DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
 
 /* rmap tracepoints */
 DECLARE_EVENT_CLASS(xfs_rmap_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+	TP_PROTO(struct xfs_btree_cur *cur,
 		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
 		 const struct xfs_owner_info *oinfo),
-	TP_ARGS(mp, agno, agbno, len, unwritten, oinfo),
+	TP_ARGS(cur, agbno, len, unwritten, oinfo),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -2852,8 +2852,8 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
 		__field(unsigned long, flags)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
 		__entry->owner = oinfo->oi_owner;
@@ -2873,10 +2873,10 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
 );
 #define DEFINE_RMAP_EVENT(name) \
 DEFINE_EVENT(xfs_rmap_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+	TP_PROTO(struct xfs_btree_cur *cur, \
 		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
 		 const struct xfs_owner_info *oinfo), \
-	TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
+	TP_ARGS(cur, agbno, len, unwritten, oinfo))
 
 /* btree cursor error/%ip tracepoint class */
 DECLARE_EVENT_CLASS(xfs_btree_error_class,
@@ -2961,10 +2961,10 @@ TRACE_EVENT(xfs_rmap_convert_state,
 );
 
 DECLARE_EVENT_CLASS(xfs_rmapbt_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+	TP_PROTO(struct xfs_btree_cur *cur,
 		 xfs_agblock_t agbno, xfs_extlen_t len,
 		 uint64_t owner, uint64_t offset, unsigned int flags),
-	TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+	TP_ARGS(cur, agbno, len, owner, offset, flags),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -2975,8 +2975,8 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 		__field(unsigned int, flags)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
 		__entry->owner = owner;
@@ -2994,10 +2994,10 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 );
 #define DEFINE_RMAPBT_EVENT(name) \
 DEFINE_EVENT(xfs_rmapbt_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+	TP_PROTO(struct xfs_btree_cur *cur, \
 		 xfs_agblock_t agbno, xfs_extlen_t len, \
 		 uint64_t owner, uint64_t offset, unsigned int flags), \
-	TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+	TP_ARGS(cur, agbno, len, owner, offset, flags))
 
 DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/10] xfs: clean up rmap log intent item tracepoint callsites
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:29   ` [PATCH 03/10] xfs: prepare rmap btree tracepoints for widening Darrick J. Wong
@ 2023-12-31 21:30   ` Darrick J. Wong
  2023-12-31 21:30   ` [PATCH 05/10] xfs: remove xfs_trans_set_rmap_flags Darrick J. Wong
                     ` (5 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:30 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pass the incore rmap structure to the tracepoints instead of open-coding
the argument passing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c |   22 ++++--------------
 fs/xfs/libxfs/xfs_rmap.h |   10 ++++++++
 fs/xfs/xfs_trace.c       |    1 +
 fs/xfs/xfs_trace.h       |   57 ++++++++++++++++++++++------------------------
 4 files changed, 43 insertions(+), 47 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index b8425ae7807cc..628f337a8eee1 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2576,20 +2576,15 @@ xfs_rmap_finish_one(
 	struct xfs_rmap_intent		*ri,
 	struct xfs_btree_cur		**pcur)
 {
+	struct xfs_owner_info		oinfo;
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_btree_cur		*rcur;
 	struct xfs_buf			*agbp = NULL;
-	int				error = 0;
-	struct xfs_owner_info		oinfo;
 	xfs_agblock_t			bno;
 	bool				unwritten;
+	int				error = 0;
 
-	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock);
-
-	trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno,
-			ri->ri_owner, ri->ri_whichfork,
-			ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount,
-			ri->ri_bmap.br_state);
+	trace_xfs_rmap_deferred(mp, ri);
 
 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
 		return -EIO;
@@ -2664,15 +2659,6 @@ __xfs_rmap_add(
 {
 	struct xfs_rmap_intent		*ri;
 
-	trace_xfs_rmap_defer(tp->t_mountp,
-			XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
-			type,
-			XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
-			owner, whichfork,
-			bmap->br_startoff,
-			bmap->br_blockcount,
-			bmap->br_state);
-
 	ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&ri->ri_list);
 	ri->ri_type = type;
@@ -2680,6 +2666,8 @@ __xfs_rmap_add(
 	ri->ri_whichfork = whichfork;
 	ri->ri_bmap = *bmap;
 
+	trace_xfs_rmap_defer(tp->t_mountp, ri);
+
 	xfs_rmap_update_get_group(tp->t_mountp, ri);
 	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
 }
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 3a153b4801b46..f16b07d851d32 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -157,6 +157,16 @@ enum xfs_rmap_intent_type {
 	XFS_RMAP_FREE,
 };
 
+#define XFS_RMAP_INTENT_STRINGS \
+	{ XFS_RMAP_MAP,			"map" }, \
+	{ XFS_RMAP_MAP_SHARED,		"map_shared" }, \
+	{ XFS_RMAP_UNMAP,		"unmap" }, \
+	{ XFS_RMAP_UNMAP_SHARED,	"unmap_shared" }, \
+	{ XFS_RMAP_CONVERT,		"cvt" }, \
+	{ XFS_RMAP_CONVERT_SHARED,	"cvt_shared" }, \
+	{ XFS_RMAP_ALLOC,		"alloc" }, \
+	{ XFS_RMAP_FREE,		"free" }
+
 struct xfs_rmap_intent {
 	struct list_head			ri_list;
 	enum xfs_rmap_intent_type		ri_type;
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 21f55b0125fc4..8a990a2fa132f 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -45,6 +45,7 @@
 #include "xfs_parent.h"
 #include "xfs_imeta.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rmap.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 67c73e1d9b95c..9290a02bac99e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -89,6 +89,7 @@ struct xfs_attrlist_cursor_kern;
 struct xfs_imeta_update;
 struct xfs_rtgroup;
 struct xfs_extent_free_item;
+struct xfs_rmap_intent;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -2999,20 +3000,22 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \
 		 uint64_t owner, uint64_t offset, unsigned int flags), \
 	TP_ARGS(cur, agbno, len, owner, offset, flags))
 
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC);
+TRACE_DEFINE_ENUM(XFS_RMAP_FREE);
+
 DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 int op,
-		 xfs_agblock_t agbno,
-		 xfs_ino_t ino,
-		 int whichfork,
-		 xfs_fileoff_t offset,
-		 xfs_filblks_t len,
-		 xfs_exntst_t state),
-	TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+	TP_PROTO(struct xfs_mount *mp, struct xfs_rmap_intent *ri),
+	TP_ARGS(mp, ri),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(unsigned long long, owner)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_ino_t, ino)
 		__field(xfs_agblock_t, agbno)
 		__field(int, whichfork)
 		__field(xfs_fileoff_t, l_loff)
@@ -3022,21 +3025,22 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
-		__entry->ino = ino;
-		__entry->agbno = agbno;
-		__entry->whichfork = whichfork;
-		__entry->l_loff = offset;
-		__entry->l_len = len;
-		__entry->l_state = state;
-		__entry->op = op;
+		__entry->agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
+		__entry->agbno = XFS_FSB_TO_AGBNO(mp,
+					ri->ri_bmap.br_startblock);
+		__entry->owner = ri->ri_owner;
+		__entry->whichfork = ri->ri_whichfork;
+		__entry->l_loff = ri->ri_bmap.br_startoff;
+		__entry->l_len = ri->ri_bmap.br_blockcount;
+		__entry->l_state = ri->ri_bmap.br_state;
+		__entry->op = ri->ri_type;
 	),
-	TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+	TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->op,
+		  __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS),
 		  __entry->agno,
 		  __entry->agbno,
-		  __entry->ino,
+		  __entry->owner,
 		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
 		  __entry->l_loff,
 		  __entry->l_len,
@@ -3044,15 +3048,8 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 );
 #define DEFINE_RMAP_DEFERRED_EVENT(name) \
 DEFINE_EVENT(xfs_rmap_deferred_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 int op, \
-		 xfs_agblock_t agbno, \
-		 xfs_ino_t ino, \
-		 int whichfork, \
-		 xfs_fileoff_t offset, \
-		 xfs_filblks_t len, \
-		 xfs_exntst_t state), \
-	TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
+	TP_PROTO(struct xfs_mount *mp, struct xfs_rmap_intent *ri), \
+	TP_ARGS(mp, ri))
 DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
 DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/10] xfs: remove xfs_trans_set_rmap_flags
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:30   ` [PATCH 04/10] xfs: clean up rmap log intent item tracepoint callsites Darrick J. Wong
@ 2023-12-31 21:30   ` Darrick J. Wong
  2023-12-31 21:30   ` [PATCH 06/10] xfs: add a ri_entry helper Darrick J. Wong
                     ` (4 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:30 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Remove this single-use helper.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_rmap_item.c |   79 +++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 45 deletions(-)


diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 183544ac74c98..3e67fe2133263 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -225,49 +225,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = {
 	.iop_intent	= xfs_rud_item_intent,
 };
 
-/* Set the map extent flags for this reverse mapping. */
-static void
-xfs_trans_set_rmap_flags(
-	struct xfs_map_extent		*map,
-	enum xfs_rmap_intent_type	type,
-	int				whichfork,
-	xfs_exntst_t			state)
-{
-	map->me_flags = 0;
-	if (state == XFS_EXT_UNWRITTEN)
-		map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
-	if (whichfork == XFS_ATTR_FORK)
-		map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
-	switch (type) {
-	case XFS_RMAP_MAP:
-		map->me_flags |= XFS_RMAP_EXTENT_MAP;
-		break;
-	case XFS_RMAP_MAP_SHARED:
-		map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
-		break;
-	case XFS_RMAP_UNMAP:
-		map->me_flags |= XFS_RMAP_EXTENT_UNMAP;
-		break;
-	case XFS_RMAP_UNMAP_SHARED:
-		map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
-		break;
-	case XFS_RMAP_CONVERT:
-		map->me_flags |= XFS_RMAP_EXTENT_CONVERT;
-		break;
-	case XFS_RMAP_CONVERT_SHARED:
-		map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
-		break;
-	case XFS_RMAP_ALLOC:
-		map->me_flags |= XFS_RMAP_EXTENT_ALLOC;
-		break;
-	case XFS_RMAP_FREE:
-		map->me_flags |= XFS_RMAP_EXTENT_FREE;
-		break;
-	default:
-		ASSERT(0);
-	}
-}
-
 /* Sort rmap intents by AG. */
 static int
 xfs_rmap_update_diff_items(
@@ -306,8 +263,40 @@ xfs_rmap_update_log_item(
 	map->me_startblock = ri->ri_bmap.br_startblock;
 	map->me_startoff = ri->ri_bmap.br_startoff;
 	map->me_len = ri->ri_bmap.br_blockcount;
-	xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork,
-			ri->ri_bmap.br_state);
+
+	map->me_flags = 0;
+	if (ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN)
+		map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+	if (ri->ri_whichfork == XFS_ATTR_FORK)
+		map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+	switch (ri->ri_type) {
+	case XFS_RMAP_MAP:
+		map->me_flags |= XFS_RMAP_EXTENT_MAP;
+		break;
+	case XFS_RMAP_MAP_SHARED:
+		map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+		break;
+	case XFS_RMAP_UNMAP:
+		map->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+		break;
+	case XFS_RMAP_UNMAP_SHARED:
+		map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+		break;
+	case XFS_RMAP_CONVERT:
+		map->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+		break;
+	case XFS_RMAP_CONVERT_SHARED:
+		map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+		break;
+	case XFS_RMAP_ALLOC:
+		map->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+		break;
+	case XFS_RMAP_FREE:
+		map->me_flags |= XFS_RMAP_EXTENT_FREE;
+		break;
+	default:
+		ASSERT(0);
+	}
 }
 
 static struct xfs_log_item *


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/10] xfs: add a ri_entry helper
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:30   ` [PATCH 05/10] xfs: remove xfs_trans_set_rmap_flags Darrick J. Wong
@ 2023-12-31 21:30   ` Darrick J. Wong
  2023-12-31 21:31   ` [PATCH 07/10] xfs: reuse xfs_rmap_update_cancel_item Darrick J. Wong
                     ` (3 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:30 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

Add a helper to translate from the item list head to the
rmap_intent_item structure and use it so shorten assignments
and avoid the need for extra local variables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_rmap_item.c |   20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)


diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 3e67fe2133263..80433d6b2f9a3 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -225,6 +225,11 @@ static const struct xfs_item_ops xfs_rud_item_ops = {
 	.iop_intent	= xfs_rud_item_intent,
 };
 
+static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_rmap_intent, ri_list);
+}
+
 /* Sort rmap intents by AG. */
 static int
 xfs_rmap_update_diff_items(
@@ -232,11 +237,8 @@ xfs_rmap_update_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	struct xfs_rmap_intent		*ra;
-	struct xfs_rmap_intent		*rb;
-
-	ra = container_of(a, struct xfs_rmap_intent, ri_list);
-	rb = container_of(b, struct xfs_rmap_intent, ri_list);
+	struct xfs_rmap_intent		*ra = ri_entry(a);
+	struct xfs_rmap_intent		*rb = ri_entry(b);
 
 	return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
 }
@@ -363,11 +365,9 @@ xfs_rmap_update_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
-	struct xfs_rmap_intent		*ri;
+	struct xfs_rmap_intent		*ri = ri_entry(item);
 	int				error;
 
-	ri = container_of(item, struct xfs_rmap_intent, ri_list);
-
 	error = xfs_rmap_finish_one(tp, ri, state);
 
 	xfs_rmap_update_put_group(ri);
@@ -388,9 +388,7 @@ STATIC void
 xfs_rmap_update_cancel_item(
 	struct list_head		*item)
 {
-	struct xfs_rmap_intent		*ri;
-
-	ri = container_of(item, struct xfs_rmap_intent, ri_list);
+	struct xfs_rmap_intent		*ri = ri_entry(item);
 
 	xfs_rmap_update_put_group(ri);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/10] xfs: reuse xfs_rmap_update_cancel_item
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:30   ` [PATCH 06/10] xfs: add a ri_entry helper Darrick J. Wong
@ 2023-12-31 21:31   ` Darrick J. Wong
  2023-12-31 21:31   ` [PATCH 08/10] xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one Darrick J. Wong
                     ` (2 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:31 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

Reuse xfs_rmap_update_cancel_item to put the AG/RTG and free the item in
a few places that currently open code the logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_rmap_item.c |   25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)


diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 80433d6b2f9a3..9ce11e27cb9fd 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -357,6 +357,17 @@ xfs_rmap_update_put_group(
 	xfs_perag_intent_put(ri->ri_pag);
 }
 
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_rmap_intent		*ri = ri_entry(item);
+
+	xfs_rmap_update_put_group(ri);
+	kmem_cache_free(xfs_rmap_intent_cache, ri);
+}
+
 /* Process a deferred rmap update. */
 STATIC int
 xfs_rmap_update_finish_item(
@@ -370,8 +381,7 @@ xfs_rmap_update_finish_item(
 
 	error = xfs_rmap_finish_one(tp, ri, state);
 
-	xfs_rmap_update_put_group(ri);
-	kmem_cache_free(xfs_rmap_intent_cache, ri);
+	xfs_rmap_update_cancel_item(item);
 	return error;
 }
 
@@ -383,17 +393,6 @@ xfs_rmap_update_abort_intent(
 	xfs_rui_release(RUI_ITEM(intent));
 }
 
-/* Cancel a deferred rmap update. */
-STATIC void
-xfs_rmap_update_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_rmap_intent		*ri = ri_entry(item);
-
-	xfs_rmap_update_put_group(ri);
-	kmem_cache_free(xfs_rmap_intent_cache, ri);
-}
-
 /* Is this recovered RUI ok? */
 static inline bool
 xfs_rui_validate_map(


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/10] xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:31   ` [PATCH 07/10] xfs: reuse xfs_rmap_update_cancel_item Darrick J. Wong
@ 2023-12-31 21:31   ` Darrick J. Wong
  2023-12-31 21:31   ` [PATCH 09/10] xfs: simplify usage of the rcur local variable " Darrick J. Wong
  2023-12-31 21:31   ` [PATCH 10/10] xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c Darrick J. Wong
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:31 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

In xfs_rmap_finish_one we known the cursor is non-zero when calling
xfs_rmap_finish_one_cleanup and we pass a 0 error variable.  This means
xfs_rmap_finish_one_cleanup is just doing a xfs_btree_del_cursor.

Open code that and move xfs_rmap_finish_one_cleanup to
fs/xfs/xfs_rmap_item.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: minor porting changes]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c |   19 +------------------
 fs/xfs/libxfs/xfs_rmap.h |    2 --
 fs/xfs/xfs_rmap_item.c   |   18 ++++++++++++++++++
 3 files changed, 19 insertions(+), 20 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 628f337a8eee1..ba33e51a3f2b8 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2514,23 +2514,6 @@ xfs_rmap_query_all(
 	return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
 }
 
-/* Clean up after calling xfs_rmap_finish_one. */
-void
-xfs_rmap_finish_one_cleanup(
-	struct xfs_trans	*tp,
-	struct xfs_btree_cur	*rcur,
-	int			error)
-{
-	struct xfs_buf		*agbp;
-
-	if (rcur == NULL)
-		return;
-	agbp = rcur->bc_ag.agbp;
-	xfs_btree_del_cursor(rcur, error);
-	if (error)
-		xfs_trans_brelse(tp, agbp);
-}
-
 /* Commit an rmap operation into the ondisk tree. */
 int
 __xfs_rmap_finish_intent(
@@ -2595,7 +2578,7 @@ xfs_rmap_finish_one(
 	 */
 	rcur = *pcur;
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
-		xfs_rmap_finish_one_cleanup(tp, rcur, 0);
+		xfs_btree_del_cursor(rcur, 0);
 		rcur = NULL;
 		*pcur = NULL;
 	}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index f16b07d851d32..2513ee36aa29d 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -192,8 +192,6 @@ void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
 void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
 		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
 
-void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
-		struct xfs_btree_cur *rcur, int error);
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
 		struct xfs_btree_cur **pcur);
 int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 9ce11e27cb9fd..d13fe835f0313 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -21,6 +21,7 @@
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
 #include "xfs_ag.h"
+#include "xfs_btree.h"
 
 struct kmem_cache	*xfs_rui_cache;
 struct kmem_cache	*xfs_rud_cache;
@@ -385,6 +386,23 @@ xfs_rmap_update_finish_item(
 	return error;
 }
 
+/* Clean up after calling xfs_rmap_finish_one. */
+STATIC void
+xfs_rmap_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	struct xfs_buf		*agbp = NULL;
+
+	if (rcur == NULL)
+		return;
+	agbp = rcur->bc_ag.agbp;
+	xfs_btree_del_cursor(rcur, error);
+	if (error && agbp)
+		xfs_trans_brelse(tp, agbp);
+}
+
 /* Abort all pending RUIs. */
 STATIC void
 xfs_rmap_update_abort_intent(


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/10] xfs: simplify usage of the rcur local variable in xfs_rmap_finish_one
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:31   ` [PATCH 08/10] xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one Darrick J. Wong
@ 2023-12-31 21:31   ` Darrick J. Wong
  2023-12-31 21:31   ` [PATCH 10/10] xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c Darrick J. Wong
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:31 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Christoph Hellwig <hch@lst.de>

Only update rcur when we know the final *pcur value.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[djwong: don't leave the caller with a dangling ref]
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c |    6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index ba33e51a3f2b8..3830f73607f32 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2561,7 +2561,7 @@ xfs_rmap_finish_one(
 {
 	struct xfs_owner_info		oinfo;
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_btree_cur		*rcur;
+	struct xfs_btree_cur		*rcur = *pcur;
 	struct xfs_buf			*agbp = NULL;
 	xfs_agblock_t			bno;
 	bool				unwritten;
@@ -2576,7 +2576,6 @@ xfs_rmap_finish_one(
 	 * If we haven't gotten a cursor or the cursor AG doesn't match
 	 * the startblock, get one now.
 	 */
-	rcur = *pcur;
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
 		xfs_btree_del_cursor(rcur, 0);
 		rcur = NULL;
@@ -2598,9 +2597,8 @@ xfs_rmap_finish_one(
 			return -EFSCORRUPTED;
 		}
 
-		rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
+		*pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
 	}
-	*pcur = rcur;
 
 	xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
 			ri->ri_bmap.br_startoff);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/10] xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c
  2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 21:31   ` [PATCH 09/10] xfs: simplify usage of the rcur local variable " Darrick J. Wong
@ 2023-12-31 21:31   ` Darrick J. Wong
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:31 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the code that adds the incore xfs_rmap_update_item deferred work
data to a transaction live with the RUI log item code.  This means that
the rmap code no longer has to know about the inner workings of the RUI
log items.

As a consequence, we can get rid of the _{get,put}_group helpers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c |    6 ++----
 fs/xfs/libxfs/xfs_rmap.h |    3 ---
 fs/xfs/xfs_rmap_item.c   |   24 +++++++++++-------------
 fs/xfs/xfs_rmap_item.h   |    4 ++++
 4 files changed, 17 insertions(+), 20 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 3830f73607f32..0f552cb737c8b 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -24,6 +24,7 @@
 #include "xfs_inode.h"
 #include "xfs_ag.h"
 #include "xfs_health.h"
+#include "xfs_rmap_item.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -2647,10 +2648,7 @@ __xfs_rmap_add(
 	ri->ri_whichfork = whichfork;
 	ri->ri_bmap = *bmap;
 
-	trace_xfs_rmap_defer(tp->t_mountp, ri);
-
-	xfs_rmap_update_get_group(tp->t_mountp, ri);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+	xfs_rmap_defer_add(tp, ri);
 }
 
 /* Map an extent into a file. */
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 2513ee36aa29d..e6240efd6fe75 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -176,9 +176,6 @@ struct xfs_rmap_intent {
 	struct xfs_perag			*ri_pag;
 };
 
-void xfs_rmap_update_get_group(struct xfs_mount *mp,
-		struct xfs_rmap_intent *ri);
-
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
 void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		int whichfork, struct xfs_bmbt_irec *imap);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index d13fe835f0313..e2ee1b6719202 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -22,6 +22,7 @@
 #include "xfs_log_recover.h"
 #include "xfs_ag.h"
 #include "xfs_btree.h"
+#include "xfs_trace.h"
 
 struct kmem_cache	*xfs_rui_cache;
 struct kmem_cache	*xfs_rud_cache;
@@ -341,21 +342,18 @@ xfs_rmap_update_create_done(
 	return &rudp->rud_item;
 }
 
-/* Take a passive ref to the AG containing the space we're rmapping. */
+/* Add this deferred RUI to the transaction. */
 void
-xfs_rmap_update_get_group(
-	struct xfs_mount	*mp,
+xfs_rmap_defer_add(
+	struct xfs_trans	*tp,
 	struct xfs_rmap_intent	*ri)
 {
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	trace_xfs_rmap_defer(mp, ri);
+
 	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
-}
-
-/* Release a passive AG ref after finishing rmapping work. */
-static inline void
-xfs_rmap_update_put_group(
-	struct xfs_rmap_intent	*ri)
-{
-	xfs_perag_intent_put(ri->ri_pag);
+	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
 }
 
 /* Cancel a deferred rmap update. */
@@ -365,7 +363,7 @@ xfs_rmap_update_cancel_item(
 {
 	struct xfs_rmap_intent		*ri = ri_entry(item);
 
-	xfs_rmap_update_put_group(ri);
+	xfs_perag_intent_put(ri->ri_pag);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);
 }
 
@@ -495,7 +493,7 @@ xfs_rui_recover_work(
 	ri->ri_bmap.br_blockcount = map->me_len;
 	ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
 			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-	xfs_rmap_update_get_group(mp, ri);
+	ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock);
 
 	xfs_defer_add_item(dfp, &ri->ri_list);
 }
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 802e5119eacaa..40d331555675b 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -71,4 +71,8 @@ struct xfs_rud_log_item {
 extern struct kmem_cache	*xfs_rui_cache;
 extern struct kmem_cache	*xfs_rud_cache;
 
+struct xfs_rmap_intent;
+
+void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
+
 #endif	/* __XFS_RMAP_ITEM_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/39] xfs: prepare rmap btree cursor tracepoints for realtime
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
@ 2023-12-31 21:32   ` Darrick J. Wong
  2023-12-31 21:32   ` [PATCH 02/39] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions Darrick J. Wong
                     ` (37 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:32 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Rework the rmap btree cursor tracepoints in preparation to handle the
realtime rmap btree cursor.  Mostly this involves renaming the field to
"rmapbno" and extracting the group number from the cursor when possible.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_trace.c |   18 ++++++++++++++++
 fs/xfs/xfs_trace.h |   58 +++++++++++++++++++++++++++-------------------------
 2 files changed, 48 insertions(+), 28 deletions(-)


diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8a990a2fa132f..f35927ebe2450 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -47,6 +47,24 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rmap.h"
 
+static inline void
+xfs_rmapbt_crack_agno_opdev(
+	struct xfs_btree_cur	*cur,
+	xfs_agnumber_t		*agno,
+	dev_t			*opdev)
+{
+	if (cur->bc_flags & XFS_BTREE_IN_XFILE) {
+		*agno = 0;
+		*opdev = xfbtree_target(cur->bc_mem.xfbtree)->bt_dev;
+	} else if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		*agno = cur->bc_ino.rtg->rtg_rgno;
+		*opdev = cur->bc_mp->m_rtdev_targp->bt_dev;
+	} else {
+		*agno = cur->bc_ag.pag->pag_agno;
+		*opdev = cur->bc_mp->m_super->s_dev;
+	}
+}
+
 /*
  * We include this last to have the helpers above available for the trace
  * event implementations.
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 9290a02bac99e..43ecf98f86558 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -14,11 +14,15 @@
  * ino: filesystem inode number
  *
  * agbno: per-AG block number in fs blocks
+ * rgbno: per-rtgroup block number in fs blocks
  * startblock: physical block number for file mappings.  This is either a
  *             segmented fsblock for data device mappings, or a rfsblock
  *             for realtime device mappings
  * fsbcount: number of blocks in an extent, in fs blocks
  *
+ * rmapbno: physical block number for a reverse mapping.  This is an agbno for
+ *          per-AG rmap btrees or a rgbno for realtime rmap btrees.
+ *
  * daddr: physical block number in 512b blocks
  * bbcount: number of blocks in a physical extent, in 512b blocks
  *
@@ -2840,13 +2844,14 @@ DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
 /* rmap tracepoints */
 DECLARE_EVENT_CLASS(xfs_rmap_class,
 	TP_PROTO(struct xfs_btree_cur *cur,
-		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
+		 xfs_agblock_t rmapbno, xfs_extlen_t len, bool unwritten,
 		 const struct xfs_owner_info *oinfo),
-	TP_ARGS(cur, agbno, len, unwritten, oinfo),
+	TP_ARGS(cur, rmapbno, len, unwritten, oinfo),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, rmapbno)
 		__field(xfs_extlen_t, len)
 		__field(uint64_t, owner)
 		__field(uint64_t, offset)
@@ -2854,8 +2859,8 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
-		__entry->agbno = agbno;
+		xfs_rmapbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
+		__entry->rmapbno = rmapbno;
 		__entry->len = len;
 		__entry->owner = oinfo->oi_owner;
 		__entry->offset = oinfo->oi_offset;
@@ -2863,10 +2868,11 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
 		if (unwritten)
 			__entry->flags |= XFS_RMAP_UNWRITTEN;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x rmapbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->rmapbno,
 		  __entry->len,
 		  __entry->owner,
 		  __entry->offset,
@@ -2875,9 +2881,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
 #define DEFINE_RMAP_EVENT(name) \
 DEFINE_EVENT(xfs_rmap_class, name, \
 	TP_PROTO(struct xfs_btree_cur *cur, \
-		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
+		 xfs_agblock_t rmapbno, xfs_extlen_t len, bool unwritten, \
 		 const struct xfs_owner_info *oinfo), \
-	TP_ARGS(cur, agbno, len, unwritten, oinfo))
+	TP_ARGS(cur, rmapbno, len, unwritten, oinfo))
 
 /* btree cursor error/%ip tracepoint class */
 DECLARE_EVENT_CLASS(xfs_btree_error_class,
@@ -2936,40 +2942,35 @@ TRACE_EVENT(xfs_rmap_convert_state,
 	TP_ARGS(cur, state, caller_ip),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_ino_t, ino)
 		__field(int, state)
 		__field(unsigned long, caller_ip)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
-			__entry->agno = 0;
-			__entry->ino = cur->bc_ino.ip->i_ino;
-		} else {
-			__entry->agno = cur->bc_ag.pag->pag_agno;
-			__entry->ino = 0;
-		}
+		xfs_rmapbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
 		__entry->state = state;
 		__entry->caller_ip = caller_ip;
 	),
-	TP_printk("dev %d:%d agno 0x%x ino 0x%llx state %d caller %pS",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x state %d caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
-		  __entry->ino,
 		  __entry->state,
 		  (char *)__entry->caller_ip)
 );
 
 DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 	TP_PROTO(struct xfs_btree_cur *cur,
-		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t rmapbno, xfs_extlen_t len,
 		 uint64_t owner, uint64_t offset, unsigned int flags),
-	TP_ARGS(cur, agbno, len, owner, offset, flags),
+	TP_ARGS(cur, rmapbno, len, owner, offset, flags),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, rmapbno)
 		__field(xfs_extlen_t, len)
 		__field(uint64_t, owner)
 		__field(uint64_t, offset)
@@ -2977,17 +2978,18 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
-		__entry->agbno = agbno;
+		xfs_rmapbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
+		__entry->rmapbno = rmapbno;
 		__entry->len = len;
 		__entry->owner = owner;
 		__entry->offset = offset;
 		__entry->flags = flags;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x rmapbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->rmapbno,
 		  __entry->len,
 		  __entry->owner,
 		  __entry->offset,
@@ -2996,9 +2998,9 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
 #define DEFINE_RMAPBT_EVENT(name) \
 DEFINE_EVENT(xfs_rmapbt_class, name, \
 	TP_PROTO(struct xfs_btree_cur *cur, \
-		 xfs_agblock_t agbno, xfs_extlen_t len, \
+		 xfs_agblock_t rmapbno, xfs_extlen_t len, \
 		 uint64_t owner, uint64_t offset, unsigned int flags), \
-	TP_ARGS(cur, agbno, len, owner, offset, flags))
+	TP_ARGS(cur, rmapbno, len, owner, offset, flags))
 
 TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
 TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/39] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
  2023-12-31 21:32   ` [PATCH 01/39] xfs: prepare rmap btree cursor tracepoints for realtime Darrick J. Wong
@ 2023-12-31 21:32   ` Darrick J. Wong
  2023-12-31 21:32   ` [PATCH 03/39] xfs: introduce realtime rmap btree definitions Darrick J. Wong
                     ` (36 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:32 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Simplify the calling conventions by allowing callers to pass a fsbno
(xfs_fsblock_t) directly into these functions, since we're just going to
set it in a struct anyway.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |    6 ++----
 fs/xfs/libxfs/xfs_rmap.c     |   12 +++++-------
 fs/xfs/libxfs/xfs_rmap.h     |    8 ++++----
 fs/xfs/scrub/alloc_repair.c  |   10 +++++++---
 4 files changed, 18 insertions(+), 18 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index e30c4cdfaa392..6f7ec83281656 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1887,8 +1887,7 @@ xfs_refcount_alloc_cow_extent(
 	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
-			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
@@ -1904,8 +1903,7 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
-			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	xfs_rmap_free_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
 	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 0f552cb737c8b..b3383cc474492 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -527,7 +527,7 @@ xfs_rmap_free_check_owner(
 	struct xfs_btree_cur	*cur,
 	uint64_t		ltoff,
 	struct xfs_rmap_irec	*rec,
-	xfs_filblks_t		len,
+	xfs_extlen_t		len,
 	uint64_t		owner,
 	uint64_t		offset,
 	unsigned int		flags)
@@ -2718,8 +2718,7 @@ xfs_rmap_convert_extent(
 void
 xfs_rmap_alloc_extent(
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
+	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
 {
@@ -2728,7 +2727,7 @@ xfs_rmap_alloc_extent(
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
 		return;
 
-	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+	bmap.br_startblock = fsbno;
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
@@ -2740,8 +2739,7 @@ xfs_rmap_alloc_extent(
 void
 xfs_rmap_free_extent(
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
+	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
 {
@@ -2750,7 +2748,7 @@ xfs_rmap_free_extent(
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
 		return;
 
-	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+	bmap.br_startblock = fsbno;
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index e6240efd6fe75..0ccfd7d88e56e 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -184,10 +184,10 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
 		struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *imap);
-void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
-		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
-		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+		xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+		xfs_extlen_t len, uint64_t owner);
 
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
 		struct xfs_btree_cur **pcur);
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 45edda096869c..3805099cb578b 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -542,9 +542,13 @@ xrep_abt_dispose_one(
 	ASSERT(pag == resv->pag);
 
 	/* Add a deferred rmap for each extent we used. */
-	if (resv->used > 0)
-		xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
-				resv->used, XFS_RMAP_OWN_AG);
+	if (resv->used > 0) {
+		xfs_fsblock_t	fsbno;
+
+		fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, resv->agbno);
+		xfs_rmap_alloc_extent(sc->tp, fsbno, resv->used,
+				XFS_RMAP_OWN_AG);
+	}
 
 	/*
 	 * For each reserved btree block we didn't use, add it to the free


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/39] xfs: introduce realtime rmap btree definitions
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
  2023-12-31 21:32   ` [PATCH 01/39] xfs: prepare rmap btree cursor tracepoints for realtime Darrick J. Wong
  2023-12-31 21:32   ` [PATCH 02/39] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions Darrick J. Wong
@ 2023-12-31 21:32   ` Darrick J. Wong
  2023-12-31 21:32   ` [PATCH 04/39] xfs: define the on-disk realtime rmap btree format Darrick J. Wong
                     ` (35 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:32 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add new realtime rmap btree definitions. The realtime rmap btree will
be rooted from a hidden inode, but has its own shape and therefore
needs to have most of its own separate types.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.h  |    1 +
 fs/xfs/libxfs/xfs_format.h |    7 +++++++
 fs/xfs/libxfs/xfs_types.h  |    5 +++--
 fs/xfs/scrub/trace.h       |    1 +
 fs/xfs/xfs_trace.h         |    1 +
 5 files changed, 13 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index ce0bc5dfffe1c..e6571c9157d1e 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -64,6 +64,7 @@ union xfs_btree_rec {
 #define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi)
 #define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
 #define	XFS_BTNUM_RCBAG	((xfs_btnum_t)XFS_BTNUM_RCBAGi)
+#define	XFS_BTNUM_RTRMAP ((xfs_btnum_t)XFS_BTNUM_RTRMAPi)
 
 struct xfs_btree_ops;
 uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 87476c6bb6c64..b47d4f16143a6 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1746,6 +1746,13 @@ typedef __be32 xfs_rmap_ptr_t;
 	 XFS_FIBT_BLOCK(mp) + 1 : \
 	 XFS_IBT_BLOCK(mp) + 1)
 
+/*
+ * Realtime Reverse mapping btree format definitions
+ *
+ * This is a btree for reverse mapping records for realtime volumes
+ */
+#define	XFS_RTRMAP_CRC_MAGIC	0x4d415052	/* 'MAPR' */
+
 /*
  * Reference Count Btree format definitions
  *
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index ad2ce83874f9f..b3edc57dc65bd 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -126,7 +126,7 @@ typedef enum {
 typedef enum {
 	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
 	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_RCBAGi,
-	XFS_BTNUM_MAX
+	XFS_BTNUM_RTRMAPi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 
 #define XFS_BTNUM_STRINGS \
@@ -137,7 +137,8 @@ typedef enum {
 	{ XFS_BTNUM_INOi,	"inobt" }, \
 	{ XFS_BTNUM_FINOi,	"finobt" }, \
 	{ XFS_BTNUM_REFCi,	"refcbt" }, \
-	{ XFS_BTNUM_RCBAGi,	"rcbagbt" }
+	{ XFS_BTNUM_RCBAGi,	"rcbagbt" }, \
+	{ XFS_BTNUM_RTRMAPi,	"rtrmapbt" }
 
 struct xfs_name {
 	const unsigned char	*name;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index dd809042a6041..bdcd77c839317 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -48,6 +48,7 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RCBAGi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RTRMAPi);
 
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 43ecf98f86558..1c89d38b85446 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2550,6 +2550,7 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RCBAGi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RTRMAPi);
 
 DECLARE_EVENT_CLASS(xfs_btree_cur_class,
 	TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/39] xfs: define the on-disk realtime rmap btree format
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:32   ` [PATCH 03/39] xfs: introduce realtime rmap btree definitions Darrick J. Wong
@ 2023-12-31 21:32   ` Darrick J. Wong
  2023-12-31 21:33   ` [PATCH 05/39] xfs: realtime rmap btree transaction reservations Darrick J. Wong
                     ` (34 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:32 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Start filling out the rtrmap btree implementation. Start with the
on-disk btree format; add everything needed to read, write and
manipulate rmap btree blocks. This prepares the way for connecting the
btree operations implementation.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile                  |    1 
 fs/xfs/libxfs/xfs_btree.c        |    5 +
 fs/xfs/libxfs/xfs_format.h       |    3 
 fs/xfs/libxfs/xfs_ondisk.h       |    1 
 fs/xfs/libxfs/xfs_rtrmap_btree.c |  307 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |   83 ++++++++++
 fs/xfs/libxfs/xfs_sb.c           |    6 +
 fs/xfs/libxfs/xfs_shared.h       |    2 
 fs/xfs/xfs_mount.c               |    5 -
 fs/xfs/xfs_mount.h               |    9 +
 10 files changed, 420 insertions(+), 2 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtrmap_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_rtrmap_btree.h


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6dc9e740f8ce5..50a7929982fdd 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -48,6 +48,7 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_rmap_btree.o \
 				   xfs_refcount.o \
 				   xfs_refcount_btree.o \
+				   xfs_rtrmap_btree.o \
 				   xfs_sb.o \
 				   xfs_swapext.o \
 				   xfs_symlink_remote.o \
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 5b4a2c6a3f2ac..c8a2ce10cccc8 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -32,6 +32,7 @@
 #include "scrub/xfbtree.h"
 #include "xfs_btree_mem.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Btree magic numbers.
@@ -5528,6 +5529,9 @@ xfs_btree_init_cur_caches(void)
 	if (error)
 		goto err;
 	error = xfs_refcountbt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_rtrmapbt_init_cur_cache();
 	if (error)
 		goto err;
 
@@ -5546,6 +5550,7 @@ xfs_btree_destroy_cur_caches(void)
 	xfs_bmbt_destroy_cur_cache();
 	xfs_rmapbt_destroy_cur_cache();
 	xfs_refcountbt_destroy_cur_cache();
+	xfs_rtrmapbt_destroy_cur_cache();
 }
 
 /* Move the btree cursor before the first record. */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index b47d4f16143a6..5317c6438f070 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1753,6 +1753,9 @@ typedef __be32 xfs_rmap_ptr_t;
  */
 #define	XFS_RTRMAP_CRC_MAGIC	0x4d415052	/* 'MAPR' */
 
+/* inode-based btree pointer type */
+typedef __be64 xfs_rtrmap_ptr_t;
+
 /*
  * Reference Count Btree format definitions
  *
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 70b96efa26973..897a1b72f8d12 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -77,6 +77,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
+	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
new file mode 100644
index 0000000000000..ecacb457cd27f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+
+static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
+
+/*
+ * Realtime Reverse Map btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device.  See the comments in xfs_rmap_btree.c for more information.
+ *
+ * This tree is basically the same as the regular rmap btree except that it
+ * is rooted in an inode and does not live in free space.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrmapbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_cur	*new;
+
+	new = xfs_rtrmapbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ino.rtg,
+			cur->bc_ino.ip);
+
+	/* Copy the flags values since init cursor doesn't get them. */
+	new->bc_ino.flags = cur->bc_ino.flags;
+
+	return new;
+}
+
+static xfs_failaddr_t
+xfs_rtrmapbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	int			level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (!xfs_has_rmapbt(mp))
+		return __this_address;
+	fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+	level = be16_to_cpu(block->bb_level);
+	if (level > mp->m_rtrmap_maxlevels)
+		return __this_address;
+
+	return xfs_btree_lblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
+}
+
+static void
+xfs_rtrmapbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_lblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rtrmapbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrmapbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_rtrmapbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_lblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
+	.name			= "xfs_rtrmapbt",
+	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+	.verify_read		= xfs_rtrmapbt_read_verify,
+	.verify_write		= xfs_rtrmapbt_write_verify,
+	.verify_struct		= xfs_rtrmapbt_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrmapbt_ops = {
+	.rec_len		= sizeof(struct xfs_rmap_rec),
+	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+	.lru_refs		= XFS_RMAP_BTREE_REF,
+	.geom_flags		= XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE |
+				  XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING |
+				  XFS_BTREE_IROOT_RECORDS,
+
+	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
+	.buf_ops		= &xfs_rtrmapbt_buf_ops,
+};
+
+/* Initialize a new rt rmap btree cursor. */
+static struct xfs_btree_cur *
+xfs_rtrmapbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip)
+{
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RTRMAP,
+			&xfs_rtrmapbt_ops, mp->m_rtrmap_maxlevels,
+			xfs_rtrmapbt_cur_cache);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+
+	cur->bc_ino.ip = ip;
+	cur->bc_ino.allocated = 0;
+	cur->bc_ino.flags = 0;
+
+	cur->bc_ino.rtg = xfs_rtgroup_hold(rtg);
+	return cur;
+}
+
+/* Allocate a new rt rmap btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+	cur = xfs_rtrmapbt_init_common(mp, tp, rtg, ip);
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+	cur->bc_ino.whichfork = XFS_DATA_FORK;
+	return cur;
+}
+
+/* Create a new rt reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip,
+	struct xbtree_ifakeroot	*ifake)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_rtrmapbt_init_common(mp, NULL, rtg, ip);
+	cur->bc_nlevels = ifake->if_levels;
+	cur->bc_ino.forksize = ifake->if_fork_size;
+	cur->bc_ino.whichfork = -1;
+	xfs_btree_stage_ifakeroot(cur, ifake, NULL);
+	return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root.  Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrmapbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp)
+{
+	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
+	struct xfs_ifork	*ifp;
+	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	/*
+	 * Free any resources hanging off the real fork, then shallow-copy the
+	 * staging fork's contents into the real fork to transfer everything
+	 * we just built.
+	 */
+	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+	xfs_idestroy_fork(ifp);
+	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK, &xfs_rtrmapbt_ops);
+}
+
+/* Calculate number of records in a rt reverse mapping btree block. */
+static inline unsigned int
+xfs_rtrmapbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(struct xfs_rmap_rec);
+	return blocklen /
+		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Calculate number of records in an rt reverse mapping btree block.
+ */
+unsigned int
+xfs_rtrmapbt_maxrecs(
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_RTRMAP_BLOCK_LEN;
+	return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime reverse mapping btrees. */
+unsigned int
+xfs_rtrmapbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
+
+	/* We need at most one record for every block in an rt group. */
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrmapbt_init_cur_cache(void)
+{
+	xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
+			xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_rtrmapbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_rtrmapbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
+	xfs_rtrmapbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of an rt reverse mapping btree. */
+void
+xfs_rtrmapbt_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	unsigned int		d_maxlevels, r_maxlevels;
+
+	if (!xfs_has_rtrmapbt(mp)) {
+		mp->m_rtrmap_maxlevels = 0;
+		return;
+	}
+
+	/*
+	 * The realtime rmapbt lives on the data device, which means that its
+	 * maximum height is constrained by the size of the data device and
+	 * the height required to store one rmap record for each block in an
+	 * rt group.
+	 */
+	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
+				mp->m_sb.sb_dblocks);
+	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
+				mp->m_sb.sb_rgblocks);
+
+	/* Add one level to handle the inode root level. */
+	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
new file mode 100644
index 0000000000000..0f73267971924
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTRMAP_BTREE_H__
+#define	__XFS_RTRMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RTRMAP_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+		struct xfs_inode *ip);
+struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		struct xbtree_ifakeroot *ifake);
+void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp);
+unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
+void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrmapbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_rec_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_rec *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_high_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 sizeof(struct xfs_rmap_key) +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_ptr_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrmap_ptr_t *)
+		((char *)block + XFS_RTRMAP_BLOCK_LEN +
+		 maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+		 (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+unsigned int xfs_rtrmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rtrmapbt_init_cur_cache(void);
+void xfs_rtrmapbt_destroy_cur_cache(void);
+
+#endif	/* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 8178a8e8097ff..a5ca8f4f8699f 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -28,6 +28,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_swapext.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1125,6 +1126,11 @@ xfs_sb_mount_common(
 	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
 	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
 
+	mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true);
+	mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false);
+	mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2;
+	mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2;
+
 	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
 	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
 	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 8ad4b67d6febb..adb742267c9d0 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -42,6 +42,7 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
 extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
@@ -54,6 +55,7 @@ extern const struct xfs_btree_ops xfs_finobt_ops;
 extern const struct xfs_btree_ops xfs_bmbt_ops;
 extern const struct xfs_btree_ops xfs_refcountbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_ops;
+extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
 
 /* log size calculation functions */
 int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c774ac73bdec5..30d2d0c5e5e53 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -36,6 +36,7 @@
 #include "xfs_ag.h"
 #include "xfs_imeta.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/stats.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -664,8 +665,7 @@ static inline void
 xfs_rtbtree_compute_maxlevels(
 	struct xfs_mount	*mp)
 {
-	/* This will be filled in later. */
-	mp->m_rtbtree_maxlevels = 0;
+	mp->m_rtbtree_maxlevels = mp->m_rtrmap_maxlevels;
 }
 
 /*
@@ -737,6 +737,7 @@ xfs_mountfs(
 	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
 	xfs_mount_setup_inode_geom(mp);
 	xfs_rmapbt_compute_maxlevels(mp);
+	xfs_rtrmapbt_compute_maxlevels(mp);
 	xfs_refcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7ef8d5f706883..ada8b281d74e2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -133,11 +133,14 @@ typedef struct xfs_mount {
 	uint			m_bmap_dmnr[2];	/* min bmap btree records */
 	uint			m_rmap_mxr[2];	/* max rmap btree records */
 	uint			m_rmap_mnr[2];	/* min rmap btree records */
+	uint			m_rtrmap_mxr[2]; /* max rtrmap btree records */
+	uint			m_rtrmap_mnr[2]; /* min rtrmap btree records */
 	uint			m_refc_mxr[2];	/* max refc btree records */
 	uint			m_refc_mnr[2];	/* min refc btree records */
 	uint			m_alloc_maxlevels; /* max alloc btree levels */
 	uint			m_bm_maxlevels[2]; /* max bmap btree levels */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
+	uint			m_rtrmap_maxlevels; /* max rtrmap btree level */
 	uint			m_refc_maxlevels; /* max refcount btree level */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
 	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
@@ -369,6 +372,12 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
 __XFS_HAS_FEAT(metadir, METADIR)
 __XFS_HAS_FEAT(rtgroups, RTGROUPS)
 
+static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
+{
+	return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) &&
+	       xfs_has_rmapbt(mp);
+}
+
 /*
  * Mount features
  *


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/39] xfs: realtime rmap btree transaction reservations
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:32   ` [PATCH 04/39] xfs: define the on-disk realtime rmap btree format Darrick J. Wong
@ 2023-12-31 21:33   ` Darrick J. Wong
  2023-12-31 21:33   ` [PATCH 06/39] xfs: add realtime rmap btree operations Darrick J. Wong
                     ` (33 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:33 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that there's enough log reservation to handle mapping
and unmapping realtime extents.  We have to reserve enough space
to handle a split in the rtrmapbt to add the record and a second
split in the regular rmapbt to record the rtrmapbt split.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_swapext.c     |    4 +++-
 fs/xfs/libxfs/xfs_trans_resv.c  |   12 ++++++++++--
 fs/xfs/libxfs/xfs_trans_space.h |   13 +++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_swapext.c b/fs/xfs/libxfs/xfs_swapext.c
index 244ef3d8431fd..80d4aa1ec6399 100644
--- a/fs/xfs/libxfs/xfs_swapext.c
+++ b/fs/xfs/libxfs/xfs_swapext.c
@@ -761,7 +761,9 @@ xfs_swapext_rmapbt_blocks(
 	if (!xfs_has_rmapbt(mp))
 		return 0;
 	if (XFS_IS_REALTIME_INODE(req->ip1))
-		return 0;
+		return howmany_64(req->nr_exchanges,
+					XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) *
+			XFS_RTRMAPADD_SPACE_RES(mp);
 
 	return howmany_64(req->nr_exchanges,
 					XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 83922f5e54aed..423b0cede71cb 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -214,7 +214,9 @@ xfs_calc_inode_chunk_res(
  * Per-extent log reservation for the btree changes involved in freeing or
  * allocating a realtime extent.  We have to be able to log as many rtbitmap
  * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
- * extents, as well as the realtime summary block.
+ * extents, as well as the realtime summary block (t1).  Realtime rmap btree
+ * operations happen in a second transaction, so factor in a couple of rtrmapbt
+ * splits (t2).
  */
 static unsigned int
 xfs_rtalloc_block_count(
@@ -223,10 +225,16 @@ xfs_rtalloc_block_count(
 {
 	unsigned int		rtbmp_blocks;
 	xfs_rtxlen_t		rtxlen;
+	unsigned int		t1, t2 = 0;
 
 	rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
 	rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen);
-	return (rtbmp_blocks + 1) * num_ops;
+	t1 = (rtbmp_blocks + 1) * num_ops;
+
+	if (xfs_has_rmapbt(mp))
+		t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1);
+
+	return max(t1, t2);
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 1155ff2d37e29..d89b570aafcc6 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -14,6 +14,19 @@
 #define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)    \
 		(((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0]))
 
+/* Worst case number of realtime rmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)    \
+		(((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0]))
+
+/* Adding one realtime rmap could split every level to the top of the tree. */
+#define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels)
+
+/* Blocks we might need to add "b" realtime rmaps to a tree. */
+#define XFS_NRTRMAPADD_SPACE_RES(mp, b) \
+	((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \
+	  XFS_RTRMAPADD_SPACE_RES(mp))
+
 /* Worst case number of rmaps that can be held in a block. */
 #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
 		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/39] xfs: add realtime rmap btree operations
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:33   ` [PATCH 05/39] xfs: realtime rmap btree transaction reservations Darrick J. Wong
@ 2023-12-31 21:33   ` Darrick J. Wong
  2023-12-31 21:33   ` [PATCH 07/39] xfs: prepare rmap functions to deal with rtrmapbt Darrick J. Wong
                     ` (32 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:33 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Implement the generic btree operations needed to manipulate rtrmap
btree blocks. This is different from the regular rmapbt in that we
allocate space from the filesystem at large, and are neither
constrained to the free space nor any particular AG.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.c        |   71 ++++++++++
 fs/xfs/libxfs/xfs_btree.h        |    5 +
 fs/xfs/libxfs/xfs_rtrmap_btree.c |  271 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 347 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c8a2ce10cccc8..a294641d91832 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -33,6 +33,10 @@
 #include "xfs_btree_mem.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_quota.h"
+#include "xfs_imeta.h"
 
 /*
  * Btree magic numbers.
@@ -5579,3 +5583,70 @@ xfs_btree_goto_left_edge(
 
 	return 0;
 }
+
+/* Allocate a block for an inode-rooted metadata btree. */
+int
+xfs_btree_alloc_imeta_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	struct xfs_alloc_arg		args = {
+		.mp			= cur->bc_mp,
+		.tp			= cur->bc_tp,
+		.resv			= XFS_AG_RESV_IMETA,
+		.minlen			= 1,
+		.maxlen			= 1,
+		.prod			= 1,
+	};
+	struct xfs_inode		*ip = cur->bc_ino.ip;
+	int				error;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(XFS_IS_DQDETACHED(cur->bc_mp, ip));
+
+	xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork);
+	error = xfs_alloc_vextent_start_ag(&args,
+			XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino));
+	if (error)
+		return error;
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+
+	xfs_imeta_resv_alloc_extent(ip, &args);
+	cur->bc_ino.allocated++;
+
+	new->l = cpu_to_be64(args.fsbno);
+	*stat = 1;
+	return 0;
+}
+
+/* Free a block from an inode-rooted metadata btree. */
+int
+xfs_btree_free_imeta_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_inode	*ip = cur->bc_ino.ip;
+	struct xfs_trans	*tp = cur->bc_tp;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+	int			error;
+
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(XFS_IS_DQDETACHED(cur->bc_mp, ip));
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
+	error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_IMETA,
+			0);
+	if (error)
+		return error;
+
+	xfs_imeta_resv_free_extent(ip, tp, 1);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index e6571c9157d1e..3559cf5d3a653 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -764,4 +764,9 @@ void xfs_btree_destroy_cur_caches(void);
 
 int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
 
+int xfs_btree_alloc_imeta_block(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *start, union xfs_btree_ptr *newp,
+		int *stat);
+int xfs_btree_free_imeta_block(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index ecacb457cd27f..5be3e1af55684 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -18,12 +18,14 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
+#include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
 #include "xfs_rtgroup.h"
+#include "xfs_bmap.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -52,6 +54,182 @@ xfs_rtrmapbt_dup_cursor(
 	return new;
 }
 
+STATIC int
+xfs_rtrmapbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_rtrmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrmapbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0);
+	}
+
+	return cur->bc_mp->m_rtrmap_mxr[level != 0];
+}
+
+/*
+ * Convert the ondisk record's offset field into the ondisk key's offset field.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
+{
+	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+}
+
+STATIC void
+xfs_rtrmapbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->rmap.rm_startblock = rec->rmap.rm_startblock;
+	key->rmap.rm_owner = rec->rmap.rm_owner;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+}
+
+STATIC void
+xfs_rtrmapbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	uint64_t			off;
+	int				adj;
+
+	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+	key->rmap.rm_startblock = rec->rmap.rm_startblock;
+	be32_add_cpu(&key->rmap.rm_startblock, adj);
+	key->rmap.rm_owner = rec->rmap.rm_owner;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+		return;
+	off = be64_to_cpu(key->rmap.rm_offset);
+	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+	key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rtrmapbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+	rec->rmap.rm_offset = cpu_to_be64(
+			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rtrmapbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+/*
+ * Mask the appropriate parts of the ondisk key field for a key comparison.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline uint64_t offset_keymask(uint64_t offset)
+{
+	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
+	const struct xfs_rmap_key	*kp = &key->rmap;
+	__u64				x, y;
+	int64_t				d;
+
+	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+	if (d)
+		return d;
+
+	x = be64_to_cpu(kp->rm_owner);
+	y = rec->rm_owner;
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+
+	x = offset_keymask(be64_to_cpu(kp->rm_offset));
+	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
+	if (x > y)
+		return 1;
+	else if (y > x)
+		return -1;
+	return 0;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
+{
+	const struct xfs_rmap_key	*kp1 = &k1->rmap;
+	const struct xfs_rmap_key	*kp2 = &k2->rmap;
+	int64_t				d;
+	__u64				x, y;
+
+	/* Doesn't make sense to mask off the physical space part */
+	ASSERT(!mask || mask->rmap.rm_startblock);
+
+	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
+		     be32_to_cpu(kp2->rm_startblock);
+	if (d)
+		return d;
+
+	if (!mask || mask->rmap.rm_owner) {
+		x = be64_to_cpu(kp1->rm_owner);
+		y = be64_to_cpu(kp2->rm_owner);
+		if (x > y)
+			return 1;
+		else if (y > x)
+			return -1;
+	}
+
+	if (!mask || mask->rmap.rm_offset) {
+		/* Doesn't make sense to allow offset but not owner */
+		ASSERT(!mask || mask->rmap.rm_owner);
+
+		x = offset_keymask(be64_to_cpu(kp1->rm_offset));
+		y = offset_keymask(be64_to_cpu(kp2->rm_offset));
+		if (x > y)
+			return 1;
+		else if (y > x)
+			return -1;
+	}
+
+	return 0;
+}
+
 static xfs_failaddr_t
 xfs_rtrmapbt_verify(
 	struct xfs_buf		*bp)
@@ -118,6 +296,86 @@ const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
 	.verify_struct		= xfs_rtrmapbt_verify,
 };
 
+STATIC int
+xfs_rtrmapbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	uint32_t			x;
+	uint32_t			y;
+	uint64_t			a;
+	uint64_t			b;
+
+	x = be32_to_cpu(k1->rmap.rm_startblock);
+	y = be32_to_cpu(k2->rmap.rm_startblock);
+	if (x < y)
+		return 1;
+	else if (x > y)
+		return 0;
+	a = be64_to_cpu(k1->rmap.rm_owner);
+	b = be64_to_cpu(k2->rmap.rm_owner);
+	if (a < b)
+		return 1;
+	else if (a > b)
+		return 0;
+	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
+	if (a <= b)
+		return 1;
+	return 0;
+}
+
+STATIC int
+xfs_rtrmapbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	uint32_t			x;
+	uint32_t			y;
+	uint64_t			a;
+	uint64_t			b;
+
+	x = be32_to_cpu(r1->rmap.rm_startblock);
+	y = be32_to_cpu(r2->rmap.rm_startblock);
+	if (x < y)
+		return 1;
+	else if (x > y)
+		return 0;
+	a = be64_to_cpu(r1->rmap.rm_owner);
+	b = be64_to_cpu(r2->rmap.rm_owner);
+	if (a < b)
+		return 1;
+	else if (a > b)
+		return 0;
+	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
+	if (a <= b)
+		return 1;
+	return 0;
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrmapbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->rmap.rm_startblock);
+
+	/*
+	 * We only support checking contiguity of the physical space component.
+	 * If any callers ever need more specificity than that, they'll have to
+	 * implement it here.
+	 */
+	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
+
+	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
+				 be32_to_cpu(key2->rmap.rm_startblock));
+}
+
 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.rec_len		= sizeof(struct xfs_rmap_rec),
 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
@@ -127,7 +385,20 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 				  XFS_BTREE_IROOT_RECORDS,
 
 	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
+	.alloc_block		= xfs_btree_alloc_imeta_block,
+	.free_block		= xfs_btree_free_imeta_block,
+	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
+	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_rtrmapbt_init_ptr_from_cur,
+	.key_diff		= xfs_rtrmapbt_key_diff,
 	.buf_ops		= &xfs_rtrmapbt_buf_ops,
+	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
+	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
 };
 
 /* Initialize a new rt rmap btree cursor. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/39] xfs: prepare rmap functions to deal with rtrmapbt
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:33   ` [PATCH 06/39] xfs: add realtime rmap btree operations Darrick J. Wong
@ 2023-12-31 21:33   ` Darrick J. Wong
  2023-12-31 21:33   ` [PATCH 08/39] xfs: add a realtime flag to the rmap update log redo items Darrick J. Wong
                     ` (31 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:33 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Prepare the high-level rmap functions to deal with the new realtime
rmapbt and its slightly different conventions.  Provide the ability
to talk to either rmapbt or rtrmapbt formats from the same high
level code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c |   66 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rmap.h |    3 ++
 2 files changed, 69 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index b3383cc474492..35df4e832996e 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -25,6 +25,7 @@
 #include "xfs_ag.h"
 #include "xfs_health.h"
 #include "xfs_rmap_item.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -264,11 +265,72 @@ xfs_rmap_check_irec(
 	return NULL;
 }
 
+xfs_failaddr_t
+xfs_rtrmap_check_irec(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_rmap_irec	*irec)
+{
+	struct xfs_mount		*mp = rtg->rtg_mount;
+	bool				is_inode;
+	bool				is_unwritten;
+	bool				is_bmbt;
+	bool				is_attr;
+
+	if (irec->rm_blockcount == 0)
+		return __this_address;
+
+	if (irec->rm_owner == XFS_RMAP_OWN_FS) {
+		if (irec->rm_startblock != 0)
+			return __this_address;
+		if (irec->rm_blockcount != mp->m_sb.sb_rextsize)
+			return __this_address;
+		if (irec->rm_offset != 0)
+			return __this_address;
+	} else {
+		if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
+					    irec->rm_blockcount))
+			return __this_address;
+	}
+
+	if (!(xfs_verify_ino(mp, irec->rm_owner) ||
+	      (irec->rm_owner <= XFS_RMAP_OWN_FS &&
+	       irec->rm_owner >= XFS_RMAP_OWN_MIN)))
+		return __this_address;
+
+	/* Check flags. */
+	is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+	is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+	is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+	is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+	if (!is_inode && irec->rm_owner != XFS_RMAP_OWN_FS)
+		return __this_address;
+
+	if (!is_inode && irec->rm_offset != 0)
+		return __this_address;
+
+	if (is_bmbt || is_attr)
+		return __this_address;
+
+	if (is_unwritten && !is_inode)
+		return __this_address;
+
+	/* Check for a valid fork offset, if applicable. */
+	if (is_inode &&
+	    !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))
+		return __this_address;
+
+	return NULL;
+}
+
 static inline xfs_failaddr_t
 xfs_rmap_check_btrec(
 	struct xfs_btree_cur		*cur,
 	const struct xfs_rmap_irec	*irec)
 {
+	if (cur->bc_btnum == XFS_BTNUM_RTRMAP)
+		return xfs_rtrmap_check_irec(cur->bc_ino.rtg, irec);
+
 	if (cur->bc_flags & XFS_BTREE_IN_XFILE)
 		return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
 	return xfs_rmap_check_irec(cur->bc_ag.pag, irec);
@@ -285,6 +347,10 @@ xfs_rmap_complain_bad_rec(
 	if (cur->bc_flags & XFS_BTREE_IN_XFILE)
 		xfs_warn(mp,
  "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+	else if (cur->bc_btnum == XFS_BTNUM_RTRMAP)
+		xfs_warn(mp,
+ "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!",
+				cur->bc_ino.rtg->rtg_rgno, fa);
 	else
 		xfs_warn(mp,
  "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 0ccfd7d88e56e..762f2f40b6e47 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -7,6 +7,7 @@
 #define __XFS_RMAP_H__
 
 struct xfs_perag;
+struct xfs_rtgroup;
 
 static inline void
 xfs_rmap_ino_bmbt_owner(
@@ -206,6 +207,8 @@ xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_rmap_irec *irec);
 xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
 		const struct xfs_rmap_irec *irec);
+xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg,
+		const struct xfs_rmap_irec *irec);
 
 int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, enum xbtree_recpacking *outcome);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/39] xfs: add a realtime flag to the rmap update log redo items
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:33   ` [PATCH 07/39] xfs: prepare rmap functions to deal with rtrmapbt Darrick J. Wong
@ 2023-12-31 21:33   ` Darrick J. Wong
  2023-12-31 21:34   ` [PATCH 09/39] xfs: support recovering rmap intent items targetting realtime extents Darrick J. Wong
                     ` (30 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:33 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Extend the rmap update (RUI) log items with a new realtime flag that
indicates that the updates apply against the realtime rmapbt.  We'll
wire up the actual rmap code later.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_defer.h       |    1 
 fs/xfs/libxfs/xfs_log_format.h  |    6 +
 fs/xfs/libxfs/xfs_log_recover.h |    2 
 fs/xfs/libxfs/xfs_refcount.c    |    4 -
 fs/xfs/libxfs/xfs_rmap.c        |   32 ++++-
 fs/xfs/libxfs/xfs_rmap.h        |   12 +-
 fs/xfs/scrub/alloc_repair.c     |    2 
 fs/xfs/xfs_log_recover.c        |    2 
 fs/xfs/xfs_rmap_item.c          |  237 +++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_trace.h              |   23 +++-
 10 files changed, 293 insertions(+), 28 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index b4e1c386768c9..fddcb4cccbcc2 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -69,6 +69,7 @@ struct xfs_defer_op_type {
 extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
 extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
 extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 1f5fe4a588eca..ea4e88d665707 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -250,6 +250,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_SXD		0x1249  /* extent swap done */
 #define	XFS_LI_EFI_RT		0x124a	/* realtime extent free intent */
 #define	XFS_LI_EFD_RT		0x124b	/* realtime extent free done */
+#define	XFS_LI_RUI_RT		0x124c	/* realtime rmap update intent */
+#define	XFS_LI_RUD_RT		0x124d	/* realtime rmap update done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -271,7 +273,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_SXI,		"XFS_LI_SXI" }, \
 	{ XFS_LI_SXD,		"XFS_LI_SXD" }, \
 	{ XFS_LI_EFI_RT,	"XFS_LI_EFI_RT" }, \
-	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }
+	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }, \
+	{ XFS_LI_RUI_RT,	"XFS_LI_RUI_RT" }, \
+	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }
 
 /*
  * Inode Log Item Format definitions.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 811c37026d251..433974693d10b 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -79,6 +79,8 @@ extern const struct xlog_recover_item_ops xlog_sxi_item_ops;
 extern const struct xlog_recover_item_ops xlog_sxd_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtrui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtrud_item_ops;
 
 /*
  * Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 6f7ec83281656..7f4433b2a5dd3 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1887,7 +1887,7 @@ xfs_refcount_alloc_cow_extent(
 	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
@@ -1903,7 +1903,7 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_free_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
 	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 35df4e832996e..daef2d67eb7a0 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2682,6 +2682,21 @@ xfs_rmap_finish_one(
 	return 0;
 }
 
+/*
+ * Process one of the deferred realtime rmap operations.  We pass back the
+ * btree cursor to reduce overhead.
+ */
+int
+xfs_rtrmap_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_rmap_intent		*ri,
+	struct xfs_btree_cur		**pcur)
+{
+	/* coming in a subsequent patch */
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
 /*
  * Don't defer an rmap if we aren't an rmap filesystem.
  */
@@ -2702,6 +2717,7 @@ __xfs_rmap_add(
 	struct xfs_trans		*tp,
 	enum xfs_rmap_intent_type	type,
 	uint64_t			owner,
+	bool				isrt,
 	int				whichfork,
 	struct xfs_bmbt_irec		*bmap)
 {
@@ -2713,6 +2729,7 @@ __xfs_rmap_add(
 	ri->ri_owner = owner;
 	ri->ri_whichfork = whichfork;
 	ri->ri_bmap = *bmap;
+	ri->ri_realtime = isrt;
 
 	xfs_rmap_defer_add(tp, ri);
 }
@@ -2726,6 +2743,7 @@ xfs_rmap_map_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
 		return;
@@ -2733,7 +2751,7 @@ xfs_rmap_map_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_MAP_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /* Unmap an extent out of a file. */
@@ -2745,6 +2763,7 @@ xfs_rmap_unmap_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
 		return;
@@ -2752,7 +2771,7 @@ xfs_rmap_unmap_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_UNMAP_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /*
@@ -2770,6 +2789,7 @@ xfs_rmap_convert_extent(
 	struct xfs_bmbt_irec	*PREV)
 {
 	enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
 
 	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return;
@@ -2777,13 +2797,14 @@ xfs_rmap_convert_extent(
 	if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
 		type = XFS_RMAP_CONVERT_SHARED;
 
-	__xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+	__xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
 }
 
 /* Schedule the creation of an rmap for non-file data. */
 void
 xfs_rmap_alloc_extent(
 	struct xfs_trans	*tp,
+	bool			isrt,
 	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
@@ -2798,13 +2819,14 @@ xfs_rmap_alloc_extent(
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap);
 }
 
 /* Schedule the deletion of an rmap for non-file data. */
 void
 xfs_rmap_free_extent(
 	struct xfs_trans	*tp,
+	bool			isrt,
 	xfs_fsblock_t		fsbno,
 	xfs_extlen_t		len,
 	uint64_t		owner)
@@ -2819,7 +2841,7 @@ xfs_rmap_free_extent(
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap);
 }
 
 /* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 762f2f40b6e47..3719fc4cbc26b 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -174,7 +174,11 @@ struct xfs_rmap_intent {
 	int					ri_whichfork;
 	uint64_t				ri_owner;
 	struct xfs_bmbt_irec			ri_bmap;
-	struct xfs_perag			*ri_pag;
+	union {
+		struct xfs_perag		*ri_pag;
+		struct xfs_rtgroup		*ri_rtg;
+	};
+	bool					ri_realtime;
 };
 
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
@@ -185,11 +189,13 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
 		struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *imap);
-void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
 		xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_fsblock_t fsbno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
 		xfs_extlen_t len, uint64_t owner);
 
+int xfs_rtrmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
+		struct xfs_btree_cur **pcur);
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
 		struct xfs_btree_cur **pcur);
 int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 3805099cb578b..d4ea1afb238a0 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -546,7 +546,7 @@ xrep_abt_dispose_one(
 		xfs_fsblock_t	fsbno;
 
 		fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, resv->agbno);
-		xfs_rmap_alloc_extent(sc->tp, fsbno, resv->used,
+		xfs_rmap_alloc_extent(sc->tp, false, fsbno, resv->used,
 				XFS_RMAP_OWN_AG);
 	}
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0aeca77d511d0..1efb69fcadf10 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1795,6 +1795,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
 	&xlog_sxd_item_ops,
 	&xlog_rtefi_item_ops,
 	&xlog_rtefd_item_ops,
+	&xlog_rtrui_item_ops,
+	&xlog_rtrud_item_ops,
 };
 
 static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index e2ee1b6719202..229b5127d4716 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -23,6 +23,7 @@
 #include "xfs_ag.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_rui_cache;
 struct kmem_cache	*xfs_rud_cache;
@@ -94,7 +95,9 @@ xfs_rui_item_format(
 	ASSERT(atomic_read(&ruip->rui_next_extent) ==
 			ruip->rui_format.rui_nextents);
 
-	ruip->rui_format.rui_type = XFS_LI_RUI;
+	ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT);
+
+	ruip->rui_format.rui_type = lip->li_type;
 	ruip->rui_format.rui_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
@@ -137,19 +140,22 @@ xfs_rui_item_release(
 STATIC struct xfs_rui_log_item *
 xfs_rui_init(
 	struct xfs_mount		*mp,
+	unsigned short			item_type,
 	uint				nextents)
 
 {
 	struct xfs_rui_log_item		*ruip;
 
 	ASSERT(nextents > 0);
+	ASSERT(item_type == XFS_LI_RUI || item_type == XFS_LI_RUI_RT);
+
 	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
 		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
 	else
 		ruip = kmem_cache_zalloc(xfs_rui_cache,
 					 GFP_KERNEL | __GFP_NOFAIL);
 
-	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+	xfs_log_item_init(mp, &ruip->rui_item, item_type, &xfs_rui_item_ops);
 	ruip->rui_format.rui_nextents = nextents;
 	ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
 	atomic_set(&ruip->rui_next_extent, 0);
@@ -188,7 +194,9 @@ xfs_rud_item_format(
 	struct xfs_rud_log_item	*rudp = RUD_ITEM(lip);
 	struct xfs_log_iovec	*vecp = NULL;
 
-	rudp->rud_format.rud_type = XFS_LI_RUD;
+	ASSERT(lip->li_type == XFS_LI_RUD || lip->li_type == XFS_LI_RUD_RT);
+
+	rudp->rud_format.rud_type = lip->li_type;
 	rudp->rud_format.rud_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format,
@@ -232,6 +240,14 @@ static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e)
 	return list_entry(e, struct xfs_rmap_intent, ri_list);
 }
 
+static inline bool
+xfs_rui_item_isrt(const struct xfs_log_item *lip)
+{
+	ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT);
+
+	return lip->li_type == XFS_LI_RUI_RT;
+}
+
 /* Sort rmap intents by AG. */
 static int
 xfs_rmap_update_diff_items(
@@ -311,11 +327,12 @@ xfs_rmap_update_create_intent(
 	bool				sort)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_rui_log_item		*ruip = xfs_rui_init(mp, count);
+	struct xfs_rui_log_item		*ruip;
 	struct xfs_rmap_intent		*ri;
 
 	ASSERT(count > 0);
 
+	ruip = xfs_rui_init(mp, XFS_LI_RUI, count);
 	if (sort)
 		list_sort(mp, items, xfs_rmap_update_diff_items);
 	list_for_each_entry(ri, items, ri_list)
@@ -323,6 +340,12 @@ xfs_rmap_update_create_intent(
 	return &ruip->rui_item;
 }
 
+static inline unsigned short
+xfs_rud_type_from_rui(const struct xfs_rui_log_item *ruip)
+{
+	return xfs_rui_item_isrt(&ruip->rui_item) ? XFS_LI_RUD_RT : XFS_LI_RUD;
+}
+
 /* Get an RUD so we can process all the deferred rmap updates. */
 static struct xfs_log_item *
 xfs_rmap_update_create_done(
@@ -334,8 +357,8 @@ xfs_rmap_update_create_done(
 	struct xfs_rud_log_item		*rudp;
 
 	rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
-	xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
-			  &xfs_rud_item_ops);
+	xfs_log_item_init(tp->t_mountp, &rudp->rud_item,
+			xfs_rud_type_from_rui(ruip), &xfs_rud_item_ops);
 	rudp->rud_ruip = ruip;
 	rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
 
@@ -352,8 +375,23 @@ xfs_rmap_defer_add(
 
 	trace_xfs_rmap_defer(mp, ri);
 
-	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+	/*
+	 * Deferred rmap updates for the realtime and data sections must use
+	 * separate transactions to finish deferred work because updates to
+	 * realtime metadata files can lock AGFs to allocate btree blocks and
+	 * we don't want that mixing with the AGF locks taken to finish data
+	 * section updates.
+	 */
+	if (ri->ri_realtime) {
+		xfs_rgnumber_t	rgno;
+
+		rgno = xfs_rtb_to_rgno(mp, ri->ri_bmap.br_startblock);
+		ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+		xfs_defer_add(tp, &ri->ri_list, &xfs_rtrmap_update_defer_type);
+	} else {
+		ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
+		xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+	}
 }
 
 /* Cancel a deferred rmap update. */
@@ -564,10 +602,12 @@ xfs_rmap_relog_intent(
 	struct xfs_map_extent		*map;
 	unsigned int			count;
 
+	ASSERT(intent->li_type == XFS_LI_RUI || intent->li_type == XFS_LI_RUI_RT);
+
 	count = RUI_ITEM(intent)->rui_format.rui_nextents;
 	map = RUI_ITEM(intent)->rui_format.rui_extents;
 
-	ruip = xfs_rui_init(tp->t_mountp, count);
+	ruip = xfs_rui_init(tp->t_mountp, intent->li_type, count);
 	memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
 	atomic_set(&ruip->rui_next_extent, count);
 
@@ -587,6 +627,98 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.relog_intent	= xfs_rmap_relog_intent,
 };
 
+#ifdef CONFIG_XFS_RT
+/* Sort rmap intents by rtgroup. */
+static int
+xfs_rtrmap_update_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_rmap_intent		*ra = ri_entry(a);
+	struct xfs_rmap_intent		*rb = ri_entry(b);
+
+	return ra->ri_rtg->rtg_rgno - rb->ri_rtg->rtg_rgno;
+}
+
+static struct xfs_log_item *
+xfs_rtrmap_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_rui_log_item		*ruip;
+	struct xfs_rmap_intent		*ri;
+
+	ASSERT(count > 0);
+
+	ruip = xfs_rui_init(mp, XFS_LI_RUI_RT, count);
+	if (sort)
+		list_sort(mp, items, xfs_rtrmap_update_diff_items);
+	list_for_each_entry(ri, items, ri_list)
+		xfs_rmap_update_log_item(tp, ruip, ri);
+	return &ruip->rui_item;
+}
+
+/* Cancel a deferred realtime rmap update. */
+STATIC void
+xfs_rtrmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_rmap_intent		*ri = ri_entry(item);
+
+	xfs_rtgroup_put(ri->ri_rtg);
+	kmem_cache_free(xfs_rmap_intent_cache, ri);
+}
+
+/* Process a deferred realtime rmap update. */
+STATIC int
+xfs_rtrmap_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_rmap_intent		*ri = ri_entry(item);
+	int				error;
+
+	error = xfs_rtrmap_finish_one(tp, ri, state);
+
+	xfs_rtrmap_update_cancel_item(item);
+	return error;
+}
+
+/* Clean up after calling xfs_rtrmap_finish_one. */
+STATIC void
+xfs_rtrmap_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	if (rcur)
+		xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = {
+	.name		= "rtrmap",
+	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_rtrmap_update_create_intent,
+	.abort_intent	= xfs_rmap_update_abort_intent,
+	.create_done	= xfs_rmap_update_create_done,
+	.finish_item	= xfs_rtrmap_update_finish_item,
+	.finish_cleanup = xfs_rtrmap_finish_one_cleanup,
+	.cancel_item	= xfs_rtrmap_update_cancel_item,
+	.recover_work	= xfs_rmap_recover_work,
+	.relog_intent	= xfs_rmap_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = {
+	.name		= "rtrmap",
+};
+#endif
+
 STATIC bool
 xfs_rui_item_match(
 	struct xfs_log_item	*lip,
@@ -652,7 +784,7 @@ xlog_recover_rui_commit_pass2(
 		return -EFSCORRUPTED;
 	}
 
-	ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+	ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents);
 	xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
 	atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
 
@@ -666,6 +798,61 @@ const struct xlog_recover_item_ops xlog_rui_item_ops = {
 	.commit_pass2		= xlog_recover_rui_commit_pass2,
 };
 
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtrui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_rui_log_item		*ruip;
+	struct xfs_rui_log_format	*rui_formatp;
+	size_t				len;
+
+	rui_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents);
+	xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
+	atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
+
+	xlog_recover_intent_item(log, &ruip->rui_item, lsn,
+			&xfs_rtrmap_update_defer_type);
+	return 0;
+}
+#else
+STATIC int
+xlog_recover_rtrui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+			item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+	return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtrui_item_ops = {
+	.item_type		= XFS_LI_RUI_RT,
+	.commit_pass2		= xlog_recover_rtrui_commit_pass2,
+};
+
 /*
  * This routine is called when an RUD format structure is found in a committed
  * transaction in the log. Its purpose is to cancel the corresponding RUI if it
@@ -697,3 +884,33 @@ const struct xlog_recover_item_ops xlog_rud_item_ops = {
 	.item_type		= XFS_LI_RUD,
 	.commit_pass2		= xlog_recover_rud_commit_pass2,
 };
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtrud_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_rud_log_format	*rud_formatp;
+
+	rud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				rud_formatp, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_RUI_RT,
+			rud_formatp->rud_rui_id);
+	return 0;
+}
+#else
+# define xlog_recover_rtrud_commit_pass2	xlog_recover_rtrui_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtrud_item_ops = {
+	.item_type		= XFS_LI_RUD_RT,
+	.commit_pass2		= xlog_recover_rtrud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 1c89d38b85446..10eeceb8b9e7f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3017,9 +3017,10 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 	TP_ARGS(mp, ri),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(unsigned long long, owner)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, rmapbno)
 		__field(int, whichfork)
 		__field(xfs_fileoff_t, l_loff)
 		__field(xfs_filblks_t, l_len)
@@ -3028,9 +3029,18 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
-		__entry->agbno = XFS_FSB_TO_AGBNO(mp,
-					ri->ri_bmap.br_startblock);
+		if (ri->ri_realtime) {
+			__entry->opdev = mp->m_rtdev_targp->bt_dev;
+			__entry->rmapbno = xfs_rtb_to_rgbno(mp,
+					ri->ri_bmap.br_startblock,
+					&__entry->agno);
+		} else {
+			__entry->agno = XFS_FSB_TO_AGNO(mp,
+						ri->ri_bmap.br_startblock);
+			__entry->opdev = __entry->dev;
+			__entry->rmapbno = XFS_FSB_TO_AGBNO(mp,
+						ri->ri_bmap.br_startblock);
+		}
 		__entry->owner = ri->ri_owner;
 		__entry->whichfork = ri->ri_whichfork;
 		__entry->l_loff = ri->ri_bmap.br_startoff;
@@ -3038,11 +3048,12 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
 		__entry->l_state = ri->ri_bmap.br_state;
 		__entry->op = ri->ri_type;
 	),
-	TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+	TP_printk("dev %d:%d op %s opdev %d:%d agno 0x%x rmapbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->rmapbno,
 		  __entry->owner,
 		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
 		  __entry->l_loff,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/39] xfs: support recovering rmap intent items targetting realtime extents
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:33   ` [PATCH 08/39] xfs: add a realtime flag to the rmap update log redo items Darrick J. Wong
@ 2023-12-31 21:34   ` Darrick J. Wong
  2023-12-31 21:34   ` [PATCH 10/39] xfs: add realtime rmap btree block detection to log recovery Darrick J. Wong
                     ` (29 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:34 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we have rmap on the realtime device, log recovery has to
support remapping extents on the realtime volume.  Make this work.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_rmap_item.c |   21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 229b5127d4716..580baf3b1b1d3 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -451,6 +451,7 @@ xfs_rmap_update_abort_intent(
 static inline bool
 xfs_rui_validate_map(
 	struct xfs_mount		*mp,
+	bool				isrt,
 	struct xfs_map_extent		*map)
 {
 	if (!xfs_has_rmapbt(mp))
@@ -480,6 +481,9 @@ xfs_rui_validate_map(
 	if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
 		return false;
 
+	if (isrt)
+		return xfs_verify_rtbext(mp, map->me_startblock, map->me_len);
+
 	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
 }
 
@@ -487,6 +491,7 @@ static inline void
 xfs_rui_recover_work(
 	struct xfs_mount		*mp,
 	struct xfs_defer_pending	*dfp,
+	bool				isrt,
 	const struct xfs_map_extent	*map)
 {
 	struct xfs_rmap_intent		*ri;
@@ -531,7 +536,15 @@ xfs_rui_recover_work(
 	ri->ri_bmap.br_blockcount = map->me_len;
 	ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
 			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-	ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock);
+	ri->ri_realtime = isrt;
+	if (isrt) {
+		xfs_rgnumber_t	rgno;
+
+		rgno = xfs_rtb_to_rgno(mp, map->me_startblock);
+		ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+	} else {
+		ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock);
+	}
 
 	xfs_defer_add_item(dfp, &ri->ri_list);
 }
@@ -550,6 +563,7 @@ xfs_rmap_recover_work(
 	struct xfs_rui_log_item		*ruip = RUI_ITEM(lip);
 	struct xfs_trans		*tp;
 	struct xfs_mount		*mp = lip->li_log->l_mp;
+	bool				isrt = xfs_rui_item_isrt(lip);
 	int				i;
 	int				error = 0;
 
@@ -559,7 +573,7 @@ xfs_rmap_recover_work(
 	 * just toss the RUI.
 	 */
 	for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
-		if (!xfs_rui_validate_map(mp,
+		if (!xfs_rui_validate_map(mp, isrt,
 					&ruip->rui_format.rui_extents[i])) {
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					&ruip->rui_format,
@@ -567,7 +581,8 @@ xfs_rmap_recover_work(
 			return -EFSCORRUPTED;
 		}
 
-		xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]);
+		xfs_rui_recover_work(mp, dfp, isrt,
+				&ruip->rui_format.rui_extents[i]);
 	}
 
 	resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/39] xfs: add realtime rmap btree block detection to log recovery
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 21:34   ` [PATCH 09/39] xfs: support recovering rmap intent items targetting realtime extents Darrick J. Wong
@ 2023-12-31 21:34   ` Darrick J. Wong
  2023-12-31 21:34   ` [PATCH 11/39] xfs: add realtime reverse map inode to metadata directory Darrick J. Wong
                     ` (28 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:34 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Identify rtrmapbt blocks in the log correctly so that we can
validate them during log recovery.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_buf_item_recover.c |    4 ++++
 1 file changed, 4 insertions(+)


diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 2e617041161e0..c7d86636bd312 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -259,6 +259,9 @@ xlog_recover_validate_buf_type(
 		case XFS_BMAP_MAGIC:
 			bp->b_ops = &xfs_bmbt_buf_ops;
 			break;
+		case XFS_RTRMAP_CRC_MAGIC:
+			bp->b_ops = &xfs_rtrmapbt_buf_ops;
+			break;
 		case XFS_RMAP_CRC_MAGIC:
 			bp->b_ops = &xfs_rmapbt_buf_ops;
 			break;
@@ -768,6 +771,7 @@ xlog_recover_get_buf_lsn(
 		uuid = &btb->bb_u.s.bb_uuid;
 		break;
 	}
+	case XFS_RTRMAP_CRC_MAGIC:
 	case XFS_BMAP_CRC_MAGIC:
 	case XFS_BMAP_MAGIC: {
 		struct xfs_btree_block *btb = blk;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/39] xfs: add realtime reverse map inode to metadata directory
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 21:34   ` [PATCH 10/39] xfs: add realtime rmap btree block detection to log recovery Darrick J. Wong
@ 2023-12-31 21:34   ` Darrick J. Wong
  2023-12-31 21:34   ` [PATCH 12/39] xfs: add metadata reservations for realtime rmap btrees Darrick J. Wong
                     ` (27 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:34 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a metadir path to select the realtime rmap btree inode and load
it at mount time.  The rtrmapbt inode will have a unique extent format
code, which means that we also have to update the inode validation and
flush routines to look for it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h       |    6 ++-
 fs/xfs/libxfs/xfs_inode_buf.c    |   10 +++++
 fs/xfs/libxfs/xfs_inode_fork.c   |    9 +++++
 fs/xfs/libxfs/xfs_rtgroup.h      |    3 ++
 fs/xfs/libxfs/xfs_rtrmap_btree.c |   33 ++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |    4 ++
 fs/xfs/xfs_inode.c               |   19 ++++++++++
 fs/xfs/xfs_inode_item.c          |    2 +
 fs/xfs/xfs_inode_item_recover.c  |    1 +
 fs/xfs/xfs_rtalloc.c             |   71 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trace.h               |    1 +
 11 files changed, 156 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 5317c6438f070..d374240fc58c0 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1026,7 +1026,8 @@ enum xfs_dinode_fmt {
 	XFS_DINODE_FMT_LOCAL,		/* bulk data */
 	XFS_DINODE_FMT_EXTENTS,		/* struct xfs_bmbt_rec */
 	XFS_DINODE_FMT_BTREE,		/* struct xfs_bmdr_block */
-	XFS_DINODE_FMT_UUID		/* added long ago, but never used */
+	XFS_DINODE_FMT_UUID,		/* added long ago, but never used */
+	XFS_DINODE_FMT_RMAP,		/* reverse mapping btree */
 };
 
 #define XFS_INODE_FORMAT_STR \
@@ -1034,7 +1035,8 @@ enum xfs_dinode_fmt {
 	{ XFS_DINODE_FMT_LOCAL,		"local" }, \
 	{ XFS_DINODE_FMT_EXTENTS,	"extent" }, \
 	{ XFS_DINODE_FMT_BTREE,		"btree" }, \
-	{ XFS_DINODE_FMT_UUID,		"uuid" }
+	{ XFS_DINODE_FMT_UUID,		"uuid" }, \
+	{ XFS_DINODE_FMT_RMAP,		"rmap" }
 
 /*
  * Max values for extnum and aextnum.
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 18d9e71a3bb30..5ed779cbe6f9f 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -411,6 +411,12 @@ xfs_dinode_verify_fork(
 		if (di_nextents > max_extents)
 			return __this_address;
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		if (!xfs_has_rtrmapbt(mp))
+			return __this_address;
+		if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
+			return __this_address;
+		break;
 	default:
 		return __this_address;
 	}
@@ -430,6 +436,10 @@ xfs_dinode_verify_forkoff(
 		if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
 			return __this_address;
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		if (!(xfs_has_metadir(mp) && xfs_has_parent(mp)))
+			return __this_address;
+		fallthrough;
 	case XFS_DINODE_FMT_LOCAL:	/* fall through ... */
 	case XFS_DINODE_FMT_EXTENTS:    /* fall through ... */
 	case XFS_DINODE_FMT_BTREE:
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 16543bb873a81..eb7e733b30638 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -264,6 +264,11 @@ xfs_iformat_data_fork(
 			return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 		case XFS_DINODE_FMT_BTREE:
 			return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+		case XFS_DINODE_FMT_RMAP:
+			if (!xfs_has_rtrmapbt(ip->i_mount))
+				return -EFSCORRUPTED;
+			ASSERT(0); /* to be implemented later */
+			return -EFSCORRUPTED;
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -653,6 +658,10 @@ xfs_iflush_fork(
 		}
 		break;
 
+	case XFS_DINODE_FMT_RMAP:
+		ASSERT(0); /* to be implemented later */
+		break;
+
 	default:
 		ASSERT(0);
 		break;
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 0a63f14b5aa0f..77503bda35563 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -22,6 +22,9 @@ struct xfs_rtgroup {
 	/* for rcu-safe freeing */
 	struct rcu_head		rcu_head;
 
+	/* reverse mapping btree inode */
+	struct xfs_inode	*rtg_rmapip;
+
 	/* Number of blocks in this group */
 	xfs_rgblock_t		rtg_blockcount;
 
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 5be3e1af55684..e60864dd15030 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -18,6 +18,7 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
+#include "xfs_imeta.h"
 #include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_trace.h"
@@ -476,6 +477,7 @@ xfs_rtrmapbt_commit_staged_btree(
 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_RMAP);
 
 	/*
 	 * Free any resources hanging off the real fork, then shallow-copy the
@@ -576,3 +578,34 @@ xfs_rtrmapbt_compute_maxlevels(
 	/* Add one level to handle the inode root level. */
 	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
 }
+
+#define XFS_RTRMAP_NAMELEN		17
+
+/* Create the metadata directory path for an rtrmap btree inode. */
+int
+xfs_rtrmapbt_create_path(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_imeta_path	**pathp)
+{
+	struct xfs_imeta_path	*path;
+	unsigned char		*fname;
+	int			error;
+
+	error = xfs_imeta_create_file_path(mp, 2, &path);
+	if (error)
+		return error;
+
+	fname = kmalloc(XFS_RTRMAP_NAMELEN, GFP_KERNEL);
+	if (!fname) {
+		xfs_imeta_free_path(path);
+		return -ENOMEM;
+	}
+
+	snprintf(fname, XFS_RTRMAP_NAMELEN, "%u.rmap", rgno);
+	path->im_path[0] = "realtime";
+	path->im_path[1] = fname;
+	path->im_dynamicmask = 0x2;
+	*pathp = path;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 0f73267971924..29b698660182d 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_cur;
 struct xfs_mount;
 struct xbtree_ifakeroot;
 struct xfs_rtgroup;
+struct xfs_imeta_path;
 
 /* rmaps only exist on crc enabled filesystems */
 #define XFS_RTRMAP_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
@@ -80,4 +81,7 @@ unsigned int xfs_rtrmapbt_maxlevels_ondisk(void);
 int __init xfs_rtrmapbt_init_cur_cache(void);
 void xfs_rtrmapbt_destroy_cur_cache(void);
 
+int xfs_rtrmapbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		struct xfs_imeta_path **pathp);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 78afc5b8e11c6..b7cda81161fb5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2501,7 +2501,15 @@ xfs_iflush(
 			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
 		goto flush_out;
 	}
-	if (S_ISREG(VFS_I(ip)->i_mode)) {
+	if (ip->i_df.if_format == XFS_DINODE_FMT_RMAP) {
+		if (!S_ISREG(VFS_I(ip)->i_mode) ||
+		    !(ip->i_diflags2 & XFS_DIFLAG2_METADIR)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad rt rmapbt inode %Lu, ptr "PTR_FMT,
+				__func__, ip->i_ino, ip);
+			goto flush_out;
+		}
+	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
 		if (XFS_TEST_ERROR(
 		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
@@ -2541,6 +2549,15 @@ xfs_iflush(
 		goto flush_out;
 	}
 
+	if (xfs_inode_has_attr_fork(ip)) {
+		if (ip->i_af.if_format == XFS_DINODE_FMT_RMAP) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: rt rmapbt in inode %Lu attr fork, ptr "PTR_FMT,
+				__func__, ip->i_ino, ip);
+			goto flush_out;
+		}
+	}
+
 	/*
 	 * Inode item log recovery for v2 inodes are dependent on the flushiter
 	 * count for correct sequencing.  We bump the flush iteration count so
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index b35335e20342c..2903c101505f5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -210,6 +210,7 @@ xfs_inode_item_data_fork_size(
 		}
 		break;
 	case XFS_DINODE_FMT_BTREE:
+	case XFS_DINODE_FMT_RMAP:
 		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
 		    ip->i_df.if_broot_bytes > 0) {
 			*nbytes += ip->i_df.if_broot_bytes;
@@ -330,6 +331,7 @@ xfs_inode_item_format_data_fork(
 		}
 		break;
 	case XFS_DINODE_FMT_BTREE:
+	case XFS_DINODE_FMT_RMAP:
 		iip->ili_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
 
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 144198a6b2702..0ec61d17f98e6 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -393,6 +393,7 @@ xlog_recover_inode_commit_pass2(
 
 	if (unlikely(S_ISREG(ldip->di_mode))) {
 		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ldip->di_format != XFS_DINODE_FMT_RMAP) &&
 		    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
 			XFS_CORRUPTION_ERROR(
 				"Bad log dinode data fork format for regular file",
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 49528f901d047..5308554fa93ec 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,6 +25,8 @@
 #include "xfs_da_format.h"
 #include "xfs_imeta.h"
 #include "xfs_rtgroup.h"
+#include "xfs_error.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Realtime metadata files are not quite regular files because userspace can't
@@ -35,6 +37,7 @@
  */
 static struct lock_class_key xfs_rbmip_key;
 static struct lock_class_key xfs_rsumip_key;
+static struct lock_class_key xfs_rrmapip_key;
 
 /*
  * Read and return the summary information for a given extent size,
@@ -1589,6 +1592,53 @@ __xfs_rt_iget(
 #define xfs_rt_iget(tp, ino, lockdep_key, ipp) \
 	__xfs_rt_iget((tp), (ino), (lockdep_key), #lockdep_key, (ipp))
 
+/* Load realtime rmap btree inode. */
+STATIC int
+xfs_rtmount_rmapbt(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_path	*path;
+	struct xfs_inode	*ip;
+	xfs_ino_t		ino;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(mp))
+		return 0;
+
+	error = xfs_rtrmapbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		return error;
+
+	error = xfs_imeta_lookup(tp, path, &ino);
+	if (error)
+		goto out_path;
+
+	if (ino == NULLFSINO) {
+		error = -EFSCORRUPTED;
+		goto out_path;
+	}
+
+	error = xfs_rt_iget(tp, ino, &xfs_rrmapip_key, &ip);
+	if (error)
+		goto out_path;
+
+	if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_RMAP)) {
+		error = -EFSCORRUPTED;
+		goto out_rele;
+	}
+
+	rtg->rtg_rmapip = ip;
+	ip = NULL;
+out_rele:
+	if (ip)
+		xfs_imeta_irele(ip);
+out_path:
+	xfs_imeta_free_path(path);
+	return error;
+}
+
 /*
  * Read in the bmbt of an rt metadata inode so that we never have to load them
  * at runtime.  This enables the use of shared ILOCKs for rtbitmap scans.  Use
@@ -1663,12 +1713,24 @@ xfs_rtmount_inodes(
 	for_each_rtgroup(mp, rgno, rtg) {
 		rtg->rtg_blockcount = xfs_rtgroup_block_count(mp,
 							      rtg->rtg_rgno);
+
+		error = xfs_rtmount_rmapbt(rtg, tp);
+		if (error) {
+			xfs_rtgroup_rele(rtg);
+			goto out_rele_rtgroup;
+		}
 	}
 
 	xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
 	xfs_trans_cancel(tp);
 	return 0;
 
+out_rele_rtgroup:
+	for_each_rtgroup(mp, rgno, rtg) {
+		if (rtg->rtg_rmapip)
+			xfs_imeta_irele(rtg->rtg_rmapip);
+		rtg->rtg_rmapip = NULL;
+	}
 out_rele_summary:
 	xfs_imeta_irele(mp->m_rsumip);
 out_rele_bitmap:
@@ -1682,7 +1744,16 @@ void
 xfs_rtunmount_inodes(
 	struct xfs_mount	*mp)
 {
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
 	kmem_free(mp->m_rsum_cache);
+
+	for_each_rtgroup(mp, rgno, rtg) {
+		if (rtg->rtg_rmapip)
+			xfs_imeta_irele(rtg->rtg_rmapip);
+		rtg->rtg_rmapip = NULL;
+	}
 	if (mp->m_rbmip)
 		xfs_imeta_irele(mp->m_rbmip);
 	if (mp->m_rsumip)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 10eeceb8b9e7f..8ebdfb216266c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2221,6 +2221,7 @@ TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_RMAP);
 
 DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 	TP_PROTO(struct xfs_inode *ip, int which),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/39] xfs: add metadata reservations for realtime rmap btrees
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 21:34   ` [PATCH 11/39] xfs: add realtime reverse map inode to metadata directory Darrick J. Wong
@ 2023-12-31 21:34   ` Darrick J. Wong
  2023-12-31 21:35   ` [PATCH 13/39] xfs: wire up a new inode fork type for the realtime rmap Darrick J. Wong
                     ` (26 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:34 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reserve some free blocks so that we will always have enough free blocks
in the data volume to handle expansion of the realtime rmap btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtrmap_btree.c |   39 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |    2 ++
 fs/xfs/xfs_rtalloc.c             |   21 +++++++++++++++++++-
 3 files changed, 61 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index e60864dd15030..9d2087962c53a 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -609,3 +609,42 @@ xfs_rtrmapbt_create_path(
 	*pathp = path;
 	return 0;
 }
+
+/* Calculate the rtrmap btree size for some records. */
+static unsigned long long
+xfs_rtrmapbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_rtrmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum rmap btree size.
+ */
+static unsigned long long
+xfs_rtrmapbt_max_size(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtblocks)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rtrmap_mxr[0] == 0)
+		return 0;
+
+	return xfs_rtrmapbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+xfs_filblks_t
+xfs_rtrmapbt_calc_reserves(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_has_rtrmapbt(mp))
+		return 0;
+
+	/* 1/64th (~1.5%) of the space, and enough for 1 record per block. */
+	return max_t(xfs_filblks_t, mp->m_sb.sb_rgblocks >> 6,
+			xfs_rtrmapbt_max_size(mp, mp->m_sb.sb_rgblocks));
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 29b698660182d..b7950e6d45d40 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -84,4 +84,6 @@ void xfs_rtrmapbt_destroy_cur_cache(void);
 int xfs_rtrmapbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 		struct xfs_imeta_path **pathp);
 
+xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5308554fa93ec..2f2a92672de9a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1560,6 +1560,11 @@ void
 xfs_rt_resv_free(
 	struct xfs_mount	*mp)
 {
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
+	for_each_rtgroup(mp, rgno, rtg)
+		xfs_imeta_resv_free_inode(rtg->rtg_rmapip);
 }
 
 /* Reserve space for rt metadata inodes' space expansion. */
@@ -1567,7 +1572,21 @@ int
 xfs_rt_resv_init(
 	struct xfs_mount	*mp)
 {
-	return 0;
+	struct xfs_rtgroup	*rtg;
+	xfs_filblks_t		ask;
+	xfs_rgnumber_t		rgno;
+	int			error = 0;
+
+	for_each_rtgroup(mp, rgno, rtg) {
+		int		err2;
+
+		ask = xfs_rtrmapbt_calc_reserves(mp);
+		err2 = xfs_imeta_resv_init_inode(rtg->rtg_rmapip, ask);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	return error;
 }
 
 static inline int


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/39] xfs: wire up a new inode fork type for the realtime rmap
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 21:34   ` [PATCH 12/39] xfs: add metadata reservations for realtime rmap btrees Darrick J. Wong
@ 2023-12-31 21:35   ` Darrick J. Wong
  2023-12-31 21:35   ` [PATCH 14/39] xfs: allow inodes with zero extents but nonzero nblocks Darrick J. Wong
                     ` (25 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:35 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Plumb in the pieces we need to embed the root of the realtime rmap
btree in an inode's data fork, complete with new fork type and
on-disk interpretation functions.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h       |    8 +
 fs/xfs/libxfs/xfs_inode_fork.c   |    8 +
 fs/xfs/libxfs/xfs_ondisk.h       |    1 
 fs/xfs/libxfs/xfs_rtrmap_btree.c |  220 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |  112 +++++++++++++++++++
 fs/xfs/xfs_inode_item_recover.c  |   32 +++++-
 6 files changed, 375 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index d374240fc58c0..1c1910256a927 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1755,6 +1755,14 @@ typedef __be32 xfs_rmap_ptr_t;
  */
 #define	XFS_RTRMAP_CRC_MAGIC	0x4d415052	/* 'MAPR' */
 
+/*
+ * rtrmap root header, on-disk form only.
+ */
+struct xfs_rtrmap_root {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+};
+
 /* inode-based btree pointer type */
 typedef __be64 xfs_rtrmap_ptr_t;
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index eb7e733b30638..ab54a9b27c781 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -27,6 +27,7 @@
 #include "xfs_errortag.h"
 #include "xfs_health.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_rtrmap_btree.h"
 
 struct kmem_cache *xfs_ifork_cache;
 
@@ -267,8 +268,7 @@ xfs_iformat_data_fork(
 		case XFS_DINODE_FMT_RMAP:
 			if (!xfs_has_rtrmapbt(ip->i_mount))
 				return -EFSCORRUPTED;
-			ASSERT(0); /* to be implemented later */
-			return -EFSCORRUPTED;
+			return xfs_iformat_rtrmap(ip, dip);
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -659,7 +659,9 @@ xfs_iflush_fork(
 		break;
 
 	case XFS_DINODE_FMT_RMAP:
-		ASSERT(0); /* to be implemented later */
+		ASSERT(whichfork == XFS_DATA_FORK);
+		if (iip->ili_fields & brootflag[whichfork])
+			xfs_iflush_rtrmap(ip, dip);
 		break;
 
 	default:
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 897a1b72f8d12..102a3574fc682 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -78,6 +78,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 9d2087962c53a..e2ee2a500ca38 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -85,6 +85,39 @@ xfs_rtrmapbt_get_maxrecs(
 	return cur->bc_mp->m_rtrmap_mxr[level != 0];
 }
 
+/* Calculate number of records in the ondisk realtime rmap btree inode root. */
+unsigned int
+xfs_rtrmapbt_droot_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= sizeof(struct xfs_rtrmap_root);
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_rmap_rec);
+	return blocklen / (2 * sizeof(struct xfs_rmap_key) +
+			sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_rtrmapbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrmapbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_rtrmap_mxr[level != 0];
+	return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
 /*
  * Convert the ondisk record's offset field into the ondisk key's offset field.
  * Fork and bmbt are significant parts of the rmap record key, but written
@@ -377,6 +410,64 @@ xfs_rtrmapbt_keys_contiguous(
 				 be32_to_cpu(key2->rmap.rm_startblock));
 }
 
+/* Move the rtrmap btree root from one incore buffer to another. */
+static void
+xfs_rtrmapbt_broot_move(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_btree_block	*dst_broot,
+	size_t			dst_bytes,
+	struct xfs_btree_block	*src_broot,
+	size_t			src_bytes,
+	unsigned int		level,
+	unsigned int		numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	void			*dptr;
+	void			*sptr;
+
+	ASSERT(xfs_rtrmap_droot_space(src_broot) <=
+			xfs_inode_fork_size(ip, whichfork));
+
+	/*
+	 * We always have to move the pointers because they are not butted
+	 * against the btree block header.
+	 */
+	if (numrecs && level > 0) {
+		sptr = xfs_rtrmap_broot_ptr_addr(mp, src_broot, 1, src_bytes);
+		dptr = xfs_rtrmap_broot_ptr_addr(mp, dst_broot, 1, dst_bytes);
+		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
+	}
+
+	if (src_broot == dst_broot)
+		return;
+
+	/*
+	 * If the root is being totally relocated, we have to migrate the block
+	 * header and the keys/records that come after it.
+	 */
+	memcpy(dst_broot, src_broot, XFS_RTRMAP_BLOCK_LEN);
+
+	if (!numrecs)
+		return;
+
+	if (level == 0) {
+		sptr = xfs_rtrmap_rec_addr(src_broot, 1);
+		dptr = xfs_rtrmap_rec_addr(dst_broot, 1);
+		memcpy(dptr, sptr, numrecs * sizeof(struct xfs_rmap_rec));
+	} else {
+		sptr = xfs_rtrmap_key_addr(src_broot, 1);
+		dptr = xfs_rtrmap_key_addr(dst_broot, 1);
+		memcpy(dptr, sptr, numrecs * 2 * sizeof(struct xfs_rmap_key));
+	}
+}
+
+static const struct xfs_ifork_broot_ops xfs_rtrmapbt_iroot_ops = {
+	.maxrecs		= xfs_rtrmapbt_maxrecs,
+	.size			= xfs_rtrmap_broot_space_calc,
+	.move			= xfs_rtrmapbt_broot_move,
+};
+
 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.rec_len		= sizeof(struct xfs_rmap_rec),
 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
@@ -390,6 +481,7 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.free_block		= xfs_btree_free_imeta_block,
 	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
 	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
+	.get_dmaxrecs		= xfs_rtrmapbt_get_dmaxrecs,
 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
@@ -400,6 +492,7 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
+	.iroot_ops		= &xfs_rtrmapbt_iroot_ops,
 };
 
 /* Initialize a new rt rmap btree cursor. */
@@ -648,3 +741,130 @@ xfs_rtrmapbt_calc_reserves(
 	return max_t(xfs_filblks_t, mp->m_sb.sb_rgblocks >> 6,
 			xfs_rtrmapbt_max_size(mp, mp->m_sb.sb_rgblocks));
 }
+
+/* Convert on-disk form of btree root to in-memory form. */
+STATIC void
+xfs_rtrmapbt_from_disk(
+	struct xfs_inode	*ip,
+	struct xfs_rtrmap_root	*dblock,
+	unsigned int		dblocklen,
+	struct xfs_btree_block	*rblock)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_rmap_key	*fkp;
+	__be64			*fpp;
+	struct xfs_rmap_key	*tkp;
+	__be64			*tpp;
+	struct xfs_rmap_rec	*frp;
+	struct xfs_rmap_rec	*trp;
+	unsigned int		rblocklen = xfs_rtrmap_broot_space(mp, dblock);
+	unsigned int		numrecs;
+	unsigned int		maxrecs;
+
+	xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino);
+
+	rblock->bb_level = dblock->bb_level;
+	rblock->bb_numrecs = dblock->bb_numrecs;
+	numrecs = be16_to_cpu(dblock->bb_numrecs);
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+		tkp = xfs_rtrmap_key_addr(rblock, 1);
+		fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+		tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+		trp = xfs_rtrmap_rec_addr(rblock, 1);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Load a realtime reverse mapping btree root in from disk. */
+int
+xfs_iformat_rtrmap(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	unsigned int		numrecs;
+	unsigned int		level;
+	int			dsize;
+
+	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+	numrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > mp->m_rtrmap_maxlevels ||
+	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize)
+		return -EFSCORRUPTED;
+
+	xfs_iroot_alloc(ip, XFS_DATA_FORK,
+			xfs_rtrmap_broot_space_calc(mp, level, numrecs));
+	xfs_rtrmapbt_from_disk(ip, dfp, dsize, ifp->if_broot);
+	return 0;
+}
+
+/* Convert in-memory form of btree root to on-disk form. */
+void
+xfs_rtrmapbt_to_disk(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*rblock,
+	unsigned int		rblocklen,
+	struct xfs_rtrmap_root	*dblock,
+	unsigned int		dblocklen)
+{
+	struct xfs_rmap_key	*fkp;
+	__be64			*fpp;
+	struct xfs_rmap_key	*tkp;
+	__be64			*tpp;
+	struct xfs_rmap_rec	*frp;
+	struct xfs_rmap_rec	*trp;
+	unsigned int		numrecs;
+	unsigned int		maxrecs;
+
+	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC));
+	ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+	ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+	numrecs = be16_to_cpu(rblock->bb_numrecs);
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrmap_key_addr(rblock, 1);
+		tkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+		fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrmap_rec_addr(rblock, 1);
+		trp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Flush a realtime reverse mapping btree root out to disk. */
+void
+xfs_iflush_rtrmap(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+	ASSERT(ifp->if_broot != NULL);
+	ASSERT(ifp->if_broot_bytes > 0);
+	ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, XFS_DATA_FORK));
+	xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
+			dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index b7950e6d45d40..2b26a05f90779 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -27,6 +27,7 @@ void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
 unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
 		bool leaf);
 void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf);
 
 /*
  * Addresses of records, keys, and pointers within an incore rtrmapbt block.
@@ -86,4 +87,115 @@ int xfs_rtrmapbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 
 xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp);
 
+/* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */
+
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_droot_rec_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_rec *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_droot_key_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index)
+{
+	return (struct xfs_rmap_key *)
+		((char *)(block + 1) +
+		 (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_droot_ptr_addr(
+	struct xfs_rtrmap_root	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrmap_ptr_t *)
+		((char *)(block + 1) +
+		 maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+		 (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_broot_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*bb,
+	unsigned int		index,
+	unsigned int		block_size)
+{
+	return xfs_rtrmap_ptr_addr(bb, index,
+			xfs_rtrmapbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrmap_broot_space_calc(
+	struct xfs_mount	*mp,
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = XFS_RTRMAP_BLOCK_LEN;
+
+	if (level > 0)
+		return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+					 sizeof(xfs_rtrmap_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb)
+{
+	return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrmap_droot_space_calc(
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = sizeof(struct xfs_rtrmap_root);
+
+	if (level > 0)
+		return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+					 sizeof(xfs_rtrmap_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrmap_droot_space(struct xfs_btree_block *bb)
+{
+	return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock,
+		unsigned int rblocklen, struct xfs_rtrmap_root *dblock,
+		unsigned int dblocklen);
+void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 0ec61d17f98e6..4cf967df28ef1 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -22,6 +22,7 @@
 #include "xfs_log_recover.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_rtrmap_btree.h"
 
 STATIC void
 xlog_recover_inode_ra_pass2(
@@ -266,6 +267,31 @@ xlog_dinode_verify_extent_counts(
 	return 0;
 }
 
+static inline int
+xlog_recover_inode_dbroot(
+	struct xfs_mount	*mp,
+	void			*src,
+	unsigned int		len,
+	struct xfs_dinode	*dip)
+{
+	void			*dfork = XFS_DFORK_DPTR(dip);
+	unsigned int		dsize = XFS_DFORK_DSIZE(dip, mp);
+
+	switch (dip->di_format) {
+	case XFS_DINODE_FMT_BTREE:
+		xfs_bmbt_to_bmdr(mp, src, len, dfork, dsize);
+		break;
+	case XFS_DINODE_FMT_RMAP:
+		xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize);
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
 STATIC int
 xlog_recover_inode_commit_pass2(
 	struct xlog			*log,
@@ -475,9 +501,9 @@ xlog_recover_inode_commit_pass2(
 		break;
 
 	case XFS_ILOG_DBROOT:
-		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
-				 (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip),
-				 XFS_DFORK_DSIZE(dip, mp));
+		error = xlog_recover_inode_dbroot(mp, src, len, dip);
+		if (error)
+			goto out_release;
 		break;
 
 	default:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/39] xfs: allow inodes with zero extents but nonzero nblocks
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 21:35   ` [PATCH 13/39] xfs: wire up a new inode fork type for the realtime rmap Darrick J. Wong
@ 2023-12-31 21:35   ` Darrick J. Wong
  2023-12-31 21:35   ` [PATCH 15/39] xfs: use realtime EFI to free extents when realtime rmap is enabled Darrick J. Wong
                     ` (24 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:35 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Metadata inodes that store btrees will have zero extents and a nonzero
nblocks.  Adjust the inode verifier so that this combination is not
flagged.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c |   16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 5ed779cbe6f9f..dae2efec1d5d0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -601,9 +601,6 @@ xfs_dinode_verify(
 	if (mode && nextents + naextents > nblocks)
 		return __this_address;
 
-	if (nextents + naextents == 0 && nblocks != 0)
-		return __this_address;
-
 	if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
 		return __this_address;
 
@@ -707,6 +704,19 @@ xfs_dinode_verify(
 			return fa;
 	}
 
+	/* metadata inodes containing btrees always have zero extent count */
+	if (flags2 & XFS_DIFLAG2_METADIR) {
+		switch (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK)) {
+		case XFS_DINODE_FMT_RMAP:
+			break;
+		default:
+			if (nextents + naextents == 0 && nblocks != 0)
+				return __this_address;
+			break;
+		}
+	} else if (nextents + naextents == 0 && nblocks != 0)
+		return __this_address;
+
 	return NULL;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/39] xfs: use realtime EFI to free extents when realtime rmap is enabled
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-31 21:35   ` [PATCH 14/39] xfs: allow inodes with zero extents but nonzero nblocks Darrick J. Wong
@ 2023-12-31 21:35   ` Darrick J. Wong
  2023-12-31 21:35   ` [PATCH 16/39] xfs: wire up rmap map and unmap to the realtime rmapbt Darrick J. Wong
                     ` (23 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:35 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When rmap is enabled, XFS expects a certain order of operations, which
is: 1) remove the file mapping, 2) remove the reverse mapping, and then
3) free the blocks.  xfs_bmap_del_extent_real tries to do 1 and 3 in the
same transaction, which means that when rtrmap is enabled, we have to
use realtime EFIs to maintain the expected order.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |   23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ce9c247525a3d..992e492972e76 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5057,7 +5057,6 @@ xfs_bmap_del_extent_real(
 {
 	xfs_fsblock_t		del_endblock=0;	/* first block past del */
 	xfs_fileoff_t		del_endoff;	/* first offset past del */
-	int			do_fx;	/* free extent at end of routine */
 	int			error;	/* error return value */
 	struct xfs_bmbt_irec	got;	/* current extent entry */
 	xfs_fileoff_t		got_endoff;	/* first offset past got */
@@ -5070,6 +5069,8 @@ xfs_bmap_del_extent_real(
 	uint			qfield;	/* quota field to update */
 	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
 	struct xfs_bmbt_irec	old;
+	bool			isrt = xfs_ifork_is_realtime(ip, whichfork);
+	bool			want_free = !(bflags & XFS_BMAPI_REMAP);
 
 	*logflagsp = 0;
 
@@ -5101,18 +5102,24 @@ xfs_bmap_del_extent_real(
 		return -ENOSPC;
 
 	*logflagsp = XFS_ILOG_CORE;
-	if (xfs_ifork_is_realtime(ip, whichfork)) {
-		if (!(bflags & XFS_BMAPI_REMAP)) {
+	if (isrt) {
+		/*
+		 * Historically, we did not use EFIs to free realtime extents.
+		 * However, when reverse mapping is enabled, we must maintain
+		 * the same order of operations as the data device, which is:
+		 * Remove the file mapping, remove the reverse mapping, and
+		 * then free the blocks.  This means that we must delay the
+		 * freeing until after we've scheduled the rmap update.
+		 */
+		if (want_free && !xfs_has_rtrmapbt(mp)) {
 			error = xfs_rtfree_blocks(tp, del->br_startblock,
 					del->br_blockcount);
 			if (error)
 				return error;
+			want_free = false;
 		}
-
-		do_fx = 0;
 		qfield = XFS_TRANS_DQ_RTBCOUNT;
 	} else {
-		do_fx = 1;
 		qfield = XFS_TRANS_DQ_BCOUNT;
 	}
 	nblks = del->br_blockcount;
@@ -5262,7 +5269,7 @@ xfs_bmap_del_extent_real(
 	/*
 	 * If we need to, add to list of extents to delete.
 	 */
-	if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+	if (want_free) {
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
 			xfs_refcount_decrease_extent(tp, del);
 		} else {
@@ -5271,6 +5278,8 @@ xfs_bmap_del_extent_real(
 			if ((bflags & XFS_BMAPI_NODISCARD) ||
 			    del->br_state == XFS_EXT_UNWRITTEN)
 				efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD;
+			if (isrt)
+				efi_flags |= XFS_FREE_EXTENT_REALTIME;
 
 			error = xfs_free_extent_later(tp, del->br_startblock,
 					del->br_blockcount, NULL,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/39] xfs: wire up rmap map and unmap to the realtime rmapbt
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-31 21:35   ` [PATCH 15/39] xfs: use realtime EFI to free extents when realtime rmap is enabled Darrick J. Wong
@ 2023-12-31 21:35   ` Darrick J. Wong
  2023-12-31 21:36   ` [PATCH 17/39] xfs: create routine to allocate and initialize a realtime rmap btree inode Darrick J. Wong
                     ` (22 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:35 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Connect the map and unmap reverse-mapping operations to the realtime
rmapbt via the deferred operation callbacks.  This enables us to
perform rmap operations against the correct btree.

[Contains a minor bugfix from hch]

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c    |   37 ++++++++++++++++++++++++++++++++++---
 fs/xfs/libxfs/xfs_rtgroup.c |    9 +++++++++
 fs/xfs/libxfs/xfs_rtgroup.h |    5 ++++-
 3 files changed, 47 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index daef2d67eb7a0..8766805ed1343 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -26,6 +26,7 @@
 #include "xfs_health.h"
 #include "xfs_rmap_item.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -2692,9 +2693,39 @@ xfs_rtrmap_finish_one(
 	struct xfs_rmap_intent		*ri,
 	struct xfs_btree_cur		**pcur)
 {
-	/* coming in a subsequent patch */
-	ASSERT(0);
-	return -EFSCORRUPTED;
+	struct xfs_owner_info		oinfo;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*rcur = *pcur;
+	xfs_rgnumber_t			rgno;
+	xfs_rgblock_t			bno;
+	bool				unwritten;
+
+	trace_xfs_rmap_deferred(mp, ri);
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
+		return -EIO;
+
+	/*
+	 * If we haven't gotten a cursor or the cursor rtgroup doesn't match
+	 * the startblock, get one now.
+	 */
+	if (rcur != NULL && rcur->bc_ino.rtg != ri->ri_rtg) {
+		xfs_btree_del_cursor(rcur, 0);
+		rcur = NULL;
+	}
+	if (rcur == NULL) {
+		xfs_rtgroup_lock(tp, ri->ri_rtg, XFS_RTGLOCK_RMAP);
+		*pcur = rcur = xfs_rtrmapbt_init_cursor(mp, tp, ri->ri_rtg,
+				ri->ri_rtg->rtg_rmapip);
+	}
+
+	xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
+			ri->ri_bmap.br_startoff);
+	unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
+	bno = xfs_rtb_to_rgbno(mp, ri->ri_bmap.br_startblock, &rgno);
+
+	return __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+			ri->ri_bmap.br_blockcount, &oinfo, unwritten);
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 7a45a16cbbab8..6ffe1c21a1ea4 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -552,6 +552,12 @@ xfs_rtgroup_lock(
 		xfs_rtbitmap_lock(tp, rtg->rtg_mount);
 	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
 		xfs_rtbitmap_lock_shared(rtg->rtg_mount, XFS_RBMLOCK_BITMAP);
+
+	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg->rtg_rmapip) {
+		xfs_ilock(rtg->rtg_rmapip, XFS_ILOCK_EXCL);
+		if (tp)
+			xfs_trans_ijoin(tp, rtg->rtg_rmapip, XFS_ILOCK_EXCL);
+	}
 }
 
 /* Unlock metadata inodes associated with this rt group. */
@@ -564,6 +570,9 @@ xfs_rtgroup_unlock(
 	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
 	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
+	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg->rtg_rmapip)
+		xfs_iunlock(rtg->rtg_rmapip, XFS_ILOCK_EXCL);
+
 	if (rtglock_flags & XFS_RTGLOCK_BITMAP)
 		xfs_rtbitmap_unlock(rtg->rtg_mount);
 	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 77503bda35563..559a5135820d3 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -231,9 +231,12 @@ int xfs_rtgroup_init_secondary_super(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 #define XFS_RTGLOCK_BITMAP		(1U << 0)
 /* Lock the rt bitmap inode in shared mode */
 #define XFS_RTGLOCK_BITMAP_SHARED	(1U << 1)
+/* Lock the rt rmap inode in exclusive mode */
+#define XFS_RTGLOCK_RMAP		(1U << 2)
 
 #define XFS_RTGLOCK_ALL_FLAGS	(XFS_RTGLOCK_BITMAP | \
-				 XFS_RTGLOCK_BITMAP_SHARED)
+				 XFS_RTGLOCK_BITMAP_SHARED | \
+				 XFS_RTGLOCK_RMAP)
 
 void xfs_rtgroup_lock(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
 		unsigned int rtglock_flags);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/39] xfs: create routine to allocate and initialize a realtime rmap btree inode
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-31 21:35   ` [PATCH 16/39] xfs: wire up rmap map and unmap to the realtime rmapbt Darrick J. Wong
@ 2023-12-31 21:36   ` Darrick J. Wong
  2023-12-31 21:36   ` [PATCH 18/39] xfs: rearrange xfs_fsmap.c a little bit Darrick J. Wong
                     ` (21 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:36 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a library routine to allocate and initialize an empty realtime
rmapbt inode.  We'll use this for mkfs and repair.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtrmap_btree.c |   34 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |    4 ++++
 2 files changed, 38 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index e2ee2a500ca38..b824562bdc2ec 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -868,3 +868,37 @@ xfs_iflush_rtrmap(
 	xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
 			dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
 }
+
+/*
+ * Create a realtime rmap btree inode.
+ *
+ * Regardless of the return value, the caller must clean up @upd.  If a new
+ * inode is returned through @*ipp, the caller must finish setting up the incore
+ * inode and release it.
+ */
+int
+xfs_rtrmapbt_create(
+	struct xfs_imeta_update	*upd,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = upd->mp;
+	struct xfs_ifork	*ifp;
+	int			error;
+
+	error = xfs_imeta_create(upd, S_IFREG, ipp);
+	if (error)
+		return error;
+
+	ifp = xfs_ifork_ptr(upd->ip, XFS_DATA_FORK);
+	ifp->if_format = XFS_DINODE_FMT_RMAP;
+	ASSERT(ifp->if_broot_bytes == 0);
+	ASSERT(ifp->if_bytes == 0);
+
+	/* Initialize the empty incore btree root. */
+	xfs_iroot_alloc(upd->ip, XFS_DATA_FORK,
+			xfs_rtrmap_broot_space_calc(mp, 0, 0));
+	xfs_btree_init_block(mp, ifp->if_broot, &xfs_rtrmapbt_ops, 0, 0,
+			upd->ip->i_ino);
+	xfs_trans_log_inode(upd->tp, upd->ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 2b26a05f90779..108ab8c0aea44 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -198,4 +198,8 @@ void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock,
 		unsigned int dblocklen);
 void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
 
+struct xfs_imeta_update;
+
+int xfs_rtrmapbt_create(struct xfs_imeta_update *upd, struct xfs_inode **ipp);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/39] xfs: rearrange xfs_fsmap.c a little bit
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-31 21:36   ` [PATCH 17/39] xfs: create routine to allocate and initialize a realtime rmap btree inode Darrick J. Wong
@ 2023-12-31 21:36   ` Darrick J. Wong
  2023-12-31 21:36   ` [PATCH 19/39] xfs: wire up getfsmap to the realtime reverse mapping btree Darrick J. Wong
                     ` (20 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:36 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The order of the functions in this file has gotten a little confusing
over the years.  Specifically, the two data device implementations
(bnobt and rmapbt) could be adjacent in the source code instead of split
in two by the logdev and rtdev fsmap implementations.  We're about to
add more functionality to this file, so rearrange things now.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_fsmap.c |  270 ++++++++++++++++++++++++++--------------------------
 1 file changed, 135 insertions(+), 135 deletions(-)


diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index c49a5b01c3e0c..34769cb35c908 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -427,141 +427,6 @@ xfs_getfsmap_set_irec_flags(
 		irec->rm_flags |= XFS_RMAP_UNWRITTEN;
 }
 
-/* Execute a getfsmap query against the log device. */
-STATIC int
-xfs_getfsmap_logdev(
-	struct xfs_trans		*tp,
-	const struct xfs_fsmap		*keys,
-	struct xfs_getfsmap_info	*info)
-{
-	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_rmap_irec		rmap;
-	xfs_daddr_t			rec_daddr, len_daddr;
-	xfs_fsblock_t			start_fsb, end_fsb;
-	uint64_t			eofs;
-
-	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
-	if (keys[0].fmr_physical >= eofs)
-		return 0;
-	start_fsb = XFS_BB_TO_FSBT(mp,
-				keys[0].fmr_physical + keys[0].fmr_length);
-	end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
-
-	/* Adjust the low key if we are continuing from where we left off. */
-	if (keys[0].fmr_length > 0)
-		info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb);
-
-	trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb);
-	trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb);
-
-	if (start_fsb > 0)
-		return 0;
-
-	/* Fabricate an rmap entry for the external log device. */
-	rmap.rm_startblock = 0;
-	rmap.rm_blockcount = mp->m_sb.sb_logblocks;
-	rmap.rm_owner = XFS_RMAP_OWN_LOG;
-	rmap.rm_offset = 0;
-	rmap.rm_flags = 0;
-
-	rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock);
-	len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount);
-	return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr);
-}
-
-#ifdef CONFIG_XFS_RT
-/* Transform a rtbitmap "record" into a fsmap */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap_helper(
-	struct xfs_mount		*mp,
-	struct xfs_trans		*tp,
-	const struct xfs_rtalloc_rec	*rec,
-	void				*priv)
-{
-	struct xfs_getfsmap_info	*info = priv;
-	struct xfs_rmap_irec		irec;
-	xfs_rtblock_t			rtbno;
-	xfs_daddr_t			rec_daddr, len_daddr;
-
-	rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
-	rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
-	irec.rm_startblock = rtbno;
-
-	rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
-	len_daddr = XFS_FSB_TO_BB(mp, rtbno);
-	irec.rm_blockcount = rtbno;
-
-	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
-	irec.rm_offset = 0;
-	irec.rm_flags = 0;
-
-	return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr);
-}
-
-/* Execute a getfsmap query against the realtime device rtbitmap. */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap(
-	struct xfs_trans		*tp,
-	const struct xfs_fsmap		*keys,
-	struct xfs_getfsmap_info	*info)
-{
-
-	struct xfs_rtalloc_rec		alow = { 0 };
-	struct xfs_rtalloc_rec		ahigh = { 0 };
-	struct xfs_mount		*mp = tp->t_mountp;
-	xfs_rtblock_t			start_rtb;
-	xfs_rtblock_t			end_rtb;
-	uint64_t			eofs;
-	int				error;
-
-	eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
-	if (keys[0].fmr_physical >= eofs)
-		return 0;
-	start_rtb = XFS_BB_TO_FSBT(mp,
-				keys[0].fmr_physical + keys[0].fmr_length);
-	end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
-
-	info->missing_owner = XFS_FMR_OWN_UNKNOWN;
-
-	/* Adjust the low key if we are continuing from where we left off. */
-	if (keys[0].fmr_length > 0) {
-		info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb);
-		if (info->low_daddr >= eofs)
-			return 0;
-	}
-
-	trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
-	trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
-
-	xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
-
-	/*
-	 * Set up query parameters to return free rtextents covering the range
-	 * we want.
-	 */
-	alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb);
-	ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb);
-	error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
-			xfs_getfsmap_rtdev_rtbitmap_helper, info);
-	if (error)
-		goto err;
-
-	/*
-	 * Report any gaps at the end of the rtbitmap by simulating a null
-	 * rmap starting at the block after the end of the query range.
-	 */
-	info->last = true;
-	ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
-
-	error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
-	if (error)
-		goto err;
-err:
-	xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
-	return error;
-}
-#endif /* CONFIG_XFS_RT */
-
 static inline bool
 rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r)
 {
@@ -786,6 +651,141 @@ xfs_getfsmap_datadev_bnobt(
 			xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
 }
 
+/* Execute a getfsmap query against the log device. */
+STATIC int
+xfs_getfsmap_logdev(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_rmap_irec		rmap;
+	xfs_daddr_t			rec_daddr, len_daddr;
+	xfs_fsblock_t			start_fsb, end_fsb;
+	uint64_t			eofs;
+
+	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	start_fsb = XFS_BB_TO_FSBT(mp,
+				keys[0].fmr_physical + keys[0].fmr_length);
+	end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
+
+	/* Adjust the low key if we are continuing from where we left off. */
+	if (keys[0].fmr_length > 0)
+		info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb);
+
+	trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb);
+	trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb);
+
+	if (start_fsb > 0)
+		return 0;
+
+	/* Fabricate an rmap entry for the external log device. */
+	rmap.rm_startblock = 0;
+	rmap.rm_blockcount = mp->m_sb.sb_logblocks;
+	rmap.rm_owner = XFS_RMAP_OWN_LOG;
+	rmap.rm_offset = 0;
+	rmap.rm_flags = 0;
+
+	rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock);
+	len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount);
+	return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr);
+}
+
+#ifdef CONFIG_XFS_RT
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*rec,
+	void				*priv)
+{
+	struct xfs_getfsmap_info	*info = priv;
+	struct xfs_rmap_irec		irec;
+	xfs_rtblock_t			rtbno;
+	xfs_daddr_t			rec_daddr, len_daddr;
+
+	rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
+	rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
+	irec.rm_startblock = rtbno;
+
+	rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+	len_daddr = XFS_FSB_TO_BB(mp, rtbno);
+	irec.rm_blockcount = rtbno;
+
+	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
+	irec.rm_offset = 0;
+	irec.rm_flags = 0;
+
+	return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr);
+}
+
+/* Execute a getfsmap query against the realtime device rtbitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+
+	struct xfs_rtalloc_rec		alow = { 0 };
+	struct xfs_rtalloc_rec		ahigh = { 0 };
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_rtblock_t			start_rtb;
+	xfs_rtblock_t			end_rtb;
+	uint64_t			eofs;
+	int				error;
+
+	eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	start_rtb = XFS_BB_TO_FSBT(mp,
+				keys[0].fmr_physical + keys[0].fmr_length);
+	end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
+
+	info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+
+	/* Adjust the low key if we are continuing from where we left off. */
+	if (keys[0].fmr_length > 0) {
+		info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb);
+		if (info->low_daddr >= eofs)
+			return 0;
+	}
+
+	trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
+	trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
+
+	xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
+
+	/*
+	 * Set up query parameters to return free rtextents covering the range
+	 * we want.
+	 */
+	alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb);
+	ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb);
+	error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
+			xfs_getfsmap_rtdev_rtbitmap_helper, info);
+	if (error)
+		goto err;
+
+	/*
+	 * Report any gaps at the end of the rtbitmap by simulating a null
+	 * rmap starting at the block after the end of the query range.
+	 */
+	info->last = true;
+	ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
+
+	error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
+	if (error)
+		goto err;
+err:
+	xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+	return error;
+}
+#endif /* CONFIG_XFS_RT */
+
 /* Do we recognize the device? */
 STATIC bool
 xfs_getfsmap_is_valid_device(


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/39] xfs: wire up getfsmap to the realtime reverse mapping btree
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-31 21:36   ` [PATCH 18/39] xfs: rearrange xfs_fsmap.c a little bit Darrick J. Wong
@ 2023-12-31 21:36   ` Darrick J. Wong
  2023-12-31 21:37   ` [PATCH 20/39] xfs: check that the rtrmapbt maxlevels doesn't increase when growing fs Darrick J. Wong
                     ` (19 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:36 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Connect the getfsmap ioctl to the realtime rmapbt.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_fsmap.c |  194 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 191 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 34769cb35c908..b0eabc76eb28a 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -25,6 +25,8 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 
 /* Convert an xfs_fsmap to an fsmap. */
 static void
@@ -158,6 +160,7 @@ struct xfs_getfsmap_info {
 	struct xfs_fsmap_head	*head;
 	struct fsmap		*fsmap_recs;	/* mapping records */
 	struct xfs_buf		*agf_bp;	/* AGF, for refcount queries */
+	struct xfs_rtgroup	*rtg;		/* rt group info, if needed */
 	struct xfs_perag	*pag;		/* AG info, if applicable */
 	xfs_daddr_t		next_daddr;	/* next daddr we expect */
 	/* daddr of low fsmap key when we're using the rtbitmap */
@@ -338,8 +341,14 @@ xfs_getfsmap_helper(
 	if (info->head->fmh_entries >= info->head->fmh_count)
 		return -ECANCELED;
 
-	trace_xfs_fsmap_mapping(mp, info->dev,
-			info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec);
+	if (info->pag)
+		trace_xfs_fsmap_mapping(mp, info->dev, info->pag->pag_agno,
+				rec);
+	else if (info->rtg)
+		trace_xfs_fsmap_mapping(mp, info->dev, info->rtg->rtg_rgno,
+				rec);
+	else
+		trace_xfs_fsmap_mapping(mp, info->dev, NULLAGNUMBER, rec);
 
 	fmr.fmr_device = info->dev;
 	fmr.fmr_physical = rec_daddr;
@@ -784,6 +793,181 @@ xfs_getfsmap_rtdev_rtbitmap(
 	xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
 	return error;
 }
+
+/* Transform a realtime rmapbt record into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt_helper(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_getfsmap_info	*info = priv;
+	xfs_rtblock_t			rtbno;
+	xfs_daddr_t			rec_daddr;
+
+	rtbno = xfs_rgbno_to_rtb(mp, cur->bc_ino.rtg->rtg_rgno,
+			rec->rm_startblock);
+	rec_daddr = xfs_rtb_to_daddr(mp, rtbno);
+
+	return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr, 0);
+}
+
+/* Actually query the rtrmap btree. */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt_query(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	struct xfs_btree_cur		**curpp)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	/* Report any gap at the end of the last rtgroup. */
+	if (info->last)
+		return xfs_getfsmap_rtdev_rmapbt_helper(*curpp, &info->high,
+				info);
+
+	/* Query the rtrmapbt */
+	xfs_rtgroup_lock(NULL, info->rtg, XFS_RTGLOCK_RMAP);
+	*curpp = xfs_rtrmapbt_init_cursor(mp, tp, info->rtg,
+			info->rtg->rtg_rmapip);
+	return xfs_rmap_query_range(*curpp, &info->low, &info->high,
+			xfs_getfsmap_rtdev_rmapbt_helper, info);
+}
+
+/* Execute a getfsmap query against the realtime device rmapbt. */
+STATIC int
+xfs_getfsmap_rtdev_rmapbt(
+	struct xfs_trans		*tp,
+	const struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_rtgroup		*rtg;
+	struct xfs_btree_cur		*bt_cur = NULL;
+	xfs_rtblock_t			start_rtb;
+	xfs_rtblock_t			end_rtb;
+	xfs_rgnumber_t			start_rg, end_rg;
+	uint64_t			eofs;
+	int				error = 0;
+
+	eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	start_rtb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+	end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
+
+	info->missing_owner = XFS_FMR_OWN_FREE;
+
+	/*
+	 * Convert the fsmap low/high keys to rtgroup based keys.  Initialize
+	 * low to the fsmap low key and max out the high key to the end
+	 * of the rtgroup.
+	 */
+	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+	error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+	if (error)
+		return error;
+	info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
+	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+	/* Adjust the low key if we are continuing from where we left off. */
+	if (info->low.rm_blockcount == 0) {
+		/* No previous record from which to continue */
+	} else if (rmap_not_shareable(mp, &info->low)) {
+		/* Last record seen was an unshareable extent */
+		info->low.rm_owner = 0;
+		info->low.rm_offset = 0;
+
+		start_rtb += info->low.rm_blockcount;
+		if (xfs_rtb_to_daddr(mp, start_rtb) >= eofs)
+			return 0;
+	} else {
+		/* Last record seen was a shareable file data extent */
+		info->low.rm_offset += info->low.rm_blockcount;
+	}
+	info->low.rm_startblock = xfs_rtb_to_rgbno(mp, start_rtb, &start_rg);
+
+	info->high.rm_startblock = -1U;
+	info->high.rm_owner = ULLONG_MAX;
+	info->high.rm_offset = ULLONG_MAX;
+	info->high.rm_blockcount = 0;
+	info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+
+	end_rg = xfs_rtb_to_rgno(mp, end_rtb);
+
+	for_each_rtgroup_range(mp, start_rg, end_rg, rtg) {
+		/*
+		 * Set the rtgroup high key from the fsmap high key if this
+		 * is the last rtgroup that we're querying.
+		 */
+		info->rtg = rtg;
+		if (rtg->rtg_rgno == end_rg) {
+			xfs_rgnumber_t	junk;
+
+			info->high.rm_startblock = xfs_rtb_to_rgbno(mp,
+					end_rtb, &junk);
+			info->high.rm_offset = XFS_BB_TO_FSBT(mp,
+					keys[1].fmr_offset);
+			error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+			if (error)
+				break;
+			xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+		}
+
+		if (bt_cur) {
+			xfs_rtgroup_unlock(bt_cur->bc_ino.rtg,
+					XFS_RTGLOCK_RMAP);
+			xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+			bt_cur = NULL;
+		}
+
+		trace_xfs_fsmap_low_key(mp, info->dev, rtg->rtg_rgno,
+				&info->low);
+		trace_xfs_fsmap_high_key(mp, info->dev, rtg->rtg_rgno,
+				&info->high);
+
+		error = xfs_getfsmap_rtdev_rmapbt_query(tp, info, &bt_cur);
+		if (error)
+			break;
+
+		/*
+		 * Set the rtgroup low key to the start of the rtgroup prior to
+		 * moving on to the next rtgroup.
+		 */
+		if (rtg->rtg_rgno == start_rg)
+			memset(&info->low, 0, sizeof(info->low));
+
+		/*
+		 * If this is the last rtgroup, report any gap at the end of it
+		 * before we drop the reference to the perag when the loop
+		 * terminates.
+		 */
+		if (rtg->rtg_rgno == end_rg) {
+			info->last = true;
+			error = xfs_getfsmap_rtdev_rmapbt_query(tp, info,
+					&bt_cur);
+			if (error)
+				break;
+		}
+		info->rtg = NULL;
+	}
+
+	if (bt_cur) {
+		xfs_rtgroup_unlock(bt_cur->bc_ino.rtg, XFS_RTGLOCK_RMAP);
+		xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
+							 XFS_BTREE_NOERROR);
+	}
+	if (info->rtg) {
+		xfs_rtgroup_rele(info->rtg);
+		info->rtg = NULL;
+	} else if (rtg) {
+		/* loop termination case */
+		xfs_rtgroup_rele(rtg);
+	}
+
+	return error;
+}
 #endif /* CONFIG_XFS_RT */
 
 /* Do we recognize the device? */
@@ -916,7 +1100,10 @@ xfs_getfsmap(
 #ifdef CONFIG_XFS_RT
 	if (mp->m_rtdev_targp) {
 		handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
-		handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
+		if (use_rmap)
+			handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
+		else
+			handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
 	}
 #endif /* CONFIG_XFS_RT */
 
@@ -983,6 +1170,7 @@ xfs_getfsmap(
 		info.dev = handlers[i].dev;
 		info.last = false;
 		info.pag = NULL;
+		info.rtg = NULL;
 		info.low_daddr = -1ULL;
 		info.low.rm_blockcount = 0;
 		error = handlers[i].fn(tp, dkeys, &info);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/39] xfs: check that the rtrmapbt maxlevels doesn't increase when growing fs
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-31 21:36   ` [PATCH 19/39] xfs: wire up getfsmap to the realtime reverse mapping btree Darrick J. Wong
@ 2023-12-31 21:37   ` Darrick J. Wong
  2023-12-31 21:37   ` [PATCH 21/39] xfs: add realtime rmap btree when adding rt volume Darrick J. Wong
                     ` (18 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:37 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The size of filesystem transaction reservations depends on the maximum
height (maxlevels) of the realtime btrees.  Since we don't want a grow
operation to increase the reservation size enough that we'll fail the
minimum log size checks on the next mount, constrain growfs operations
if they would cause an increase in those maxlevels.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_fsops.c   |   12 +++++++++
 fs/xfs/xfs_rtalloc.c |   64 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_rtalloc.h |    5 ++++
 fs/xfs/xfs_trace.h   |   21 ++++++++++++++++
 4 files changed, 101 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 99f9f5c8d9b6e..e78ee67b9dd12 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -23,6 +23,7 @@
 #include "xfs_trace.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtalloc.h"
+#include "xfs_rtrmap_btree.h"
 
 /*
  * Write new AG headers to disk. Non-transactional, but need to be
@@ -115,6 +116,13 @@ xfs_growfs_data_private(
 		xfs_buf_relse(bp);
 	}
 
+	/* Make sure the new fs size won't cause problems with the log. */
+	error = xfs_growfs_check_rtgeom(mp, nb, mp->m_sb.sb_rblocks,
+			mp->m_sb.sb_rextsize, mp->m_sb.sb_rextents,
+			mp->m_sb.sb_rbmblocks, mp->m_sb.sb_rextslog);
+	if (error)
+		return error;
+
 	nb_div = nb;
 	nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
 	if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS)
@@ -227,7 +235,11 @@ xfs_growfs_data_private(
 		error = xfs_fs_reserve_ag_blocks(mp);
 		if (error == -ENOSPC)
 			error = 0;
+
+		/* Compute new maxlevels for rt btrees. */
+		xfs_rtrmapbt_compute_maxlevels(mp);
 	}
+
 	return error;
 
 out_trans_cancel:
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 2f2a92672de9a..ac6580bda2052 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -27,6 +27,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_error.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_trace.h"
 
 /*
  * Realtime metadata files are not quite regular files because userspace can't
@@ -1016,6 +1017,57 @@ xfs_growfs_rt_init_primary(
 	return 0;
 }
 
+/*
+ * Check that changes to the realtime geometry won't affect the minimum
+ * log size, which would cause the fs to become unusable.
+ */
+int
+xfs_growfs_check_rtgeom(
+	const struct xfs_mount	*mp,
+	xfs_rfsblock_t		dblocks,
+	xfs_rfsblock_t		rblocks,
+	xfs_agblock_t		rextsize,
+	xfs_rtblock_t		rextents,
+	xfs_extlen_t		rbmblocks,
+	uint8_t			rextslog)
+{
+	struct xfs_mount	*fake_mp;
+	int			min_logfsbs;
+
+	fake_mp = kmem_alloc(sizeof(struct xfs_mount), KM_MAYFAIL);
+	if (!fake_mp)
+		return -ENOMEM;
+
+	/*
+	 * Create a dummy xfs_mount with the new rt geometry, and compute the
+	 * new minimum log size.  This ensures that the log is big enough to
+	 * handle the larger transactions that we could start sending.
+	 */
+	memcpy(fake_mp, mp, sizeof(struct xfs_mount));
+
+	fake_mp->m_sb.sb_dblocks = dblocks;
+	fake_mp->m_sb.sb_rblocks = rblocks;
+	fake_mp->m_sb.sb_rextents = rextents;
+	fake_mp->m_sb.sb_rextsize = rextsize;
+	fake_mp->m_sb.sb_rbmblocks = rbmblocks;
+	fake_mp->m_sb.sb_rextslog = rextslog;
+	if (rblocks > 0)
+		fake_mp->m_features |= XFS_FEAT_REALTIME;
+
+	xfs_rtrmapbt_compute_maxlevels(fake_mp);
+
+	xfs_trans_resv_calc(fake_mp, M_RES(fake_mp));
+	min_logfsbs = xfs_log_calc_minimum_size(fake_mp);
+	trace_xfs_growfs_check_rtgeom(mp, min_logfsbs);
+
+	kmem_free(fake_mp);
+
+	if (mp->m_sb.sb_logblocks < min_logfsbs)
+		return -ENOSPC;
+
+	return 0;
+}
+
 /*
  * Grow the realtime area of the filesystem.
  */
@@ -1107,6 +1159,12 @@ xfs_growfs_rt(
 	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
 		return -EINVAL;
 
+	/* Make sure the new fs size won't cause problems with the log. */
+	error = xfs_growfs_check_rtgeom(mp, mp->m_sb.sb_dblocks, nrblocks,
+			in->extsize, nrextents, nrbmblocks, nrextslog);
+	if (error)
+		return error;
+
 	/* Allocate the new rt group structures */
 	if (xfs_has_rtgroups(mp)) {
 		uint64_t	new_rgcount;
@@ -1301,8 +1359,12 @@ xfs_growfs_rt(
 			rtg->rtg_blockcount = xfs_rtgroup_block_count(mp,
 								rtg->rtg_rgno);
 
-		/* Ensure the mount RT feature flag is now set. */
+		/*
+		 * Ensure the mount RT feature flag is now set, and compute new
+		 * maxlevels for rt btrees.
+		 */
 		mp->m_features |= XFS_FEAT_REALTIME;
+		xfs_rtrmapbt_compute_maxlevels(mp);
 	}
 	if (error)
 		goto out_free;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index c01ca192646a9..8a7b6cfa13cf0 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -80,6 +80,10 @@ xfs_growfs_rt(
 	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
 
 int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
+int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, xfs_rfsblock_t dblocks,
+		xfs_rfsblock_t rblocks, xfs_agblock_t rextsize,
+		xfs_rtblock_t rextents, xfs_extlen_t rbmblocks,
+		uint8_t rextslog);
 #else
 # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)	(-ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)			(-ENOSYS)
@@ -101,6 +105,7 @@ xfs_rtmount_init(
 # define xfs_rtunmount_inodes(m)
 # define xfs_rt_resv_free(mp)				((void)0)
 # define xfs_rt_resv_init(mp)				(0)
+# define xfs_growfs_check_rtgeom(mp, d, r, rs, rx, rb, rl)	(0)
 #endif	/* CONFIG_XFS_RT */
 
 #endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8ebdfb216266c..2dc991730c303 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5406,6 +5406,27 @@ DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free_extent);
 DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_critical);
 DEFINE_INODE_ERROR_EVENT(xfs_imeta_resv_init_error);
 
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xfs_growfs_check_rtgeom,
+	TP_PROTO(const struct xfs_mount *mp, unsigned int min_logfsbs),
+	TP_ARGS(mp, min_logfsbs),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, logblocks)
+		__field(unsigned int, min_logfsbs)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->logblocks = mp->m_sb.sb_logblocks;
+		__entry->min_logfsbs = min_logfsbs;
+	),
+	TP_printk("dev %d:%d logblocks %u min_logfsbs %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->logblocks,
+		  __entry->min_logfsbs)
+);
+#endif /* CONFIG_XFS_RT */
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/39] xfs: add realtime rmap btree when adding rt volume
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-31 21:37   ` [PATCH 20/39] xfs: check that the rtrmapbt maxlevels doesn't increase when growing fs Darrick J. Wong
@ 2023-12-31 21:37   ` Darrick J. Wong
  2023-12-31 21:37   ` [PATCH 22/39] xfs: report realtime rmap btree corruption errors to the health system Darrick J. Wong
                     ` (17 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:37 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If we're adding enough space to the realtime section to require the
creation of new realtime groups, create the rt rmap btree inode before
we start adding the space.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_rtalloc.c |   88 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ac6580bda2052..f6b23439b674d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -24,8 +24,11 @@
 #include "xfs_health.h"
 #include "xfs_da_format.h"
 #include "xfs_imeta.h"
+#include "xfs_imeta_utils.h"
 #include "xfs_rtgroup.h"
 #include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_trace.h"
 
@@ -1017,6 +1020,74 @@ xfs_growfs_rt_init_primary(
 	return 0;
 }
 
+/* Add a metadata inode for a realtime rmap btree. */
+static int
+xfs_growfsrt_create_rtrmap(
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_update	upd;
+	struct xfs_rmap_irec	rmap = {
+		.rm_startblock	= 0,
+		.rm_blockcount	= mp->m_sb.sb_rextsize,
+		.rm_owner	= XFS_RMAP_OWN_FS,
+		.rm_offset	= 0,
+		.rm_flags	= 0,
+	};
+	struct xfs_btree_cur	*cur;
+	struct xfs_imeta_path	*path;
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(mp) || rtg->rtg_rmapip)
+		return 0;
+
+	error = xfs_rtrmapbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		return error;
+
+	error = xfs_imeta_ensure_dirpath(mp, path);
+	if (error)
+		goto out_path;
+
+	error = xfs_imeta_start_create(mp, path, &upd);
+	if (error)
+		goto out_path;
+
+	error = xfs_rtrmapbt_create(&upd, &ip);
+	if (error)
+		goto out_cancel;
+
+	lockdep_set_class(&ip->i_lock.mr_lock, &xfs_rrmapip_key);
+
+	/* Rmap the rtgroup superblock; this had better fit in the data fork. */
+	cur = xfs_rtrmapbt_init_cursor(mp, upd.tp, rtg, ip);
+	error = xfs_rmap_map_raw(cur, &rmap);
+	xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out_cancel;
+
+	error = xfs_imeta_commit_update(&upd);
+	if (error)
+		goto out_path;
+
+	xfs_imeta_free_path(path);
+	xfs_finish_inode_setup(ip);
+	rtg->rtg_rmapip = ip;
+	return 0;
+
+out_cancel:
+	xfs_imeta_cancel_update(&upd, error);
+	/* Have to finish setting up the inode to ensure it's deleted. */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		xfs_irele(ip);
+	}
+out_path:
+	xfs_imeta_free_path(path);
+	return error;
+}
+
 /*
  * Check that changes to the realtime geometry won't affect the minimum
  * log size, which would cause the fs to become unusable.
@@ -1122,7 +1193,9 @@ xfs_growfs_rt(
 		return -EINVAL;
 
 	/* Unsupported realtime features. */
-	if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
+	if (!xfs_has_rtgroups(mp) && xfs_has_rmapbt(mp))
+		return -EOPNOTSUPP;
+	if (xfs_has_reflink(mp) || xfs_has_quota(mp))
 		return -EOPNOTSUPP;
 
 	nrblocks = in->newblocks;
@@ -1261,10 +1334,21 @@ xfs_growfs_rt(
 		/* recompute growfsrt reservation from new rsumsize */
 		xfs_trans_resv_calc(nmp, &nmp->m_resv);
 
-		if (xfs_has_rtgroups(mp))
+		if (xfs_has_rtgroups(mp)) {
+			xfs_rgnumber_t	rgno = last_rgno;
+
 			nsbp->sb_rgcount = howmany_64(nsbp->sb_rblocks,
 						      nsbp->sb_rgblocks);
 
+			for_each_rtgroup_range(mp, rgno, nsbp->sb_rgcount, rtg) {
+				error = xfs_growfsrt_create_rtrmap(rtg);
+				if (error) {
+					xfs_rtgroup_rele(rtg);
+					break;
+				}
+			}
+		}
+
 		/*
 		 * Start a transaction, get the log reservation.
 		 */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/39] xfs: report realtime rmap btree corruption errors to the health system
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-31 21:37   ` [PATCH 21/39] xfs: add realtime rmap btree when adding rt volume Darrick J. Wong
@ 2023-12-31 21:37   ` Darrick J. Wong
  2023-12-31 21:37   ` [PATCH 23/39] xfs: fix scrub tracepoints when inode-rooted btrees are involved Darrick J. Wong
                     ` (16 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:37 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Whenever we encounter corrupt realtime rmap btree blocks, we should
report that to the health monitoring system for later reporting.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs_staging.h   |    1 +
 fs/xfs/libxfs/xfs_health.h       |    4 +++-
 fs/xfs/libxfs/xfs_inode_fork.c   |    4 +++-
 fs/xfs/libxfs/xfs_rtrmap_btree.c |    5 ++++-
 fs/xfs/xfs_health.c              |    4 ++++
 fs/xfs/xfs_rtalloc.c             |    2 ++
 6 files changed, 17 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs_staging.h b/fs/xfs/libxfs/xfs_fs_staging.h
index 1f57331487791..9d5d6af62b616 100644
--- a/fs/xfs/libxfs/xfs_fs_staging.h
+++ b/fs/xfs/libxfs/xfs_fs_staging.h
@@ -216,6 +216,7 @@ struct xfs_rtgroup_geometry {
 };
 #define XFS_RTGROUP_GEOM_SICK_SUPER	(1 << 0)  /* superblock */
 #define XFS_RTGROUP_GEOM_SICK_BITMAP	(1 << 1)  /* rtbitmap for this group */
+#define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1 << 2)  /* reverse mappings */
 
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 63, struct xfs_rtgroup_geometry)
 
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 1e9938a417b42..aeeb62769773f 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -68,6 +68,7 @@ struct xfs_rtgroup;
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
 #define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
 #define XFS_SICK_RT_SUPER	(1 << 2)  /* rt group superblock */
+#define XFS_SICK_RT_RMAPBT	(1 << 3)  /* reverse mappings */
 
 /* Observable health issues for AG metadata. */
 #define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
@@ -113,7 +114,8 @@ struct xfs_rtgroup;
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY | \
-				 XFS_SICK_RT_SUPER)
+				 XFS_SICK_RT_SUPER | \
+				 XFS_SICK_RT_RMAPBT)
 
 #define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
 				 XFS_SICK_AG_AGF | \
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index ab54a9b27c781..e7ab04aea2db6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -266,8 +266,10 @@ xfs_iformat_data_fork(
 		case XFS_DINODE_FMT_BTREE:
 			return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 		case XFS_DINODE_FMT_RMAP:
-			if (!xfs_has_rtrmapbt(ip->i_mount))
+			if (!xfs_has_rtrmapbt(ip->i_mount)) {
+				xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 				return -EFSCORRUPTED;
+			}
 			return xfs_iformat_rtrmap(ip, dip);
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index b824562bdc2ec..355c50196e986 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -27,6 +27,7 @@
 #include "xfs_extent_busy.h"
 #include "xfs_rtgroup.h"
 #include "xfs_bmap.h"
+#include "xfs_health.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -800,8 +801,10 @@ xfs_iformat_rtrmap(
 	level = be16_to_cpu(dfp->bb_level);
 
 	if (level > mp->m_rtrmap_maxlevels ||
-	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize)
+	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
+	}
 
 	xfs_iroot_alloc(ip, XFS_DATA_FORK,
 			xfs_rtrmap_broot_space_calc(mp, level, numrecs));
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index b53ef95a37a54..ed0767a6fa15a 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -532,6 +532,7 @@ xfs_ag_geom_health(
 static const struct ioctl_sick_map rtgroup_map[] = {
 	{ XFS_SICK_RT_SUPER,	XFS_RTGROUP_GEOM_SICK_SUPER },
 	{ XFS_SICK_RT_BITMAP,	XFS_RTGROUP_GEOM_SICK_BITMAP },
+	{ XFS_SICK_RT_RMAPBT,	XFS_RTGROUP_GEOM_SICK_RMAPBT },
 	{ 0, 0 },
 };
 
@@ -636,6 +637,9 @@ xfs_btree_mark_sick(
 	case XFS_BTNUM_BMAP:
 		xfs_bmap_mark_sick(cur->bc_ino.ip, cur->bc_ino.whichfork);
 		return;
+	case XFS_BTNUM_RTRMAP:
+		xfs_rtgroup_mark_sick(cur->bc_ino.rtg, XFS_SICK_RT_RMAPBT);
+		return;
 	case XFS_BTNUM_BNO:
 		mask = XFS_SICK_AG_BNOBT;
 		break;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f6b23439b674d..37156cf8acd25 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1781,6 +1781,7 @@ xfs_rtmount_rmapbt(
 		goto out_path;
 
 	if (ino == NULLFSINO) {
+		xfs_rtgroup_mark_sick(rtg, XFS_SICK_RT_RMAPBT);
 		error = -EFSCORRUPTED;
 		goto out_path;
 	}
@@ -1790,6 +1791,7 @@ xfs_rtmount_rmapbt(
 		goto out_path;
 
 	if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_RMAP)) {
+		xfs_rtgroup_mark_sick(rtg, XFS_SICK_RT_RMAPBT);
 		error = -EFSCORRUPTED;
 		goto out_rele;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/39] xfs: fix scrub tracepoints when inode-rooted btrees are involved
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-31 21:37   ` [PATCH 22/39] xfs: report realtime rmap btree corruption errors to the health system Darrick J. Wong
@ 2023-12-31 21:37   ` Darrick J. Wong
  2023-12-31 21:38   ` [PATCH 24/39] xfs: allow queued realtime intents to drain before scrubbing Darrick J. Wong
                     ` (15 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:37 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Fix a minor mistakes in the scrub tracepoints that can manifest when
inode-rooted btrees are enabled.  The existing code worked fine for bmap
btrees, but we should tighten the code up to be less sloppy.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/trace.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index bdcd77c839317..822fcdfd89a4b 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -611,7 +611,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
 	TP_fast_assign(
 		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
 		__entry->dev = sc->mp->m_super->s_dev;
-		__entry->ino = sc->ip->i_ino;
+		__entry->ino = cur->bc_ino.ip->i_ino;
 		__entry->whichfork = cur->bc_ino.whichfork;
 		__entry->type = sc->sm->sm_type;
 		__entry->btnum = cur->bc_btnum;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/39] xfs: allow queued realtime intents to drain before scrubbing
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-31 21:37   ` [PATCH 23/39] xfs: fix scrub tracepoints when inode-rooted btrees are involved Darrick J. Wong
@ 2023-12-31 21:38   ` Darrick J. Wong
  2023-12-31 21:38   ` [PATCH 25/39] xfs: scrub the realtime rmapbt Darrick J. Wong
                     ` (14 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:38 UTC (permalink / raw)
  To: djwong; +Cc: Christoph Hellwig, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When a writer thread executes a chain of log intent items for the
realtime volume, the ILOCKs taken during each step are for each rt
metadata file, not the entire rt volume itself.  Although scrub takes
all rt metadata ILOCKs, this isn't sufficient to guard against scrub
checking the rt volume while that writer thread is in the middle of
finishing a chain because there's no higher level locking primitive
guarding the realtime volume.

When there's a collision, cross-referencing between data structures
(e.g. rtrmapbt and rtrefcountbt) yields false corruption events; if
repair is running, this results in incorrect repairs, which is
catastrophic.

Fix this by adding to the mount structure the same drain that we use to
protect scrub against concurrent AG updates, but this time for the
realtime volume.

[Contains a few cleanups from hch]

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtgroup.c |    2 +
 fs/xfs/libxfs/xfs_rtgroup.h |    9 ++++
 fs/xfs/scrub/common.c       |   88 +++++++++++++++++++++++++++++++++++++----
 fs/xfs/scrub/rtbitmap.c     |    3 +
 fs/xfs/scrub/scrub.c        |    2 -
 fs/xfs/xfs_bmap_item.c      |   11 ++---
 fs/xfs/xfs_drain.c          |   93 +++++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_drain.h          |   28 ++++++++++++-
 fs/xfs/xfs_extfree_item.c   |   14 ++----
 fs/xfs/xfs_mount.h          |    1 
 fs/xfs/xfs_rmap_item.c      |   16 +++----
 fs/xfs/xfs_trace.h          |   32 +++++++++++++++
 12 files changed, 255 insertions(+), 44 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 6ffe1c21a1ea4..7b031f4917349 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -162,6 +162,7 @@ xfs_initialize_rtgroups(
 		/* Place kernel structure only init below this point. */
 		spin_lock_init(&rtg->rtg_state_lock);
 		init_waitqueue_head(&rtg->rtg_active_wq);
+		xfs_defer_drain_init(&rtg->rtg_intents_drain);
 #endif /* __KERNEL__ */
 
 		/* Active ref owned by mount indicates rtgroup is online. */
@@ -216,6 +217,7 @@ xfs_free_rtgroups(
 		spin_unlock(&mp->m_rtgroup_lock);
 		ASSERT(rtg);
 		XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_ref) != 0);
+		xfs_defer_drain_free(&rtg->rtg_intents_drain);
 
 		/* drop the mount's active reference */
 		xfs_rtgroup_rele(rtg);
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 559a5135820d3..9487c2e00478b 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -39,6 +39,15 @@ struct xfs_rtgroup {
 #ifdef __KERNEL__
 	/* -- kernel only structures below this line -- */
 	spinlock_t		rtg_state_lock;
+
+	/*
+	 * We use xfs_drain to track the number of deferred log intent items
+	 * that have been queued (but not yet processed) so that waiters (e.g.
+	 * scrub) will not lock resources when other threads are in the middle
+	 * of processing a chain of intent items only to find momentary
+	 * inconsistencies.
+	 */
+	struct xfs_defer_drain	rtg_intents_drain;
 #endif /* __KERNEL__ */
 };
 
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 3dee7d717073e..efce0dff2e937 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -776,12 +776,88 @@ xchk_rt_unlock_rtbitmap(
 }
 
 #ifdef CONFIG_XFS_RT
+/* Lock all the rt group metadata inode ILOCKs and wait for intents. */
+static int
+xchk_rtgroup_drain_and_lock(
+	struct xfs_scrub	*sc,
+	struct xchk_rt		*sr,
+	unsigned int		rtglock_flags)
+{
+	int			error = 0;
+
+	ASSERT(sr->rtg != NULL);
+
+	/*
+	 * If we're /only/ locking the rtbitmap in shared mode, then we're
+	 * obviously not trying to compare records in two metadata inodes.
+	 * There's no need to drain intents here because the caller (most
+	 * likely the rgsuper scanner) doesn't need that level of consistency.
+	 */
+	if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) {
+		xfs_rtgroup_lock(NULL, sr->rtg, rtglock_flags);
+		sr->rtlock_flags = rtglock_flags;
+		return 0;
+	}
+
+	do {
+		if (xchk_should_terminate(sc, &error))
+			return error;
+
+		xfs_rtgroup_lock(NULL, sr->rtg, rtglock_flags);
+
+		/*
+		 * If we've grabbed a non-metadata file for scrubbing, we
+		 * assume that holding its ILOCK will suffice to coordinate
+		 * with any rt intent chains involving this inode.
+		 */
+		if (sc->ip && !xfs_is_metadata_inode(sc->ip)) {
+			sr->rtlock_flags = rtglock_flags;
+			return 0;
+		}
+
+		/*
+		 * Decide if the rt group is quiet enough for all metadata to
+		 * be consistent with each other.  Regular file IO doesn't get
+		 * to lock all the rt inodes at the same time, which means that
+		 * there could be other threads in the middle of processing a
+		 * chain of deferred ops.
+		 *
+		 * We just locked all the metadata inodes for this rt group;
+		 * now take a look to see if there are any intents in progress.
+		 * If there are, drop the rt group inode locks and wait for the
+		 * intents to drain.  Since we hold the rt group inode locks
+		 * for the duration of the scrub, this is the only time we have
+		 * to sample the intents counter; any threads increasing it
+		 * after this point can't possibly be in the middle of a chain
+		 * of rt metadata updates.
+		 *
+		 * Obviously, this should be slanted against scrub and in favor
+		 * of runtime threads.
+		 */
+		if (!xfs_rtgroup_intent_busy(sr->rtg)) {
+			sr->rtlock_flags = rtglock_flags;
+			return 0;
+		}
+
+		xfs_rtgroup_unlock(sr->rtg, rtglock_flags);
+
+		if (!(sc->flags & XCHK_FSGATES_DRAIN))
+			return -ECHRNG;
+		error = xfs_rtgroup_intent_drain(sr->rtg);
+		if (error == -ERESTARTSYS)
+			error = -EINTR;
+	} while (!error);
+
+	return error;
+}
+
 /*
  * For scrubbing a realtime group, grab all the in-core resources we'll need to
  * check the metadata, which means taking the ILOCK of the realtime group's
- * metadata inodes.  Callers must not join these inodes to the transaction with
- * non-zero lockflags or concurrency problems will result.  The @rtglock_flags
- * argument takes XFS_RTGLOCK_* flags.
+ * metadata inodes and draining any running intent chains.  Callers must not
+ * join these inodes to the transaction with non-zero lockflags or concurrency
+ * problems will result.  The @rtglock_flags argument takes XFS_RTGLOCK_*
+ * flags.
  */
 int
 xchk_rtgroup_init(
@@ -797,9 +873,7 @@ xchk_rtgroup_init(
 	if (!sr->rtg)
 		return -ENOENT;
 
-	xfs_rtgroup_lock(NULL, sr->rtg, rtglock_flags);
-	sr->rtlock_flags = rtglock_flags;
-	return 0;
+	return xchk_rtgroup_drain_and_lock(sc, sr, rtglock_flags);
 }
 
 /*
@@ -1457,7 +1531,7 @@ xchk_fsgates_enable(
 	trace_xchk_fsgates_enable(sc, scrub_fsgates);
 
 	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
-		xfs_drain_wait_enable();
+		xfs_defer_drain_wait_enable();
 
 	if (scrub_fsgates & XCHK_FSGATES_QUOTA)
 		xfs_dqtrx_hook_enable();
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index aae8b0e6ff281..5bedb09387495 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -84,6 +84,9 @@ xchk_setup_rtbitmap(
 	struct xchk_rtbitmap	*rtb;
 	int			error;
 
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
 	rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
 	if (!rtb)
 		return -ENOMEM;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index a6b7b57fc1df7..58bb52a63782e 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -165,7 +165,7 @@ xchk_fsgates_disable(
 	trace_xchk_fsgates_disable(sc, sc->flags & FSGATES_MASK);
 
 	if (sc->flags & XCHK_FSGATES_DRAIN)
-		xfs_drain_wait_disable();
+		xfs_defer_drain_wait_disable();
 
 	if (sc->flags & XCHK_FSGATES_QUOTA)
 		xfs_dqtrx_hook_disable();
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e50276ceceae9..4522de5f13a8d 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -327,10 +327,8 @@ xfs_bmap_update_get_group(
 {
 	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
 		if (xfs_has_rtgroups(mp)) {
-			xfs_rgnumber_t	rgno;
-
-			rgno = xfs_rtb_to_rgno(mp, bi->bi_bmap.br_startblock);
-			bi->bi_rtg = xfs_rtgroup_get(mp, rgno);
+			bi->bi_rtg = xfs_rtgroup_intent_get(mp,
+						bi->bi_bmap.br_startblock);
 		} else {
 			bi->bi_rtg = NULL;
 		}
@@ -366,8 +364,9 @@ xfs_bmap_update_put_group(
 	struct xfs_bmap_intent	*bi)
 {
 	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
-		if (xfs_has_rtgroups(bi->bi_owner->i_mount))
-			xfs_rtgroup_put(bi->bi_rtg);
+		if (xfs_has_rtgroups(bi->bi_owner->i_mount)) {
+			xfs_rtgroup_intent_put(bi->bi_rtg);
+		}
 		return;
 	}
 
diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c
index 7bdb9688c0f5e..306cd133b9a27 100644
--- a/fs/xfs/xfs_drain.c
+++ b/fs/xfs/xfs_drain.c
@@ -11,11 +11,12 @@
 #include "xfs_mount.h"
 #include "xfs_ag.h"
 #include "xfs_trace.h"
+#include "xfs_rtgroup.h"
 
 /*
- * Use a static key here to reduce the overhead of xfs_drain_rele.  If the
+ * Use a static key here to reduce the overhead of xfs_defer_drain_rele.  If the
  * compiler supports jump labels, the static branch will be replaced by a nop
- * sled when there are no xfs_drain_wait callers.  Online fsck is currently
+ * sled when there are no xfs_defer_drain_wait callers.  Online fsck is currently
  * the only caller, so this is a reasonable tradeoff.
  *
  * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
@@ -23,18 +24,18 @@
  * XFS callers cannot hold any locks that might be used by memory reclaim or
  * writeback when calling the static_branch_{inc,dec} functions.
  */
-static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate);
+static DEFINE_STATIC_KEY_FALSE(xfs_defer_drain_waiter_gate);
 
 void
-xfs_drain_wait_disable(void)
+xfs_defer_drain_wait_disable(void)
 {
-	static_branch_dec(&xfs_drain_waiter_gate);
+	static_branch_dec(&xfs_defer_drain_waiter_gate);
 }
 
 void
-xfs_drain_wait_enable(void)
+xfs_defer_drain_wait_enable(void)
 {
-	static_branch_inc(&xfs_drain_waiter_gate);
+	static_branch_inc(&xfs_defer_drain_waiter_gate);
 }
 
 void
@@ -71,7 +72,7 @@ static inline bool has_waiters(struct wait_queue_head *wq_head)
 static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr)
 {
 	if (atomic_dec_and_test(&dr->dr_count) &&
-	    static_branch_unlikely(&xfs_drain_waiter_gate) &&
+	    static_branch_unlikely(&xfs_defer_drain_waiter_gate) &&
 	    has_waiters(&dr->dr_waiters))
 		wake_up(&dr->dr_waiters);
 }
@@ -164,3 +165,79 @@ xfs_perag_intent_busy(
 {
 	return xfs_defer_drain_busy(&pag->pag_intents_drain);
 }
+
+#ifdef CONFIG_XFS_RT
+
+/*
+ * Get a passive reference to an rtgroup and declare an intent to update its
+ * metadata.
+ */
+struct xfs_rtgroup *
+xfs_rtgroup_intent_get(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
+	rgno = xfs_rtb_to_rgno(mp, rtbno);
+	rtg = xfs_rtgroup_get(mp, rgno);
+	if (!rtg)
+		return NULL;
+
+	xfs_rtgroup_intent_hold(rtg);
+	return rtg;
+}
+
+/*
+ * Release our intent to update this rtgroup's metadata, and then release our
+ * passive ref to the rtgroup.
+ */
+void
+xfs_rtgroup_intent_put(
+	struct xfs_rtgroup	*rtg)
+{
+	xfs_rtgroup_intent_rele(rtg);
+	xfs_rtgroup_put(rtg);
+}
+/*
+ * Declare an intent to update rtgroup metadata.  Other threads that need
+ * exclusive access can decide to back off if they see declared intentions.
+ */
+void
+xfs_rtgroup_intent_hold(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_intent_hold(rtg, __return_address);
+	xfs_defer_drain_grab(&rtg->rtg_intents_drain);
+}
+
+/* Release our intent to update this rtgroup's metadata. */
+void
+xfs_rtgroup_intent_rele(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_intent_rele(rtg, __return_address);
+	xfs_defer_drain_rele(&rtg->rtg_intents_drain);
+}
+
+/*
+ * Wait for the intent update count for this rtgroup to hit zero.
+ * Callers must not hold any rt metadata inode locks.
+ */
+int
+xfs_rtgroup_intent_drain(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_wait_intents(rtg, __return_address);
+	return xfs_defer_drain_wait(&rtg->rtg_intents_drain);
+}
+
+/* Has anyone declared an intent to update this rtgroup? */
+bool
+xfs_rtgroup_intent_busy(
+	struct xfs_rtgroup	*rtg)
+{
+	return xfs_defer_drain_busy(&rtg->rtg_intents_drain);
+}
+#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h
index 775164f54ea6d..b02ff87a873a1 100644
--- a/fs/xfs/xfs_drain.h
+++ b/fs/xfs/xfs_drain.h
@@ -7,6 +7,7 @@
 #define XFS_DRAIN_H_
 
 struct xfs_perag;
+struct xfs_rtgroup;
 
 #ifdef CONFIG_XFS_DRAIN_INTENTS
 /*
@@ -25,8 +26,8 @@ struct xfs_defer_drain {
 void xfs_defer_drain_init(struct xfs_defer_drain *dr);
 void xfs_defer_drain_free(struct xfs_defer_drain *dr);
 
-void xfs_drain_wait_disable(void);
-void xfs_drain_wait_enable(void);
+void xfs_defer_drain_wait_disable(void);
+void xfs_defer_drain_wait_enable(void);
 
 /*
  * Deferred Work Intent Drains
@@ -60,6 +61,9 @@ void xfs_drain_wait_enable(void);
  * All functions that create work items must increment the intent counter as
  * soon as the item is added to the transaction and cannot drop the counter
  * until the item is finished or cancelled.
+ *
+ * The same principles apply to realtime groups because the rt metadata inode
+ * ILOCKs are not held across transaction rolls.
  */
 struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp,
 		xfs_fsblock_t fsbno);
@@ -70,6 +74,7 @@ void xfs_perag_intent_rele(struct xfs_perag *pag);
 
 int xfs_perag_intent_drain(struct xfs_perag *pag);
 bool xfs_perag_intent_busy(struct xfs_perag *pag);
+
 #else
 struct xfs_defer_drain { /* empty */ };
 
@@ -85,4 +90,23 @@ static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { }
 
 #endif /* CONFIG_XFS_DRAIN_INTENTS */
 
+#if defined(CONFIG_XFS_DRAIN_INTENTS) && defined(CONFIG_XFS_RT)
+struct xfs_rtgroup *xfs_rtgroup_intent_get(struct xfs_mount *mp,
+		xfs_rtblock_t rtbno);
+void xfs_rtgroup_intent_put(struct xfs_rtgroup *rtg);
+
+void xfs_rtgroup_intent_hold(struct xfs_rtgroup *rtg);
+void xfs_rtgroup_intent_rele(struct xfs_rtgroup *rtg);
+
+int xfs_rtgroup_intent_drain(struct xfs_rtgroup *rtg);
+bool xfs_rtgroup_intent_busy(struct xfs_rtgroup *rtg);
+#else
+#define xfs_rtgroup_intent_get(mp, rtbno) \
+	xfs_rtgroup_get(mp, xfs_rtb_to_rgno((mp), (rtbno)))
+#define xfs_rtgroup_intent_put(rtg)		xfs_rtgroup_put(rtg)
+static inline void xfs_rtgroup_intent_hold(struct xfs_rtgroup *rtg) { }
+static inline void xfs_rtgroup_intent_rele(struct xfs_rtgroup *rtg) { }
+#endif /* CONFIG_XFS_DRAIN_INTENTS && CONFIG_XFS_RT */
+
+
 #endif /* XFS_DRAIN_H_ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 51a363d85978f..a17d2bf05b414 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -468,11 +468,8 @@ xfs_extent_free_defer_add(
 	trace_xfs_extent_free_defer(mp, xefi);
 
 	if (xfs_efi_is_realtime(xefi)) {
-		xfs_rgnumber_t		rgno;
-
-		rgno = xfs_rtb_to_rgno(mp, xefi->xefi_startblock);
-		xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
-
+		xefi->xefi_rtg = xfs_rtgroup_intent_get(mp,
+						xefi->xefi_startblock);
 		*dfpp = xfs_defer_add(tp, &xefi->xefi_list,
 				&xfs_rtextent_free_defer_type);
 		return;
@@ -615,11 +612,8 @@ xfs_efi_recover_work(
 	xefi->xefi_agresv = XFS_AG_RESV_NONE;
 	xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
 	if (isrt) {
-		xfs_rgnumber_t		rgno;
-
 		xefi->xefi_flags |= XFS_EFI_REALTIME;
-		rgno = xfs_rtb_to_rgno(mp, extp->ext_start);
-		xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
+		xefi->xefi_rtg = xfs_rtgroup_intent_get(mp, extp->ext_start);
 	} else {
 		xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start);
 	}
@@ -778,7 +772,7 @@ xfs_rtextent_free_cancel_item(
 {
 	struct xfs_extent_free_item	*xefi = xefi_entry(item);
 
-	xfs_rtgroup_put(xefi->xefi_rtg);
+	xfs_rtgroup_intent_put(xefi->xefi_rtg);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
 }
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ada8b281d74e2..21c889a723820 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -13,6 +13,7 @@ struct xfs_ail;
 struct xfs_quotainfo;
 struct xfs_da_geometry;
 struct xfs_perag;
+struct xfs_rtgroup;
 
 /* dynamic preallocation free space thresholds, 5% down to 1% */
 enum {
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 580baf3b1b1d3..f4270efc9dc5e 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -383,13 +383,12 @@ xfs_rmap_defer_add(
 	 * section updates.
 	 */
 	if (ri->ri_realtime) {
-		xfs_rgnumber_t	rgno;
-
-		rgno = xfs_rtb_to_rgno(mp, ri->ri_bmap.br_startblock);
-		ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+		ri->ri_rtg = xfs_rtgroup_intent_get(mp,
+						ri->ri_bmap.br_startblock);
 		xfs_defer_add(tp, &ri->ri_list, &xfs_rtrmap_update_defer_type);
 	} else {
-		ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
+		ri->ri_pag = xfs_perag_intent_get(mp,
+						ri->ri_bmap.br_startblock);
 		xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
 	}
 }
@@ -538,10 +537,7 @@ xfs_rui_recover_work(
 			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
 	ri->ri_realtime = isrt;
 	if (isrt) {
-		xfs_rgnumber_t	rgno;
-
-		rgno = xfs_rtb_to_rgno(mp, map->me_startblock);
-		ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+		ri->ri_rtg = xfs_rtgroup_intent_get(mp, map->me_startblock);
 	} else {
 		ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock);
 	}
@@ -684,7 +680,7 @@ xfs_rtrmap_update_cancel_item(
 {
 	struct xfs_rmap_intent		*ri = ri_entry(item);
 
-	xfs_rtgroup_put(ri->ri_rtg);
+	xfs_rtgroup_intent_put(ri->ri_rtg);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);
 }
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2dc991730c303..05d8ff68b09e2 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -4952,6 +4952,38 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold);
 DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele);
 DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_rtgroup_intents_class,
+	TP_PROTO(struct xfs_rtgroup *rtg, void *caller_ip),
+	TP_ARGS(rtg, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(long, nr_intents)
+		__field(void *, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg->rtg_mount->m_super->s_dev;
+		__entry->rtdev = rtg->rtg_mount->m_rtdev_targp->bt_dev;
+		__entry->nr_intents = atomic_read(&rtg->rtg_intents_drain.dr_count);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d intents %ld caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->nr_intents,
+		  __entry->caller_ip)
+);
+
+#define DEFINE_RTGROUP_INTENTS_EVENT(name)	\
+DEFINE_EVENT(xfs_rtgroup_intents_class, name,					\
+	TP_PROTO(struct xfs_rtgroup *rtg, void *caller_ip), \
+	TP_ARGS(rtg, caller_ip))
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_intent_hold);
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_intent_rele);
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_wait_intents);
+#endif /* CONFIG_XFS_RT */
+
 #endif /* CONFIG_XFS_DRAIN_INTENTS */
 
 TRACE_EVENT(xfs_swapext_overhead,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 25/39] xfs: scrub the realtime rmapbt
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (23 preceding siblings ...)
  2023-12-31 21:38   ` [PATCH 24/39] xfs: allow queued realtime intents to drain before scrubbing Darrick J. Wong
@ 2023-12-31 21:38   ` Darrick J. Wong
  2023-12-31 21:38   ` [PATCH 26/39] xfs: cross-reference realtime bitmap to realtime rmapbt scrubber Darrick J. Wong
                     ` (13 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:38 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Check the realtime reverse mapping btree against the rtbitmap, and
modify the rtbitmap scrub to check against the rtrmapbt.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile             |    1 
 fs/xfs/libxfs/xfs_fs.h      |    3 -
 fs/xfs/scrub/bmap.c         |    1 
 fs/xfs/scrub/bmap_repair.c  |    1 
 fs/xfs/scrub/common.c       |   78 +++++++++++++++++
 fs/xfs/scrub/common.h       |   10 ++
 fs/xfs/scrub/health.c       |    1 
 fs/xfs/scrub/inode.c        |   10 +-
 fs/xfs/scrub/inode_repair.c |    7 +-
 fs/xfs/scrub/repair.c       |    1 
 fs/xfs/scrub/rtrmap.c       |  193 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c        |    9 ++
 fs/xfs/scrub/scrub.h        |    5 +
 fs/xfs/scrub/stats.c        |    1 
 fs/xfs/scrub/trace.h        |    4 +
 15 files changed, 313 insertions(+), 12 deletions(-)
 create mode 100644 fs/xfs/scrub/rtrmap.c


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 50a7929982fdd..69ca5cb7e4000 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -193,6 +193,7 @@ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rgsuper.o \
 				   rtbitmap.o \
+				   rtrmap.o \
 				   rtsummary.o \
 				   )
 
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 102b927336057..dcf048aae8c17 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -737,9 +737,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
 #define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
 #define XFS_SCRUB_TYPE_RGBITMAP	31	/* realtime group bitmap */
+#define XFS_SCRUB_TYPE_RTRMAPBT	32	/* rtgroup reverse mapping btree */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	32
+#define XFS_SCRUB_TYPE_NR	33
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index b90f0d8540ba5..8fa51350facc4 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -953,6 +953,7 @@ xchk_bmap(
 	case XFS_DINODE_FMT_UUID:
 	case XFS_DINODE_FMT_DEV:
 	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_RMAP:
 		/* No mappings to check. */
 		if (whichfork == XFS_COW_FORK)
 			xchk_fblock_set_corrupt(sc, whichfork, 0);
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 8a6dc7ff2f79e..9dddc5997b4fc 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -728,6 +728,7 @@ xrep_bmap_check_inputs(
 	case XFS_DINODE_FMT_DEV:
 	case XFS_DINODE_FMT_LOCAL:
 	case XFS_DINODE_FMT_UUID:
+	case XFS_DINODE_FMT_RMAP:
 		return -ECANCELED;
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_BTREE:
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index efce0dff2e937..5ea9f3f310335 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -35,6 +35,8 @@
 #include "xfs_swapext.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_bmap_util.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -866,6 +868,8 @@ xchk_rtgroup_init(
 	struct xchk_rt		*sr,
 	unsigned int		rtglock_flags)
 {
+	int			error;
+
 	ASSERT(sr->rtg == NULL);
 	ASSERT(sr->rtlock_flags == 0);
 
@@ -873,7 +877,30 @@ xchk_rtgroup_init(
 	if (!sr->rtg)
 		return -ENOENT;
 
-	return xchk_rtgroup_drain_and_lock(sc, sr, rtglock_flags);
+	error = xchk_rtgroup_drain_and_lock(sc, sr, rtglock_flags);
+	if (error)
+		return error;
+
+	if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP))
+		sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->mp, sc->tp,
+				sr->rtg, sr->rtg->rtg_rmapip);
+
+	return 0;
+}
+
+/*
+ * Free all the btree cursors and other incore data relating to the realtime
+ * group.  This has to be done /before/ committing (or cancelling) the scrub
+ * transaction.
+ */
+void
+xchk_rtgroup_btcur_free(
+	struct xchk_rt		*sr)
+{
+	if (sr->rmap_cur)
+		xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR);
+
+	sr->rmap_cur = NULL;
 }
 
 /*
@@ -961,6 +988,14 @@ xchk_setup_fs(
 	return xchk_trans_alloc(sc, resblks);
 }
 
+/* Set us up with a transaction and an empty context to repair rt metadata. */
+int
+xchk_setup_rt(
+	struct xfs_scrub	*sc)
+{
+	return xchk_trans_alloc(sc, 0);
+}
+
 /* Set us up with AG headers and btree cursors. */
 int
 xchk_setup_ag_btree(
@@ -1696,3 +1731,44 @@ xchk_inode_is_allocated(
 	rcu_read_unlock();
 	return error;
 }
+
+/* Count the blocks used by a file, even if it's a metadata inode. */
+int
+xchk_inode_count_blocks(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_extnum_t		*nextents,
+	xfs_filblks_t		*count)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
+	struct xfs_btree_cur	*cur;
+	xfs_extlen_t		btblocks;
+	int			error;
+
+	if (!ifp) {
+		*nextents = 0;
+		*count = 0;
+		return 0;
+	}
+
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_RMAP:
+		if (!sc->sr.rtg) {
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+		cur = xfs_rtrmapbt_init_cursor(sc->mp, sc->tp, sc->sr.rtg,
+				sc->ip);
+		error = xfs_btree_count_blocks(cur, &btblocks);
+		xfs_btree_del_cursor(cur, error);
+		if (error)
+			return error;
+
+		*nextents = 0;
+		*count = btblocks - 1;
+		return 0;
+	default:
+		return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork,
+				nextents, count);
+	}
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index f96dd658feab9..3251f80c00e15 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -65,6 +65,7 @@ static inline int xchk_setup_nothing(struct xfs_scrub *sc)
 /* Setup functions */
 int xchk_setup_agheader(struct xfs_scrub *sc);
 int xchk_setup_fs(struct xfs_scrub *sc);
+int xchk_setup_rt(struct xfs_scrub *sc);
 int xchk_setup_ag_allocbt(struct xfs_scrub *sc);
 int xchk_setup_ag_iallocbt(struct xfs_scrub *sc);
 int xchk_setup_ag_rmapbt(struct xfs_scrub *sc);
@@ -83,11 +84,13 @@ int xchk_setup_rtbitmap(struct xfs_scrub *sc);
 int xchk_setup_rtsummary(struct xfs_scrub *sc);
 int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
 int xchk_setup_rgbitmap(struct xfs_scrub *sc);
+int xchk_setup_rtrmapbt(struct xfs_scrub *sc);
 #else
 # define xchk_setup_rtbitmap		xchk_setup_nothing
 # define xchk_setup_rtsummary		xchk_setup_nothing
 # define xchk_setup_rgsuperblock	xchk_setup_nothing
 # define xchk_setup_rgbitmap		xchk_setup_nothing
+# define xchk_setup_rtrmapbt		xchk_setup_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -148,14 +151,17 @@ void xchk_rt_unlock_rtbitmap(struct xfs_scrub *sc);
 #ifdef CONFIG_XFS_RT
 
 /* All the locks we need to check an rtgroup. */
-#define XCHK_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP_SHARED)
+#define XCHK_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP_SHARED | \
+				 XFS_RTGLOCK_RMAP)
 
 int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
 		struct xchk_rt *sr, unsigned int rtglock_flags);
 void xchk_rtgroup_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
+void xchk_rtgroup_btcur_free(struct xchk_rt *sr);
 void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
 #else
 # define xchk_rtgroup_init(sc, rgno, sr, lockflags)	(-ENOSYS)
+# define xchk_rtgroup_btcur_free(sr)			((void)0)
 # define xchk_rtgroup_free(sc, sr)			((void)0)
 #endif /* CONFIG_XFS_RT */
 
@@ -282,5 +288,7 @@ void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
 
 int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino,
 		bool *inuse);
+int xchk_inode_count_blocks(struct xfs_scrub *sc, int whichfork,
+		xfs_extnum_t *nextents, xfs_filblks_t *count);
 
 #endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 7fccb1a03060a..0fec833057697 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -115,6 +115,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_DIRTREE]	= { XHG_INO, XFS_SICK_INO_DIRTREE },
 	[XFS_SCRUB_TYPE_METAPATH]	= { XHG_FS,  XFS_SICK_FS_METAPATH },
 	[XFS_SCRUB_TYPE_RGSUPER]	= { XHG_RTGROUP, XFS_SICK_RT_SUPER },
+	[XFS_SCRUB_TYPE_RTRMAPBT]	= { XHG_RTGROUP, XFS_SICK_RT_RMAPBT },
 };
 
 /* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 7357bf5156ba3..5fc10e02b9c41 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -496,6 +496,10 @@ xchk_dinode(
 		if (!S_ISREG(mode) && !S_ISDIR(mode))
 			xchk_ino_set_corrupt(sc, ino);
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		if (!S_ISREG(mode))
+			xchk_ino_set_corrupt(sc, ino);
+		break;
 	case XFS_DINODE_FMT_UUID:
 	default:
 		xchk_ino_set_corrupt(sc, ino);
@@ -680,15 +684,13 @@ xchk_inode_xref_bmap(
 		return;
 
 	/* Walk all the extents to check nextents/naextents/nblocks. */
-	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
-			&nextents, &count);
+	error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count);
 	if (!xchk_should_check_xref(sc, &error, NULL))
 		return;
 	if (nextents < xfs_dfork_data_extents(dip))
 		xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
 
-	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
-			&nextents, &acount);
+	error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, &acount);
 	if (!xchk_should_check_xref(sc, &error, NULL))
 		return;
 	if (nextents != xfs_dfork_attr_extents(dip))
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 30be423b3716e..2b3a6cbadae71 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1446,8 +1446,7 @@ xrep_inode_blockcounts(
 	trace_xrep_inode_blockcounts(sc);
 
 	/* Set data fork counters from the data fork mappings. */
-	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
-			&nextents, &count);
+	error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count);
 	if (error)
 		return error;
 	if (xfs_is_reflink_inode(sc->ip)) {
@@ -1471,8 +1470,8 @@ xrep_inode_blockcounts(
 	/* Set attr fork counters from the attr fork mappings. */
 	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
 	if (ifp) {
-		error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
-				&nextents, &acount);
+		error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents,
+				&acount);
 		if (error)
 			return error;
 		if (count >= sc->mp->m_sb.sb_dblocks)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 24d99891a634c..cad45d5b86e2e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -61,6 +61,7 @@ xrep_attempt(
 	trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
 
 	xchk_ag_btcur_free(&sc->sa);
+	xchk_rtgroup_btcur_free(&sc->sr);
 
 	/* Repair whatever's broken. */
 	ASSERT(sc->ops->repair);
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
new file mode 100644
index 0000000000000..7c7da0b232321
--- /dev/null
+++ b/fs/xfs/scrub/rtrmap.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_imeta.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* Set us up with the realtime metadata locked. */
+int
+xchk_setup_rtrmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_rtgroup	*rtg;
+	int			error = 0;
+
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+	rtg = xfs_rtgroup_get(mp, sc->sm->sm_agno);
+	if (!rtg)
+		return -ENOENT;
+
+	error = xchk_setup_rt(sc);
+	if (error)
+		goto out_rtg;
+
+	error = xchk_install_live_inode(sc, rtg->rtg_rmapip);
+	if (error)
+		goto out_rtg;
+
+	error = xchk_ino_dqattach(sc);
+	if (error)
+		goto out_rtg;
+
+	error = xchk_rtgroup_init(sc, rtg->rtg_rgno, &sc->sr, XCHK_RTGLOCK_ALL);
+out_rtg:
+	xfs_rtgroup_put(rtg);
+	return error;
+}
+
+/* Realtime reverse mapping. */
+
+struct xchk_rtrmap {
+	/*
+	 * The furthest-reaching of the rmapbt records that we've already
+	 * processed.  This enables us to detect overlapping records for space
+	 * allocations that cannot be shared.
+	 */
+	struct xfs_rmap_irec	overlap_rec;
+
+	/*
+	 * The previous rmapbt record, so that we can check for two records
+	 * that could be one.
+	 */
+	struct xfs_rmap_irec	prev_rec;
+};
+
+/* Flag failures for records that overlap but cannot. */
+STATIC void
+xchk_rtrmapbt_check_overlapping(
+	struct xchk_btree		*bs,
+	struct xchk_rtrmap		*cr,
+	const struct xfs_rmap_irec	*irec)
+{
+	xfs_rtblock_t			pnext, inext;
+
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	/* No previous record? */
+	if (cr->overlap_rec.rm_blockcount == 0)
+		goto set_prev;
+
+	/* Do overlap_rec and irec overlap? */
+	pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount;
+	if (pnext <= irec->rm_startblock)
+		goto set_prev;
+
+	xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	/* Save whichever rmap record extends furthest. */
+	inext = irec->rm_startblock + irec->rm_blockcount;
+	if (pnext > inext)
+		return;
+
+set_prev:
+	memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Decide if two reverse-mapping records can be merged. */
+static inline bool
+xchk_rtrmap_mergeable(
+	struct xchk_rtrmap		*cr,
+	const struct xfs_rmap_irec	*r2)
+{
+	const struct xfs_rmap_irec	*r1 = &cr->prev_rec;
+
+	/* Ignore if prev_rec is not yet initialized. */
+	if (cr->prev_rec.rm_blockcount == 0)
+		return false;
+
+	if (r1->rm_owner != r2->rm_owner)
+		return false;
+	if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock)
+		return false;
+	if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount >
+	    XFS_RMAP_LEN_MAX)
+		return false;
+	if (r1->rm_flags != r2->rm_flags)
+		return false;
+	return r1->rm_offset + r1->rm_blockcount == r2->rm_offset;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrmapbt_check_mergeable(
+	struct xchk_btree		*bs,
+	struct xchk_rtrmap		*cr,
+	const struct xfs_rmap_irec	*irec)
+{
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	if (xchk_rtrmap_mergeable(cr, irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Scrub a realtime rmapbt record. */
+STATIC int
+xchk_rtrmapbt_rec(
+	struct xchk_btree		*bs,
+	const union xfs_btree_rec	*rec)
+{
+	struct xchk_rtrmap		*cr = bs->private;
+	struct xfs_rmap_irec		irec;
+
+	if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
+	    xfs_rtrmap_check_irec(bs->cur->bc_ino.rtg, &irec) != NULL) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
+
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	xchk_rtrmapbt_check_mergeable(bs, cr, &irec);
+	xchk_rtrmapbt_check_overlapping(bs, cr, &irec);
+	return 0;
+}
+
+/* Scrub the realtime rmap btree. */
+int
+xchk_rtrmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_owner_info	oinfo;
+	struct xchk_rtrmap	cr = { };
+	int			error;
+
+	error = xchk_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, sc->sr.rtg->rtg_rmapip->i_ino,
+			XFS_DATA_FORK);
+	return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr);
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 58bb52a63782e..428624d4f0c45 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -223,6 +223,8 @@ xchk_teardown(
 	int			error)
 {
 	xchk_ag_free(sc, &sc->sa);
+	xchk_rtgroup_btcur_free(&sc->sr);
+
 	if (sc->tp) {
 		if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 			error = xfs_trans_commit(sc->tp);
@@ -472,6 +474,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.has	= xfs_has_rtgroups,
 		.repair = xrep_notsupported,
 	},
+	[XFS_SCRUB_TYPE_RTRMAPBT] = {	/* realtime group rmapbt */
+		.type	= ST_RTGROUP,
+		.setup	= xchk_setup_rtrmapbt,
+		.scrub	= xchk_rtrmapbt,
+		.has	= xfs_has_rtrmapbt,
+		.repair	= xrep_notsupported,
+	},
 };
 
 static int
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 76eca41a8995a..52bbc1fbcc560 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -122,6 +122,9 @@ struct xchk_rt {
 	 * if rtg != NULL.
 	 */
 	unsigned int		rtlock_flags;
+
+	/* rtgroup btrees */
+	struct xfs_btree_cur	*rmap_cur;
 };
 
 struct xfs_scrub {
@@ -278,11 +281,13 @@ int xchk_rtbitmap(struct xfs_scrub *sc);
 int xchk_rtsummary(struct xfs_scrub *sc);
 int xchk_rgsuperblock(struct xfs_scrub *sc);
 int xchk_rgbitmap(struct xfs_scrub *sc);
+int xchk_rtrmapbt(struct xfs_scrub *sc);
 #else
 # define xchk_rtbitmap		xchk_nothing
 # define xchk_rtsummary		xchk_nothing
 # define xchk_rgsuperblock	xchk_nothing
 # define xchk_rgbitmap		xchk_nothing
+# define xchk_rtrmapbt		xchk_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_quota(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index 4bdff9a19dd6c..0da7ecabfe9d9 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -83,6 +83,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_METAPATH]	= "metapath",
 	[XFS_SCRUB_TYPE_RGSUPER]	= "rgsuper",
 	[XFS_SCRUB_TYPE_RGBITMAP]	= "rgbitmap",
+	[XFS_SCRUB_TYPE_RTRMAPBT]	= "rtrmapbt",
 };
 
 /* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 822fcdfd89a4b..f02914f129605 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -86,6 +86,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGBITMAP);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
 
 #define XFS_SCRUB_TYPE_STRINGS \
 	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
@@ -120,7 +121,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGBITMAP);
 	{ XFS_SCRUB_TYPE_BARRIER,	"barrier" }, \
 	{ XFS_SCRUB_TYPE_METAPATH,	"metapath" }, \
 	{ XFS_SCRUB_TYPE_RGSUPER,	"rgsuper" }, \
-	{ XFS_SCRUB_TYPE_RGBITMAP,	"rgbitmap" }
+	{ XFS_SCRUB_TYPE_RGBITMAP,	"rgbitmap" }, \
+	{ XFS_SCRUB_TYPE_RTRMAPBT,	"rtrmapbt" }
 
 #define XFS_SCRUB_FLAG_STRINGS \
 	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 26/39] xfs: cross-reference realtime bitmap to realtime rmapbt scrubber
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (24 preceding siblings ...)
  2023-12-31 21:38   ` [PATCH 25/39] xfs: scrub the realtime rmapbt Darrick J. Wong
@ 2023-12-31 21:38   ` Darrick J. Wong
  2023-12-31 21:38   ` [PATCH 27/39] xfs: cross-reference the realtime rmapbt Darrick J. Wong
                     ` (12 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:38 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When we're checking the realtime rmap btree entries, cross-reference
those entries with the realtime bitmap too.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/rtrmap.c |   18 ++++++++++++++++++
 1 file changed, 18 insertions(+)


diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 7c7da0b232321..6009d458605e4 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -151,6 +151,23 @@ xchk_rtrmapbt_check_mergeable(
 	memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
 }
 
+/* Cross-reference with other metadata. */
+STATIC void
+xchk_rtrmapbt_xref(
+	struct xfs_scrub	*sc,
+	struct xfs_rmap_irec	*irec)
+{
+	xfs_rtblock_t		rtbno;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	rtbno = xfs_rgbno_to_rtb(sc->mp, sc->sr.rtg->rtg_rgno,
+			irec->rm_startblock);
+
+	xchk_xref_is_used_rt_space(sc, rtbno, irec->rm_blockcount);
+}
+
 /* Scrub a realtime rmapbt record. */
 STATIC int
 xchk_rtrmapbt_rec(
@@ -171,6 +188,7 @@ xchk_rtrmapbt_rec(
 
 	xchk_rtrmapbt_check_mergeable(bs, cr, &irec);
 	xchk_rtrmapbt_check_overlapping(bs, cr, &irec);
+	xchk_rtrmapbt_xref(bs->sc, &irec);
 	return 0;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 27/39] xfs: cross-reference the realtime rmapbt
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (25 preceding siblings ...)
  2023-12-31 21:38   ` [PATCH 26/39] xfs: cross-reference realtime bitmap to realtime rmapbt scrubber Darrick J. Wong
@ 2023-12-31 21:38   ` Darrick J. Wong
  2023-12-31 21:39   ` [PATCH 28/39] xfs: scan rt rmap when we're doing an intense rmap check of bmbt mappings Darrick J. Wong
                     ` (11 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:38 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the data fork and realtime bitmap scrubbers to cross-reference
information with the realtime rmap btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/bmap.c     |   72 ++++++++++++++++++++++++++++++++++++-----------
 fs/xfs/scrub/rtbitmap.c |   58 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/rtbitmap.h |    3 ++
 fs/xfs/scrub/rtrmap.c   |   65 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.h    |    9 ++++++
 5 files changed, 190 insertions(+), 17 deletions(-)


diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 8fa51350facc4..604c4df2fdb34 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -20,6 +20,7 @@
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_health.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -142,15 +143,22 @@ static inline bool
 xchk_bmap_get_rmap(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec,
-	xfs_agblock_t		agbno,
+	xfs_agblock_t		bno,
 	uint64_t		owner,
 	struct xfs_rmap_irec	*rmap)
 {
+	struct xfs_btree_cur	**curp = &info->sc->sa.rmap_cur;
 	xfs_fileoff_t		offset;
 	unsigned int		rflags = 0;
 	int			has_rmap;
 	int			error;
 
+	if (xfs_ifork_is_realtime(info->sc->ip, info->whichfork))
+		curp = &info->sc->sr.rmap_cur;
+
+	if (*curp == NULL)
+		return false;
+
 	if (info->whichfork == XFS_ATTR_FORK)
 		rflags |= XFS_RMAP_ATTR_FORK;
 	if (irec->br_state == XFS_EXT_UNWRITTEN)
@@ -171,13 +179,13 @@ xchk_bmap_get_rmap(
 	 * range rmap lookup to make sure we get the correct owner/offset.
 	 */
 	if (info->is_shared) {
-		error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
-				owner, offset, rflags, rmap, &has_rmap);
+		error = xfs_rmap_lookup_le_range(*curp, bno, owner, offset,
+				rflags, rmap, &has_rmap);
 	} else {
-		error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
-				owner, offset, rflags, rmap, &has_rmap);
+		error = xfs_rmap_lookup_le(*curp, bno, owner, offset,
+				rflags, rmap, &has_rmap);
 	}
-	if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
+	if (!xchk_should_check_xref(info->sc, &error, curp))
 		return false;
 
 	if (!has_rmap)
@@ -191,29 +199,29 @@ STATIC void
 xchk_bmap_xref_rmap(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec,
-	xfs_agblock_t		agbno)
+	xfs_agblock_t		bno)
 {
 	struct xfs_rmap_irec	rmap;
 	unsigned long long	rmap_end;
 	uint64_t		owner = info->sc->ip->i_ino;
 
-	if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
+	if (xchk_skip_xref(info->sc->sm))
 		return;
 
 	/* Find the rmap record for this irec. */
-	if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+	if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap))
 		return;
 
 	/*
 	 * The rmap must be an exact match for this incore file mapping record,
 	 * which may have arisen from multiple ondisk records.
 	 */
-	if (rmap.rm_startblock != agbno)
+	if (rmap.rm_startblock != bno)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
 	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
-	if (rmap_end != agbno + irec->br_blockcount)
+	if (rmap_end != bno + irec->br_blockcount)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
@@ -258,7 +266,7 @@ STATIC void
 xchk_bmap_xref_rmap_cow(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec,
-	xfs_agblock_t		agbno)
+	xfs_agblock_t		bno)
 {
 	struct xfs_rmap_irec	rmap;
 	unsigned long long	rmap_end;
@@ -268,7 +276,7 @@ xchk_bmap_xref_rmap_cow(
 		return;
 
 	/* Find the rmap record for this irec. */
-	if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+	if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap))
 		return;
 
 	/*
@@ -276,12 +284,12 @@ xchk_bmap_xref_rmap_cow(
 	 * can start before and end after the physical space allocated to this
 	 * mapping.  There are no offsets to check.
 	 */
-	if (rmap.rm_startblock > agbno)
+	if (rmap.rm_startblock > bno)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
 	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
-	if (rmap_end < agbno + irec->br_blockcount)
+	if (rmap_end < bno + irec->br_blockcount)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
@@ -314,10 +322,40 @@ xchk_bmap_rt_iextent_xref(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
-	xchk_rt_init(info->sc, &info->sc->sr, XCHK_RTLOCK_BITMAP_SHARED);
+	struct xfs_owner_info	oinfo;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_rgnumber_t		rgno;
+	xfs_rgblock_t		rgbno;
+	int			error;
+
+	if (!xfs_has_rtrmapbt(mp)) {
+		xchk_rt_init(info->sc, &info->sc->sr,
+				XCHK_RTLOCK_BITMAP_SHARED);
+		xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
+				irec->br_blockcount);
+		xchk_rt_unlock(info->sc, &info->sc->sr);
+		return;
+	}
+
+	rgbno = xfs_rtb_to_rgbno(mp, irec->br_startblock, &rgno);
+	error = xchk_rtgroup_init(info->sc, rgno, &info->sc->sr,
+			XCHK_RTGLOCK_ALL);
+	if (!xchk_fblock_process_error(info->sc, info->whichfork,
+			irec->br_startoff, &error))
+		goto out_free;
+
 	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
 			irec->br_blockcount);
-	xchk_rt_unlock(info->sc, &info->sc->sr);
+	xchk_bmap_xref_rmap(info, irec, rgbno);
+
+	xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, info->whichfork,
+			irec->br_startoff);
+	xchk_xref_is_only_rt_owned_by(info->sc, rgbno, irec->br_blockcount,
+			&oinfo);
+
+out_free:
+	xchk_rtgroup_btcur_free(&info->sc->sr);
+	xchk_rtgroup_free(info->sc, &info->sc->sr);
 }
 
 /* Cross-reference a single datadev extent record. */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 5bedb09387495..0d823eadbdba0 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -9,6 +9,7 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_btree.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
 #include "xfs_rtbitmap.h"
@@ -16,10 +17,13 @@
 #include "xfs_bmap.h"
 #include "xfs_bit.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/repair.h"
 #include "scrub/rtbitmap.h"
+#include "scrub/btree.h"
 
 static inline void
 xchk_rtbitmap_compute_geometry(
@@ -123,6 +127,36 @@ xchk_setup_rtbitmap(
 
 /* Per-rtgroup bitmap contents. */
 
+/* Cross-reference rtbitmap entries with other metadata. */
+STATIC void
+xchk_rgbitmap_xref(
+	struct xchk_rgbitmap	*rgb,
+	xfs_rtblock_t		startblock,
+	xfs_rtblock_t		blockcount)
+{
+	struct xfs_scrub	*sc = rgb->sc;
+	xfs_rgnumber_t		rgno;
+	xfs_rgblock_t		rgbno;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+	if (!sc->sr.rmap_cur)
+		return;
+
+	rgbno = xfs_rtb_to_rgbno(sc->mp, startblock, &rgno);
+	xchk_xref_has_no_rt_owner(sc, rgbno, blockcount);
+
+	if (rgb->next_free_rtblock < startblock) {
+		xfs_rgblock_t	next_rgbno;
+
+		next_rgbno = xfs_rtb_to_rgbno(sc->mp, rgb->next_free_rtblock,
+				&rgno);
+		xchk_xref_has_rt_owner(sc, next_rgbno, rgbno - next_rgbno);
+	}
+
+	rgb->next_free_rtblock = startblock + blockcount;
+}
+
 /* Scrub a free extent record from the realtime bitmap. */
 STATIC int
 xchk_rgbitmap_rec(
@@ -141,6 +175,12 @@ xchk_rgbitmap_rec(
 
 	if (!xfs_verify_rtbext(mp, startblock, blockcount))
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+
+	xchk_rgbitmap_xref(rgb, startblock, blockcount);
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return -ECANCELED;
+
 	return 0;
 }
 
@@ -154,6 +194,7 @@ xchk_rgbitmap(
 	struct xfs_rtgroup	*rtg = sc->sr.rtg;
 	struct xchk_rgbitmap	*rgb = sc->buf;
 	xfs_rtblock_t		rtbno;
+	xfs_rtblock_t		last_rtbno;
 	xfs_rgblock_t		last_rgbno = rtg->rtg_blockcount - 1;
 	int			error;
 
@@ -168,6 +209,7 @@ xchk_rgbitmap(
 	 * realtime group.
 	 */
 	rtbno = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, 0);
+	rgb->next_free_rtblock = rtbno;
 	keys[0].ar_startext = xfs_rtb_to_rtx(mp, rtbno);
 
 	rtbno = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, last_rgbno);
@@ -179,6 +221,22 @@ xchk_rgbitmap(
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
 		return error;
 
+	/*
+	 * Check that the are rmappings for all rt extents between the end of
+	 * the last free extent we saw and the last possible extent in the rt
+	 * group.
+	 */
+	last_rtbno = xfs_rgbno_to_rtb(sc->mp, rtg->rtg_rgno, last_rgbno);
+	if (rgb->next_free_rtblock < last_rtbno) {
+		xfs_rgnumber_t	rgno;
+		xfs_rgblock_t	next_rgbno;
+
+		next_rgbno = xfs_rtb_to_rgbno(sc->mp, rgb->next_free_rtblock,
+				&rgno);
+		xchk_xref_has_rt_owner(sc, next_rgbno,
+				last_rgbno - next_rgbno);
+	}
+
 	return 0;
 }
 
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
index f659f0e76b4fa..42d50fa48a0ec 100644
--- a/fs/xfs/scrub/rtbitmap.h
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -17,6 +17,9 @@ struct xchk_rgbitmap {
 	struct xfs_scrub	*sc;
 
 	struct xchk_rtbitmap	rtb;
+
+	/* The next free rt block that we expect to see. */
+	xfs_rtblock_t		next_free_rtblock;
 };
 
 #ifdef CONFIG_XFS_ONLINE_REPAIR
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 6009d458605e4..c3e1cee81b6d2 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -209,3 +209,68 @@ xchk_rtrmapbt(
 			XFS_DATA_FORK);
 	return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr);
 }
+
+/* xref check that the extent has no realtime reverse mapping at all */
+void
+xchk_xref_has_no_rt_owner(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		bno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* xref check that the extent is completely mapped */
+void
+xchk_xref_has_rt_owner(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		bno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_FULL)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* xref check that the extent is only owned by a given owner */
+void
+xchk_xref_is_only_rt_owned_by(
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_rmap_matches		res;
+	int				error;
+
+	if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_count_owners(sc->sr.rmap_cur, bno, len, oinfo, &res);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		return;
+	if (res.matches != 1)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+	if (res.bad_non_owner_matches)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+	if (res.non_owner_matches)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 52bbc1fbcc560..38731b99625d6 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -321,8 +321,17 @@ void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
 #ifdef CONFIG_XFS_RT
 void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
 		xfs_extlen_t len);
+void xchk_xref_has_no_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
+void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
+void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
 #else
 # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
+# define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0)
+# define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0)
+# define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0)
 #endif
 
 #endif	/* __XFS_SCRUB_SCRUB_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 28/39] xfs: scan rt rmap when we're doing an intense rmap check of bmbt mappings
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (26 preceding siblings ...)
  2023-12-31 21:38   ` [PATCH 27/39] xfs: cross-reference the realtime rmapbt Darrick J. Wong
@ 2023-12-31 21:39   ` Darrick J. Wong
  2023-12-31 21:39   ` [PATCH 29/39] xfs: scrub the metadir path of rt rmap btree files Darrick J. Wong
                     ` (10 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:39 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the bmbt scrubber how to perform a comprehensive check that the
rmapbt does not contain /any/ mappings that are not described by bmbt
records when it's dealing with a realtime file.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/bmap.c |   60 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 53 insertions(+), 7 deletions(-)


diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 604c4df2fdb34..696ac7208c4d5 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -21,6 +21,8 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_health.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -637,12 +639,20 @@ xchk_bmap_check_rmap(
 	 */
 	check_rec = *rec;
 	while (have_map) {
+		xfs_fsblock_t	startblock;
+
 		if (irec.br_startoff != check_rec.rm_offset)
 			xchk_fblock_set_corrupt(sc, sbcri->whichfork,
 					check_rec.rm_offset);
-		if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
-				cur->bc_ag.pag->pag_agno,
-				check_rec.rm_startblock))
+		if (cur->bc_btnum == XFS_BTNUM_RMAP)
+			startblock = XFS_AGB_TO_FSB(sc->mp,
+					cur->bc_ag.pag->pag_agno,
+					check_rec.rm_startblock);
+		else
+			startblock = xfs_rgbno_to_rtb(sc->mp,
+					cur->bc_ino.rtg->rtg_rgno,
+					check_rec.rm_startblock);
+		if (irec.br_startblock != startblock)
 			xchk_fblock_set_corrupt(sc, sbcri->whichfork,
 					check_rec.rm_offset);
 		if (irec.br_blockcount > check_rec.rm_blockcount)
@@ -696,6 +706,30 @@ xchk_bmap_check_ag_rmaps(
 	return error;
 }
 
+/* Make sure each rt rmap has a corresponding bmbt entry. */
+STATIC int
+xchk_bmap_check_rt_rmaps(
+	struct xfs_scrub		*sc,
+	struct xfs_rtgroup		*rtg)
+{
+	struct xchk_bmap_check_rmap_info sbcri;
+	struct xfs_btree_cur		*cur;
+	int				error;
+
+	xfs_rtgroup_lock(NULL, rtg, XFS_RTGLOCK_RMAP);
+	cur = xfs_rtrmapbt_init_cursor(sc->mp, sc->tp, rtg, rtg->rtg_rmapip);
+
+	sbcri.sc = sc;
+	sbcri.whichfork = XFS_DATA_FORK;
+	error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
+	if (error == -ECANCELED)
+		error = 0;
+
+	xfs_btree_del_cursor(cur, error);
+	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+	return error;
+}
+
 /*
  * Decide if we want to scan the reverse mappings to determine if the attr
  * fork /really/ has zero space mappings.
@@ -750,10 +784,6 @@ xchk_bmap_check_empty_datafork(
 {
 	struct xfs_ifork	*ifp = &ip->i_df;
 
-	/* Don't support realtime rmap checks yet. */
-	if (XFS_IS_REALTIME_INODE(ip))
-		return false;
-
 	/*
 	 * If the dinode repair found a bad data fork, it will reset the fork
 	 * to extents format with zero records and wait for the this scrubber
@@ -805,6 +835,22 @@ xchk_bmap_check_rmaps(
 	xfs_agnumber_t		agno;
 	int			error;
 
+	if (xfs_ifork_is_realtime(sc->ip, whichfork)) {
+		struct xfs_rtgroup	*rtg;
+		xfs_rgnumber_t		rgno;
+
+		for_each_rtgroup(sc->mp, rgno, rtg) {
+			error = xchk_bmap_check_rt_rmaps(sc, rtg);
+			if (error ||
+			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+				xfs_rtgroup_rele(rtg);
+				return error;
+			}
+		}
+
+		return 0;
+	}
+
 	for_each_perag(sc->mp, agno, pag) {
 		error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
 		if (error ||


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 29/39] xfs: scrub the metadir path of rt rmap btree files
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (27 preceding siblings ...)
  2023-12-31 21:39   ` [PATCH 28/39] xfs: scan rt rmap when we're doing an intense rmap check of bmbt mappings Darrick J. Wong
@ 2023-12-31 21:39   ` Darrick J. Wong
  2023-12-31 21:39   ` [PATCH 30/39] xfs: walk the rt reverse mapping tree when rebuilding rmap Darrick J. Wong
                     ` (9 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:39 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new XFS_SCRUB_METAPATH subtype so that we can scrub the metadata
directory tree path to the rmap btree file for each rt group.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h  |    3 ++-
 fs/xfs/scrub/metapath.c |   27 ++++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index dcf048aae8c17..0bbdbfb0a8ae7 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -804,9 +804,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_METAPATH_USRQUOTA	2
 #define XFS_SCRUB_METAPATH_GRPQUOTA	3
 #define XFS_SCRUB_METAPATH_PRJQUOTA	4
+#define XFS_SCRUB_METAPATH_RTRMAPBT	5
 
 /* Number of metapath sm_ino values */
-#define XFS_SCRUB_METAPATH_NR		5
+#define XFS_SCRUB_METAPATH_NR		6
 
 /*
  * ioctl limits
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index 5a669a1a8ad17..6afd117c890e9 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -21,6 +21,8 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_attr.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -93,13 +95,25 @@ xchk_setup_metapath(
 	struct xchk_metapath	*mpath;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_inode	*ip = NULL;
+	struct xfs_rtgroup	*rtg;
+	struct xfs_imeta_path	*path;
 	int			error;
 
 	if (!xfs_has_metadir(mp))
 		return -ENOENT;
-	if (sc->sm->sm_gen || sc->sm->sm_agno)
+	if (sc->sm->sm_gen)
 		return -EINVAL;
 
+	switch (sc->sm->sm_ino) {
+	case XFS_SCRUB_METAPATH_RTRMAPBT:
+		/* empty */
+		break;
+	default:
+		if (sc->sm->sm_agno)
+			return -EINVAL;
+		break;
+	}
+
 	mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS);
 	if (!mpath)
 		return -ENOMEM;
@@ -132,6 +146,17 @@ xchk_setup_metapath(
 		if (XFS_IS_PQUOTA_ON(mp))
 			ip = xfs_quota_inode(mp, XFS_DQTYPE_PROJ);
 		break;
+	case XFS_SCRUB_METAPATH_RTRMAPBT:
+		error = xfs_rtrmapbt_create_path(mp, sc->sm->sm_agno, &path);
+		if (error)
+			return error;
+		mpath->path = path;
+		rtg = xfs_rtgroup_get(mp, sc->sm->sm_agno);
+		if (rtg) {
+			ip = rtg->rtg_rmapip;
+			xfs_rtgroup_put(rtg);
+		}
+		break;
 	default:
 		return -EINVAL;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 30/39] xfs: walk the rt reverse mapping tree when rebuilding rmap
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (28 preceding siblings ...)
  2023-12-31 21:39   ` [PATCH 29/39] xfs: scrub the metadir path of rt rmap btree files Darrick J. Wong
@ 2023-12-31 21:39   ` Darrick J. Wong
  2023-12-31 21:39   ` [PATCH 31/39] xfs: online repair of realtime file bmaps Darrick J. Wong
                     ` (8 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:39 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When we're rebuilding the data device rmap, if we encounter an "rmap"
format fork, we have to walk the (realtime) rmap btree inode to build
the appropriate mappings.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/rmap_repair.c |   36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)


diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index c6bb90fa43cca..7733334a1faa9 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -30,6 +30,8 @@
 #include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_ag.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtgroup.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -513,6 +515,38 @@ xrep_rmap_scan_iext(
 	return xrep_rmap_stash_accumulated(rf);
 }
 
+static int
+xrep_rmap_scan_rtrmapbt(
+	struct xrep_rmap_ifork	*rf,
+	struct xfs_inode	*ip)
+{
+	struct xfs_scrub	*sc = rf->rr->sc;
+	struct xfs_btree_cur	*cur;
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+	int			error;
+
+	if (rf->whichfork != XFS_DATA_FORK)
+		return -EFSCORRUPTED;
+
+	for_each_rtgroup(sc->mp, rgno, rtg) {
+		if (ip == rtg->rtg_rmapip) {
+			cur = xfs_rtrmapbt_init_cursor(sc->mp, sc->tp, rtg, ip);
+			error = xrep_rmap_scan_iroot_btree(rf, cur);
+			xfs_btree_del_cursor(cur, error);
+			xfs_rtgroup_rele(rtg);
+			return error;
+		}
+	}
+
+	/*
+	 * We shouldn't find an rmap format inode that isn't associated with
+	 * an rtgroup!
+	 */
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
 /* Find all the extents from a given AG in an inode fork. */
 STATIC int
 xrep_rmap_scan_ifork(
@@ -542,6 +576,8 @@ xrep_rmap_scan_ifork(
 		error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
 		if (error || mappings_done)
 			return error;
+	} else if (ifp->if_format == XFS_DINODE_FMT_RMAP) {
+		return xrep_rmap_scan_rtrmapbt(&rf, ip);
 	} else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
 		return 0;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 31/39] xfs: online repair of realtime file bmaps
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (29 preceding siblings ...)
  2023-12-31 21:39   ` [PATCH 30/39] xfs: walk the rt reverse mapping tree when rebuilding rmap Darrick J. Wong
@ 2023-12-31 21:39   ` Darrick J. Wong
  2023-12-31 21:40   ` [PATCH 32/39] xfs: repair inodes that have realtime extents Darrick J. Wong
                     ` (7 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:39 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Repair the block mappings of realtime files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/bmap_repair.c |  127 +++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/scrub/common.c      |    2 -
 fs/xfs/scrub/common.h      |    3 +
 fs/xfs/scrub/repair.c      |   93 ++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h      |   11 ++++
 5 files changed, 231 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 9dddc5997b4fc..25c52caae58a4 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -25,11 +25,13 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
 #include "xfs_refcount.h"
 #include "xfs_quota.h"
 #include "xfs_ialloc.h"
 #include "xfs_ag.h"
 #include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -361,6 +363,116 @@ xrep_bmap_scan_ag(
 	return error;
 }
 
+#ifdef CONFIG_XFS_RT
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_rtfork_rmap(
+	struct xfs_scrub		*sc,
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec)
+{
+	xfs_rtblock_t			rtbno;
+
+	/* xattr extents are never stored on realtime devices */
+	if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+		return -EFSCORRUPTED;
+
+	/* bmbt blocks are never stored on realtime devices */
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		return -EFSCORRUPTED;
+
+	/* Data extents for non-rt files are never stored on the rt device. */
+	if (!XFS_IS_REALTIME_INODE(sc->ip))
+		return -EFSCORRUPTED;
+
+	/* Check the file offsets and physical extents. */
+	if (!xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	/* Check that this is within the rtgroup. */
+	if (!xfs_verify_rgbext(cur->bc_ino.rtg, rec->rm_startblock,
+				rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	rtbno = xfs_rgbno_to_rtb(sc->mp, cur->bc_ino.rtg->rtg_rgno,
+			rec->rm_startblock);
+	return xrep_require_rtext_inuse(sc, rtbno, rec->rm_blockcount);
+}
+
+/* Record realtime extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rtrmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_bmap		*rb = priv;
+	xfs_rtblock_t			rtbno;
+	int				error = 0;
+
+	if (xchk_should_terminate(rb->sc, &error))
+		return error;
+
+	/* Skip extents which are not owned by this inode and fork. */
+	if (rec->rm_owner != rb->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_bmap_check_rtfork_rmap(rb->sc, cur, rec);
+	if (error)
+		return error;
+
+	/*
+	 * Record all blocks allocated to this file even if the extent isn't
+	 * for the fork we're rebuilding so that we can reset di_nblocks later.
+	 */
+	rb->nblocks += rec->rm_blockcount;
+
+	/* If this rmap isn't for the fork we want, we're done. */
+	if (rb->whichfork == XFS_DATA_FORK &&
+	    (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+	if (rb->whichfork == XFS_ATTR_FORK &&
+	    !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+
+	rtbno = xfs_rgbno_to_rtb(cur->bc_mp, cur->bc_ino.rtg->rtg_rgno,
+			rec->rm_startblock);
+	return xrep_bmap_from_rmap(rb, rec->rm_offset, rtbno,
+			rec->rm_blockcount,
+			rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/* Scan the realtime reverse mappings to build the new extent map. */
+STATIC int
+xrep_bmap_scan_rtgroup(
+	struct xrep_bmap	*rb,
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_scrub	*sc = rb->sc;
+	int			error;
+
+	if (xrep_is_rtmeta_ino(sc, rtg, sc->ip->i_ino))
+		return 0;
+
+	error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+			XFS_RTGLOCK_RMAP | XFS_RTGLOCK_BITMAP_SHARED);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb);
+	xchk_rtgroup_btcur_free(&sc->sr);
+	xchk_rtgroup_free(sc, &sc->sr);
+	return error;
+}
+#else
+static inline int
+xrep_bmap_scan_rtgroup(struct xrep_bmap *rb, struct xfs_rtgroup *rtg)
+{
+	return -EFSCORRUPTED;
+}
+#endif
+
 /* Find the delalloc extents from the old incore extent tree. */
 STATIC int
 xrep_bmap_find_delalloc(
@@ -410,9 +522,20 @@ xrep_bmap_find_mappings(
 {
 	struct xfs_scrub	*sc = rb->sc;
 	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 	int			error = 0;
 
+	/* Iterate the rtrmaps for extents. */
+	for_each_rtgroup(sc->mp, rgno, rtg) {
+		error = xrep_bmap_scan_rtgroup(rb, rtg);
+		if (error) {
+			xfs_rtgroup_rele(rtg);
+			return error;
+		}
+	}
+
 	/* Iterate the rmaps for extents. */
 	for_each_perag(sc->mp, agno, pag) {
 		error = xrep_bmap_scan_ag(rb, pag);
@@ -751,10 +874,6 @@ xrep_bmap_check_inputs(
 		return -EINVAL;
 	}
 
-	/* Don't know how to rebuild realtime data forks. */
-	if (XFS_IS_REALTIME_INODE(sc->ip))
-		return -EOPNOTSUPP;
-
 	return 0;
 }
 
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 5ea9f3f310335..e16185e9ddd9b 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -779,7 +779,7 @@ xchk_rt_unlock_rtbitmap(
 
 #ifdef CONFIG_XFS_RT
 /* Lock all the rt group metadata inode ILOCKs and wait for intents. */
-static int
+int
 xchk_rtgroup_drain_and_lock(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr,
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 3251f80c00e15..c0b2c62d4bd82 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -159,10 +159,13 @@ int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
 void xchk_rtgroup_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rtgroup_btcur_free(struct xchk_rt *sr);
 void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
+int xchk_rtgroup_drain_and_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
+		unsigned int rtglock_flags);
 #else
 # define xchk_rtgroup_init(sc, rgno, sr, lockflags)	(-ENOSYS)
 # define xchk_rtgroup_btcur_free(sr)			((void)0)
 # define xchk_rtgroup_free(sc, sr)			((void)0)
+# define xchk_rtgroup_drain_and_lock(sc, sr, lockflags)	(-ENOSYS)
 #endif /* CONFIG_XFS_RT */
 
 int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index cad45d5b86e2e..5af1a8871de55 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -36,6 +36,9 @@
 #include "xfs_da_btree.h"
 #include "xfs_attr.h"
 #include "xfs_dir2.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -954,6 +957,73 @@ xrep_ag_init(
 	return 0;
 }
 
+#ifdef CONFIG_XFS_RT
+/* Initialize all the btree cursors for a RT repair. */
+static void
+xrep_rtgroup_btcur_init(
+	struct xfs_scrub	*sc,
+	struct xchk_rt		*sr)
+{
+	struct xfs_mount	*mp = sc->mp;
+
+	ASSERT(sr->rtg != NULL);
+
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT &&
+	    (sr->rtlock_flags & XFS_RTGLOCK_RMAP) &&
+	    xfs_has_rtrmapbt(mp))
+		sr->rmap_cur = xfs_rtrmapbt_init_cursor(mp, sc->tp, sr->rtg,
+				sr->rtg->rtg_rmapip);
+}
+
+/*
+ * Given a reference to a rtgroup structure, lock rtgroup btree inodes and
+ * create btree cursors.  Must only be called to repair a regular rt file.
+ */
+int
+xrep_rtgroup_init(
+	struct xfs_scrub	*sc,
+	struct xfs_rtgroup	*rtg,
+	struct xchk_rt		*sr,
+	unsigned int		rtglock_flags)
+{
+	ASSERT(sr->rtg == NULL);
+
+	xfs_rtgroup_lock(NULL, rtg, rtglock_flags);
+	sr->rtlock_flags = rtglock_flags;
+
+	/* Grab our own passive reference from the caller's ref. */
+	sr->rtg = xfs_rtgroup_hold(rtg);
+	xrep_rtgroup_btcur_init(sc, sr);
+	return 0;
+}
+
+/* Ensure that all rt blocks in the given range are not marked free. */
+int
+xrep_require_rtext_inuse(
+	struct xfs_scrub	*sc,
+	xfs_rtblock_t		rtbno,
+	xfs_filblks_t		len)
+{
+	struct xfs_mount	*mp = sc->mp;
+	xfs_rtxnum_t		startrtx;
+	xfs_rtxnum_t		endrtx;
+	bool			is_free = false;
+	int			error;
+
+	startrtx = xfs_rtb_to_rtx(mp, rtbno);
+	endrtx = xfs_rtb_to_rtx(mp, rtbno + len - 1);
+
+	error = xfs_rtalloc_extent_is_free(mp, sc->tp, startrtx,
+			endrtx - startrtx + 1, &is_free);
+	if (error)
+		return error;
+	if (is_free)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
 /* Reinitialize the per-AG block reservation for the AG we just fixed. */
 int
 xrep_reset_perag_resv(
@@ -1209,3 +1279,26 @@ xrep_buf_verify_struct(
 
 	return fa == NULL;
 }
+
+/* Are we looking at a realtime metadata inode? */
+bool
+xrep_is_rtmeta_ino(
+	struct xfs_scrub	*sc,
+	struct xfs_rtgroup	*rtg,
+	xfs_ino_t		ino)
+{
+	/*
+	 * All filesystems have rt bitmap and summary inodes, even if they
+	 * don't have an rt section.
+	 */
+	if (ino == sc->mp->m_rbmip->i_ino)
+		return true;
+	if (ino == sc->mp->m_rsumip->i_ino)
+		return true;
+
+	/* Newer rt metadata files are not guaranteed to exist */
+	if (rtg->rtg_rmapip && ino == rtg->rtg_rmapip->i_ino)
+		return true;
+
+	return false;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 3d8ca12b2cc51..e7fe4c5d8de5d 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -106,6 +106,17 @@ int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap);
 void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
 int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
 		struct xchk_ag *sa);
+#ifdef CONFIG_XFS_RT
+int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
+		struct xchk_rt *sr, unsigned int rtglock_flags);
+int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
+		xfs_filblks_t len);
+#else
+# define xrep_rtgroup_init(sc, rtg, sr, lockflags)	(-ENOSYS)
+#endif /* CONFIG_XFS_RT */
+
+bool xrep_is_rtmeta_ino(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
+		xfs_ino_t ino);
 
 /* Metadata revalidators */
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 32/39] xfs: repair inodes that have realtime extents
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (30 preceding siblings ...)
  2023-12-31 21:39   ` [PATCH 31/39] xfs: online repair of realtime file bmaps Darrick J. Wong
@ 2023-12-31 21:40   ` Darrick J. Wong
  2023-12-31 21:40   ` [PATCH 33/39] xfs: repair rmap btree inodes Darrick J. Wong
                     ` (6 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:40 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Plumb into the inode core repair code the ability to search for extents
on realtime devices.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/inode_repair.c |   63 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 2b3a6cbadae71..45ad78d1e5404 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -38,6 +38,8 @@
 #include "xfs_log_priv.h"
 #include "xfs_health.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -718,18 +720,77 @@ xrep_dinode_count_ag_rmaps(
 	return error;
 }
 
+/* Count extents and blocks for an inode given an rt rmap. */
+STATIC int
+xrep_dinode_walk_rtrmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_inode		*ri = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(ri->sc, &error))
+		return error;
+
+	/* We only care about this inode. */
+	if (rec->rm_owner != ri->sc->sm->sm_ino)
+		return 0;
+
+	if (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))
+		return -EFSCORRUPTED;
+
+	ri->rt_blocks += rec->rm_blockcount;
+	ri->rt_extents++;
+	return 0;
+}
+
+/* Count extents and blocks for an inode from all realtime rmap data. */
+STATIC int
+xrep_dinode_count_rtgroup_rmaps(
+	struct xrep_inode	*ri,
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	int			error;
+
+	if (!xfs_has_realtime(sc->mp) ||
+	    xrep_is_rtmeta_ino(sc, rtg, sc->sm->sm_ino))
+		return 0;
+
+	error = xrep_rtgroup_init(sc, rtg, &sc->sr, XFS_RTGLOCK_RMAP);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap,
+			ri);
+	xchk_rtgroup_btcur_free(&sc->sr);
+	xchk_rtgroup_free(sc, &sc->sr);
+	return error;
+}
+
 /* Count extents and blocks for a given inode from all rmap data. */
 STATIC int
 xrep_dinode_count_rmaps(
 	struct xrep_inode	*ri)
 {
 	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 	int			error;
 
-	if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
+	if (!xfs_has_rmapbt(ri->sc->mp))
 		return -EOPNOTSUPP;
 
+	for_each_rtgroup(ri->sc->mp, rgno, rtg) {
+		error = xrep_dinode_count_rtgroup_rmaps(ri, rtg);
+		if (error) {
+			xfs_rtgroup_rele(rtg);
+			return error;
+		}
+	}
+
 	for_each_perag(ri->sc->mp, agno, pag) {
 		error = xrep_dinode_count_ag_rmaps(ri, pag);
 		if (error) {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 33/39] xfs: repair rmap btree inodes
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (31 preceding siblings ...)
  2023-12-31 21:40   ` [PATCH 32/39] xfs: repair inodes that have realtime extents Darrick J. Wong
@ 2023-12-31 21:40   ` Darrick J. Wong
  2023-12-31 21:40   ` [PATCH 34/39] xfs: online repair of realtime bitmaps for a realtime group Darrick J. Wong
                     ` (5 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:40 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the inode repair code how to deal with realtime rmap btree inodes
that won't load properly.  This is most likely moot since the filesystem
generally won't mount without the rtrmapbt inodes being usable, but
we'll add this for completeness.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/inode_repair.c |   46 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)


diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 45ad78d1e5404..f4f6ed6ef5120 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -895,6 +895,37 @@ xrep_dinode_bad_bmbt_fork(
 	return false;
 }
 
+/* Return true if this rmap-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_rmapbt_fork(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	unsigned int		dfork_size,
+	int			whichfork)
+{
+	struct xfs_rtrmap_root	*dfp;
+	unsigned int		nrecs;
+	unsigned int		level;
+
+	if (whichfork != XFS_DATA_FORK)
+		return true;
+	if (dfork_size < sizeof(struct xfs_rtrmap_root))
+		return true;
+
+	dfp = XFS_DFORK_PTR(dip, whichfork);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > sc->mp->m_rtrmap_maxlevels)
+		return true;
+	if (xfs_rtrmap_droot_space_calc(level, nrecs) > dfork_size)
+		return true;
+	if (level > 0 && nrecs == 0)
+		return true;
+
+	return false;
+}
+
 /*
  * Check the data fork for things that will fail the ifork verifiers or the
  * ifork formatters.
@@ -975,6 +1006,11 @@ xrep_dinode_check_dfork(
 				XFS_DATA_FORK))
 			return true;
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		if (xrep_dinode_bad_rmapbt_fork(sc, dip, dfork_size,
+				XFS_DATA_FORK))
+			return true;
+		break;
 	default:
 		return true;
 	}
@@ -1095,6 +1131,11 @@ xrep_dinode_check_afork(
 					XFS_ATTR_FORK))
 			return true;
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		if (xrep_dinode_bad_rmapbt_fork(sc, dip, afork_size,
+				XFS_ATTR_FORK))
+			return true;
+		break;
 	default:
 		return true;
 	}
@@ -1142,6 +1183,7 @@ xrep_dinode_ensure_forkoff(
 	uint16_t		mode)
 {
 	struct xfs_bmdr_block	*bmdr;
+	struct xfs_rtrmap_root	*rmdr;
 	struct xfs_scrub	*sc = ri->sc;
 	xfs_extnum_t		attr_extents, data_extents;
 	size_t			bmdr_minsz = xfs_bmdr_space_calc(1);
@@ -1248,6 +1290,10 @@ xrep_dinode_ensure_forkoff(
 		bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
 		dfork_min = xfs_bmap_broot_space(sc->mp, bmdr);
 		break;
+	case XFS_DINODE_FMT_RMAP:
+		rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+		dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr);
+		break;
 	default:
 		dfork_min = 0;
 		break;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 34/39] xfs: online repair of realtime bitmaps for a realtime group
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (32 preceding siblings ...)
  2023-12-31 21:40   ` [PATCH 33/39] xfs: repair rmap btree inodes Darrick J. Wong
@ 2023-12-31 21:40   ` Darrick J. Wong
  2023-12-31 21:40   ` [PATCH 35/39] xfs: support repairing metadata btrees rooted in metadir inodes Darrick J. Wong
                     ` (4 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:40 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

For a given rt group, regenerate the bitmap contents from the group's
realtime rmap btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/common.h           |    5 
 fs/xfs/scrub/repair.c           |    2 
 fs/xfs/scrub/repair.h           |    4 
 fs/xfs/scrub/rtbitmap.c         |   19 +
 fs/xfs/scrub/rtbitmap.h         |   56 +++
 fs/xfs/scrub/rtbitmap_repair.c  |  676 +++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/rtsummary_repair.c |    3 
 fs/xfs/scrub/scrub.c            |    2 
 fs/xfs/scrub/tempfile.c         |   15 +
 fs/xfs/scrub/tempswap.h         |    2 
 fs/xfs/scrub/trace.c            |    1 
 fs/xfs/scrub/trace.h            |  149 +++++++++
 12 files changed, 922 insertions(+), 12 deletions(-)


diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index c0b2c62d4bd82..5dc481f69d160 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -274,6 +274,11 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
 			(sc)->mp->m_super->s_id, \
 			(sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \
 			##__VA_ARGS__)
+#define xchk_xfile_rtgroup_descr(sc, fmt, ...) \
+	kasprintf(XCHK_GFP_FLAGS, "XFS (%s): rtgroup 0x%x " fmt, \
+			(sc)->mp->m_super->s_id, \
+			(sc)->sa.pag ? (sc)->sr.rtg->rtg_rgno : (sc)->sm->sm_agno, \
+			##__VA_ARGS__)
 
 /*
  * Setting up a hook to wait for intents to drain is costly -- we have to take
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 5af1a8871de55..4789014be4a36 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -959,7 +959,7 @@ xrep_ag_init(
 
 #ifdef CONFIG_XFS_RT
 /* Initialize all the btree cursors for a RT repair. */
-static void
+void
 xrep_rtgroup_btcur_init(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr)
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index e7fe4c5d8de5d..b66e0b5331394 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -109,6 +109,7 @@ int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
 #ifdef CONFIG_XFS_RT
 int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
 		struct xchk_rt *sr, unsigned int rtglock_flags);
+void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr);
 int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
 		xfs_filblks_t len);
 #else
@@ -151,10 +152,12 @@ int xrep_metapath(struct xfs_scrub *sc);
 int xrep_rtbitmap(struct xfs_scrub *sc);
 int xrep_rtsummary(struct xfs_scrub *sc);
 int xrep_rgsuperblock(struct xfs_scrub *sc);
+int xrep_rgbitmap(struct xfs_scrub *sc);
 #else
 # define xrep_rtbitmap			xrep_notsupported
 # define xrep_rtsummary			xrep_notsupported
 # define xrep_rgsuperblock		xrep_notsupported
+# define xrep_rgbitmap			xrep_notsupported
 #endif /* CONFIG_XFS_RT */
 
 #ifdef CONFIG_XFS_QUOTA
@@ -260,6 +263,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
 #define xrep_dirtree			xrep_notsupported
 #define xrep_metapath			xrep_notsupported
 #define xrep_rgsuperblock		xrep_notsupported
+#define xrep_rgbitmap			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 0d823eadbdba0..47463ef336eed 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -19,9 +19,11 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_swapext.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/repair.h"
+#include "scrub/tempswap.h"
 #include "scrub/rtbitmap.h"
 #include "scrub/btree.h"
 
@@ -45,15 +47,26 @@ xchk_setup_rgbitmap(
 {
 	struct xfs_mount	*mp = sc->mp;
 	struct xchk_rgbitmap	*rgb;
+	unsigned int		wordcnt = xchk_rgbitmap_wordcnt(sc);
 	int			error;
 
-	rgb = kzalloc(sizeof(struct xchk_rgbitmap), XCHK_GFP_FLAGS);
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+	rgb = kzalloc(struct_size(rgb, words, wordcnt), XCHK_GFP_FLAGS);
 	if (!rgb)
 		return -ENOMEM;
 	rgb->sc = sc;
 	sc->buf = rgb;
+	rgb->rtglock_flags = XCHK_RTGLOCK_ALL;
 
-	error = xchk_trans_alloc(sc, 0);
+	if (xchk_could_repair(sc)) {
+		error = xrep_setup_rgbitmap(sc, rgb);
+		if (error)
+			return error;
+	}
+
+	error = xchk_trans_alloc(sc, rgb->rtb.resblks);
 	if (error)
 		return error;
 
@@ -66,7 +79,7 @@ xchk_setup_rgbitmap(
 		return error;
 
 	error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr,
-			XCHK_RTGLOCK_ALL);
+			rgb->rtglock_flags);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
index 42d50fa48a0ec..6ff897f0871f8 100644
--- a/fs/xfs/scrub/rtbitmap.h
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -13,19 +13,75 @@ struct xchk_rtbitmap {
 	unsigned int		resblks;
 };
 
+/*
+ * We use an xfile to construct new bitmap blocks for the portion of the
+ * rtbitmap file that we're replacing.  Whereas the ondisk bitmap must be
+ * accessed through the buffer cache, the xfile bitmap supports direct
+ * word-level accesses.  Therefore, we create a small abstraction for linear
+ * access.
+ */
+typedef unsigned long long xrep_wordoff_t;
+typedef unsigned int xrep_wordcnt_t;
+
+/* Mask to round an rtx down to the nearest bitmap word. */
+#define XREP_RTBMP_WORDMASK	((1ULL << XFS_NBWORDLOG) - 1)
+
 struct xchk_rgbitmap {
 	struct xfs_scrub	*sc;
 
 	struct xchk_rtbitmap	rtb;
 
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+	struct xfs_rtalloc_args	args;
+	struct xrep_tempswap	tempswap;
+#endif
+
 	/* The next free rt block that we expect to see. */
 	xfs_rtblock_t		next_free_rtblock;
+
+	/* file offset inside the rtbitmap where we start swapping */
+	xfs_fileoff_t		group_rbmoff;
+
+	/* number of rtbitmap blocks for this group */
+	xfs_filblks_t		group_rbmlen;
+
+	/* The next rtgroup block we expect to see during our rtrmapbt walk. */
+	xfs_rgblock_t		next_rgbno;
+
+	/* rtgroup lock flags */
+	unsigned int		rtglock_flags;
+
+	/* rtword position of xfile as we write buffers to disk. */
+	xrep_wordoff_t		prep_wordoff;
+
+	/* Memory buffer full of 1s for rgbitmap repair. */
+	union xfs_rtword_raw	words[];
 };
 
 #ifdef CONFIG_XFS_ONLINE_REPAIR
 int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
+int xrep_setup_rgbitmap(struct xfs_scrub *sc, struct xchk_rgbitmap *rgb);
+
+/*
+ * How big should the words[] buffer be?
+ *
+ * For repairs, we want a full fsblock worth of space so that we can memcpy a
+ * buffer full of 1s into the xfile bitmap.  The xfile bitmap doesn't have
+ * rtbitmap block headers, so we don't use blockwsize.  Scrub doesn't use the
+ * words buffer at all.
+ */
+static inline unsigned int
+xchk_rgbitmap_wordcnt(
+	struct xfs_scrub	*sc)
+{
+	if (xchk_could_repair(sc))
+		return sc->mp->m_sb.sb_blocksize >> XFS_WORDLOG;
+	return 0;
+}
 #else
 # define xrep_setup_rtbitmap(sc, rtb)	(0)
+# define xrep_setup_rgbitmap(sc, rgb)	(0)
+# define xchk_rgbitmap_wordcnt(sc)	(0)
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
 #endif /* __XFS_SCRUB_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index 46f5d5f605c91..db87ce51c35fc 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -12,17 +12,693 @@
 #include "xfs_btree.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
+#include "xfs_rtalloc.h"
 #include "xfs_inode.h"
 #include "xfs_bit.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_swapext.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/xfile.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempswap.h"
+#include "scrub/reap.h"
 #include "scrub/rtbitmap.h"
 
+/* rt bitmap content repairs */
+
+/* Set up to repair the realtime bitmap for this group. */
+int
+xrep_setup_rgbitmap(
+	struct xfs_scrub	*sc,
+	struct xchk_rgbitmap	*rgb)
+{
+	struct xfs_mount	*mp = sc->mp;
+	char			*descr;
+	unsigned long long	blocks = 0;
+	unsigned long long	rtbmp_words;
+	int			error;
+
+	error = xrep_tempfile_create(sc, S_IFREG);
+	if (error)
+		return error;
+
+	/* Create an xfile to hold our reconstructed bitmap. */
+	rtbmp_words = xfs_rtbitmap_wordcount(mp, mp->m_sb.sb_rextents);
+	descr = xchk_xfile_rtgroup_descr(sc, "bitmap file");
+	error = xfile_create(descr, rtbmp_words << XFS_WORDLOG, &sc->xfile);
+	kfree(descr);
+	if (error)
+		return error;
+
+	/*
+	 * Reserve enough blocks to write out a completely new bitmap file,
+	 * plus twice as many blocks as we would need if we can only allocate
+	 * one block per data fork mapping.  This should cover the
+	 * preallocation of the temporary file and swapping the extent
+	 * mappings.
+	 *
+	 * We cannot use xfs_swapext_estimate because we have not yet
+	 * constructed the replacement bitmap and therefore do not know how
+	 * many extents it will use.  By the time we do, we will have a dirty
+	 * transaction (which we cannot drop because we cannot drop the
+	 * rtbitmap ILOCK) and cannot ask for more reservation.
+	 */
+	blocks = mp->m_sb.sb_rbmblocks;
+	blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
+	if (blocks > UINT_MAX)
+		return -EOPNOTSUPP;
+
+	rgb->rtb.resblks += blocks;
+
+	/*
+	 * Grab support for atomic extent swapping before we allocate any
+	 * transactions or grab ILOCKs.
+	 */
+	error = xrep_tempswap_grab_log_assist(sc);
+	if (error)
+		return error;
+
+	/*
+	 * We must hold rbmip with ILOCK_EXCL to use the extent swap at the end
+	 * of the repair function.  Change the desired rtglock flags.
+	 */
+	rgb->rtglock_flags &= ~XFS_RTGLOCK_BITMAP_SHARED;
+	rgb->rtglock_flags |= XFS_RTGLOCK_BITMAP;
+	return 0;
+}
+
+static inline xrep_wordoff_t
+rtx_to_wordoff(
+	struct xfs_mount	*mp,
+	xfs_rtxnum_t		rtx)
+{
+	return rtx >> XFS_NBWORDLOG;
+}
+
+static inline xrep_wordcnt_t
+rtxlen_to_wordcnt(
+	xfs_rtxlen_t	rtxlen)
+{
+	return rtxlen >> XFS_NBWORDLOG;
+}
+
+/* Helper functions to record rtwords in an xfile. */
+
+static inline int
+xfbmp_load(
+	struct xchk_rgbitmap	*rgb,
+	xrep_wordoff_t		wordoff,
+	xfs_rtword_t		*word)
+{
+	union xfs_rtword_raw	urk;
+	int			error;
+
+	ASSERT(xfs_has_rtgroups(rgb->sc->mp));
+
+	error = xfile_obj_load(rgb->sc->xfile, &urk,
+			sizeof(union xfs_rtword_raw),
+			wordoff << XFS_WORDLOG);
+	if (error)
+		return error;
+
+	*word = le32_to_cpu(urk.rtg);
+	return 0;
+}
+
+static inline int
+xfbmp_store(
+	struct xchk_rgbitmap	*rgb,
+	xrep_wordoff_t		wordoff,
+	const xfs_rtword_t	word)
+{
+	union xfs_rtword_raw	urk;
+
+	ASSERT(xfs_has_rtgroups(rgb->sc->mp));
+
+	urk.rtg = cpu_to_le32(word);
+	return xfile_obj_store(rgb->sc->xfile, &urk,
+			sizeof(union xfs_rtword_raw),
+			wordoff << XFS_WORDLOG);
+}
+
+static inline int
+xfbmp_copyin(
+	struct xchk_rgbitmap	*rgb,
+	xrep_wordoff_t		wordoff,
+	const union xfs_rtword_raw	*word,
+	xrep_wordcnt_t		nr_words)
+{
+	return xfile_obj_store(rgb->sc->xfile, word, nr_words << XFS_WORDLOG,
+			wordoff << XFS_WORDLOG);
+}
+
+static inline int
+xfbmp_copyout(
+	struct xchk_rgbitmap	*rgb,
+	xrep_wordoff_t		wordoff,
+	union xfs_rtword_raw	*word,
+	xrep_wordcnt_t		nr_words)
+{
+	return xfile_obj_load(rgb->sc->xfile, word, nr_words << XFS_WORDLOG,
+			wordoff << XFS_WORDLOG);
+}
+
+/*
+ * Preserve the portions of the rtbitmap block for the start of this rtgroup
+ * that map to the previous rtgroup.
+ */
+STATIC int
+xrep_rgbitmap_load_before(
+	struct xchk_rgbitmap	*rgb)
+{
+	struct xfs_scrub	*sc = rgb->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_rtgroup	*rtg = sc->sr.rtg;
+	xrep_wordoff_t		wordoff;
+	xfs_rtblock_t		group_rtbno;
+	xfs_rtxnum_t		group_rtx, rbmoff_rtx;
+	xfs_rtword_t		ondisk_word;
+	xfs_rtword_t		xfile_word;
+	xfs_rtword_t		mask;
+	xrep_wordcnt_t		wordcnt;
+	int			bit;
+	int			error;
+
+	/*
+	 * Compute the file offset within the rtbitmap block that corresponds
+	 * to the start of this group, and decide if we need to read blocks
+	 * from the group before this one.
+	 */
+	group_rtbno = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, 0);
+	group_rtx = xfs_rtb_to_rtx(mp, group_rtbno);
+
+	rgb->group_rbmoff = xfs_rtx_to_rbmblock(mp, group_rtx);
+	rbmoff_rtx = xfs_rbmblock_to_rtx(mp, rgb->group_rbmoff);
+	rgb->prep_wordoff = rtx_to_wordoff(mp, rbmoff_rtx);
+
+	trace_xrep_rgbitmap_load(rtg, rgb->group_rbmoff, rbmoff_rtx,
+			group_rtx - 1);
+
+	if (rbmoff_rtx == group_rtx)
+		return 0;
+
+	rgb->args.mp = sc->mp;
+	rgb->args.tp = sc->tp;
+	error = xfs_rtbitmap_read_buf(&rgb->args, rgb->group_rbmoff);
+	if (error) {
+		/*
+		 * Reading the existing rbmblock failed, and we must deal with
+		 * the part of the rtbitmap block that corresponds to the
+		 * previous group.  The most conservative option is to fill
+		 * that part of the bitmap with zeroes so that it won't get
+		 * allocated.  The xfile contains zeroes already, so we can
+		 * return.
+		 */
+		return 0;
+	}
+
+	/*
+	 * Copy full rtbitmap words into memory from the beginning of the
+	 * ondisk block until we get to the word that corresponds to the start
+	 * of this group.
+	 */
+	wordoff = rtx_to_wordoff(mp, rbmoff_rtx);
+	wordcnt = rtxlen_to_wordcnt(group_rtx - rbmoff_rtx);
+	if (wordcnt > 0) {
+		union xfs_rtword_raw	*p;
+
+		p = xfs_rbmblock_wordptr(&rgb->args, 0);
+		error = xfbmp_copyin(rgb, wordoff, p, wordcnt);
+		if (error)
+			goto out_rele;
+
+		trace_xrep_rgbitmap_load_words(mp, rgb->group_rbmoff, wordoff,
+				wordcnt);
+		wordoff += wordcnt;
+	}
+
+	/*
+	 * Compute the bit position of the first rtextent of this group.  If
+	 * the bit position is zero, we don't have to RMW a partial word and
+	 * move to the next step.
+	 */
+	bit = group_rtx & XREP_RTBMP_WORDMASK;
+	if (bit == 0)
+		goto out_rele;
+
+	/*
+	 * Create a mask of the bits that we want to load from disk.  These
+	 * bits track space in a different rtgroup, which is why we must
+	 * preserve them even as we replace parts of the bitmap.
+	 */
+	mask = ~((((xfs_rtword_t)1 << (XFS_NBWORD - bit)) - 1) << bit);
+
+	error = xfbmp_load(rgb, wordoff, &xfile_word);
+	if (error)
+		goto out_rele;
+	ondisk_word = xfs_rtbitmap_getword(&rgb->args, wordcnt);
+
+	trace_xrep_rgbitmap_load_word(mp, wordoff, bit, ondisk_word,
+			xfile_word, mask);
+
+	xfile_word &= ~mask;
+	xfile_word |= (ondisk_word & mask);
+
+	error = xfbmp_store(rgb, wordoff, xfile_word);
+	if (error)
+		goto out_rele;
+
+out_rele:
+	xfs_rtbuf_cache_relse(&rgb->args);
+	return error;
+}
+
+/*
+ * Preserve the portions of the rtbitmap block for the end of this rtgroup
+ * that map to the next rtgroup.
+ */
+STATIC int
+xrep_rgbitmap_load_after(
+	struct xchk_rgbitmap	*rgb)
+{
+	struct xfs_scrub	*sc = rgb->sc;
+	struct xfs_mount	*mp = rgb->sc->mp;
+	struct xfs_rtgroup	*rtg = rgb->sc->sr.rtg;
+	xrep_wordoff_t		wordoff;
+	xfs_rtblock_t		last_rtbno;
+	xfs_rtxnum_t		last_group_rtx, last_rbmblock_rtx;
+	xfs_fileoff_t		last_group_rbmoff;
+	xfs_rtword_t		ondisk_word;
+	xfs_rtword_t		xfile_word;
+	xfs_rtword_t		mask;
+	xrep_wordcnt_t		wordcnt;
+	unsigned int		last_group_word;
+	int			bit;
+	int			error;
+
+	last_rtbno = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno,
+					rtg->rtg_blockcount - 1);
+	last_group_rtx = xfs_rtb_to_rtx(mp, last_rtbno);
+
+	last_group_rbmoff = xfs_rtx_to_rbmblock(mp, last_group_rtx);
+	rgb->group_rbmlen = last_group_rbmoff - rgb->group_rbmoff + 1;
+	last_rbmblock_rtx = xfs_rbmblock_to_rtx(mp, last_group_rbmoff + 1) - 1;
+
+	trace_xrep_rgbitmap_load(rtg, last_group_rbmoff, last_group_rtx + 1,
+			last_rbmblock_rtx);
+
+	if (last_rbmblock_rtx == last_group_rtx ||
+	    rtg->rtg_rgno == mp->m_sb.sb_rgcount - 1)
+		return 0;
+
+	rgb->args.mp = sc->mp;
+	rgb->args.tp = sc->tp;
+	error = xfs_rtbitmap_read_buf(&rgb->args, last_group_rbmoff);
+	if (error) {
+		/*
+		 * Reading the existing rbmblock failed, and we must deal with
+		 * the part of the rtbitmap block that corresponds to the
+		 * previous group.  The most conservative option is to fill
+		 * that part of the bitmap with zeroes so that it won't get
+		 * allocated.  The xfile contains zeroes already, so we can
+		 * return.
+		 */
+		return 0;
+	}
+
+	/*
+	 * Compute the bit position of the first rtextent of the next group.
+	 * If the bit position is zero, we don't have to RMW a partial word
+	 * and move to the next step.
+	 */
+	wordoff = rtx_to_wordoff(mp, last_group_rtx);
+	bit = (last_group_rtx + 1) & XREP_RTBMP_WORDMASK;
+	if (bit == 0)
+		goto copy_words;
+
+	/*
+	 * Create a mask of the bits that we want to load from disk.  These
+	 * bits track space in a different rtgroup, which is why we must
+	 * preserve them even as we replace parts of the bitmap.
+	 */
+	mask = (((xfs_rtword_t)1 << (XFS_NBWORD - bit)) - 1) << bit;
+
+	error = xfbmp_load(rgb, wordoff, &xfile_word);
+	if (error)
+		goto out_rele;
+	last_group_word = xfs_rtx_to_rbmword(mp, last_group_rtx);
+	ondisk_word = xfs_rtbitmap_getword(&rgb->args, last_group_word);
+
+	trace_xrep_rgbitmap_load_word(mp, wordoff, bit, ondisk_word,
+			xfile_word, mask);
+
+	xfile_word &= ~mask;
+	xfile_word |= (ondisk_word & mask);
+
+	error = xfbmp_store(rgb, wordoff, xfile_word);
+	if (error)
+		goto out_rele;
+
+copy_words:
+	/* Copy as many full words as we can. */
+	wordoff++;
+	wordcnt = rtxlen_to_wordcnt(last_rbmblock_rtx - last_group_rtx);
+	if (wordcnt > 0) {
+		union xfs_rtword_raw	*p;
+
+		p = xfs_rbmblock_wordptr(&rgb->args,
+				mp->m_blockwsize - wordcnt);
+		error = xfbmp_copyin(rgb, wordoff, p, wordcnt);
+		if (error)
+			goto out_rele;
+
+		trace_xrep_rgbitmap_load_words(mp, last_group_rbmoff, wordoff,
+				wordcnt);
+	}
+
+out_rele:
+	xfs_rtbuf_cache_relse(&rgb->args);
+	return error;
+}
+
+/* Perform a logical OR operation on an rtword in the incore bitmap. */
+static int
+xrep_rgbitmap_or(
+	struct xchk_rgbitmap	*rgb,
+	xrep_wordoff_t		wordoff,
+	xfs_rtword_t		mask)
+{
+	xfs_rtword_t		word;
+	int			error;
+
+	error = xfbmp_load(rgb, wordoff, &word);
+	if (error)
+		return error;
+
+	trace_xrep_rgbitmap_or(rgb->sc->mp, wordoff, mask, word);
+
+	return xfbmp_store(rgb, wordoff, word | mask);
+}
+
+/*
+ * Mark as free every rt extent between the next rt block we expected to see
+ * in the rtrmap records and the given rt block.
+ */
+STATIC int
+xrep_rgbitmap_mark_free(
+	struct xchk_rgbitmap	*rgb,
+	xfs_rgblock_t		rgbno)
+{
+	struct xfs_mount	*mp = rgb->sc->mp;
+	struct xfs_rtgroup	*rtg = rgb->sc->sr.rtg;
+	xfs_rtblock_t		rtbno;
+	xfs_rtxnum_t		startrtx;
+	xfs_rtxnum_t		nextrtx;
+	xrep_wordoff_t		wordoff, nextwordoff;
+	unsigned int		bit;
+	unsigned int		bufwsize;
+	xfs_extlen_t		mod;
+	xfs_rtword_t		mask;
+	int			error;
+
+	if (!xfs_verify_rgbext(rtg, rgb->next_rgbno, rgbno - rgb->next_rgbno))
+		return -EFSCORRUPTED;
+
+	/*
+	 * Convert rt blocks to rt extents  The block range we find must be
+	 * aligned to an rtextent boundary on both ends.
+	 */
+	rtbno = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, rgb->next_rgbno);
+	startrtx = xfs_rtb_to_rtxrem(mp, rtbno, &mod);
+	if (mod)
+		return -EFSCORRUPTED;
+
+	rtbno = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, rgbno - 1);
+	nextrtx = xfs_rtb_to_rtxrem(mp, rtbno, &mod) + 1;
+	if (mod != mp->m_sb.sb_rextsize - 1)
+		return -EFSCORRUPTED;
+
+	trace_xrep_rgbitmap_record_free(mp, startrtx, nextrtx - 1);
+
+	/* Set bits as needed to round startrtx up to the nearest word. */
+	bit = startrtx & XREP_RTBMP_WORDMASK;
+	if (bit) {
+		xfs_rtblock_t	len = nextrtx - startrtx;
+		unsigned int	lastbit;
+
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+
+		error = xrep_rgbitmap_or(rgb, rtx_to_wordoff(mp, startrtx),
+				mask);
+		if (error || lastbit - bit == len)
+			return error;
+		startrtx += XFS_NBWORD - bit;
+	}
+
+	/* Set bits as needed to round nextrtx down to the nearest word. */
+	bit = nextrtx & XREP_RTBMP_WORDMASK;
+	if (bit) {
+		mask = ((xfs_rtword_t)1 << bit) - 1;
+
+		error = xrep_rgbitmap_or(rgb, rtx_to_wordoff(mp, nextrtx),
+				mask);
+		if (error || startrtx + bit == nextrtx)
+			return error;
+		nextrtx -= bit;
+	}
+
+	trace_xrep_rgbitmap_record_free_bulk(mp, startrtx, nextrtx - 1);
+
+	/* Set all the words in between, up to a whole fs block at once. */
+	wordoff = rtx_to_wordoff(mp, startrtx);
+	nextwordoff = rtx_to_wordoff(mp, nextrtx);
+	bufwsize = mp->m_sb.sb_blocksize >> XFS_WORDLOG;
+
+	while (wordoff < nextwordoff) {
+		xrep_wordoff_t	rem;
+		xrep_wordcnt_t	wordcnt;
+
+		wordcnt = min_t(xrep_wordcnt_t, nextwordoff - wordoff,
+				bufwsize);
+
+		/*
+		 * Try to keep us aligned to the rtwords buffer to reduce the
+		 * number of xfile writes.
+		 */
+		rem = wordoff & (bufwsize - 1);
+		if (rem)
+			wordcnt = min_t(xrep_wordcnt_t, wordcnt,
+					bufwsize - rem);
+
+		error = xfbmp_copyin(rgb, wordoff, rgb->words, wordcnt);
+		if (error)
+			return error;
+
+		wordoff += wordcnt;
+	}
+
+	return 0;
+}
+
+/* Set free space in the rtbitmap based on rtrmapbt records. */
+STATIC int
+xrep_rgbitmap_walk_rtrmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xchk_rgbitmap		*rgb = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(rgb->sc, &error))
+		return error;
+
+	if (rgb->next_rgbno < rec->rm_startblock) {
+		error = xrep_rgbitmap_mark_free(rgb, rec->rm_startblock);
+		if (error)
+			return error;
+	}
+
+	rgb->next_rgbno = max(rgb->next_rgbno,
+			      rec->rm_startblock + rec->rm_blockcount);
+	return 0;
+}
+
+/*
+ * Walk the rtrmapbt to find all the gaps between records, and mark the gaps
+ * in the realtime bitmap that we're computing.
+ */
+STATIC int
+xrep_rgbitmap_find_freespace(
+	struct xchk_rgbitmap	*rgb)
+{
+	struct xfs_scrub	*sc = rgb->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_rtgroup	*rtg = sc->sr.rtg;
+	int			error;
+
+	/* Prepare a buffer of ones so that we can accelerate bulk setting. */
+	memset(rgb->words, 0xFF, mp->m_sb.sb_blocksize);
+
+	xrep_rtgroup_btcur_init(sc, &sc->sr);
+	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_rgbitmap_walk_rtrmap,
+			rgb);
+	if (error)
+		goto out;
+
+	/*
+	 * Mark as free every possible rt extent from the last one we saw to
+	 * the end of the rt group.
+	 */
+	if (rgb->next_rgbno < rtg->rtg_blockcount) {
+		error = xrep_rgbitmap_mark_free(rgb, rtg->rtg_blockcount);
+		if (error)
+			goto out;
+	}
+
+out:
+	xchk_rtgroup_btcur_free(&sc->sr);
+	return error;
+}
+
+static int
+xrep_rgbitmap_prep_buf(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp,
+	void			*data)
+{
+	struct xchk_rgbitmap	*rgb = data;
+	struct xfs_mount	*mp = sc->mp;
+	union xfs_rtword_raw	*ondisk;
+	int			error;
+
+	rgb->args.mp = sc->mp;
+	rgb->args.tp = sc->tp;
+	rgb->args.rbmbp = bp;
+	ondisk = xfs_rbmblock_wordptr(&rgb->args, 0);
+	rgb->args.rbmbp = NULL;
+
+	error = xfbmp_copyout(rgb, rgb->prep_wordoff, ondisk,
+			mp->m_blockwsize);
+	if (error)
+		return error;
+
+	if (xfs_has_rtgroups(sc->mp)) {
+		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+
+		hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+		hdr->rt_owner = cpu_to_be64(sc->ip->i_ino);
+		hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+		hdr->rt_lsn = 0;
+		uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid);
+		bp->b_ops = &xfs_rtbitmap_buf_ops;
+	} else {
+		bp->b_ops = &xfs_rtbuf_ops;
+	}
+
+	rgb->prep_wordoff += mp->m_blockwsize;
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTBITMAP_BUF);
+	return 0;
+}
+
+/* Repair the realtime bitmap for this rt group. */
+int
+xrep_rgbitmap(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_rgbitmap	*rgb = sc->buf;
+	int			error;
+
+	/*
+	 * We require the realtime rmapbt (and atomic file updates) to rebuild
+	 * anything.
+	 */
+	if (!xfs_has_rtrmapbt(sc->mp))
+		return -EOPNOTSUPP;
+
+	/*
+	 * If the start or end of this rt group happens to be in the middle of
+	 * an rtbitmap block, try to read in the parts of the bitmap that are
+	 * from some other group.
+	 */
+	error = xrep_rgbitmap_load_before(rgb);
+	if (error)
+		return error;
+	error = xrep_rgbitmap_load_after(rgb);
+	if (error)
+		return error;
+
+	/*
+	 * Generate the new rtbitmap data.  We don't need the rtbmp information
+	 * once this call is finished.
+	 */
+	error = xrep_rgbitmap_find_freespace(rgb);
+	if (error)
+		return error;
+
+	/*
+	 * Try to take ILOCK_EXCL of the temporary file.  We had better be the
+	 * only ones holding onto this inode, but we can't block while holding
+	 * the rtbitmap file's ILOCK_EXCL.
+	 */
+	while (!xrep_tempfile_ilock_nowait(sc)) {
+		if (xchk_should_terminate(sc, &error))
+			return error;
+		delay(1);
+	}
+
+	/*
+	 * Make sure we have space allocated for the part of the bitmap
+	 * file that corresponds to this group.
+	 */
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+	error = xrep_tempfile_prealloc(sc, rgb->group_rbmoff, rgb->group_rbmlen);
+	if (error)
+		return error;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	/* Copy the bitmap file that we generated. */
+	error = xrep_tempfile_copyin(sc, rgb->group_rbmoff, rgb->group_rbmlen,
+			xrep_rgbitmap_prep_buf, rgb);
+	if (error)
+		return error;
+	error = xrep_tempfile_set_isize(sc,
+			XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks));
+	if (error)
+		return error;
+
+	/*
+	 * Now swap the extents.  We're done with the temporary buffer, so
+	 * we can reuse it for the tempfile swapext information.
+	 */
+	error = xrep_tempswap_trans_reserve(sc, XFS_DATA_FORK,
+			rgb->group_rbmoff, rgb->group_rbmlen, &rgb->tempswap);
+	if (error)
+		return error;
+
+	error = xrep_tempswap_contents(sc, &rgb->tempswap);
+	if (error)
+		return error;
+
+	/* Free the old bitmap blocks if they are free. */
+	return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+}
+
+/* rt bitmap file repairs */
+
 /* Set up to repair the realtime bitmap file metadata. */
 int
 xrep_setup_rtbitmap(
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
index c66373eec436d..390c6403e8908 100644
--- a/fs/xfs/scrub/rtsummary_repair.c
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -168,7 +168,8 @@ xrep_rtsummary(
 	 * Now swap the extents.  Nothing in repair uses the temporary buffer,
 	 * so we can reuse it for the tempfile swapext information.
 	 */
-	error = xrep_tempswap_trans_reserve(sc, XFS_DATA_FORK, &rts->tempswap);
+	error = xrep_tempswap_trans_reserve(sc, XFS_DATA_FORK, 0, rsumblocks,
+			&rts->tempswap);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 428624d4f0c45..435003e5a1e92 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -472,7 +472,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_rgbitmap,
 		.scrub	= xchk_rgbitmap,
 		.has	= xfs_has_rtgroups,
-		.repair = xrep_notsupported,
+		.repair = xrep_rgbitmap,
 	},
 	[XFS_SCRUB_TYPE_RTRMAPBT] = {	/* realtime group rmapbt */
 		.type	= ST_RTGROUP,
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 1dd7c4c5cff0f..e4d1c703c3195 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -608,6 +608,8 @@ STATIC int
 xrep_tempswap_prep_request(
 	struct xfs_scrub	*sc,
 	int			whichfork,
+	xfs_fileoff_t		off,
+	xfs_filblks_t		len,
 	struct xrep_tempswap	*tx)
 {
 	struct xfs_swapext_req	*req = &tx->req;
@@ -631,10 +633,10 @@ xrep_tempswap_prep_request(
 	/* Swap all mappings in both forks. */
 	req->ip1 = sc->tempip;
 	req->ip2 = sc->ip;
-	req->startoff1 = 0;
-	req->startoff2 = 0;
+	req->startoff1 = off;
+	req->startoff2 = off;
 	req->whichfork = whichfork;
-	req->blockcount = XFS_MAX_FILEOFF;
+	req->blockcount = len;
 	req->req_flags = XFS_SWAP_REQ_LOGGED;
 
 	/* Always swap sizes when we're swapping data fork mappings. */
@@ -801,6 +803,8 @@ int
 xrep_tempswap_trans_reserve(
 	struct xfs_scrub	*sc,
 	int			whichfork,
+	xfs_fileoff_t		off,
+	xfs_filblks_t		len,
 	struct xrep_tempswap	*tx)
 {
 	int			error;
@@ -809,7 +813,7 @@ xrep_tempswap_trans_reserve(
 	ASSERT(xfs_isilocked(sc->ip, XFS_ILOCK_EXCL));
 	ASSERT(xfs_isilocked(sc->tempip, XFS_ILOCK_EXCL));
 
-	error = xrep_tempswap_prep_request(sc, whichfork, tx);
+	error = xrep_tempswap_prep_request(sc, whichfork, off, len, tx);
 	if (error)
 		return error;
 
@@ -846,7 +850,8 @@ xrep_tempswap_trans_alloc(
 
 	ASSERT(sc->tp == NULL);
 
-	error = xrep_tempswap_prep_request(sc, whichfork, tx);
+	error = xrep_tempswap_prep_request(sc, whichfork, 0, XFS_MAX_FILEOFF,
+			tx);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/scrub/tempswap.h b/fs/xfs/scrub/tempswap.h
index 83900eef8cfc5..0be14f5f6382e 100644
--- a/fs/xfs/scrub/tempswap.h
+++ b/fs/xfs/scrub/tempswap.h
@@ -13,7 +13,7 @@ struct xrep_tempswap {
 
 int xrep_tempswap_grab_log_assist(struct xfs_scrub *sc);
 int xrep_tempswap_trans_reserve(struct xfs_scrub *sc, int whichfork,
-		struct xrep_tempswap *ti);
+		xfs_fileoff_t off, xfs_filblks_t len, struct xrep_tempswap *ti);
 int xrep_tempswap_trans_alloc(struct xfs_scrub *sc, int whichfork,
 		struct xrep_tempswap *ti);
 
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 06f7b89920e94..4d0a6dceaa6c6 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -22,6 +22,7 @@
 #include "xfs_rmap.h"
 #include "xfs_parent.h"
 #include "xfs_imeta.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index f02914f129605..c90324ca86579 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -3752,6 +3752,155 @@ DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink);
 DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink);
 DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xrep_rgbitmap_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, xfs_rtxnum_t end),
+	TP_ARGS(mp, start, end),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rtxnum_t, start)
+		__field(xfs_rtxnum_t, end)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->start = start;
+		__entry->end = end;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d startrtx 0x%llx endrtx 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->start,
+		  __entry->end)
+);
+#define DEFINE_REPAIR_RGBITMAP_EVENT(name) \
+DEFINE_EVENT(xrep_rgbitmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, \
+		 xfs_rtxnum_t end), \
+	TP_ARGS(mp, start, end))
+DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rgbitmap_record_free);
+DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rgbitmap_record_free_bulk);
+
+TRACE_EVENT(xrep_rgbitmap_or,
+	TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff,
+		 xfs_rtword_t mask, xfs_rtword_t word),
+	TP_ARGS(mp, wordoff, mask, word),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(unsigned long long, wordoff)
+		__field(unsigned int, mask)
+		__field(unsigned int, word)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->wordoff = wordoff;
+		__entry->mask = mask;
+		__entry->word = word;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx mask 0x%x word 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->wordoff,
+		  __entry->mask,
+		  __entry->word)
+);
+
+TRACE_EVENT(xrep_rgbitmap_load,
+	TP_PROTO(struct xfs_rtgroup *rtg, xfs_fileoff_t rbmoff,
+		 xfs_rtxnum_t rtx, xfs_rtxnum_t len),
+	TP_ARGS(rtg, rbmoff, rtx, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_fileoff_t, rbmoff)
+		__field(xfs_rtxnum_t, rtx)
+		__field(xfs_rtxnum_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg->rtg_mount->m_super->s_dev;
+		__entry->rtdev = rtg->rtg_mount->m_rtdev_targp->bt_dev;
+		__entry->rgno = rtg->rtg_rgno;
+		__entry->rbmoff = rbmoff;
+		__entry->rtx = rtx;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rbmoff 0x%llx rtx 0x%llx rtxcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rgno,
+		  __entry->rbmoff,
+		  __entry->rtx,
+		  __entry->len)
+);
+
+TRACE_EVENT(xrep_rgbitmap_load_words,
+	TP_PROTO(struct xfs_mount *mp, xfs_fileoff_t rbmoff,
+		 unsigned long long wordoff, unsigned int wordcnt),
+	TP_ARGS(mp, rbmoff, wordoff, wordcnt),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_fileoff_t, rbmoff)
+		__field(unsigned long long, wordoff)
+		__field(unsigned int, wordcnt)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->rbmoff = rbmoff;
+		__entry->wordoff = wordoff;
+		__entry->wordcnt = wordcnt;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rbmoff 0x%llx wordoff 0x%llx wordcnt 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rbmoff,
+		  __entry->wordoff,
+		  __entry->wordcnt)
+);
+
+TRACE_EVENT(xrep_rgbitmap_load_word,
+	TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff,
+		 unsigned int bit, xfs_rtword_t ondisk_word,
+		 xfs_rtword_t xfile_word, xfs_rtword_t word_mask),
+	TP_ARGS(mp, wordoff, bit, ondisk_word, xfile_word, word_mask),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(unsigned long long, wordoff)
+		__field(unsigned int, bit)
+		__field(xfs_rtword_t, ondisk_word)
+		__field(xfs_rtword_t, xfile_word)
+		__field(xfs_rtword_t, word_mask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->wordoff = wordoff;
+		__entry->bit = bit;
+		__entry->ondisk_word = ondisk_word;
+		__entry->xfile_word = xfile_word;
+		__entry->word_mask = word_mask;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx bit %u ondisk 0x%x(0x%x) inmem 0x%x(0x%x) result 0x%x mask 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->wordoff,
+		  __entry->bit,
+		  __entry->ondisk_word,
+		  __entry->ondisk_word & __entry->word_mask,
+		  __entry->xfile_word,
+		  __entry->xfile_word & ~__entry->word_mask,
+		  (__entry->xfile_word & ~__entry->word_mask) |
+		  (__entry->ondisk_word & __entry->word_mask),
+		  __entry->word_mask)
+);
+#endif /* CONFIG_XFS_RT */
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 35/39] xfs: support repairing metadata btrees rooted in metadir inodes
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (33 preceding siblings ...)
  2023-12-31 21:40   ` [PATCH 34/39] xfs: online repair of realtime bitmaps for a realtime group Darrick J. Wong
@ 2023-12-31 21:40   ` Darrick J. Wong
  2023-12-31 21:41   ` [PATCH 36/39] xfs: online repair of the realtime rmap btree Darrick J. Wong
                     ` (3 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:40 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Adapt the repair code so that we can stage a new btree in the data fork
area of a metadir inode and reap the old blocks.  We already have nearly
all of the infrastructure; the only parts that were missing were the
metadata inode reservation handling.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/newbt.c |   43 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/newbt.h |    1 +
 fs/xfs/scrub/reap.c  |   41 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/reap.h  |    2 ++
 4 files changed, 87 insertions(+)


diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 8375c8b9752a7..66a780f4e0176 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -19,6 +19,8 @@
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
 #include "xfs_defer.h"
+#include "xfs_imeta.h"
+#include "xfs_quota.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -120,6 +122,44 @@ xrep_newbt_init_inode(
 	return 0;
 }
 
+/*
+ * Initialize accounting resources for staging a new metadata inode btree.
+ * If the inode has an imeta space reservation, the caller must adjust the
+ * imeta reservation at btree commit.
+ */
+int
+xrep_newbt_init_metadir_inode(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc)
+{
+	struct xfs_owner_info		oinfo;
+	struct xfs_ifork		*ifp;
+
+	ASSERT(xfs_is_metadir_inode(sc->ip));
+	ASSERT(XFS_IS_DQDETACHED(sc->mp, sc->ip));
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+
+	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+	if (!ifp)
+		return -ENOMEM;
+
+	/*
+	 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the
+	 * inode metadata space reservations can only account allocated space
+	 * to the i_nblocks.  We do not want to change the inode core fields
+	 * until we're ready to commit the new tree, so we allocate the blocks
+	 * as if they were regular file blocks.  This exposes us to a higher
+	 * risk of the repair being cancelled due to ENOSPC.
+	 */
+	xrep_newbt_init_ag(xnr, sc, &oinfo,
+			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+			XFS_AG_RESV_NONE);
+	xnr->ifake.if_fork = ifp;
+	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK);
+	return 0;
+}
+
 /*
  * Initialize accounting resources for staging a new btree.  Callers are
  * expected to add their own reservations (and clean them up) manually.
@@ -225,6 +265,7 @@ xrep_newbt_alloc_ag_blocks(
 	int			error = 0;
 
 	ASSERT(sc->sa.pag != NULL);
+	ASSERT(xnr->resv != XFS_AG_RESV_IMETA);
 
 	while (nr_blocks > 0) {
 		struct xfs_alloc_arg	args = {
@@ -299,6 +340,8 @@ xrep_newbt_alloc_file_blocks(
 	struct xfs_mount	*mp = sc->mp;
 	int			error = 0;
 
+	ASSERT(xnr->resv != XFS_AG_RESV_IMETA);
+
 	while (nr_blocks > 0) {
 		struct xfs_alloc_arg	args = {
 			.tp		= sc->tp,
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index 3d804d31af24a..5ce785599287b 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -63,6 +63,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
 		enum xfs_ag_resv_type resv);
 int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
 		int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_init_metadir_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc);
 int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
 int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
 		xfs_agblock_t agbno, xfs_extlen_t len);
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 37eb61906be18..b8c48e36d2a8d 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -33,6 +33,7 @@
 #include "xfs_attr.h"
 #include "xfs_attr_remote.h"
 #include "xfs_defer.h"
+#include "xfs_imeta.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -391,6 +392,8 @@ xreap_agextent_iter(
 	xfs_fsblock_t		fsbno;
 	int			error = 0;
 
+	ASSERT(rs->resv != XFS_AG_RESV_IMETA);
+
 	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
 
 	/*
@@ -676,6 +679,44 @@ xrep_reap_fsblocks(
 	return 0;
 }
 
+/*
+ * Dispose of every block of an old metadata btree that used to be rooted in a
+ * metadata directory file.
+ */
+int
+xrep_reap_metadir_fsblocks(
+	struct xfs_scrub		*sc,
+	struct xfsb_bitmap		*bitmap)
+{
+	/*
+	 * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old
+	 * blocks are no longer mapped by the inode, and inode metadata space
+	 * reservations can only account freed space to the i_nblocks.
+	 */
+	struct xfs_owner_info		oinfo;
+	struct xreap_state		rs = {
+		.sc			= sc,
+		.oinfo			= &oinfo,
+		.resv			= XFS_AG_RESV_NONE,
+	};
+	int				error;
+
+	ASSERT(xfs_has_rmapbt(sc->mp));
+	ASSERT(sc->ip != NULL);
+	ASSERT(xfs_is_metadir_inode(sc->ip));
+
+	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+
+	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
+	if (error)
+		return error;
+
+	if (xreap_dirty(&rs))
+		return xrep_defer_finish(sc);
+
+	return 0;
+}
+
 /*
  * Metadata files are not supposed to share blocks with anything else.
  * If blocks are shared, we remove the reverse mapping (thus reducing the
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 3f2f1775e29db..70e5e6bbb8d38 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -14,6 +14,8 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
 int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
 		const struct xfs_owner_info *oinfo);
 int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
+int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc,
+		struct xfsb_bitmap *bitmap);
 
 /* Buffer cache scan context. */
 struct xrep_bufscan {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 36/39] xfs: online repair of the realtime rmap btree
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (34 preceding siblings ...)
  2023-12-31 21:40   ` [PATCH 35/39] xfs: support repairing metadata btrees rooted in metadir inodes Darrick J. Wong
@ 2023-12-31 21:41   ` Darrick J. Wong
  2023-12-31 21:41   ` [PATCH 37/39] xfs: create a shadow rmap btree during realtime rmap repair Darrick J. Wong
                     ` (2 subsequent siblings)
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:41 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Repair the realtime rmap btree while mounted.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile                  |    1 
 fs/xfs/libxfs/xfs_rtrmap_btree.c |    2 
 fs/xfs/libxfs/xfs_rtrmap_btree.h |    3 
 fs/xfs/scrub/common.c            |    5 
 fs/xfs/scrub/repair.c            |  135 +++++++
 fs/xfs/scrub/repair.h            |   13 +
 fs/xfs/scrub/rtrmap.c            |    7 
 fs/xfs/scrub/rtrmap_repair.c     |  749 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c             |    2 
 fs/xfs/scrub/trace.h             |   57 +++
 10 files changed, 971 insertions(+), 3 deletions(-)
 create mode 100644 fs/xfs/scrub/rtrmap_repair.c


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 69ca5cb7e4000..f9092ae77e684 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -235,6 +235,7 @@ xfs-y				+= $(addprefix scrub/, \
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rgsuper_repair.o \
 				   rtbitmap_repair.o \
+				   rtrmap_repair.o \
 				   rtsummary_repair.o \
 				   )
 
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 355c50196e986..51a8ffff1c755 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -705,7 +705,7 @@ xfs_rtrmapbt_create_path(
 }
 
 /* Calculate the rtrmap btree size for some records. */
-static unsigned long long
+unsigned long long
 xfs_rtrmapbt_calc_size(
 	struct xfs_mount	*mp,
 	unsigned long long	len)
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 108ab8c0aea44..5aec719be053f 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -202,4 +202,7 @@ struct xfs_imeta_update;
 
 int xfs_rtrmapbt_create(struct xfs_imeta_update *upd, struct xfs_inode **ipp);
 
+unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index e16185e9ddd9b..558e267924399 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -993,7 +993,10 @@ int
 xchk_setup_rt(
 	struct xfs_scrub	*sc)
 {
-	return xchk_trans_alloc(sc, 0);
+	uint			resblks;
+
+	resblks = xrep_calc_rtgroup_resblks(sc);
+	return xchk_trans_alloc(sc, resblks);
 }
 
 /* Set us up with AG headers and btree cursors. */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4789014be4a36..c9c538083c722 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -39,6 +39,8 @@
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
+#include "xfs_imeta.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -382,6 +384,39 @@ xrep_calc_ag_resblks(
 	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
 }
 
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out how many blocks to reserve for a rtgroup repair.  We calculate
+ * the worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-rtgroup btree.
+ */
+xfs_extlen_t
+xrep_calc_rtgroup_resblks(
+	struct xfs_scrub		*sc)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_scrub_metadata	*sm = sc->sm;
+	struct xfs_rtgroup		*rtg;
+	xfs_extlen_t			usedlen;
+	xfs_extlen_t			rmapbt_sz = 0;
+
+	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+		return 0;
+
+	rtg = xfs_rtgroup_get(mp, sm->sm_agno);
+	usedlen = rtg->rtg_blockcount;
+	xfs_rtgroup_put(rtg);
+
+	if (xfs_has_rmapbt(mp))
+		rmapbt_sz = xfs_rtrmapbt_calc_size(mp, usedlen);
+
+	trace_xrep_calc_rtgroup_resblks_btsize(mp, sm->sm_agno, usedlen,
+			rmapbt_sz);
+
+	return rmapbt_sz;
+}
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Reconstructing per-AG Btrees
  *
@@ -1302,3 +1337,103 @@ xrep_is_rtmeta_ino(
 
 	return false;
 }
+
+/* Check the sanity of a rmap record for a metadata btree inode. */
+int
+xrep_check_ino_btree_mapping(
+	struct xfs_scrub		*sc,
+	const struct xfs_rmap_irec	*rec)
+{
+	enum xbtree_recpacking		outcome;
+	int				error;
+
+	/*
+	 * Metadata btree inodes never have extended attributes, and all blocks
+	 * should have the bmbt block flag set.
+	 */
+	if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) ||
+	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+		return -EFSCORRUPTED;
+
+	/* Make sure the block is within the AG. */
+	if (!xfs_verify_agbext(sc->sa.pag, rec->rm_startblock,
+				rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+			rec->rm_blockcount, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/*
+ * Reset the block count of the inode being repaired, and adjust the dquot
+ * block usage to match.  The inode must not have an xattr fork.
+ */
+void
+xrep_inode_set_nblocks(
+	struct xfs_scrub	*sc,
+	int64_t			new_blocks)
+{
+	int64_t			delta;
+
+	delta = new_blocks - sc->ip->i_nblocks;
+	sc->ip->i_nblocks = new_blocks;
+
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	if (delta != 0)
+		xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT,
+				delta);
+}
+
+/* Reset the block reservation for a metadata inode. */
+int
+xrep_reset_imeta_reservation(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_inode	*ip = sc->ip;
+	int64_t			delta;
+	int			error;
+
+	delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
+	if (delta == 0)
+		return 0;
+
+	if (delta > 0) {
+		int64_t		give_back;
+
+		/* Too many blocks, free from the incore reservation. */
+		give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
+		if (give_back > 0) {
+			xfs_mod_delalloc(ip->i_mount, -give_back);
+			xfs_mod_fdblocks(ip->i_mount, give_back, true);
+			ip->i_delayed_blks -= give_back;
+		}
+
+		return 0;
+	}
+
+	/* Not enough reservation, try to add more.  @delta is negative here. */
+	error = xfs_mod_fdblocks(sc->mp, delta, true);
+	while (error == -ENOSPC) {
+		delta++;
+		if (delta == 0) {
+			xfs_warn(sc->mp,
+"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
+					ip->i_ino);
+			return 0;
+		}
+		error = xfs_mod_fdblocks(sc->mp, delta, true);
+	}
+	if (error)
+		return error;
+
+	xfs_mod_delalloc(sc->mp, -delta);
+	ip->i_delayed_blks += -delta;
+	return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index b66e0b5331394..a382ba0478aa0 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -96,6 +96,7 @@ int xrep_setup_parent(struct xfs_scrub *sc);
 int xrep_setup_nlinks(struct xfs_scrub *sc);
 int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
 int xrep_setup_dirtree(struct xfs_scrub *sc);
+int xrep_setup_rtrmapbt(struct xfs_scrub *sc);
 
 /* Repair setup functions */
 int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -112,12 +113,16 @@ int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
 void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr);
 int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
 		xfs_filblks_t len);
+xfs_extlen_t xrep_calc_rtgroup_resblks(struct xfs_scrub *sc);
 #else
 # define xrep_rtgroup_init(sc, rtg, sr, lockflags)	(-ENOSYS)
+# define xrep_calc_rtgroup_resblks(sc)			(0)
 #endif /* CONFIG_XFS_RT */
 
 bool xrep_is_rtmeta_ino(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
 		xfs_ino_t ino);
+int xrep_check_ino_btree_mapping(struct xfs_scrub *sc,
+		const struct xfs_rmap_irec *rec);
 
 /* Metadata revalidators */
 
@@ -153,11 +158,13 @@ int xrep_rtbitmap(struct xfs_scrub *sc);
 int xrep_rtsummary(struct xfs_scrub *sc);
 int xrep_rgsuperblock(struct xfs_scrub *sc);
 int xrep_rgbitmap(struct xfs_scrub *sc);
+int xrep_rtrmapbt(struct xfs_scrub *sc);
 #else
 # define xrep_rtbitmap			xrep_notsupported
 # define xrep_rtsummary			xrep_notsupported
 # define xrep_rgsuperblock		xrep_notsupported
 # define xrep_rgbitmap			xrep_notsupported
+# define xrep_rtrmapbt			xrep_notsupported
 #endif /* CONFIG_XFS_RT */
 
 #ifdef CONFIG_XFS_QUOTA
@@ -176,6 +183,8 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
 void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
 
 bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks);
+int xrep_reset_imeta_reservation(struct xfs_scrub *sc);
 
 #else
 
@@ -199,6 +208,8 @@ xrep_calc_ag_resblks(
 	return 0;
 }
 
+#define xrep_calc_rtgroup_resblks	xrep_calc_ag_resblks
+
 static inline int
 xrep_reset_perag_resv(
 	struct xfs_scrub	*sc)
@@ -226,6 +237,7 @@ xrep_setup_nothing(
 #define xrep_setup_nlinks		xrep_setup_nothing
 #define xrep_setup_dirtree		xrep_setup_nothing
 #define xrep_setup_metapath		xrep_setup_nothing
+#define xrep_setup_rtrmapbt		xrep_setup_nothing
 
 #define xrep_setup_inode(sc, imap)	((void)0)
 
@@ -264,6 +276,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
 #define xrep_metapath			xrep_notsupported
 #define xrep_rgsuperblock		xrep_notsupported
 #define xrep_rgbitmap			xrep_notsupported
+#define xrep_rtrmapbt			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index c3e1cee81b6d2..ce21fa95a5da7 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -27,6 +27,7 @@
 #include "scrub/common.h"
 #include "scrub/btree.h"
 #include "scrub/trace.h"
+#include "scrub/repair.h"
 
 /* Set us up with the realtime metadata locked. */
 int
@@ -44,6 +45,12 @@ xchk_setup_rtrmapbt(
 	if (!rtg)
 		return -ENOENT;
 
+	if (xchk_could_repair(sc)) {
+		error = xrep_setup_rtrmapbt(sc);
+		if (error)
+			return error;
+	}
+
 	error = xchk_setup_rt(sc);
 	if (error)
 		goto out_rtg;
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
new file mode 100644
index 0000000000000..c56558ce919ab
--- /dev/null
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -0,0 +1,749 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Realtime Reverse Mapping Btree Repair
+ * =====================================
+ *
+ * This isn't quite as difficult as repairing the rmap btree on the data
+ * device, since we only store the data fork extents of realtime files on the
+ * realtime device.  We still have to freeze the filesystem and stop the
+ * background threads like we do for the rmap repair, but we only have to scan
+ * realtime inodes.
+ *
+ * Collecting entries for the new realtime rmap btree is easy -- all we have
+ * to do is generate rtrmap entries from the data fork mappings of all realtime
+ * files in the filesystem.  We then scan the rmap btrees of the data device
+ * looking for extents belonging to the old btree and note them in a bitmap.
+ *
+ * To rebuild the realtime rmap btree, we bulk-load the collected mappings into
+ * a new btree cursor and atomically swap that into the realtime inode.  Then
+ * we can free the blocks from the old btree.
+ *
+ * We use the 'xrep_rtrmap' prefix for all the rmap functions.
+ */
+
+/*
+ * Packed rmap record.  The UNWRITTEN flags are hidden in the upper bits of
+ * offset, just like the on-disk record.
+ */
+struct xrep_rtrmap_extent {
+	xfs_rgblock_t	startblock;
+	xfs_extlen_t	blockcount;
+	uint64_t	owner;
+	uint64_t	offset;
+} __packed;
+
+/* Context for collecting rmaps */
+struct xrep_rtrmap {
+	/* new rtrmapbt information */
+	struct xrep_newbt	new_btree;
+
+	/* rmap records generated from primary metadata */
+	struct xfarray		*rtrmap_records;
+
+	struct xfs_scrub	*sc;
+
+	/* bitmap of old rtrmapbt blocks */
+	struct xfsb_bitmap	old_rtrmapbt_blocks;
+
+	/* inode scan cursor */
+	struct xchk_iscan	iscan;
+
+	/* get_records()'s position in the free space record array. */
+	xfarray_idx_t		array_cur;
+};
+
+/* Set us up to repair rt reverse mapping btrees. */
+int
+xrep_setup_rtrmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_rtrmap	*rr;
+
+	rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS);
+	if (!rr)
+		return -ENOMEM;
+
+	rr->sc = sc;
+	sc->buf = rr;
+	return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rtrmap_check_mapping(
+	struct xfs_scrub	*sc,
+	const struct xfs_rmap_irec *rec)
+{
+	xfs_rtblock_t		rtbno;
+
+	if (xfs_rtrmap_check_irec(sc->sr.rtg, rec) != NULL)
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	rtbno = xfs_rgbno_to_rtb(sc->mp, sc->sr.rtg->rtg_rgno,
+			rec->rm_startblock);
+	return xrep_require_rtext_inuse(sc, rtbno, rec->rm_blockcount);
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rtrmap_stash(
+	struct xrep_rtrmap	*rr,
+	xfs_rgblock_t		startblock,
+	xfs_extlen_t		blockcount,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	struct xrep_rtrmap_extent	rre = {
+		.startblock	= startblock,
+		.blockcount	= blockcount,
+		.owner		= owner,
+	};
+	struct xfs_rmap_irec	rmap = {
+		.rm_startblock	= startblock,
+		.rm_blockcount	= blockcount,
+		.rm_owner	= owner,
+		.rm_offset	= offset,
+		.rm_flags	= flags,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	int			error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	trace_xrep_rtrmap_found(sc->mp, &rmap);
+
+	rre.offset = xfs_rmap_irec_offset_pack(&rmap);
+	return xfarray_append(rr->rtrmap_records, &rre);
+}
+
+/* Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rtrmap_ifork {
+	/*
+	 * Accumulate rmap data here to turn multiple adjacent bmaps into a
+	 * single rmap.
+	 */
+	struct xfs_rmap_irec	accum;
+
+	struct xrep_rtrmap	*rr;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rtrmap_stash_accumulated(
+	struct xrep_rtrmap_ifork	*rf)
+{
+	if (rf->accum.rm_blockcount == 0)
+		return 0;
+
+	return xrep_rtrmap_stash(rf->rr, rf->accum.rm_startblock,
+			rf->accum.rm_blockcount, rf->accum.rm_owner,
+			rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rtrmap_visit_bmbt(
+	struct xfs_btree_cur	*cur,
+	struct xfs_bmbt_irec	*rec,
+	void			*priv)
+{
+	struct xrep_rtrmap_ifork *rf = priv;
+	struct xfs_rmap_irec	*accum = &rf->accum;
+	struct xfs_mount	*mp = rf->rr->sc->mp;
+	xfs_rgnumber_t		rgno;
+	xfs_rgblock_t		rgbno;
+	unsigned int		rmap_flags = 0;
+	int			error;
+
+	rgbno = xfs_rtb_to_rgbno(mp, rec->br_startblock, &rgno);
+	if (rgno != rf->rr->sc->sr.rtg->rtg_rgno)
+		return 0;
+
+	if (rec->br_state == XFS_EXT_UNWRITTEN)
+		rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+	/* If this bmap is adjacent to the previous one, just add it. */
+	if (accum->rm_blockcount > 0 &&
+	    rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+	    rgbno == accum->rm_startblock + accum->rm_blockcount &&
+	    rmap_flags == accum->rm_flags) {
+		accum->rm_blockcount += rec->br_blockcount;
+		return 0;
+	}
+
+	/* Otherwise stash the old rmap and start accumulating a new one. */
+	error = xrep_rtrmap_stash_accumulated(rf);
+	if (error)
+		return error;
+
+	accum->rm_startblock = rgbno;
+	accum->rm_blockcount = rec->br_blockcount;
+	accum->rm_offset = rec->br_startoff;
+	accum->rm_flags = rmap_flags;
+	return 0;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that maps to the rt volume.  Sets @mappings_done to true if we've
+ * scanned the block mappings in this fork.
+ */
+STATIC int
+xrep_rtrmap_scan_bmbt(
+	struct xrep_rtrmap_ifork *rf,
+	struct xfs_inode	*ip,
+	bool			*mappings_done)
+{
+	struct xrep_rtrmap	*rr = rf->rr;
+	struct xfs_btree_cur	*cur;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	int			error = 0;
+
+	*mappings_done = false;
+
+	/*
+	 * If the incore extent cache is already loaded, we'll just use the
+	 * incore extent scanner to record mappings.  Don't bother walking the
+	 * ondisk extent tree.
+	 */
+	if (!xfs_need_iread_extents(ifp))
+		return 0;
+
+	/* Accumulate all the mappings in the bmap btree. */
+	cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, XFS_DATA_FORK);
+	error = xfs_bmap_query_all(cur, xrep_rtrmap_visit_bmbt, rf);
+	xfs_btree_del_cursor(cur, error);
+	if (error)
+		return error;
+
+	/* Stash any remaining accumulated rmaps and exit. */
+	*mappings_done = true;
+	return xrep_rtrmap_stash_accumulated(rf);
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rtrmap_scan_iext(
+	struct xrep_rtrmap_ifork *rf,
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_bmbt_irec	rec;
+	struct xfs_iext_cursor	icur;
+	int			error;
+
+	for_each_xfs_iext(ifp, &icur, &rec) {
+		if (isnullstartblock(rec.br_startblock))
+			continue;
+		error = xrep_rtrmap_visit_bmbt(NULL, &rec, rf);
+		if (error)
+			return error;
+	}
+
+	return xrep_rtrmap_stash_accumulated(rf);
+}
+
+/* Find all the extents on the realtime device mapped by an inode fork. */
+STATIC int
+xrep_rtrmap_scan_dfork(
+	struct xrep_rtrmap	*rr,
+	struct xfs_inode	*ip)
+{
+	struct xrep_rtrmap_ifork rf = {
+		.accum		= { .rm_owner = ip->i_ino, },
+		.rr		= rr,
+	};
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	int			error = 0;
+
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		bool		mappings_done;
+
+		/*
+		 * Scan the bmbt for mappings.  If the incore extent tree is
+		 * loaded, we want to scan the cached mappings since that's
+		 * faster when the extent counts are very high.
+		 */
+		error = xrep_rtrmap_scan_bmbt(&rf, ip, &mappings_done);
+		if (error || mappings_done)
+			return error;
+	} else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+		/* realtime data forks should only be extents or btree */
+		return -EFSCORRUPTED;
+	}
+
+	/* Scan incore extent cache. */
+	return xrep_rtrmap_scan_iext(&rf, ifp);
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rtrmap_scan_inode(
+	struct xrep_rtrmap	*rr,
+	struct xfs_inode	*ip)
+{
+	unsigned int		lock_mode;
+	int			error = 0;
+
+	/* Skip the rt rmap btree inode. */
+	if (rr->sc->ip == ip)
+		return 0;
+
+	lock_mode = xfs_ilock_data_map_shared(ip);
+
+	/* Check the data fork if it's on the realtime device. */
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		error = xrep_rtrmap_scan_dfork(rr, ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+	xfs_iunlock(ip, lock_mode);
+	return error;
+}
+
+/* Record extents that belong to the realtime rmap inode. */
+STATIC int
+xrep_rtrmap_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_rtrmap		*rr = priv;
+	struct xfs_mount		*mp = cur->bc_mp;
+	xfs_fsblock_t			fsbno;
+	int				error = 0;
+
+	if (xchk_should_terminate(rr->sc, &error))
+		return error;
+
+	/* Skip extents which are not owned by this inode and fork. */
+	if (rec->rm_owner != rr->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_check_ino_btree_mapping(rr->sc, rec);
+	if (error)
+		return error;
+
+	fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
+			rec->rm_startblock);
+
+	return xfsb_bitmap_set(&rr->old_rtrmapbt_blocks, fsbno,
+			rec->rm_blockcount);
+}
+
+/* Scan one AG for reverse mappings for the realtime rmap btree. */
+STATIC int
+xrep_rtrmap_scan_ag(
+	struct xrep_rtrmap	*rr,
+	struct xfs_perag	*pag)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	error = xrep_ag_init(sc, pag, &sc->sa);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrmap_walk_rmap, rr);
+	xchk_ag_free(sc, &sc->sa);
+	return error;
+}
+
+STATIC int
+xrep_rtrmap_find_super_rmaps(
+	struct xrep_rtrmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+
+	/* Create a record for the rtgroup superblock. */
+	return xrep_rtrmap_stash(rr, 0, sc->mp->m_sb.sb_rextsize,
+			XFS_RMAP_OWN_FS, 0, 0);
+}
+
+/* Generate all the reverse-mappings for the realtime device. */
+STATIC int
+xrep_rtrmap_find_rmaps(
+	struct xrep_rtrmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_perag	*pag;
+	struct xfs_inode	*ip;
+	xfs_agnumber_t		agno;
+	int			error;
+
+	/* Generate rmaps for the rtgroup superblock */
+	error = xrep_rtrmap_find_super_rmaps(rr);
+	if (error)
+		return error;
+
+	/*
+	 * Set up for a potentially lengthy filesystem scan by reducing our
+	 * transaction resource usage for the duration.  Specifically:
+	 *
+	 * Unlock the realtime metadata inodes and cancel the transaction to
+	 * release the log grant space while we scan the filesystem.
+	 *
+	 * Create a new empty transaction to eliminate the possibility of the
+	 * inode scan deadlocking on cyclical metadata.
+	 *
+	 * We pass the empty transaction to the file scanning function to avoid
+	 * repeatedly cycling empty transactions.  This can be done even though
+	 * we take the IOLOCK to quiesce the file because empty transactions
+	 * do not take sb_internal.
+	 */
+	xchk_trans_cancel(sc);
+	xchk_rtgroup_unlock(sc, &sc->sr);
+	error = xchk_trans_alloc_empty(sc);
+	if (error)
+		return error;
+
+	while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+		error = xrep_rtrmap_scan_inode(rr, ip);
+		xchk_irele(sc, ip);
+		if (error)
+			break;
+
+		if (xchk_should_terminate(sc, &error))
+			break;
+	}
+	xchk_iscan_iter_finish(&rr->iscan);
+	if (error)
+		return error;
+
+	/*
+	 * Switch out for a real transaction and lock the RT metadata in
+	 * preparation for building a new tree.
+	 */
+	xchk_trans_cancel(sc);
+	error = xchk_setup_rt(sc);
+	if (error)
+		return error;
+	error = xchk_rtgroup_drain_and_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+	if (error)
+		return error;
+
+	/* Scan for old rtrmap blocks. */
+	for_each_perag(sc->mp, agno, pag) {
+		error = xrep_rtrmap_scan_ag(rr, pag);
+		if (error) {
+			xfs_perag_rele(pag);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+/* Building the new rtrmap btree. */
+
+/* Retrieve rtrmapbt data for bulk load. */
+STATIC int
+xrep_rtrmap_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xrep_rtrmap_extent	rec;
+	struct xfs_rmap_irec		*irec = &cur->bc_rec.r;
+	struct xrep_rtrmap		*rr = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+	int				error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		error = xfarray_load_next(rr->rtrmap_records, &rr->array_cur,
+				&rec);
+		if (error)
+			return error;
+
+		irec->rm_startblock = rec.startblock;
+		irec->rm_blockcount = rec.blockcount;
+		irec->rm_owner = rec.owner;
+
+		if (xfs_rmap_irec_offset_unpack(rec.offset, irec) != NULL)
+			return -EFSCORRUPTED;
+
+		error = xrep_rtrmap_check_mapping(rr->sc, irec);
+		if (error)
+			return error;
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrmap_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_rtrmap	*rr = priv;
+
+	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrmap_iroot_size(
+	struct xfs_btree_cur	*cur,
+	unsigned int		level,
+	unsigned int		nr_this_level,
+	void			*priv)
+{
+	return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
+}
+
+/*
+ * Use the collected rmap information to stage a new rmap btree.  If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed.  This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rtrmap_build_new_tree(
+	struct xrep_rtrmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_rtgroup	*rtg = sc->sr.rtg;
+	struct xfs_btree_cur	*rmap_cur;
+	uint64_t		nr_records;
+	int			error;
+
+	/*
+	 * Prepare to construct the new btree by reserving disk space for the
+	 * new btree and setting up all the accounting information we'll need
+	 * to root the new btree while it's under construction and before we
+	 * attach it to the realtime rmapbt inode.
+	 */
+	error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+	if (error)
+		return error;
+
+	rr->new_btree.bload.get_records = xrep_rtrmap_get_records;
+	rr->new_btree.bload.claim_block = xrep_rtrmap_claim_block;
+	rr->new_btree.bload.iroot_size = xrep_rtrmap_iroot_size;
+
+	rmap_cur = xfs_rtrmapbt_stage_cursor(sc->mp, rtg, rtg->rtg_rmapip,
+			&rr->new_btree.ifake);
+
+	nr_records = xfarray_length(rr->rtrmap_records);
+
+	/* Compute how many blocks we'll need for the rmaps collected. */
+	error = xfs_btree_bload_compute_geometry(rmap_cur,
+			&rr->new_btree.bload, nr_records);
+	if (error)
+		goto err_cur;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		goto err_cur;
+
+	/*
+	 * Guess how many blocks we're going to need to rebuild an entire
+	 * rtrmapbt from the number of extents we found, and pump up our
+	 * transaction to have sufficient block reservation.  We're allowed
+	 * to exceed quota to repair inconsistent metadata, though this is
+	 * unlikely.
+	 */
+	error = xfs_trans_reserve_more_inode(sc->tp, rtg->rtg_rmapip,
+			rr->new_btree.bload.nr_blocks, 0, true);
+	if (error)
+		goto err_cur;
+
+	/* Reserve the space we'll need for the new btree. */
+	error = xrep_newbt_alloc_blocks(&rr->new_btree,
+			rr->new_btree.bload.nr_blocks);
+	if (error)
+		goto err_cur;
+
+	/* Add all observed rmap records. */
+	rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_RMAP;
+	rr->array_cur = XFARRAY_CURSOR_INIT;
+	error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Install the new rtrmap btree in the inode.  After this point the old
+	 * btree is no longer accessible, the new tree is live, and we can
+	 * delete the cursor.
+	 */
+	xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp);
+	xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+	xfs_btree_del_cursor(rmap_cur, 0);
+
+	/* Dispose of any unused blocks and the accounting information. */
+	error = xrep_newbt_commit(&rr->new_btree);
+	if (error)
+		return error;
+
+	return xrep_roll_trans(sc);
+
+err_cur:
+	xfs_btree_del_cursor(rmap_cur, error);
+	xrep_newbt_cancel(&rr->new_btree);
+	return error;
+}
+
+/* Reaping the old btree. */
+
+/* Reap the old rtrmapbt blocks. */
+STATIC int
+xrep_rtrmap_remove_old_tree(
+	struct xrep_rtrmap	*rr)
+{
+	int			error;
+
+	/*
+	 * Free all the extents that were allocated to the former rtrmapbt and
+	 * aren't cross-linked with something else.
+	 */
+	error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
+	if (error)
+		return error;
+
+	/*
+	 * Ensure the proper reservation for the rtrmap inode so that we don't
+	 * fail to expand the new btree.
+	 */
+	return xrep_reset_imeta_reservation(rr->sc);
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rtrmap_setup_scan(
+	struct xrep_rtrmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	char			*descr;
+	int			error;
+
+	xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);
+
+	/* Set up some storage */
+	descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
+	error = xfarray_create(descr, 0, sizeof(struct xrep_rtrmap_extent),
+			&rr->rtrmap_records);
+	kfree(descr);
+	if (error)
+		goto out_bitmap;
+
+	/* Retry iget every tenth of a second for up to 30 seconds. */
+	xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+	return 0;
+
+out_bitmap:
+	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+	return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rtrmap_teardown(
+	struct xrep_rtrmap	*rr)
+{
+	xchk_iscan_teardown(&rr->iscan);
+	xfarray_destroy(rr->rtrmap_records);
+	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+}
+
+/* Repair the realtime rmap btree. */
+int
+xrep_rtrmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_rtrmap	*rr = sc->buf;
+	int			error;
+
+	/* Functionality is not yet complete. */
+	return xrep_notsupported(sc);
+
+	/* Make sure any problems with the fork are fixed. */
+	error = xrep_metadata_inode_forks(sc);
+	if (error)
+		return error;
+
+	error = xrep_rtrmap_setup_scan(rr);
+	if (error)
+		return error;
+
+	/* Collect rmaps for realtime files. */
+	error = xrep_rtrmap_find_rmaps(rr);
+	if (error)
+		goto out_records;
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* Rebuild the rtrmap information. */
+	error = xrep_rtrmap_build_new_tree(rr);
+	if (error)
+		goto out_records;
+
+	/* Kill the old tree. */
+	error = xrep_rtrmap_remove_old_tree(rr);
+	if (error)
+		goto out_records;
+
+out_records:
+	xrep_rtrmap_teardown(rr);
+	return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 435003e5a1e92..8193ad6702b4d 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -479,7 +479,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_rtrmapbt,
 		.scrub	= xchk_rtrmapbt,
 		.has	= xfs_has_rtrmapbt,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_rtrmapbt,
 	},
 };
 
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index c90324ca86579..95fdb82660dc3 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2314,6 +2314,32 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
 		  __entry->rmapbt_sz,
 		  __entry->refcbt_sz)
 )
+
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xrep_calc_rtgroup_resblks_btsize,
+	TP_PROTO(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		 xfs_rgblock_t usedlen, xfs_rgblock_t rmapbt_sz),
+	TP_ARGS(mp, rgno, usedlen, rmapbt_sz),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, usedlen)
+		__field(xfs_rgblock_t, rmapbt_sz)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rgno = rgno;
+		__entry->usedlen = usedlen;
+		__entry->rmapbt_sz = rmapbt_sz;
+	),
+	TP_printk("dev %d:%d rgno 0x%x usedlen %u rmapbt %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->usedlen,
+		  __entry->rmapbt_sz)
+);
+#endif /* CONFIG_XFS_RT */
+
 TRACE_EVENT(xrep_reset_counters,
 	TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc),
 	TP_ARGS(mp, fsc),
@@ -3899,6 +3925,37 @@ TRACE_EVENT(xrep_rgbitmap_load_word,
 		  (__entry->ondisk_word & __entry->word_mask),
 		  __entry->word_mask)
 );
+
+TRACE_EVENT(xrep_rtrmap_found,
+	TP_PROTO(struct xfs_mount *mp, const struct xfs_rmap_irec *rec),
+	TP_ARGS(mp, rec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->rgbno = rec->rm_startblock;
+		__entry->len = rec->rm_blockcount;
+		__entry->owner = rec->rm_owner;
+		__entry->offset = rec->rm_offset;
+		__entry->flags = rec->rm_flags;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rgbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rgbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
 #endif /* CONFIG_XFS_RT */
 
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 37/39] xfs: create a shadow rmap btree during realtime rmap repair
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (35 preceding siblings ...)
  2023-12-31 21:41   ` [PATCH 36/39] xfs: online repair of the realtime rmap btree Darrick J. Wong
@ 2023-12-31 21:41   ` Darrick J. Wong
  2023-12-31 21:41   ` [PATCH 38/39] xfs: hook live realtime rmap operations during a repair operation Darrick J. Wong
  2023-12-31 21:41   ` [PATCH 39/39] xfs: enable realtime rmap btree Darrick J. Wong
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:41 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create an in-memory btree of rmap records instead of an array.  This
enables us to do live record collection instead of freezing the fs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.c        |    2 
 fs/xfs/libxfs/xfs_btree.h        |    1 
 fs/xfs/libxfs/xfs_rmap.c         |    5 +
 fs/xfs/libxfs/xfs_rtrmap_btree.c |  123 ++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrmap_btree.h |    9 ++
 fs/xfs/scrub/rtrmap_repair.c     |  158 +++++++++++++++++++++++++++-----------
 fs/xfs/scrub/xfbtree.c           |    3 +
 7 files changed, 255 insertions(+), 46 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a294641d91832..2a181cf30299f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -491,6 +491,8 @@ xfs_btree_del_cursor(
 	if (cur->bc_flags & XFS_BTREE_IN_XFILE) {
 		if (cur->bc_mem.pag)
 			xfs_perag_put(cur->bc_mem.pag);
+		if (cur->bc_mem.rtg)
+			xfs_rtgroup_put(cur->bc_mem.rtg);
 	}
 	kmem_cache_free(cur->bc_cache, cur);
 }
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3559cf5d3a653..4753a5c847616 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -269,6 +269,7 @@ struct xfs_btree_cur_mem {
 	struct xfbtree			*xfbtree;
 	struct xfs_buf			*head_bp;
 	struct xfs_perag		*pag;
+	struct xfs_rtgroup		*rtg;
 };
 
 struct xfs_btree_level {
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 8766805ed1343..d100e03f9560f 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -329,8 +329,11 @@ xfs_rmap_check_btrec(
 	struct xfs_btree_cur		*cur,
 	const struct xfs_rmap_irec	*irec)
 {
-	if (cur->bc_btnum == XFS_BTNUM_RTRMAP)
+	if (cur->bc_btnum == XFS_BTNUM_RTRMAP) {
+		if (cur->bc_flags & XFS_BTREE_IN_XFILE)
+			return xfs_rtrmap_check_irec(cur->bc_mem.rtg, irec);
 		return xfs_rtrmap_check_irec(cur->bc_ino.rtg, irec);
+	}
 
 	if (cur->bc_flags & XFS_BTREE_IN_XFILE)
 		return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 51a8ffff1c755..3084153af3a43 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -28,6 +28,9 @@
 #include "xfs_rtgroup.h"
 #include "xfs_bmap.h"
 #include "xfs_health.h"
+#include "scrub/xfile.h"
+#include "scrub/xfbtree.h"
+#include "xfs_btree_mem.h"
 
 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
 
@@ -557,6 +560,126 @@ xfs_rtrmapbt_stage_cursor(
 	return cur;
 }
 
+#ifdef CONFIG_XFS_BTREE_IN_XFILE
+/*
+ * Validate an in-memory realtime rmap btree block.  Callers are allowed to
+ * generate an in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rtrmapbt_mem_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+
+	level = be16_to_cpu(block->bb_level);
+	if (xfs_has_rmapbt(mp)) {
+		if (level >= mp->m_rtrmap_maxlevels)
+			return __this_address;
+	} else {
+		if (level >= xfs_rtrmapbt_maxlevels_ondisk())
+			return __this_address;
+	}
+
+	return xfbtree_lblock_verify(bp,
+			xfs_rtrmapbt_maxrecs(mp, xfo_to_b(1), level == 0));
+}
+
+static void
+xfs_rtrmapbt_mem_rw_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa = xfs_rtrmapbt_mem_verify(bp);
+
+	if (fa)
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = {
+	.name			= "xfs_rtrmapbt_mem",
+	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+	.verify_read		= xfs_rtrmapbt_mem_rw_verify,
+	.verify_write		= xfs_rtrmapbt_mem_rw_verify,
+	.verify_struct		= xfs_rtrmapbt_mem_verify,
+};
+
+static const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
+	.rec_len		= sizeof(struct xfs_rmap_rec),
+	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+	.lru_refs		= XFS_RMAP_BTREE_REF,
+	.geom_flags		= XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING |
+				  XFS_BTREE_LONG_PTRS | XFS_BTREE_IN_XFILE,
+
+	.dup_cursor		= xfbtree_dup_cursor,
+	.set_root		= xfbtree_set_root,
+	.alloc_block		= xfbtree_alloc_block,
+	.free_block		= xfbtree_free_block,
+	.get_minrecs		= xfbtree_get_minrecs,
+	.get_maxrecs		= xfbtree_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfbtree_init_ptr_from_cur,
+	.key_diff		= xfs_rtrmapbt_key_diff,
+	.buf_ops		= &xfs_rtrmapbt_mem_buf_ops,
+	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
+	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_mem_cursor(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*head_bp,
+	struct xfbtree		*xfbtree)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_mount	*mp = rtg->rtg_mount;
+
+	/* Overlapping btree; 2 keys per pointer. */
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RTRMAP,
+			&xfs_rtrmapbt_mem_ops, mp->m_rtrmap_maxlevels,
+			xfs_rtrmapbt_cur_cache);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+	cur->bc_mem.xfbtree = xfbtree;
+	cur->bc_mem.head_bp = head_bp;
+	cur->bc_nlevels = xfs_btree_mem_head_nlevels(head_bp);
+
+	cur->bc_mem.rtg = xfs_rtgroup_hold(rtg);
+	return cur;
+}
+
+int
+xfs_rtrmapbt_mem_create(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_buftarg	*target,
+	struct xfbtree		**xfbtreep)
+{
+	struct xfbtree_config	cfg = {
+		.btree_ops	= &xfs_rtrmapbt_mem_ops,
+		.target		= target,
+		.flags		= XFBTREE_DIRECT_MAP,
+		.owner		= rgno,
+	};
+
+	return xfbtree_create(mp, &cfg, xfbtreep);
+}
+#endif /* CONFIG_XFS_BTREE_IN_XFILE */
+
 /*
  * Install a new rt reverse mapping btree root.  Caller is responsible for
  * invalidating and freeing the old btree blocks.
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 5aec719be053f..b0a8e8d89f9eb 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -205,4 +205,13 @@ int xfs_rtrmapbt_create(struct xfs_imeta_update *upd, struct xfs_inode **ipp);
 unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
+#ifdef CONFIG_XFS_BTREE_IN_XFILE
+struct xfbtree;
+struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
+		struct xfs_trans *tp, struct xfs_buf *mhead_bp,
+		struct xfbtree *xfbtree);
+int xfs_rtrmapbt_mem_create(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		struct xfs_buftarg *target, struct xfbtree **xfbtreep);
+#endif /* CONFIG_XFS_BTREE_IN_XFILE */
+
 #endif	/* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index c56558ce919ab..00e606dfc6842 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -12,6 +12,7 @@
 #include "xfs_defer.h"
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
+#include "xfs_btree_mem.h"
 #include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
@@ -41,6 +42,7 @@
 #include "scrub/iscan.h"
 #include "scrub/newbt.h"
 #include "scrub/reap.h"
+#include "scrub/xfbtree.h"
 
 /*
  * Realtime Reverse Mapping Btree Repair
@@ -64,24 +66,13 @@
  * We use the 'xrep_rtrmap' prefix for all the rmap functions.
  */
 
-/*
- * Packed rmap record.  The UNWRITTEN flags are hidden in the upper bits of
- * offset, just like the on-disk record.
- */
-struct xrep_rtrmap_extent {
-	xfs_rgblock_t	startblock;
-	xfs_extlen_t	blockcount;
-	uint64_t	owner;
-	uint64_t	offset;
-} __packed;
-
 /* Context for collecting rmaps */
 struct xrep_rtrmap {
 	/* new rtrmapbt information */
 	struct xrep_newbt	new_btree;
 
 	/* rmap records generated from primary metadata */
-	struct xfarray		*rtrmap_records;
+	struct xfbtree		*rtrmap_btree;
 
 	struct xfs_scrub	*sc;
 
@@ -91,8 +82,11 @@ struct xrep_rtrmap {
 	/* inode scan cursor */
 	struct xchk_iscan	iscan;
 
-	/* get_records()'s position in the free space record array. */
-	xfarray_idx_t		array_cur;
+	/* in-memory btree cursor for the ->get_blocks walk */
+	struct xfs_btree_cur	*mcur;
+
+	/* Number of records we're staging in the new btree. */
+	uint64_t		nr_records;
 };
 
 /* Set us up to repair rt reverse mapping btrees. */
@@ -101,6 +95,14 @@ xrep_setup_rtrmapbt(
 	struct xfs_scrub	*sc)
 {
 	struct xrep_rtrmap	*rr;
+	char			*descr;
+	int			error;
+
+	descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
+	error = xrep_setup_buftarg(sc, descr);
+	kfree(descr);
+	if (error)
+		return error;
 
 	rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS);
 	if (!rr)
@@ -138,11 +140,6 @@ xrep_rtrmap_stash(
 	uint64_t		offset,
 	unsigned int		flags)
 {
-	struct xrep_rtrmap_extent	rre = {
-		.startblock	= startblock,
-		.blockcount	= blockcount,
-		.owner		= owner,
-	};
 	struct xfs_rmap_irec	rmap = {
 		.rm_startblock	= startblock,
 		.rm_blockcount	= blockcount,
@@ -151,6 +148,8 @@ xrep_rtrmap_stash(
 		.rm_flags	= flags,
 	};
 	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_btree_cur	*mcur;
+	struct xfs_buf		*mhead_bp;
 	int			error = 0;
 
 	if (xchk_should_terminate(sc, &error))
@@ -158,8 +157,23 @@ xrep_rtrmap_stash(
 
 	trace_xrep_rtrmap_found(sc->mp, &rmap);
 
-	rre.offset = xfs_rmap_irec_offset_pack(&rmap);
-	return xfarray_append(rr->rtrmap_records, &rre);
+	/* Add entry to in-memory btree. */
+	error = xfbtree_head_read_buf(rr->rtrmap_btree, sc->tp, &mhead_bp);
+	if (error)
+		return error;
+
+	mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, mhead_bp,
+			rr->rtrmap_btree);
+	error = xfs_rmap_map_raw(mcur, &rmap);
+	xfs_btree_del_cursor(mcur, error);
+	if (error)
+		goto out_cancel;
+
+	return xfbtree_trans_commit(rr->rtrmap_btree, sc->tp);
+
+out_cancel:
+	xfbtree_trans_cancel(rr->rtrmap_btree, sc->tp);
+	return error;
 }
 
 /* Finding all file and bmbt extents. */
@@ -402,6 +416,24 @@ xrep_rtrmap_scan_ag(
 	return error;
 }
 
+/* Count and check all collected records. */
+STATIC int
+xrep_rtrmap_check_record(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_rtrmap		*rr = priv;
+	int				error;
+
+	error = xrep_rtrmap_check_mapping(rr->sc, rec);
+	if (error)
+		return error;
+
+	rr->nr_records++;
+	return 0;
+}
+
 STATIC int
 xrep_rtrmap_find_super_rmaps(
 	struct xrep_rtrmap	*rr)
@@ -421,6 +453,8 @@ xrep_rtrmap_find_rmaps(
 	struct xfs_scrub	*sc = rr->sc;
 	struct xfs_perag	*pag;
 	struct xfs_inode	*ip;
+	struct xfs_buf		*mhead_bp;
+	struct xfs_btree_cur	*mcur;
 	xfs_agnumber_t		agno;
 	int			error;
 
@@ -484,7 +518,25 @@ xrep_rtrmap_find_rmaps(
 		}
 	}
 
-	return 0;
+	/*
+	 * Now that we have everything locked again, we need to count the
+	 * number of rmap records stashed in the btree.  This should reflect
+	 * all actively-owned rt files in the filesystem.  At the same time,
+	 * check all our records before we start building a new btree, which
+	 * requires the rtbitmap lock.
+	 */
+	error = xfbtree_head_read_buf(rr->rtrmap_btree, NULL, &mhead_bp);
+	if (error)
+		return error;
+
+	mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, NULL, mhead_bp,
+			rr->rtrmap_btree);
+	rr->nr_records = 0;
+	error = xfs_rmap_query_all(mcur, xrep_rtrmap_check_record, rr);
+	xfs_btree_del_cursor(mcur, error);
+	xfs_buf_relse(mhead_bp);
+
+	return error;
 }
 
 /* Building the new rtrmap btree. */
@@ -498,29 +550,25 @@ xrep_rtrmap_get_records(
 	unsigned int			nr_wanted,
 	void				*priv)
 {
-	struct xrep_rtrmap_extent	rec;
-	struct xfs_rmap_irec		*irec = &cur->bc_rec.r;
 	struct xrep_rtrmap		*rr = priv;
 	union xfs_btree_rec		*block_rec;
 	unsigned int			loaded;
 	int				error;
 
 	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
-		error = xfarray_load_next(rr->rtrmap_records, &rr->array_cur,
-				&rec);
+		int			stat = 0;
+
+		error = xfs_btree_increment(rr->mcur, 0, &stat);
 		if (error)
 			return error;
-
-		irec->rm_startblock = rec.startblock;
-		irec->rm_blockcount = rec.blockcount;
-		irec->rm_owner = rec.owner;
-
-		if (xfs_rmap_irec_offset_unpack(rec.offset, irec) != NULL)
+		if (!stat)
 			return -EFSCORRUPTED;
 
-		error = xrep_rtrmap_check_mapping(rr->sc, irec);
+		error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
 		if (error)
 			return error;
+		if (!stat)
+			return -EFSCORRUPTED;
 
 		block_rec = xfs_btree_rec_addr(cur, idx, block);
 		cur->bc_ops->init_rec_from_cur(cur, block_rec);
@@ -565,7 +613,7 @@ xrep_rtrmap_build_new_tree(
 	struct xfs_scrub	*sc = rr->sc;
 	struct xfs_rtgroup	*rtg = sc->sr.rtg;
 	struct xfs_btree_cur	*rmap_cur;
-	uint64_t		nr_records;
+	struct xfs_buf		*mhead_bp;
 	int			error;
 
 	/*
@@ -585,11 +633,9 @@ xrep_rtrmap_build_new_tree(
 	rmap_cur = xfs_rtrmapbt_stage_cursor(sc->mp, rtg, rtg->rtg_rmapip,
 			&rr->new_btree.ifake);
 
-	nr_records = xfarray_length(rr->rtrmap_records);
-
 	/* Compute how many blocks we'll need for the rmaps collected. */
 	error = xfs_btree_bload_compute_geometry(rmap_cur,
-			&rr->new_btree.bload, nr_records);
+			&rr->new_btree.bload, rr->nr_records);
 	if (error)
 		goto err_cur;
 
@@ -615,12 +661,25 @@ xrep_rtrmap_build_new_tree(
 	if (error)
 		goto err_cur;
 
+	/*
+	 * Create a cursor to the in-memory btree so that we can bulk load the
+	 * new btree.
+	 */
+	error = xfbtree_head_read_buf(rr->rtrmap_btree, NULL, &mhead_bp);
+	if (error)
+		goto err_cur;
+
+	rr->mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, NULL, mhead_bp,
+			rr->rtrmap_btree);
+	error = xfs_btree_goto_left_edge(rr->mcur);
+	if (error)
+		goto err_mcur;
+
 	/* Add all observed rmap records. */
 	rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_RMAP;
-	rr->array_cur = XFARRAY_CURSOR_INIT;
 	error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
 	if (error)
-		goto err_cur;
+		goto err_mcur;
 
 	/*
 	 * Install the new rtrmap btree in the inode.  After this point the old
@@ -630,6 +689,15 @@ xrep_rtrmap_build_new_tree(
 	xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp);
 	xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
 	xfs_btree_del_cursor(rmap_cur, 0);
+	xfs_btree_del_cursor(rr->mcur, 0);
+	rr->mcur = NULL;
+	xfs_buf_relse(mhead_bp);
+
+	/*
+	 * Now that we've written the new btree to disk, we don't need to keep
+	 * updating the in-memory btree.  Abort the scan to stop live updates.
+	 */
+	xchk_iscan_abort(&rr->iscan);
 
 	/* Dispose of any unused blocks and the accounting information. */
 	error = xrep_newbt_commit(&rr->new_btree);
@@ -638,6 +706,9 @@ xrep_rtrmap_build_new_tree(
 
 	return xrep_roll_trans(sc);
 
+err_mcur:
+	xfs_btree_del_cursor(rr->mcur, error);
+	xfs_buf_relse(mhead_bp);
 err_cur:
 	xfs_btree_del_cursor(rmap_cur, error);
 	xrep_newbt_cancel(&rr->new_btree);
@@ -674,16 +745,13 @@ xrep_rtrmap_setup_scan(
 	struct xrep_rtrmap	*rr)
 {
 	struct xfs_scrub	*sc = rr->sc;
-	char			*descr;
 	int			error;
 
 	xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);
 
 	/* Set up some storage */
-	descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
-	error = xfarray_create(descr, 0, sizeof(struct xrep_rtrmap_extent),
-			&rr->rtrmap_records);
-	kfree(descr);
+	error = xfs_rtrmapbt_mem_create(sc->mp, sc->sr.rtg->rtg_rgno,
+			sc->xfile_buftarg, &rr->rtrmap_btree);
 	if (error)
 		goto out_bitmap;
 
@@ -702,7 +770,7 @@ xrep_rtrmap_teardown(
 	struct xrep_rtrmap	*rr)
 {
 	xchk_iscan_teardown(&rr->iscan);
-	xfarray_destroy(rr->rtrmap_records);
+	xfbtree_destroy(rr->rtrmap_btree);
 	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
 }
 
diff --git a/fs/xfs/scrub/xfbtree.c b/fs/xfs/scrub/xfbtree.c
index 9e557d87d1c9c..7c035ad1f696a 100644
--- a/fs/xfs/scrub/xfbtree.c
+++ b/fs/xfs/scrub/xfbtree.c
@@ -17,6 +17,7 @@
 #include "xfs_error.h"
 #include "xfs_btree_mem.h"
 #include "xfs_ag.h"
+#include "xfs_rtgroup.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfbtree.h"
@@ -298,6 +299,8 @@ xfbtree_dup_cursor(
 
 	if (cur->bc_mem.pag)
 		ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
+	if (cur->bc_mem.rtg)
+		ncur->bc_mem.rtg = xfs_rtgroup_hold(cur->bc_mem.rtg);
 
 	return ncur;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 38/39] xfs: hook live realtime rmap operations during a repair operation
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (36 preceding siblings ...)
  2023-12-31 21:41   ` [PATCH 37/39] xfs: create a shadow rmap btree during realtime rmap repair Darrick J. Wong
@ 2023-12-31 21:41   ` Darrick J. Wong
  2023-12-31 21:41   ` [PATCH 39/39] xfs: enable realtime rmap btree Darrick J. Wong
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:41 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Hook the regular realtime rmap code when an rtrmapbt repair operation is
running so that we can unlock the AGF buffer to scan the filesystem and
keep the in-memory btree up to date during the scan.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c     |   56 ++++++++++++++++-
 fs/xfs/libxfs/xfs_rmap.h     |    6 ++
 fs/xfs/libxfs/xfs_rtgroup.c  |    1 
 fs/xfs/libxfs/xfs_rtgroup.h  |    3 +
 fs/xfs/scrub/rtrmap_repair.c |  139 ++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/trace.h         |   36 +++++++++++
 6 files changed, 233 insertions(+), 8 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index d100e03f9560f..43108a195004c 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -920,8 +920,7 @@ xfs_rmap_update_hook(
 			.oinfo		= *oinfo, /* struct copy */
 		};
 
-		if (pag)
-			xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
+		xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
 	}
 }
 
@@ -946,6 +945,50 @@ xfs_rmap_hook_del(
 # define xfs_rmap_update_hook(t, p, o, s, b, u, oi)	do { } while (0)
 #endif /* CONFIG_XFS_LIVE_HOOKS */
 
+# if defined(CONFIG_XFS_LIVE_HOOKS) && defined(CONFIG_XFS_RT)
+static inline void
+xfs_rtrmap_update_hook(
+	struct xfs_trans		*tp,
+	struct xfs_rtgroup		*rtg,
+	enum xfs_rmap_intent_type	op,
+	xfs_rgblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) {
+		struct xfs_rmap_update_params	p = {
+			.startblock	= startblock,
+			.blockcount	= blockcount,
+			.unwritten	= unwritten,
+			.oinfo		= *oinfo, /* struct copy */
+		};
+
+		xfs_hooks_call(&rtg->rtg_rmap_update_hooks, op, &p);
+	}
+}
+
+/* Call the specified function during a rt reverse mapping update. */
+int
+xfs_rtrmap_hook_add(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_rmap_hook	*hook)
+{
+	return xfs_hooks_add(&rtg->rtg_rmap_update_hooks, &hook->update_hook);
+}
+
+/* Stop calling the specified function during a rt reverse mapping update. */
+void
+xfs_rtrmap_hook_del(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_rmap_hook	*hook)
+{
+	xfs_hooks_del(&rtg->rtg_rmap_update_hooks, &hook->update_hook);
+}
+#else
+# define xfs_rtrmap_update_hook(t, r, o, s, b, u, oi)	do { } while (0)
+#endif /* CONFIG_XFS_LIVE_HOOKS && CONFIG_XFS_RT */
+
 /*
  * Remove a reference to an extent in the rmap btree.
  */
@@ -2702,6 +2745,7 @@ xfs_rtrmap_finish_one(
 	xfs_rgnumber_t			rgno;
 	xfs_rgblock_t			bno;
 	bool				unwritten;
+	int				error;
 
 	trace_xfs_rmap_deferred(mp, ri);
 
@@ -2727,8 +2771,14 @@ xfs_rtrmap_finish_one(
 	unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
 	bno = xfs_rtb_to_rgbno(mp, ri->ri_bmap.br_startblock, &rgno);
 
-	return __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+	error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
 			ri->ri_bmap.br_blockcount, &oinfo, unwritten);
+	if (error)
+		return error;
+
+	xfs_rtrmap_update_hook(tp, ri->ri_rtg, ri->ri_type, bno,
+			ri->ri_bmap.br_blockcount, unwritten, &oinfo);
+	return 0;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 3719fc4cbc26b..9e19e657eeff6 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -275,6 +275,12 @@ void xfs_rmap_hook_enable(void);
 
 int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
 void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+
+# ifdef CONFIG_XFS_RT
+int xfs_rtrmap_hook_add(struct xfs_rtgroup *rtg, struct xfs_rmap_hook *hook);
+void xfs_rtrmap_hook_del(struct xfs_rtgroup *rtg, struct xfs_rmap_hook *hook);
+# endif /* CONFIG_XFS_RT */
+
 #endif
 
 #endif	/* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 7b031f4917349..8bc97c9aa4c9c 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -163,6 +163,7 @@ xfs_initialize_rtgroups(
 		spin_lock_init(&rtg->rtg_state_lock);
 		init_waitqueue_head(&rtg->rtg_active_wq);
 		xfs_defer_drain_init(&rtg->rtg_intents_drain);
+		xfs_hooks_init(&rtg->rtg_rmap_update_hooks);
 #endif /* __KERNEL__ */
 
 		/* Active ref owned by mount indicates rtgroup is online. */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 9487c2e00478b..3522527e553b8 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -48,6 +48,9 @@ struct xfs_rtgroup {
 	 * inconsistencies.
 	 */
 	struct xfs_defer_drain	rtg_intents_drain;
+
+	/* Hook to feed rt rmapbt updates to an active online repair. */
+	struct xfs_hooks	rtg_rmap_update_hooks;
 #endif /* __KERNEL__ */
 };
 
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index 00e606dfc6842..42df1cf45ae0b 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -71,6 +71,9 @@ struct xrep_rtrmap {
 	/* new rtrmapbt information */
 	struct xrep_newbt	new_btree;
 
+	/* lock for the xfbtree and xfile */
+	struct mutex		lock;
+
 	/* rmap records generated from primary metadata */
 	struct xfbtree		*rtrmap_btree;
 
@@ -79,6 +82,9 @@ struct xrep_rtrmap {
 	/* bitmap of old rtrmapbt blocks */
 	struct xfsb_bitmap	old_rtrmapbt_blocks;
 
+	/* Hooks into rtrmap update code. */
+	struct xfs_rmap_hook	hooks;
+
 	/* inode scan cursor */
 	struct xchk_iscan	iscan;
 
@@ -98,6 +104,8 @@ xrep_setup_rtrmapbt(
 	char			*descr;
 	int			error;
 
+	xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
 	descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
 	error = xrep_setup_buftarg(sc, descr);
 	kfree(descr);
@@ -155,12 +163,16 @@ xrep_rtrmap_stash(
 	if (xchk_should_terminate(sc, &error))
 		return error;
 
+	if (xchk_iscan_aborted(&rr->iscan))
+		return -EFSCORRUPTED;
+
 	trace_xrep_rtrmap_found(sc->mp, &rmap);
 
 	/* Add entry to in-memory btree. */
+	mutex_lock(&rr->lock);
 	error = xfbtree_head_read_buf(rr->rtrmap_btree, sc->tp, &mhead_bp);
 	if (error)
-		return error;
+		goto out_abort;
 
 	mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, mhead_bp,
 			rr->rtrmap_btree);
@@ -169,10 +181,18 @@ xrep_rtrmap_stash(
 	if (error)
 		goto out_cancel;
 
-	return xfbtree_trans_commit(rr->rtrmap_btree, sc->tp);
+	error = xfbtree_trans_commit(rr->rtrmap_btree, sc->tp);
+	if (error)
+		goto out_abort;
+
+	mutex_unlock(&rr->lock);
+	return 0;
 
 out_cancel:
 	xfbtree_trans_cancel(rr->rtrmap_btree, sc->tp);
+out_abort:
+	xchk_iscan_abort(&rr->iscan);
+	mutex_unlock(&rr->lock);
 	return error;
 }
 
@@ -509,6 +529,13 @@ xrep_rtrmap_find_rmaps(
 	if (error)
 		return error;
 
+	/*
+	 * If a hook failed to update the in-memory btree, we lack the data to
+	 * continue the repair.
+	 */
+	if (xchk_iscan_aborted(&rr->iscan))
+		return -EFSCORRUPTED;
+
 	/* Scan for old rtrmap blocks. */
 	for_each_perag(sc->mp, agno, pag) {
 		error = xrep_rtrmap_scan_ag(rr, pag);
@@ -739,6 +766,89 @@ xrep_rtrmap_remove_old_tree(
 	return xrep_reset_imeta_reservation(rr->sc);
 }
 
+static inline bool
+xrep_rtrmapbt_want_live_update(
+	struct xchk_iscan		*iscan,
+	const struct xfs_owner_info	*oi)
+{
+	if (xchk_iscan_aborted(iscan))
+		return false;
+
+	/*
+	 * We scanned the CoW staging extents before we started the iscan, so
+	 * we need all the updates.
+	 */
+	if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+		return true;
+
+	/* Ignore updates to files that the scanner hasn't visited yet. */
+	return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rtrmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the rtrmap ILOCK and is generating
+ * the update, so we must be careful about which parts of the struct
+ * xrep_rtrmap that we change.
+ */
+static int
+xrep_rtrmapbt_live_update(
+	struct notifier_block		*nb,
+	unsigned long			action,
+	void				*data)
+{
+	struct xfs_rmap_update_params	*p = data;
+	struct xrep_rtrmap		*rr;
+	struct xfs_mount		*mp;
+	struct xfs_btree_cur		*mcur;
+	struct xfs_buf			*mhead_bp;
+	struct xfs_trans		*tp;
+	void				*txcookie;
+	int				error;
+
+	rr = container_of(nb, struct xrep_rtrmap, hooks.update_hook.nb);
+	mp = rr->sc->mp;
+
+	if (!xrep_rtrmapbt_want_live_update(&rr->iscan, &p->oinfo))
+		goto out_unlock;
+
+	trace_xrep_rtrmap_live_update(mp, rr->sc->sr.rtg->rtg_rgno, action, p);
+
+	error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+	if (error)
+		goto out_abort;
+
+	mutex_lock(&rr->lock);
+	error = xfbtree_head_read_buf(rr->rtrmap_btree, tp, &mhead_bp);
+	if (error)
+		goto out_cancel;
+
+	mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, mhead_bp,
+			rr->rtrmap_btree);
+	error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+			p->blockcount, &p->oinfo, p->unwritten);
+	xfs_btree_del_cursor(mcur, error);
+	if (error)
+		goto out_cancel;
+
+	error = xfbtree_trans_commit(rr->rtrmap_btree, tp);
+	if (error)
+		goto out_cancel;
+
+	xrep_trans_cancel_hook_dummy(&txcookie, tp);
+	mutex_unlock(&rr->lock);
+	return NOTIFY_DONE;
+
+out_cancel:
+	xfbtree_trans_cancel(rr->rtrmap_btree, tp);
+	xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+	xchk_iscan_abort(&rr->iscan);
+	mutex_unlock(&rr->lock);
+out_unlock:
+	return NOTIFY_DONE;
+}
+
 /* Set up the filesystem scan components. */
 STATIC int
 xrep_rtrmap_setup_scan(
@@ -747,6 +857,7 @@ xrep_rtrmap_setup_scan(
 	struct xfs_scrub	*sc = rr->sc;
 	int			error;
 
+	mutex_init(&rr->lock);
 	xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);
 
 	/* Set up some storage */
@@ -757,10 +868,26 @@ xrep_rtrmap_setup_scan(
 
 	/* Retry iget every tenth of a second for up to 30 seconds. */
 	xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+	/*
+	 * Hook into live rtrmap operations so that we can update our in-memory
+	 * btree to reflect live changes on the filesystem.  Since we drop the
+	 * rtrmap ILOCK to scan all the inodes, we need this piece to avoid
+	 * installing a stale btree.
+	 */
+	ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+	xfs_hook_setup(&rr->hooks.update_hook, xrep_rtrmapbt_live_update);
+	error = xfs_rtrmap_hook_add(sc->sr.rtg, &rr->hooks);
+	if (error)
+		goto out_iscan;
 	return 0;
 
+out_iscan:
+	xchk_iscan_teardown(&rr->iscan);
+	xfbtree_destroy(rr->rtrmap_btree);
 out_bitmap:
 	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+	mutex_destroy(&rr->lock);
 	return error;
 }
 
@@ -769,9 +896,14 @@ STATIC void
 xrep_rtrmap_teardown(
 	struct xrep_rtrmap	*rr)
 {
+	struct xfs_scrub	*sc = rr->sc;
+
+	xchk_iscan_abort(&rr->iscan);
+	xfs_rtrmap_hook_del(sc->sr.rtg, &rr->hooks);
 	xchk_iscan_teardown(&rr->iscan);
 	xfbtree_destroy(rr->rtrmap_btree);
 	xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+	mutex_destroy(&rr->lock);
 }
 
 /* Repair the realtime rmap btree. */
@@ -782,9 +914,6 @@ xrep_rtrmapbt(
 	struct xrep_rtrmap	*rr = sc->buf;
 	int			error;
 
-	/* Functionality is not yet complete. */
-	return xrep_notsupported(sc);
-
 	/* Make sure any problems with the fork are fixed. */
 	error = xrep_metadata_inode_forks(sc);
 	if (error)
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 95fdb82660dc3..65e0872792e1f 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -3956,6 +3956,42 @@ TRACE_EVENT(xrep_rtrmap_found,
 		  __entry->offset,
 		  __entry->flags)
 );
+
+TRACE_EVENT(xrep_rtrmap_live_update,
+	TP_PROTO(struct xfs_mount *mp, xfs_rgnumber_t rgno, unsigned int op,
+		 const struct xfs_rmap_update_params *p),
+	TP_ARGS(mp, rgno, op, p),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(unsigned int, op)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rgno = rgno;
+		__entry->op = op;
+		__entry->rgbno = p->startblock;
+		__entry->len = p->blockcount;
+		xfs_owner_info_unpack(&p->oinfo, &__entry->owner,
+				&__entry->offset, &__entry->flags);
+		if (p->unwritten)
+			__entry->flags |= XFS_RMAP_UNWRITTEN;
+	),
+	TP_printk("dev %d:%d rgno 0x%x op %s rgbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS),
+		  __entry->rgbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
 #endif /* CONFIG_XFS_RT */
 
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 39/39] xfs: enable realtime rmap btree
  2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
                     ` (37 preceding siblings ...)
  2023-12-31 21:41   ` [PATCH 38/39] xfs: hook live realtime rmap operations during a repair operation Darrick J. Wong
@ 2023-12-31 21:41   ` Darrick J. Wong
  38 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:41 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_super.c |    6 ------
 1 file changed, 6 deletions(-)


diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4a3119a48ef08..fbfd2963b2e2c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1741,12 +1741,6 @@ xfs_fs_fill_super(
 		}
 	}
 
-	if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) {
-		xfs_alert(mp,
-	"reverse mapping btree not compatible with realtime device!");
-		error = -EINVAL;
-		goto out_filestream_unmount;
-	}
 
 	if (xfs_has_parent(mp))
 		xfs_warn(mp,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/10] xfs: give refcount btree cursor error tracepoints their own class
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
@ 2023-12-31 21:42   ` Darrick J. Wong
  2023-12-31 21:42   ` [PATCH 02/10] xfs: create specialized classes for refcount tracepoints Darrick J. Wong
                     ` (8 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:42 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert all the refcount tracepoints to use the btree error tracepoint
class.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |   42 ++++++++++++++----------------------------
 fs/xfs/xfs_trace.h           |   26 +++++++++++++-------------
 2 files changed, 27 insertions(+), 41 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 7f4433b2a5dd3..b7fe286589063 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -211,8 +211,7 @@ xfs_refcount_update(
 
 	error = xfs_btree_update(cur, &rec);
 	if (error)
-		trace_xfs_refcount_update_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_refcount_update_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -247,8 +246,7 @@ xfs_refcount_insert(
 
 out_error:
 	if (error)
-		trace_xfs_refcount_insert_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_refcount_insert_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -288,8 +286,7 @@ xfs_refcount_delete(
 			&found_rec);
 out_error:
 	if (error)
-		trace_xfs_refcount_delete_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_refcount_delete_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -438,8 +435,7 @@ xfs_refcount_split_extent(
 	return error;
 
 out_error:
-	trace_xfs_refcount_split_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_split_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -522,8 +518,7 @@ xfs_refcount_merge_center_extents(
 	return error;
 
 out_error:
-	trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_merge_center_extents_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -589,8 +584,7 @@ xfs_refcount_merge_left_extent(
 	return error;
 
 out_error:
-	trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_merge_left_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -658,8 +652,7 @@ xfs_refcount_merge_right_extent(
 	return error;
 
 out_error:
-	trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_merge_right_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -753,8 +746,7 @@ xfs_refcount_find_left_extents(
 	return error;
 
 out_error:
-	trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_find_left_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -848,8 +840,7 @@ xfs_refcount_find_right_extents(
 	return error;
 
 out_error:
-	trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_find_right_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1254,8 +1245,7 @@ xfs_refcount_adjust_extents(
 
 	return error;
 out_error:
-	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1315,8 +1305,7 @@ xfs_refcount_adjust(
 	return 0;
 
 out_error:
-	trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			error, _RET_IP_);
+	trace_xfs_refcount_adjust_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1630,8 +1619,7 @@ xfs_refcount_find_shared(
 
 out_error:
 	if (error)
-		trace_xfs_refcount_find_shared_error(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+		trace_xfs_refcount_find_shared_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1786,8 +1774,7 @@ xfs_refcount_adjust_cow_extents(
 
 	return error;
 out_error:
-	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+	trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_);
 	return error;
 }
 
@@ -1833,8 +1820,7 @@ xfs_refcount_adjust_cow(
 	return 0;
 
 out_error:
-	trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			error, _RET_IP_);
+	trace_xfs_refcount_adjust_cow_error(cur, error, _RET_IP_);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 05d8ff68b09e2..925e87772f3d0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3517,9 +3517,9 @@ DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_insert_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_delete_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_update_error);
 
 /* refcount adjustment tracepoints */
 DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase);
@@ -3534,20 +3534,20 @@ DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
 DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);
 DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent);
 DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_adjust_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_adjust_cow_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_modify_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_split_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_left_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_right_extent_error);
 
 /* reflink helpers */
 DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
 DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_shared_error);
 
 DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/10] xfs: create specialized classes for refcount tracepoints
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
  2023-12-31 21:42   ` [PATCH 01/10] xfs: give refcount btree cursor error tracepoints their own class Darrick J. Wong
@ 2023-12-31 21:42   ` Darrick J. Wong
  2023-12-31 21:42   ` [PATCH 03/10] xfs: prepare refcount btree tracepoints for widening Darrick J. Wong
                     ` (7 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:42 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The only user of the "ag" tracepoint event classes is the refcount
btree, so rename them to make that obvious and make them take the btree
cursor to simplify the arguments.  This will save us a lot of trouble
later on.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |   24 ++++++-----------
 fs/xfs/xfs_trace.h           |   61 +++++++++++++++++++++++++++---------------
 2 files changed, 48 insertions(+), 37 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index b7fe286589063..199199da3f210 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -51,7 +51,7 @@ xfs_refcount_lookup_le(
 	xfs_agblock_t		bno,
 	int			*stat)
 {
-	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+	trace_xfs_refcount_lookup(cur,
 			xfs_refcount_encode_startblock(bno, domain),
 			XFS_LOOKUP_LE);
 	cur->bc_rec.rc.rc_startblock = bno;
@@ -71,7 +71,7 @@ xfs_refcount_lookup_ge(
 	xfs_agblock_t		bno,
 	int			*stat)
 {
-	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+	trace_xfs_refcount_lookup(cur,
 			xfs_refcount_encode_startblock(bno, domain),
 			XFS_LOOKUP_GE);
 	cur->bc_rec.rc.rc_startblock = bno;
@@ -91,7 +91,7 @@ xfs_refcount_lookup_eq(
 	xfs_agblock_t		bno,
 	int			*stat)
 {
-	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+	trace_xfs_refcount_lookup(cur,
 			xfs_refcount_encode_startblock(bno, domain),
 			XFS_LOOKUP_LE);
 	cur->bc_rec.rc.rc_startblock = bno;
@@ -1262,11 +1262,9 @@ xfs_refcount_adjust(
 	int			error;
 
 	if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
-		trace_xfs_refcount_increase(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, *agbno, *aglen);
+		trace_xfs_refcount_increase(cur, *agbno, *aglen);
 	else
-		trace_xfs_refcount_decrease(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, *agbno, *aglen);
+		trace_xfs_refcount_decrease(cur, *agbno, *aglen);
 
 	/*
 	 * Ensure that no rcextents cross the boundary of the adjustment range.
@@ -1526,8 +1524,7 @@ xfs_refcount_find_shared(
 	int				have;
 	int				error;
 
-	trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			agbno, aglen);
+	trace_xfs_refcount_find_shared(cur, agbno, aglen);
 
 	/* By default, skip the whole range */
 	*fbno = NULLAGBLOCK;
@@ -1614,8 +1611,7 @@ xfs_refcount_find_shared(
 	}
 
 done:
-	trace_xfs_refcount_find_shared_result(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, *fbno, *flen);
+	trace_xfs_refcount_find_shared_result(cur, *fbno, *flen);
 
 out_error:
 	if (error)
@@ -1833,8 +1829,7 @@ __xfs_refcount_cow_alloc(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		aglen)
 {
-	trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
-			agbno, aglen);
+	trace_xfs_refcount_cow_increase(rcur, agbno, aglen);
 
 	/* Add refcount btree reservation */
 	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
@@ -1850,8 +1845,7 @@ __xfs_refcount_cow_free(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		aglen)
 {
-	trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
-			agbno, aglen);
+	trace_xfs_refcount_cow_decrease(rcur, agbno, aglen);
 
 	/* Remove refcount btree reservation */
 	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 925e87772f3d0..c4589285c6ec2 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3232,17 +3232,41 @@ DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
 
 /* refcount tracepoint classes */
 
-/* reuse the discard trace class for agbno/aglen-based traces */
-#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
+DECLARE_EVENT_CLASS(xfs_refcount_class,
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+		xfs_extlen_t len),
+	TP_ARGS(cur, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+);
+#define DEFINE_REFCOUNT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_class, name, \
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \
+		xfs_extlen_t len), \
+	TP_ARGS(cur, agbno, len))
 
-/* ag btree lookup tracepoint class */
 TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
 TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
 TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
-DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 xfs_agblock_t agbno, xfs_lookup_t dir),
-	TP_ARGS(mp, agno, agbno, dir),
+TRACE_EVENT(xfs_refcount_lookup,
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+		xfs_lookup_t dir),
+	TP_ARGS(cur, agbno, dir),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -3250,8 +3274,8 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
 		__field(xfs_lookup_t, dir)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->agbno = agbno;
 		__entry->dir = dir;
 	),
@@ -3263,12 +3287,6 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
 		  __entry->dir)
 )
 
-#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \
-DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 xfs_agblock_t agbno, xfs_lookup_t dir), \
-	TP_ARGS(mp, agno, agbno, dir))
-
 /* single-rcext tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -3512,7 +3530,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
 	TP_ARGS(mp, agno, i1, i2, i3))
 
 /* refcount btree tracepoints */
-DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert);
@@ -3522,10 +3539,10 @@ DEFINE_BTREE_ERROR_EVENT(xfs_refcount_delete_error);
 DEFINE_BTREE_ERROR_EVENT(xfs_refcount_update_error);
 
 /* refcount adjustment tracepoints */
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase);
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease);
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase);
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_increase);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_decrease);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_increase);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_decrease);
 DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent);
@@ -3545,8 +3562,8 @@ DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_left_extent_error);
 DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_right_extent_error);
 
 /* reflink helpers */
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared_result);
 DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_shared_error);
 
 DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/10] xfs: prepare refcount btree tracepoints for widening
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
  2023-12-31 21:42   ` [PATCH 01/10] xfs: give refcount btree cursor error tracepoints their own class Darrick J. Wong
  2023-12-31 21:42   ` [PATCH 02/10] xfs: create specialized classes for refcount tracepoints Darrick J. Wong
@ 2023-12-31 21:42   ` Darrick J. Wong
  2023-12-31 21:43   ` [PATCH 04/10] xfs: clean up refcount log intent item tracepoint callsites Darrick J. Wong
                     ` (6 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:42 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Prepare the rest of refcount btree tracepoints for use with realtime
reflink by making them take the btree cursor object as a parameter.
This will save us a lot of trouble later on.

Remove the xfs_refcount_recover_extent tracepoint since it's already
covered by other refcount tracepoints.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |   42 ++++++++-------------
 fs/xfs/xfs_trace.h           |   83 +++++++++++++++++++-----------------------
 2 files changed, 53 insertions(+), 72 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 199199da3f210..afb68349cb95d 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -183,7 +183,7 @@ xfs_refcount_get_rec(
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, irec);
 
-	trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+	trace_xfs_refcount_get(cur, irec);
 	return 0;
 }
 
@@ -201,7 +201,7 @@ xfs_refcount_update(
 	uint32_t		start;
 	int			error;
 
-	trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+	trace_xfs_refcount_update(cur, irec);
 
 	start = xfs_refcount_encode_startblock(irec->rc_startblock,
 			irec->rc_domain);
@@ -228,7 +228,7 @@ xfs_refcount_insert(
 {
 	int				error;
 
-	trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+	trace_xfs_refcount_insert(cur, irec);
 
 	cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
 	cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
@@ -273,7 +273,7 @@ xfs_refcount_delete(
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
-	trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
+	trace_xfs_refcount_delete(cur, &irec);
 	error = xfs_btree_delete(cur, i);
 	if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
 		xfs_btree_mark_sick(cur);
@@ -410,8 +410,7 @@ xfs_refcount_split_extent(
 		return 0;
 
 	*shape_changed = true;
-	trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			&rcext, agbno);
+	trace_xfs_refcount_split_extent(cur, &rcext, agbno);
 
 	/* Establish the right extent. */
 	tmp = rcext;
@@ -454,8 +453,7 @@ xfs_refcount_merge_center_extents(
 	int				error;
 	int				found_rec;
 
-	trace_xfs_refcount_merge_center_extents(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, left, center, right);
+	trace_xfs_refcount_merge_center_extents(cur, left, center, right);
 
 	ASSERT(left->rc_domain == center->rc_domain);
 	ASSERT(right->rc_domain == center->rc_domain);
@@ -536,8 +534,7 @@ xfs_refcount_merge_left_extent(
 	int				error;
 	int				found_rec;
 
-	trace_xfs_refcount_merge_left_extent(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, left, cleft);
+	trace_xfs_refcount_merge_left_extent(cur, left, cleft);
 
 	ASSERT(left->rc_domain == cleft->rc_domain);
 
@@ -601,8 +598,7 @@ xfs_refcount_merge_right_extent(
 	int				error;
 	int				found_rec;
 
-	trace_xfs_refcount_merge_right_extent(cur->bc_mp,
-			cur->bc_ag.pag->pag_agno, cright, right);
+	trace_xfs_refcount_merge_right_extent(cur, cright, right);
 
 	ASSERT(right->rc_domain == cright->rc_domain);
 
@@ -741,8 +737,7 @@ xfs_refcount_find_left_extents(
 		cleft->rc_refcount = 1;
 		cleft->rc_domain = domain;
 	}
-	trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			left, cleft, agbno);
+	trace_xfs_refcount_find_left_extent(cur, left, cleft, agbno);
 	return error;
 
 out_error:
@@ -835,8 +830,8 @@ xfs_refcount_find_right_extents(
 		cright->rc_refcount = 1;
 		cright->rc_domain = domain;
 	}
-	trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			cright, right, agbno + aglen);
+	trace_xfs_refcount_find_right_extent(cur, cright, right,
+			agbno + aglen);
 	return error;
 
 out_error:
@@ -1139,8 +1134,7 @@ xfs_refcount_adjust_extents(
 			tmp.rc_refcount = 1 + adj;
 			tmp.rc_domain = XFS_REFC_DOMAIN_SHARED;
 
-			trace_xfs_refcount_modify_extent(cur->bc_mp,
-					cur->bc_ag.pag->pag_agno, &tmp);
+			trace_xfs_refcount_modify_extent(cur, &tmp);
 
 			/*
 			 * Either cover the hole (increment) or
@@ -1205,8 +1199,7 @@ xfs_refcount_adjust_extents(
 		if (ext.rc_refcount == MAXREFCOUNT)
 			goto skip;
 		ext.rc_refcount += adj;
-		trace_xfs_refcount_modify_extent(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, &ext);
+		trace_xfs_refcount_modify_extent(cur, &ext);
 		cur->bc_ag.refc.nr_ops++;
 		if (ext.rc_refcount > 1) {
 			error = xfs_refcount_update(cur, &ext);
@@ -1721,8 +1714,7 @@ xfs_refcount_adjust_cow_extents(
 		tmp.rc_refcount = 1;
 		tmp.rc_domain = XFS_REFC_DOMAIN_COW;
 
-		trace_xfs_refcount_modify_extent(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, &tmp);
+		trace_xfs_refcount_modify_extent(cur, &tmp);
 
 		error = xfs_refcount_insert(cur, &tmp,
 				&found_tmp);
@@ -1753,8 +1745,7 @@ xfs_refcount_adjust_cow_extents(
 		}
 
 		ext.rc_refcount = 0;
-		trace_xfs_refcount_modify_extent(cur->bc_mp,
-				cur->bc_ag.pag->pag_agno, &ext);
+		trace_xfs_refcount_modify_extent(cur, &ext);
 		error = xfs_refcount_delete(cur, &found_rec);
 		if (error)
 			goto out_error;
@@ -1988,9 +1979,6 @@ xfs_refcount_recover_cow_leftovers(
 		if (error)
 			goto out_free;
 
-		trace_xfs_refcount_recover_extent(mp, pag->pag_agno,
-				&rr->rr_rrec);
-
 		/* Free the orphan record */
 		fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
 				rr->rr_rrec.rc_startblock);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c4589285c6ec2..095d7f8a0e707 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3289,9 +3289,8 @@ TRACE_EVENT(xfs_refcount_lookup,
 
 /* single-rcext tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 struct xfs_refcount_irec *irec),
-	TP_ARGS(mp, agno, irec),
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec),
+	TP_ARGS(cur, irec),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -3301,8 +3300,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
 		__field(xfs_nlink_t, refcount)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->domain = irec->rc_domain;
 		__entry->startblock = irec->rc_startblock;
 		__entry->blockcount = irec->rc_blockcount;
@@ -3319,15 +3318,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
 
 #define DEFINE_REFCOUNT_EXTENT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_extent_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 struct xfs_refcount_irec *irec), \
-	TP_ARGS(mp, agno, irec))
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec), \
+	TP_ARGS(cur, irec))
 
 /* single-rcext and an agbno tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno),
-	TP_ARGS(mp, agno, irec, agbno),
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec,
+		 xfs_agblock_t agbno),
+	TP_ARGS(cur, irec, agbno),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -3338,8 +3336,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
 		__field(xfs_agblock_t, agbno)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->domain = irec->rc_domain;
 		__entry->startblock = irec->rc_startblock;
 		__entry->blockcount = irec->rc_blockcount;
@@ -3358,15 +3356,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
 
 #define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \
-	TP_ARGS(mp, agno, irec, agbno))
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \
+		 xfs_agblock_t agbno), \
+	TP_ARGS(cur, irec, agbno))
 
 /* double-rcext tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2),
-	TP_ARGS(mp, agno, i1, i2),
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
+		struct xfs_refcount_irec *i2),
+	TP_ARGS(cur, i1, i2),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -3380,8 +3378,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 		__field(xfs_nlink_t, i2_refcount)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
 		__entry->i1_blockcount = i1->rc_blockcount;
@@ -3407,16 +3405,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 
 #define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \
-	TP_ARGS(mp, agno, i1, i2))
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
+		 struct xfs_refcount_irec *i2), \
+	TP_ARGS(cur, i1, i2))
 
 /* double-rcext and an agbno tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
-		 xfs_agblock_t agbno),
-	TP_ARGS(mp, agno, i1, i2, agbno),
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
+		 struct xfs_refcount_irec *i2, xfs_agblock_t agbno),
+	TP_ARGS(cur, i1, i2, agbno),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -3431,8 +3428,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 		__field(xfs_agblock_t, agbno)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
 		__entry->i1_blockcount = i1->rc_blockcount;
@@ -3460,17 +3457,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 
 #define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
-		 xfs_agblock_t agbno), \
-	TP_ARGS(mp, agno, i1, i2, agbno))
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
+		struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \
+	TP_ARGS(cur, i1, i2, agbno))
 
 /* triple-rcext tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
-		 struct xfs_refcount_irec *i3),
-	TP_ARGS(mp, agno, i1, i2, i3),
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
+		struct xfs_refcount_irec *i2, struct xfs_refcount_irec *i3),
+	TP_ARGS(cur, i1, i2, i3),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -3488,8 +3483,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 		__field(xfs_nlink_t, i3_refcount)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
 		__entry->i1_blockcount = i1->rc_blockcount;
@@ -3524,10 +3519,9 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 
 #define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
-		 struct xfs_refcount_irec *i3), \
-	TP_ARGS(mp, agno, i1, i2, i3))
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
+		struct xfs_refcount_irec *i2, struct xfs_refcount_irec *i3), \
+	TP_ARGS(cur, i1, i2, i3))
 
 /* refcount btree tracepoints */
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
@@ -3545,7 +3539,6 @@ DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_increase);
 DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_decrease);
 DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
-DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent);
 DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent);
 DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
 DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/10] xfs: clean up refcount log intent item tracepoint callsites
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:42   ` [PATCH 03/10] xfs: prepare refcount btree tracepoints for widening Darrick J. Wong
@ 2023-12-31 21:43   ` Darrick J. Wong
  2023-12-31 21:43   ` [PATCH 05/10] xfs: remove xfs_trans_set_refcount_flags Darrick J. Wong
                     ` (5 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:43 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pass the incore refcount intent structure to the tracepoints instead of
open-coding the argument passing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |   14 +++-------
 fs/xfs/libxfs/xfs_refcount.h |    6 ++++
 fs/xfs/xfs_trace.c           |    1 +
 fs/xfs/xfs_trace.h           |   59 +++++++++++++-----------------------------
 4 files changed, 29 insertions(+), 51 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index afb68349cb95d..ad0ad30a12c29 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1367,9 +1367,7 @@ xfs_refcount_finish_one(
 
 	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
 
-	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
-			ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
-			ri->ri_blockcount);
+	trace_xfs_refcount_deferred(mp, ri);
 
 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
 		return -EIO;
@@ -1432,8 +1430,7 @@ xfs_refcount_finish_one(
 		return -EFSCORRUPTED;
 	}
 	if (!error && ri->ri_blockcount > 0)
-		trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno,
-				ri->ri_type, bno, ri->ri_blockcount);
+		trace_xfs_refcount_finish_one_leftover(mp, ri);
 	return error;
 }
 
@@ -1449,11 +1446,6 @@ __xfs_refcount_add(
 {
 	struct xfs_refcount_intent	*ri;
 
-	trace_xfs_refcount_defer(tp->t_mountp,
-			XFS_FSB_TO_AGNO(tp->t_mountp, startblock),
-			type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
-			blockcount);
-
 	ri = kmem_cache_alloc(xfs_refcount_intent_cache,
 			GFP_NOFS | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&ri->ri_list);
@@ -1461,6 +1453,8 @@ __xfs_refcount_add(
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
 
+	trace_xfs_refcount_defer(tp->t_mountp, ri);
+
 	xfs_refcount_update_get_group(tp->t_mountp, ri);
 	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
 }
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 9b56768a590c0..01a20621192ed 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -48,6 +48,12 @@ enum xfs_refcount_intent_type {
 	XFS_REFCOUNT_FREE_COW,
 };
 
+#define XFS_REFCOUNT_INTENT_STRINGS \
+	{ XFS_REFCOUNT_INCREASE,	"incr" }, \
+	{ XFS_REFCOUNT_DECREASE,	"decr" }, \
+	{ XFS_REFCOUNT_ALLOC_COW,	"alloc_cow" }, \
+	{ XFS_REFCOUNT_FREE_COW,	"free_cow" }
+
 struct xfs_refcount_intent {
 	struct list_head			ri_list;
 	struct xfs_perag			*ri_pag;
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index f35927ebe2450..fe34d7c3882f3 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -46,6 +46,7 @@
 #include "xfs_imeta.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rmap.h"
+#include "xfs_refcount.h"
 
 static inline void
 xfs_rmapbt_crack_agno_opdev(
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 095d7f8a0e707..c8227f44a97c0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -94,6 +94,7 @@ struct xfs_imeta_update;
 struct xfs_rtgroup;
 struct xfs_extent_free_item;
 struct xfs_rmap_intent;
+struct xfs_refcount_intent;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -3559,66 +3560,42 @@ DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared);
 DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared_result);
 DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_shared_error);
 
+TRACE_DEFINE_ENUM(XFS_REFCOUNT_INCREASE);
+TRACE_DEFINE_ENUM(XFS_REFCOUNT_DECREASE);
+TRACE_DEFINE_ENUM(XFS_REFCOUNT_ALLOC_COW);
+TRACE_DEFINE_ENUM(XFS_REFCOUNT_FREE_COW);
+
 DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 int type, xfs_agblock_t agbno, xfs_extlen_t len),
-	TP_ARGS(mp, agno, type, agbno, len),
+	TP_PROTO(struct xfs_mount *mp, struct xfs_refcount_intent *refc),
+	TP_ARGS(mp, refc),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
-		__field(int, type)
+		__field(int, op)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
-		__entry->type = type;
-		__entry->agbno = agbno;
-		__entry->len = len;
+		__entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock);
+		__entry->op = refc->ri_type;
+		__entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock);
+		__entry->len = refc->ri_blockcount;
 	),
-	TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+	TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
+		  __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS),
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len)
 );
 #define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_deferred_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 int type, \
-		 xfs_agblock_t bno, \
-		 xfs_extlen_t len), \
-	TP_ARGS(mp, agno, type, bno, len))
+	TP_PROTO(struct xfs_mount *mp, struct xfs_refcount_intent *refc), \
+	TP_ARGS(mp, refc))
 DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
 DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
-
-TRACE_EVENT(xfs_refcount_finish_one_leftover,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 int type, xfs_agblock_t agbno, xfs_extlen_t len),
-	TP_ARGS(mp, agno, type, agbno, len),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(xfs_agnumber_t, agno)
-		__field(int, type)
-		__field(xfs_agblock_t, agbno)
-		__field(xfs_extlen_t, len)
-	),
-	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
-		__entry->type = type;
-		__entry->agbno = agbno;
-		__entry->len = len;
-	),
-	TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
-		  __entry->agno,
-		  __entry->agbno,
-		  __entry->len)
-);
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_finish_one_leftover);
 
 /* simple inode-based error/%ip tracepoint class */
 DECLARE_EVENT_CLASS(xfs_inode_error_class,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/10] xfs: remove xfs_trans_set_refcount_flags
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:43   ` [PATCH 04/10] xfs: clean up refcount log intent item tracepoint callsites Darrick J. Wong
@ 2023-12-31 21:43   ` Darrick J. Wong
  2023-12-31 21:43   ` [PATCH 06/10] xfs: add a ci_entry helper Darrick J. Wong
                     ` (4 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:43 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Remove this single-use helper.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_refcount_item.c |   32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)


diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index ff1fdb759a8d2..b5087efa2ef28 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -243,25 +243,6 @@ xfs_refcount_update_diff_items(
 	return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
 }
 
-/* Set the phys extent flags for this reverse mapping. */
-static void
-xfs_trans_set_refcount_flags(
-	struct xfs_phys_extent		*pmap,
-	enum xfs_refcount_intent_type	type)
-{
-	pmap->pe_flags = 0;
-	switch (type) {
-	case XFS_REFCOUNT_INCREASE:
-	case XFS_REFCOUNT_DECREASE:
-	case XFS_REFCOUNT_ALLOC_COW:
-	case XFS_REFCOUNT_FREE_COW:
-		pmap->pe_flags |= type;
-		break;
-	default:
-		ASSERT(0);
-	}
-}
-
 /* Log refcount updates in the intent item. */
 STATIC void
 xfs_refcount_update_log_item(
@@ -282,7 +263,18 @@ xfs_refcount_update_log_item(
 	pmap = &cuip->cui_format.cui_extents[next_extent];
 	pmap->pe_startblock = ri->ri_startblock;
 	pmap->pe_len = ri->ri_blockcount;
-	xfs_trans_set_refcount_flags(pmap, ri->ri_type);
+
+	pmap->pe_flags = 0;
+	switch (ri->ri_type) {
+	case XFS_REFCOUNT_INCREASE:
+	case XFS_REFCOUNT_DECREASE:
+	case XFS_REFCOUNT_ALLOC_COW:
+	case XFS_REFCOUNT_FREE_COW:
+		pmap->pe_flags |= ri->ri_type;
+		break;
+	default:
+		ASSERT(0);
+	}
 }
 
 static struct xfs_log_item *


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/10] xfs: add a ci_entry helper
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:43   ` [PATCH 05/10] xfs: remove xfs_trans_set_refcount_flags Darrick J. Wong
@ 2023-12-31 21:43   ` Darrick J. Wong
  2023-12-31 21:43   ` [PATCH 07/10] xfs: reuse xfs_refcount_update_cancel_item Darrick J. Wong
                     ` (3 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:43 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a helper to translate from the item list head to the
refcount_intent_item structure and use it so shorten assignments and
avoid the need for extra local variables.

Inspired-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_refcount_item.c |   20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)


diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index b5087efa2ef28..652507c61573b 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -227,6 +227,11 @@ static const struct xfs_item_ops xfs_cud_item_ops = {
 	.iop_intent	= xfs_cud_item_intent,
 };
 
+static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_refcount_intent, ri_list);
+}
+
 /* Sort refcount intents by AG. */
 static int
 xfs_refcount_update_diff_items(
@@ -234,11 +239,8 @@ xfs_refcount_update_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	struct xfs_refcount_intent	*ra;
-	struct xfs_refcount_intent	*rb;
-
-	ra = container_of(a, struct xfs_refcount_intent, ri_list);
-	rb = container_of(b, struct xfs_refcount_intent, ri_list);
+	struct xfs_refcount_intent	*ra = ci_entry(a);
+	struct xfs_refcount_intent	*rb = ci_entry(b);
 
 	return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
 }
@@ -341,11 +343,9 @@ xfs_refcount_update_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
-	struct xfs_refcount_intent	*ri;
+	struct xfs_refcount_intent	*ri = ci_entry(item);
 	int				error;
 
-	ri = container_of(item, struct xfs_refcount_intent, ri_list);
-
 	/* Did we run out of reservation?  Requeue what we didn't finish. */
 	error = xfs_refcount_finish_one(tp, ri, state);
 	if (!error && ri->ri_blockcount > 0) {
@@ -372,9 +372,7 @@ STATIC void
 xfs_refcount_update_cancel_item(
 	struct list_head		*item)
 {
-	struct xfs_refcount_intent	*ri;
-
-	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+	struct xfs_refcount_intent	*ri = ci_entry(item);
 
 	xfs_refcount_update_put_group(ri);
 	kmem_cache_free(xfs_refcount_intent_cache, ri);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/10] xfs: reuse xfs_refcount_update_cancel_item
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:43   ` [PATCH 06/10] xfs: add a ci_entry helper Darrick J. Wong
@ 2023-12-31 21:43   ` Darrick J. Wong
  2023-12-31 21:44   ` [PATCH 08/10] xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one Darrick J. Wong
                     ` (2 subsequent siblings)
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:43 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reuse xfs_refcount_update_cancel_item to put the AG/RTG and free the
item in a few places that currently open code the logic.

Inspired-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_refcount_item.c |   25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)


diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 652507c61573b..4113100dd1e5b 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -335,6 +335,17 @@ xfs_refcount_update_put_group(
 	xfs_perag_intent_put(ri->ri_pag);
 }
 
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_refcount_intent	*ri = ci_entry(item);
+
+	xfs_refcount_update_put_group(ri);
+	kmem_cache_free(xfs_refcount_intent_cache, ri);
+}
+
 /* Process a deferred refcount update. */
 STATIC int
 xfs_refcount_update_finish_item(
@@ -354,8 +365,7 @@ xfs_refcount_update_finish_item(
 		return -EAGAIN;
 	}
 
-	xfs_refcount_update_put_group(ri);
-	kmem_cache_free(xfs_refcount_intent_cache, ri);
+	xfs_refcount_update_cancel_item(item);
 	return error;
 }
 
@@ -367,17 +377,6 @@ xfs_refcount_update_abort_intent(
 	xfs_cui_release(CUI_ITEM(intent));
 }
 
-/* Cancel a deferred refcount update. */
-STATIC void
-xfs_refcount_update_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_refcount_intent	*ri = ci_entry(item);
-
-	xfs_refcount_update_put_group(ri);
-	kmem_cache_free(xfs_refcount_intent_cache, ri);
-}
-
 /* Is this recovered CUI ok? */
 static inline bool
 xfs_cui_validate_phys(


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/10] xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:43   ` [PATCH 07/10] xfs: reuse xfs_refcount_update_cancel_item Darrick J. Wong
@ 2023-12-31 21:44   ` Darrick J. Wong
  2023-12-31 21:44   ` [PATCH 09/10] xfs: simplify usage of the rcur local variable " Darrick J. Wong
  2023-12-31 21:44   ` [PATCH 10/10] xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c Darrick J. Wong
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:44 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

In xfs_refcount_finish_one we know the cursor is non-zero when calling
xfs_refcount_finish_one_cleanup and we pass a 0 error variable.  This
means xfs_refcount_finish_one_cleanup is just doing a
xfs_btree_del_cursor.

Open code that and move xfs_refcount_finish_one_cleanup to
fs/xfs/xfs_refcount_item.c.

Inspired-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |   19 +------------------
 fs/xfs/libxfs/xfs_refcount.h |    2 --
 fs/xfs/xfs_refcount_item.c   |   18 ++++++++++++++++++
 3 files changed, 19 insertions(+), 20 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index ad0ad30a12c29..0d1b17a4df6a1 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1300,23 +1300,6 @@ xfs_refcount_adjust(
 	return error;
 }
 
-/* Clean up after calling xfs_refcount_finish_one. */
-void
-xfs_refcount_finish_one_cleanup(
-	struct xfs_trans	*tp,
-	struct xfs_btree_cur	*rcur,
-	int			error)
-{
-	struct xfs_buf		*agbp;
-
-	if (rcur == NULL)
-		return;
-	agbp = rcur->bc_ag.agbp;
-	xfs_btree_del_cursor(rcur, error);
-	if (error)
-		xfs_trans_brelse(tp, agbp);
-}
-
 /*
  * Set up a continuation a deferred refcount operation by updating the intent.
  * Checks to make sure we're not going to run off the end of the AG.
@@ -1380,7 +1363,7 @@ xfs_refcount_finish_one(
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
 		nr_ops = rcur->bc_ag.refc.nr_ops;
 		shape_changes = rcur->bc_ag.refc.shape_changes;
-		xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+		xfs_btree_del_cursor(rcur, 0);
 		rcur = NULL;
 		*pcur = NULL;
 	}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 01a20621192ed..c94b8f71d407b 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -82,8 +82,6 @@ void xfs_refcount_increase_extent(struct xfs_trans *tp,
 void xfs_refcount_decrease_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
 
-extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
-		struct xfs_btree_cur *rcur, int error);
 extern int xfs_refcount_finish_one(struct xfs_trans *tp,
 		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
 
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 4113100dd1e5b..b36d8433098c8 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -21,6 +21,7 @@
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
 #include "xfs_ag.h"
+#include "xfs_btree.h"
 
 struct kmem_cache	*xfs_cui_cache;
 struct kmem_cache	*xfs_cud_cache;
@@ -369,6 +370,23 @@ xfs_refcount_update_finish_item(
 	return error;
 }
 
+/* Clean up after calling xfs_refcount_finish_one. */
+STATIC void
+xfs_refcount_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	struct xfs_buf		*agbp;
+
+	if (rcur == NULL)
+		return;
+	agbp = rcur->bc_ag.agbp;
+	xfs_btree_del_cursor(rcur, error);
+	if (error)
+		xfs_trans_brelse(tp, agbp);
+}
+
 /* Abort all pending CUIs. */
 STATIC void
 xfs_refcount_update_abort_intent(


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/10] xfs: simplify usage of the rcur local variable in xfs_refcount_finish_one
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:44   ` [PATCH 08/10] xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one Darrick J. Wong
@ 2023-12-31 21:44   ` Darrick J. Wong
  2023-12-31 21:44   ` [PATCH 10/10] xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c Darrick J. Wong
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:44 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Only update rcur when we know the final *pcur value.

Inspired-by: Christoph Hellwig <hch@lst.de>
[djwong: don't leave the caller with a dangling ref]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |    7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 0d1b17a4df6a1..ccb2a1fa5f589 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1341,7 +1341,7 @@ xfs_refcount_finish_one(
 	struct xfs_btree_cur		**pcur)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_btree_cur		*rcur;
+	struct xfs_btree_cur		*rcur = *pcur;
 	struct xfs_buf			*agbp = NULL;
 	int				error = 0;
 	xfs_agblock_t			bno;
@@ -1359,7 +1359,6 @@ xfs_refcount_finish_one(
 	 * If we haven't gotten a cursor or the cursor AG doesn't match
 	 * the startblock, get one now.
 	 */
-	rcur = *pcur;
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
 		nr_ops = rcur->bc_ag.refc.nr_ops;
 		shape_changes = rcur->bc_ag.refc.shape_changes;
@@ -1373,11 +1372,11 @@ xfs_refcount_finish_one(
 		if (error)
 			return error;
 
-		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
+		*pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp,
+							  ri->ri_pag);
 		rcur->bc_ag.refc.nr_ops = nr_ops;
 		rcur->bc_ag.refc.shape_changes = shape_changes;
 	}
-	*pcur = rcur;
 
 	switch (ri->ri_type) {
 	case XFS_REFCOUNT_INCREASE:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/10] xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c
  2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 21:44   ` [PATCH 09/10] xfs: simplify usage of the rcur local variable " Darrick J. Wong
@ 2023-12-31 21:44   ` Darrick J. Wong
  9 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:44 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the code that adds the incore xfs_refcount_update_item deferred
work data to a transaction live with the CUI log item code.  This means
that the refcount code no longer has to know about the inner workings of
the CUI log items.

As a consequence, we can get rid of the _{get,put}_group helpers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |    6 ++----
 fs/xfs/libxfs/xfs_refcount.h |    3 ---
 fs/xfs/xfs_refcount_item.c   |   24 +++++++++++-------------
 fs/xfs/xfs_refcount_item.h   |    5 +++++
 4 files changed, 18 insertions(+), 20 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index ccb2a1fa5f589..edabdf1a97d4b 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -24,6 +24,7 @@
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
 #include "xfs_health.h"
+#include "xfs_refcount_item.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -1435,10 +1436,7 @@ __xfs_refcount_add(
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
 
-	trace_xfs_refcount_defer(tp->t_mountp, ri);
-
-	xfs_refcount_update_get_group(tp->t_mountp, ri);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
+	xfs_refcount_defer_add(tp, ri);
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index c94b8f71d407b..68acb0b1b4a87 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -74,9 +74,6 @@ xfs_refcount_check_domain(
 	return true;
 }
 
-void xfs_refcount_update_get_group(struct xfs_mount *mp,
-		struct xfs_refcount_intent *ri);
-
 void xfs_refcount_increase_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
 void xfs_refcount_decrease_extent(struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index b36d8433098c8..bec3b91e826a4 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -22,6 +22,7 @@
 #include "xfs_log_recover.h"
 #include "xfs_ag.h"
 #include "xfs_btree.h"
+#include "xfs_trace.h"
 
 struct kmem_cache	*xfs_cui_cache;
 struct kmem_cache	*xfs_cud_cache;
@@ -319,21 +320,18 @@ xfs_refcount_update_create_done(
 	return &cudp->cud_item;
 }
 
-/* Take a passive ref to the AG containing the space we're refcounting. */
+/* Add this deferred CUI to the transaction. */
 void
-xfs_refcount_update_get_group(
-	struct xfs_mount		*mp,
+xfs_refcount_defer_add(
+	struct xfs_trans		*tp,
 	struct xfs_refcount_intent	*ri)
 {
+	struct xfs_mount		*mp = tp->t_mountp;
+
+	trace_xfs_refcount_defer(mp, ri);
+
 	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
-}
-
-/* Release a passive AG ref after finishing refcounting work. */
-static inline void
-xfs_refcount_update_put_group(
-	struct xfs_refcount_intent	*ri)
-{
-	xfs_perag_intent_put(ri->ri_pag);
+	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
 }
 
 /* Cancel a deferred refcount update. */
@@ -343,7 +341,7 @@ xfs_refcount_update_cancel_item(
 {
 	struct xfs_refcount_intent	*ri = ci_entry(item);
 
-	xfs_refcount_update_put_group(ri);
+	xfs_perag_intent_put(ri->ri_pag);
 	kmem_cache_free(xfs_refcount_intent_cache, ri);
 }
 
@@ -433,7 +431,7 @@ xfs_cui_recover_work(
 	ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
 	ri->ri_startblock = pmap->pe_startblock;
 	ri->ri_blockcount = pmap->pe_len;
-	xfs_refcount_update_get_group(mp, ri);
+	ri->ri_pag = xfs_perag_intent_get(mp, pmap->pe_startblock);
 
 	xfs_defer_add_item(dfp, &ri->ri_list);
 }
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index eb0ab13682d0b..bfee8f30c63ce 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -71,4 +71,9 @@ struct xfs_cud_log_item {
 extern struct kmem_cache	*xfs_cui_cache;
 extern struct kmem_cache	*xfs_cud_cache;
 
+struct xfs_refcount_intent;
+
+void xfs_refcount_defer_add(struct xfs_trans *tp,
+		struct xfs_refcount_intent *ri);
+
 #endif	/* __XFS_REFCOUNT_ITEM_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/44] xfs: prepare refcount btree cursor tracepoints for realtime
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
@ 2023-12-31 21:44   ` Darrick J. Wong
  2023-12-31 21:45   ` [PATCH 02/44] xfs: introduce realtime refcount btree definitions Darrick J. Wong
                     ` (42 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:44 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Rework the refcount btree cursor tracepoints in preparation to handle the
realtime refcount btree cursor.  Mostly this involves renaming the field to
"refcbno" and extracting the group number from the cursor when possible.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_trace.c |    9 ++++
 fs/xfs/xfs_trace.h |  114 ++++++++++++++++++++++++++++++----------------------
 2 files changed, 74 insertions(+), 49 deletions(-)


diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index fe34d7c3882f3..cb712873ce927 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -66,6 +66,15 @@ xfs_rmapbt_crack_agno_opdev(
 	}
 }
 
+static inline void
+xfs_refcountbt_crack_agno_opdev(
+	struct xfs_btree_cur	*cur,
+	xfs_agnumber_t		*agno,
+	dev_t			*opdev)
+{
+	return xfs_rmapbt_crack_agno_opdev(cur, agno, opdev);
+}
+
 /*
  * We include this last to have the helpers above available for the trace
  * event implementations.
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8227f44a97c0..fd4170a6aea43 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -22,6 +22,8 @@
  *
  * rmapbno: physical block number for a reverse mapping.  This is an agbno for
  *          per-AG rmap btrees or a rgbno for realtime rmap btrees.
+ * refcbno: physical block number for a refcount record.  This is an agbno for
+ *          per-AG refcount btrees or a rgbno for realtime refcount btrees.
  *
  * daddr: physical block number in 512b blocks
  * bbcount: number of blocks in a physical extent, in 512b blocks
@@ -3234,56 +3236,60 @@ DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
 /* refcount tracepoint classes */
 
 DECLARE_EVENT_CLASS(xfs_refcount_class,
-	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t refcbno,
 		xfs_extlen_t len),
-	TP_ARGS(cur, agbno, len),
+	TP_ARGS(cur, refcbno, len),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, refcbno)
 		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
-		__entry->agbno = agbno;
+		xfs_refcountbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
+		__entry->refcbno = refcbno;
 		__entry->len = len;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x refcbno 0x%x fsbcount 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->refcbno,
 		  __entry->len)
 );
 #define DEFINE_REFCOUNT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_class, name, \
-	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t refcbno, \
 		xfs_extlen_t len), \
-	TP_ARGS(cur, agbno, len))
+	TP_ARGS(cur, refcbno, len))
 
 TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
 TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
 TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
 TRACE_EVENT(xfs_refcount_lookup,
-	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t refcbno,
 		xfs_lookup_t dir),
-	TP_ARGS(cur, agbno, dir),
+	TP_ARGS(cur, refcbno, dir),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, refcbno)
 		__field(xfs_lookup_t, dir)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
-		__entry->agbno = agbno;
+		xfs_refcountbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
+		__entry->refcbno = refcbno;
 		__entry->dir = dir;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x refcbno 0x%x cmp %s(%d)",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->refcbno,
 		  __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
 		  __entry->dir)
 )
@@ -3294,6 +3300,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
 	TP_ARGS(cur, irec),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, domain)
 		__field(xfs_agblock_t, startblock)
@@ -3302,14 +3309,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
+		xfs_refcountbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
 		__entry->domain = irec->rc_domain;
 		__entry->startblock = irec->rc_startblock;
 		__entry->blockcount = irec->rc_blockcount;
 		__entry->refcount = irec->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x dom %s refcbno 0x%x fsbcount 0x%x refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
 		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->startblock,
@@ -3325,49 +3333,52 @@ DEFINE_EVENT(xfs_refcount_extent_class, name, \
 /* single-rcext and an agbno tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec,
-		 xfs_agblock_t agbno),
-	TP_ARGS(cur, irec, agbno),
+		 xfs_agblock_t refcbno),
+	TP_ARGS(cur, irec, refcbno),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, domain)
 		__field(xfs_agblock_t, startblock)
 		__field(xfs_extlen_t, blockcount)
 		__field(xfs_nlink_t, refcount)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, refcbno)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
+		xfs_refcountbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
 		__entry->domain = irec->rc_domain;
 		__entry->startblock = irec->rc_startblock;
 		__entry->blockcount = irec->rc_blockcount;
 		__entry->refcount = irec->rc_refcount;
-		__entry->agbno = agbno;
+		__entry->refcbno = refcbno;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x dom %s refcbno 0x%x fsbcount 0x%x refcount %u @ refcbno 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
 		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->startblock,
 		  __entry->blockcount,
 		  __entry->refcount,
-		  __entry->agbno)
+		  __entry->refcbno)
 )
 
 #define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \
-		 xfs_agblock_t agbno), \
-	TP_ARGS(cur, irec, agbno))
+		 xfs_agblock_t refcbno), \
+	TP_ARGS(cur, irec, refcbno))
 
 /* double-rcext tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
-		struct xfs_refcount_irec *i2),
+		 struct xfs_refcount_irec *i2),
 	TP_ARGS(cur, i1, i2),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, i1_domain)
 		__field(xfs_agblock_t, i1_startblock)
@@ -3380,7 +3391,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
+		xfs_refcountbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
 		__entry->i1_blockcount = i1->rc_blockcount;
@@ -3390,9 +3401,10 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 		__entry->i2_blockcount = i2->rc_blockcount;
 		__entry->i2_refcount = i2->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
-		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x dom %s refcbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s refcbno 0x%x fsbcount 0x%x refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
 		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->i1_startblock,
@@ -3413,10 +3425,11 @@ DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
 /* double-rcext and an agbno tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
-		 struct xfs_refcount_irec *i2, xfs_agblock_t agbno),
-	TP_ARGS(cur, i1, i2, agbno),
+		 struct xfs_refcount_irec *i2, xfs_agblock_t refcbno),
+	TP_ARGS(cur, i1, i2, refcbno),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, i1_domain)
 		__field(xfs_agblock_t, i1_startblock)
@@ -3426,11 +3439,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 		__field(xfs_agblock_t, i2_startblock)
 		__field(xfs_extlen_t, i2_blockcount)
 		__field(xfs_nlink_t, i2_refcount)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, refcbno)
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
+		xfs_refcountbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
 		__entry->i1_blockcount = i1->rc_blockcount;
@@ -3439,11 +3452,12 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 		__entry->i2_startblock = i2->rc_startblock;
 		__entry->i2_blockcount = i2->rc_blockcount;
 		__entry->i2_refcount = i2->rc_refcount;
-		__entry->agbno = agbno;
+		__entry->refcbno = refcbno;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
-		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x dom %s refcbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s refcbno 0x%x fsbcount 0x%x refcount %u @ refcbno 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
 		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->i1_startblock,
@@ -3453,14 +3467,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 		  __entry->i2_startblock,
 		  __entry->i2_blockcount,
 		  __entry->i2_refcount,
-		  __entry->agbno)
+		  __entry->refcbno)
 )
 
 #define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
 DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
 	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
-		struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \
-	TP_ARGS(cur, i1, i2, agbno))
+		struct xfs_refcount_irec *i2, xfs_agblock_t refcbno), \
+	TP_ARGS(cur, i1, i2, refcbno))
 
 /* triple-rcext tracepoint class */
 DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
@@ -3469,6 +3483,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 	TP_ARGS(cur, i1, i2, i3),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(dev_t, opdev)
 		__field(xfs_agnumber_t, agno)
 		__field(enum xfs_refc_domain, i1_domain)
 		__field(xfs_agblock_t, i1_startblock)
@@ -3485,7 +3500,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->agno = cur->bc_ag.pag->pag_agno;
+		xfs_refcountbt_crack_agno_opdev(cur, &__entry->agno, &__entry->opdev);
 		__entry->i1_domain = i1->rc_domain;
 		__entry->i1_startblock = i1->rc_startblock;
 		__entry->i1_blockcount = i1->rc_blockcount;
@@ -3499,10 +3514,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 		__entry->i3_blockcount = i3->rc_blockcount;
 		__entry->i3_refcount = i3->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
-		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
-		  "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+	TP_printk("dev %d:%d opdev %d:%d agno 0x%x dom %s refcbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s refcbno 0x%x fsbcount 0x%x refcount %u -- "
+		  "dom %s refcbno 0x%x fsbcount 0x%x refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
 		  __entry->agno,
 		  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->i1_startblock,
@@ -3572,21 +3588,21 @@ DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(int, op)
-		__field(xfs_agblock_t, agbno)
+		__field(xfs_agblock_t, refcbno)
 		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock);
 		__entry->op = refc->ri_type;
-		__entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock);
+		__entry->refcbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock);
 		__entry->len = refc->ri_blockcount;
 	),
-	TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x",
+	TP_printk("dev %d:%d op %s agno 0x%x refcbno 0x%x fsbcount 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS),
 		  __entry->agno,
-		  __entry->agbno,
+		  __entry->refcbno,
 		  __entry->len)
 );
 #define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/44] xfs: introduce realtime refcount btree definitions
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
  2023-12-31 21:44   ` [PATCH 01/44] xfs: prepare refcount btree cursor tracepoints for realtime Darrick J. Wong
@ 2023-12-31 21:45   ` Darrick J. Wong
  2023-12-31 21:45   ` [PATCH 03/44] xfs: namespace the maximum length/refcount symbols Darrick J. Wong
                     ` (41 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:45 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add new realtime refcount btree definitions. The realtime refcount btree
will be rooted from a hidden inode, but has its own shape and therefore
needs to have most of its own separate types.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.h  |    1 +
 fs/xfs/libxfs/xfs_format.h |    6 ++++++
 fs/xfs/libxfs/xfs_types.h  |    5 +++--
 fs/xfs/scrub/trace.h       |    1 +
 fs/xfs/xfs_trace.h         |    1 +
 5 files changed, 12 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4753a5c847616..f58240adda6f4 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -65,6 +65,7 @@ union xfs_btree_rec {
 #define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
 #define	XFS_BTNUM_RCBAG	((xfs_btnum_t)XFS_BTNUM_RCBAGi)
 #define	XFS_BTNUM_RTRMAP ((xfs_btnum_t)XFS_BTNUM_RTRMAPi)
+#define	XFS_BTNUM_RTREFC ((xfs_btnum_t)XFS_BTNUM_RTREFCi)
 
 struct xfs_btree_ops;
 uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 1c1910256a927..0dc169fde2e3d 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1815,6 +1815,12 @@ struct xfs_refcount_key {
 /* btree pointer type */
 typedef __be32 xfs_refcount_ptr_t;
 
+/*
+ * Realtime Reference Count btree format definitions
+ *
+ * This is a btree for reference count records for realtime volumes
+ */
+#define	XFS_RTREFC_CRC_MAGIC	0x52434e54	/* 'RCNT' */
 
 /*
  * BMAP Btree format definitions
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b3edc57dc65bd..4147ba288ec18 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -126,7 +126,7 @@ typedef enum {
 typedef enum {
 	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
 	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_RCBAGi,
-	XFS_BTNUM_RTRMAPi, XFS_BTNUM_MAX
+	XFS_BTNUM_RTRMAPi, XFS_BTNUM_RTREFCi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 
 #define XFS_BTNUM_STRINGS \
@@ -138,7 +138,8 @@ typedef enum {
 	{ XFS_BTNUM_FINOi,	"finobt" }, \
 	{ XFS_BTNUM_REFCi,	"refcbt" }, \
 	{ XFS_BTNUM_RCBAGi,	"rcbagbt" }, \
-	{ XFS_BTNUM_RTRMAPi,	"rtrmapbt" }
+	{ XFS_BTNUM_RTRMAPi,	"rtrmapbt" }, \
+	{ XFS_BTNUM_RTREFCi,	"rtrefcbt" }
 
 struct xfs_name {
 	const unsigned char	*name;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 65e0872792e1f..72b5277f4ba6d 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -49,6 +49,7 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RCBAGi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RTRMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RTREFCi);
 
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fd4170a6aea43..86e0aa946aa00 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2555,6 +2555,7 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RCBAGi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RTRMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RTREFCi);
 
 DECLARE_EVENT_CLASS(xfs_btree_cur_class,
 	TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/44] xfs: namespace the maximum length/refcount symbols
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
  2023-12-31 21:44   ` [PATCH 01/44] xfs: prepare refcount btree cursor tracepoints for realtime Darrick J. Wong
  2023-12-31 21:45   ` [PATCH 02/44] xfs: introduce realtime refcount btree definitions Darrick J. Wong
@ 2023-12-31 21:45   ` Darrick J. Wong
  2023-12-31 21:45   ` [PATCH 04/44] xfs: define the on-disk realtime refcount btree format Darrick J. Wong
                     ` (40 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:45 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Actually namespace these variables properly, so that readers can tell
that this is an XFS symbol, and that it's for the refcount
functionality.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h     |    4 ++--
 fs/xfs/libxfs/xfs_refcount.c   |   18 +++++++++---------
 fs/xfs/scrub/refcount.c        |    2 +-
 fs/xfs/scrub/refcount_repair.c |    4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 0dc169fde2e3d..473bdc2a1ad10 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1809,8 +1809,8 @@ struct xfs_refcount_key {
 	__be32		rc_startblock;	/* starting block number */
 };
 
-#define MAXREFCOUNT	((xfs_nlink_t)~0U)
-#define MAXREFCEXTLEN	((xfs_extlen_t)~0U)
+#define XFS_REFC_REFCOUNT_MAX	((xfs_nlink_t)~0U)
+#define XFS_REFC_LEN_MAX	((xfs_extlen_t)~0U)
 
 /* btree pointer type */
 typedef __be32 xfs_refcount_ptr_t;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index edabdf1a97d4b..b29a718737c59 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -128,7 +128,7 @@ xfs_refcount_check_irec(
 	struct xfs_perag		*pag,
 	const struct xfs_refcount_irec	*irec)
 {
-	if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
+	if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
 		return __this_address;
 
 	if (!xfs_refcount_check_domain(irec))
@@ -138,7 +138,7 @@ xfs_refcount_check_irec(
 	if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
 		return __this_address;
 
-	if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
+	if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
 		return __this_address;
 
 	return NULL;
@@ -853,9 +853,9 @@ xfs_refc_merge_refcount(
 	const struct xfs_refcount_irec	*irec,
 	enum xfs_refc_adjust_op		adjust)
 {
-	/* Once a record hits MAXREFCOUNT, it is pinned there forever */
-	if (irec->rc_refcount == MAXREFCOUNT)
-		return MAXREFCOUNT;
+	/* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */
+	if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX)
+		return XFS_REFC_REFCOUNT_MAX;
 	return irec->rc_refcount + adjust;
 }
 
@@ -898,7 +898,7 @@ xfs_refc_want_merge_center(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cleft->rc_blockcount + right->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	*ulenp = ulen;
@@ -933,7 +933,7 @@ xfs_refc_want_merge_left(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cleft->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	return true;
@@ -967,7 +967,7 @@ xfs_refc_want_merge_right(
 	 * hence we need to catch u32 addition overflows here.
 	 */
 	ulen += cright->rc_blockcount;
-	if (ulen >= MAXREFCEXTLEN)
+	if (ulen >= XFS_REFC_LEN_MAX)
 		return false;
 
 	return true;
@@ -1197,7 +1197,7 @@ xfs_refcount_adjust_extents(
 		 * Adjust the reference count and either update the tree
 		 * (incr) or free the blocks (decr).
 		 */
-		if (ext.rc_refcount == MAXREFCOUNT)
+		if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX)
 			goto skip;
 		ext.rc_refcount += adj;
 		trace_xfs_refcount_modify_extent(cur, &ext);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index d0c7d4a29c0fe..31a0461e2a0e3 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -421,7 +421,7 @@ xchk_refcount_mergeable(
 	if (r1->rc_refcount != r2->rc_refcount)
 		return false;
 	if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
-			MAXREFCEXTLEN)
+			XFS_REFC_LEN_MAX)
 		return false;
 
 	return true;
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index b485c5cc9f290..d4a42f8b7e95b 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -183,7 +183,7 @@ xrep_refc_stash(
 	if (xchk_should_terminate(sc, &error))
 		return error;
 
-	irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+	irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
 
 	error = xrep_refc_check_ext(rr->sc, &irec);
 	if (error)
@@ -422,7 +422,7 @@ xrep_refc_find_refcounts(
 	/*
 	 * Set up a bag to store all the rmap records that we're tracking to
 	 * generate a reference count record.  If the size of the bag exceeds
-	 * MAXREFCOUNT, we clamp rc_refcount.
+	 * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
 	 */
 	error = rcbag_init(sc->mp, sc->xfile_buftarg, &rcstack);
 	if (error)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/44] xfs: define the on-disk realtime refcount btree format
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:45   ` [PATCH 03/44] xfs: namespace the maximum length/refcount symbols Darrick J. Wong
@ 2023-12-31 21:45   ` Darrick J. Wong
  2023-12-31 21:45   ` [PATCH 05/44] xfs: realtime refcount btree transaction reservations Darrick J. Wong
                     ` (39 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:45 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Start filling out the rtrefcount btree implementation. Start with the
on-disk btree format; add everything needed to read, write and
manipulate refcount btree blocks. This prepares the way for connecting
the btree operations implementation.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile                      |    1 
 fs/xfs/libxfs/xfs_btree.c            |    5 +
 fs/xfs/libxfs/xfs_btree.h            |   11 +
 fs/xfs/libxfs/xfs_format.h           |    3 
 fs/xfs/libxfs/xfs_ondisk.h           |    1 
 fs/xfs/libxfs/xfs_rtrefcount_btree.c |  312 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |   71 ++++++++
 fs/xfs/libxfs/xfs_sb.c               |    8 +
 fs/xfs/libxfs/xfs_shared.h           |    2 
 fs/xfs/xfs_mount.c                   |    7 +
 fs/xfs/xfs_mount.h                   |    9 +
 11 files changed, 425 insertions(+), 5 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtrefcount_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_rtrefcount_btree.h


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index f9092ae77e684..783fc053d7be9 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -48,6 +48,7 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_rmap_btree.o \
 				   xfs_refcount.o \
 				   xfs_refcount_btree.o \
+				   xfs_rtrefcount_btree.o \
 				   xfs_rtrmap_btree.o \
 				   xfs_sb.o \
 				   xfs_swapext.o \
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2a181cf30299f..be484f86da985 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -37,6 +37,7 @@
 #include "xfs_rmap.h"
 #include "xfs_quota.h"
 #include "xfs_imeta.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Btree magic numbers.
@@ -5538,6 +5539,9 @@ xfs_btree_init_cur_caches(void)
 	if (error)
 		goto err;
 	error = xfs_rtrmapbt_init_cur_cache();
+	if (error)
+		goto err;
+	error = xfs_rtrefcountbt_init_cur_cache();
 	if (error)
 		goto err;
 
@@ -5557,6 +5561,7 @@ xfs_btree_destroy_cur_caches(void)
 	xfs_rmapbt_destroy_cur_cache();
 	xfs_refcountbt_destroy_cur_cache();
 	xfs_rtrmapbt_destroy_cur_cache();
+	xfs_rtrefcountbt_destroy_cur_cache();
 }
 
 /* Move the btree cursor before the first record. */
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index f58240adda6f4..64e37a0ffb78e 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -229,6 +229,11 @@ union xfs_btree_irec {
 	struct xfs_refcount_irec	rc;
 };
 
+struct xbtree_refc {
+	unsigned int	nr_ops;		/* # record updates */
+	unsigned int	shape_changes;	/* # of extent splits */
+};
+
 /* Per-AG btree information. */
 struct xfs_btree_cur_ag {
 	struct xfs_perag		*pag;
@@ -237,10 +242,7 @@ struct xfs_btree_cur_ag {
 		struct xbtree_afakeroot	*afake;	/* for staging cursor */
 	};
 	union {
-		struct {
-			unsigned int	nr_ops;	/* # record updates */
-			unsigned int	shape_changes;	/* # of extent splits */
-		} refc;
+		struct xbtree_refc	refc;
 		struct {
 			bool		active;	/* allocation cursor state */
 		} abt;
@@ -261,6 +263,7 @@ struct xfs_btree_cur_ino {
 
 /* For extent swap, ignore owner check in verifier */
 #define	XFS_BTCUR_BMBT_INVALID_OWNER	(1 << 1)
+	struct xbtree_refc		refc;
 };
 
 /* In-memory btree information */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 473bdc2a1ad10..c938b814c430d 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1822,6 +1822,9 @@ typedef __be32 xfs_refcount_ptr_t;
  */
 #define	XFS_RTREFC_CRC_MAGIC	0x52434e54	/* 'RCNT' */
 
+/* inode-rooted btree pointer type */
+typedef __be64 xfs_rtrefcount_ptr_t;
+
 /*
  * BMAP Btree format definitions
  *
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 102a3574fc682..242b683125662 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -79,6 +79,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t,		8);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
new file mode 100644
index 0000000000000..f99c7167183e5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+
+static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
+
+/*
+ * Realtime Reference Count btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device.  See the comments in xfs_refcount_btree.c for more information.
+ *
+ * This tree is basically the same as the regular refcount btree except that
+ * it's rooted in an inode.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrefcountbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_cur	*new;
+
+	new = xfs_rtrefcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_ino.rtg, cur->bc_ino.ip);
+
+	/* Copy the flags values since init cursor doesn't get them. */
+	new->bc_ino.flags = cur->bc_ino.flags;
+
+	return new;
+}
+
+static xfs_failaddr_t
+xfs_rtrefcountbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	int			level;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (!xfs_has_reflink(mp))
+		return __this_address;
+	fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+	level = be16_to_cpu(block->bb_level);
+	if (level > mp->m_rtrefc_maxlevels)
+		return __this_address;
+
+	return xfs_btree_lblock_verify(bp, mp->m_rtrefc_mxr[level != 0]);
+}
+
+static void
+xfs_rtrefcountbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_btree_lblock_verify_crc(bp))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rtrefcountbt_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+
+	if (bp->b_error)
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrefcountbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	fa = xfs_rtrefcountbt_verify(bp);
+	if (fa) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+	xfs_btree_lblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
+	.name			= "xfs_rtrefcountbt",
+	.magic			= { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) },
+	.verify_read		= xfs_rtrefcountbt_read_verify,
+	.verify_write		= xfs_rtrefcountbt_write_verify,
+	.verify_struct		= xfs_rtrefcountbt_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
+	.rec_len		= sizeof(struct xfs_refcount_rec),
+	.key_len		= sizeof(struct xfs_refcount_key),
+	.lru_refs		= XFS_REFC_BTREE_REF,
+	.geom_flags		= XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE |
+				  XFS_BTREE_CRC_BLOCKS | XFS_BTREE_IROOT_RECORDS,
+
+	.dup_cursor		= xfs_rtrefcountbt_dup_cursor,
+	.buf_ops		= &xfs_rtrefcountbt_buf_ops,
+};
+
+/* Initialize a new rt refcount btree cursor. */
+static struct xfs_btree_cur *
+xfs_rtrefcountbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip)
+{
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RTREFC,
+			&xfs_rtrefcountbt_ops, mp->m_rtrefc_maxlevels,
+			xfs_rtrefcountbt_cur_cache);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
+
+	cur->bc_ino.ip = ip;
+	cur->bc_ino.allocated = 0;
+	cur->bc_ino.flags = 0;
+	cur->bc_ino.refc.nr_ops = 0;
+	cur->bc_ino.refc.shape_changes = 0;
+
+	cur->bc_ino.rtg = xfs_rtgroup_hold(rtg);
+	return cur;
+}
+
+/* Allocate a new rt refcount btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrefcountbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+	cur = xfs_rtrefcountbt_init_common(mp, tp, rtg, ip);
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+	cur->bc_ino.whichfork = XFS_DATA_FORK;
+	return cur;
+}
+
+/* Create a new rt reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rtrefcountbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_rtgroup	*rtg,
+	struct xfs_inode	*ip,
+	struct xbtree_ifakeroot	*ifake)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_rtrefcountbt_init_common(mp, NULL, rtg, ip);
+	cur->bc_nlevels = ifake->if_levels;
+	cur->bc_ino.forksize = ifake->if_fork_size;
+	cur->bc_ino.whichfork = -1;
+	xfs_btree_stage_ifakeroot(cur, ifake, NULL);
+	return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root.  Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrefcountbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp)
+{
+	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
+	struct xfs_ifork	*ifp;
+	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	/*
+	 * Free any resources hanging off the real fork, then shallow-copy the
+	 * staging fork's contents into the real fork to transfer everything
+	 * we just built.
+	 */
+	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+	xfs_idestroy_fork(ifp);
+	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK,
+			&xfs_rtrefcountbt_ops);
+}
+
+/* Calculate number of records in a realtime refcount btree block. */
+static inline unsigned int
+xfs_rtrefcountbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (sizeof(struct xfs_refcount_key) +
+			   sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount btree block.
+ */
+unsigned int
+xfs_rtrefcountbt_maxrecs(
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_RTREFCOUNT_BLOCK_LEN;
+	return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime refcount btrees. */
+unsigned int
+xfs_rtrefcountbt_maxlevels_ondisk(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2;
+
+	/* We need at most one record for every block in an rt group. */
+	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrefcountbt_init_cur_cache(void)
+{
+	xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur",
+			xfs_btree_cur_sizeof(
+					xfs_rtrefcountbt_maxlevels_ondisk()),
+			0, 0, NULL);
+
+	if (!xfs_rtrefcountbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_rtrefcountbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(xfs_rtrefcountbt_cur_cache);
+	xfs_rtrefcountbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of a realtime refcount btree. */
+void
+xfs_rtrefcountbt_compute_maxlevels(
+	struct xfs_mount	*mp)
+{
+	unsigned int		d_maxlevels, r_maxlevels;
+
+	if (!xfs_has_rtreflink(mp)) {
+		mp->m_rtrefc_maxlevels = 0;
+		return;
+	}
+
+	/*
+	 * The realtime refcountbt lives on the data device, which means that
+	 * its maximum height is constrained by the size of the data device and
+	 * the height required to store one refcount record for each rtextent
+	 * in an rt group.
+	 */
+	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr,
+				mp->m_sb.sb_dblocks);
+	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr,
+				xfs_rtb_to_rtx(mp, mp->m_sb.sb_rgblocks));
+
+	/* Add one level to handle the inode root level. */
+	mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
new file mode 100644
index 0000000000000..6d23ab3a9ad41
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTREFCOUNT_BTREE_H__
+#define __XFS_RTREFCOUNT_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* refcounts only exist on crc enabled filesystems */
+#define XFS_RTREFCOUNT_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+		struct xfs_inode *ip);
+struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+		struct xbtree_ifakeroot *ifake);
+void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp);
+unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
+		unsigned int blocklen, bool leaf);
+void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_rec_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_refcount_rec *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_key_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index)
+{
+	return (struct xfs_refcount_key *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_ptr_addr(
+	struct xfs_btree_block	*block,
+	unsigned int		index,
+	unsigned int		maxrecs)
+{
+	return (xfs_rtrefcount_ptr_t *)
+		((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+		 maxrecs * sizeof(struct xfs_refcount_key) +
+		 (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
+int __init xfs_rtrefcountbt_init_cur_cache(void);
+void xfs_rtrefcountbt_destroy_cur_cache(void);
+
+#endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a5ca8f4f8699f..b75a5bcbdf19e 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -29,6 +29,7 @@
 #include "xfs_swapext.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1136,6 +1137,13 @@ xfs_sb_mount_common(
 	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
 	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
 
+	mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+			true);
+	mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+			false);
+	mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2;
+	mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2;
+
 	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 	mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index adb742267c9d0..3a7f92a9ac761 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -42,6 +42,7 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
 extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
@@ -56,6 +57,7 @@ extern const struct xfs_btree_ops xfs_bmbt_ops;
 extern const struct xfs_btree_ops xfs_refcountbt_ops;
 extern const struct xfs_btree_ops xfs_rmapbt_ops;
 extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
+extern const struct xfs_btree_ops xfs_rtrefcountbt_ops;
 
 /* log size calculation functions */
 int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 30d2d0c5e5e53..b20e0e6410512 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -37,6 +37,7 @@
 #include "xfs_imeta.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -665,7 +666,10 @@ static inline void
 xfs_rtbtree_compute_maxlevels(
 	struct xfs_mount	*mp)
 {
-	mp->m_rtbtree_maxlevels = mp->m_rtrmap_maxlevels;
+	unsigned int		levels;
+
+	levels = max(mp->m_rtrmap_maxlevels, mp->m_rtrefc_maxlevels);
+	mp->m_rtbtree_maxlevels = levels;
 }
 
 /*
@@ -739,6 +743,7 @@ xfs_mountfs(
 	xfs_rmapbt_compute_maxlevels(mp);
 	xfs_rtrmapbt_compute_maxlevels(mp);
 	xfs_refcountbt_compute_maxlevels(mp);
+	xfs_rtrefcountbt_compute_maxlevels(mp);
 
 	xfs_agbtree_compute_maxlevels(mp);
 	xfs_rtbtree_compute_maxlevels(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 21c889a723820..1c99d0630364f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -138,11 +138,14 @@ typedef struct xfs_mount {
 	uint			m_rtrmap_mnr[2]; /* min rtrmap btree records */
 	uint			m_refc_mxr[2];	/* max refc btree records */
 	uint			m_refc_mnr[2];	/* min refc btree records */
+	uint			m_rtrefc_mxr[2]; /* max rtrefc btree records */
+	uint			m_rtrefc_mnr[2]; /* min rtrefc btree records */
 	uint			m_alloc_maxlevels; /* max alloc btree levels */
 	uint			m_bm_maxlevels[2]; /* max bmap btree levels */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
 	uint			m_rtrmap_maxlevels; /* max rtrmap btree level */
 	uint			m_refc_maxlevels; /* max refcount btree level */
+	uint			m_rtrefc_maxlevels; /* max rtrefc btree level */
 	unsigned int		m_agbtree_maxlevels; /* max level of all AG btrees */
 	unsigned int		m_rtbtree_maxlevels; /* max level of all rt btrees */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
@@ -379,6 +382,12 @@ static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
 	       xfs_has_rmapbt(mp);
 }
 
+static inline bool xfs_has_rtreflink(struct xfs_mount *mp)
+{
+	return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
+	       xfs_has_reflink(mp);
+}
+
 /*
  * Mount features
  *


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/44] xfs: realtime refcount btree transaction reservations
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:45   ` [PATCH 04/44] xfs: define the on-disk realtime refcount btree format Darrick J. Wong
@ 2023-12-31 21:45   ` Darrick J. Wong
  2023-12-31 21:46   ` [PATCH 06/44] xfs: add realtime refcount btree operations Darrick J. Wong
                     ` (38 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:45 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that there's enough log reservation to handle mapping
and unmapping realtime extents.  We have to reserve enough space
to handle a split in the rtrefcountbt to add the record and a second
split in the regular refcountbt to record the rtrefcountbt split.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_trans_resv.c |   25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 423b0cede71cb..5b42603de966f 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -93,6 +93,14 @@ xfs_refcountbt_block_count(
 	return num_ops * (2 * mp->m_refc_maxlevels - 1);
 }
 
+static unsigned int
+xfs_rtrefcountbt_block_count(
+	struct xfs_mount	*mp,
+	unsigned int		num_ops)
+{
+	return num_ops * (2 * mp->m_rtrefc_maxlevels - 1);
+}
+
 /*
  * Logging inodes is really tricksy. They are logged in memory format,
  * which means that what we write into the log doesn't directly translate into
@@ -260,10 +268,13 @@ xfs_rtalloc_block_count(
  * Compute the log reservation required to handle the refcount update
  * transaction.  Refcount updates are always done via deferred log items.
  *
- * This is calculated as:
+ * This is calculated as the max of:
  * Data device refcount updates (t1):
  *    the agfs of the ags containing the blocks: nr_ops * sector size
  *    the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ * Realtime refcount updates (t2);
+ *    the rt refcount inode
+ *    the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
  */
 static unsigned int
 xfs_calc_refcountbt_reservation(
@@ -271,12 +282,20 @@ xfs_calc_refcountbt_reservation(
 	unsigned int		nr_ops)
 {
 	unsigned int		blksz = XFS_FSB_TO_B(mp, 1);
+	unsigned int		t1, t2 = 0;
 
 	if (!xfs_has_reflink(mp))
 		return 0;
 
-	return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
-	       xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+	t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+	     xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+
+	if (xfs_has_realtime(mp))
+		t2 = xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+				     blksz);
+
+	return max(t1, t2);
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/44] xfs: add realtime refcount btree operations
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:45   ` [PATCH 05/44] xfs: realtime refcount btree transaction reservations Darrick J. Wong
@ 2023-12-31 21:46   ` Darrick J. Wong
  2023-12-31 21:46   ` [PATCH 07/44] xfs: prepare refcount functions to deal with rtrefcountbt Darrick J. Wong
                     ` (37 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:46 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Implement the generic btree operations needed to manipulate rtrefcount
btree blocks. This is different from the regular refcountbt in that we
allocate space from the filesystem at large, and are neither constrained
to the free space nor any particular AG.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtrefcount_btree.c |  148 ++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index f99c7167183e5..0892c4ddc7adf 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -19,6 +19,7 @@
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
 #include "xfs_rtrefcount_btree.h"
+#include "xfs_refcount.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_error.h"
@@ -53,6 +54,106 @@ xfs_rtrefcountbt_dup_cursor(
 	return new;
 }
 
+STATIC int
+xfs_rtrefcountbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_rtrefc_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrefcountbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
+
+		return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+				level == 0);
+	}
+
+	return cur->bc_mp->m_rtrefc_mxr[level != 0];
+}
+
+STATIC void
+xfs_rtrefcountbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_rtrefcountbt_init_high_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	__u32				x;
+
+	x = be32_to_cpu(rec->refc.rc_startblock);
+	x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+	key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+	uint32_t		start;
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	rec->refc.rc_startblock = cpu_to_be32(start);
+	rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+	rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	const struct xfs_refcount_key	*kp = &key->refc;
+	const struct xfs_refcount_irec	*irec = &cur->bc_rec.rc;
+	uint32_t			start;
+
+	start = xfs_refcount_encode_startblock(irec->rc_startblock,
+			irec->rc_domain);
+	return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->refc.rc_startblock);
+
+	return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+			be32_to_cpu(k2->refc.rc_startblock);
+}
+
 static xfs_failaddr_t
 xfs_rtrefcountbt_verify(
 	struct xfs_buf		*bp)
@@ -119,6 +220,40 @@ const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
 	.verify_struct		= xfs_rtrefcountbt_verify,
 };
 
+STATIC int
+xfs_rtrefcountbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->refc.rc_startblock) <
+	       be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_rtrefcountbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	return  be32_to_cpu(r1->refc.rc_startblock) +
+		be32_to_cpu(r1->refc.rc_blockcount) <=
+		be32_to_cpu(r2->refc.rc_startblock);
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrefcountbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->refc.rc_startblock);
+
+	return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock),
+				 be32_to_cpu(key2->refc.rc_startblock));
+}
+
 const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.rec_len		= sizeof(struct xfs_refcount_rec),
 	.key_len		= sizeof(struct xfs_refcount_key),
@@ -127,7 +262,20 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 				  XFS_BTREE_CRC_BLOCKS | XFS_BTREE_IROOT_RECORDS,
 
 	.dup_cursor		= xfs_rtrefcountbt_dup_cursor,
+	.alloc_block		= xfs_btree_alloc_imeta_block,
+	.free_block		= xfs_btree_free_imeta_block,
+	.get_minrecs		= xfs_rtrefcountbt_get_minrecs,
+	.get_maxrecs		= xfs_rtrefcountbt_get_maxrecs,
+	.init_key_from_rec	= xfs_rtrefcountbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rtrefcountbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rtrefcountbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_rtrefcountbt_init_ptr_from_cur,
+	.key_diff		= xfs_rtrefcountbt_key_diff,
 	.buf_ops		= &xfs_rtrefcountbt_buf_ops,
+	.diff_two_keys		= xfs_rtrefcountbt_diff_two_keys,
+	.keys_inorder		= xfs_rtrefcountbt_keys_inorder,
+	.recs_inorder		= xfs_rtrefcountbt_recs_inorder,
+	.keys_contiguous	= xfs_rtrefcountbt_keys_contiguous,
 };
 
 /* Initialize a new rt refcount btree cursor. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/44] xfs: prepare refcount functions to deal with rtrefcountbt
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:46   ` [PATCH 06/44] xfs: add realtime refcount btree operations Darrick J. Wong
@ 2023-12-31 21:46   ` Darrick J. Wong
  2023-12-31 21:46   ` [PATCH 08/44] xfs: add a realtime flag to the refcount update log redo items Darrick J. Wong
                     ` (36 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:46 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Prepare the high-level refcount functions to deal with the new realtime
refcountbt and its slightly different conventions.  Provide the ability
to talk to either refcountbt or rtrefcountbt formats from the same high
level code.

Note that we leave the _recover_cow_leftovers functions for a separate
patch so that we can convert it all at once.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |   93 ++++++++++++++++++++++++++++++++++--------
 fs/xfs/libxfs/xfs_refcount.h |    3 +
 2 files changed, 78 insertions(+), 18 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index b29a718737c59..269b950399071 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -25,6 +25,7 @@
 #include "xfs_ag.h"
 #include "xfs_health.h"
 #include "xfs_refcount_item.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -41,6 +42,16 @@ STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur,
 STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
 		xfs_agblock_t agbno, xfs_extlen_t aglen);
 
+/* Return the maximum startblock number of the refcountbt. */
+static inline xfs_agblock_t
+xrefc_max_startblock(
+	struct xfs_btree_cur	*cur)
+{
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC)
+		return cur->bc_mp->m_sb.sb_rgblocks;
+	return cur->bc_mp->m_sb.sb_agblocks;
+}
+
 /*
  * Look up the first record less than or equal to [bno, len] in the btree
  * given by cur.
@@ -144,6 +155,37 @@ xfs_refcount_check_irec(
 	return NULL;
 }
 
+xfs_failaddr_t
+xfs_rtrefcount_check_irec(
+	struct xfs_rtgroup		*rtg,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
+		return __this_address;
+
+	if (!xfs_refcount_check_domain(irec))
+		return __this_address;
+
+	/* check for valid extent range, including overflow */
+	if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount))
+		return __this_address;
+
+	if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
+		return __this_address;
+
+	return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_refcount_check_btrec(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC)
+		return xfs_rtrefcount_check_irec(cur->bc_ino.rtg, irec);
+	return xfs_refcount_check_irec(cur->bc_ag.pag, irec);
+}
+
 static inline int
 xfs_refcount_complain_bad_rec(
 	struct xfs_btree_cur		*cur,
@@ -152,9 +194,15 @@ xfs_refcount_complain_bad_rec(
 {
 	struct xfs_mount		*mp = cur->bc_mp;
 
-	xfs_warn(mp,
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC) {
+		xfs_warn(mp,
+ "RT Refcount BTree record corruption in rtgroup %u detected at %pS!",
+				cur->bc_ino.rtg->rtg_rgno, fa);
+	} else {
+		xfs_warn(mp,
  "Refcount BTree record corruption in AG %d detected at %pS!",
 				cur->bc_ag.pag->pag_agno, fa);
+	}
 	xfs_warn(mp,
 		"Start block 0x%x, block count 0x%x, references 0x%x",
 		irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -180,7 +228,7 @@ xfs_refcount_get_rec(
 		return error;
 
 	xfs_refcount_btrec_to_irec(rec, irec);
-	fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
+	fa = xfs_refcount_check_btrec(cur, irec);
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, irec);
 
@@ -1047,6 +1095,15 @@ xfs_refcount_merge_extents(
 	return 0;
 }
 
+static inline struct xbtree_refc *
+xrefc_btree_state(
+	struct xfs_btree_cur	*cur)
+{
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC)
+		return &cur->bc_ino.refc;
+	return &cur->bc_ag.refc;
+}
+
 /*
  * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
  * true incorrectly is a shutdown FS; the penalty for guessing false
@@ -1064,25 +1121,25 @@ xfs_refcount_still_have_space(
 	 * to handle each of the shape changes to the refcount btree.
 	 */
 	overhead = xfs_allocfree_block_count(cur->bc_mp,
-				cur->bc_ag.refc.shape_changes);
-	overhead += cur->bc_mp->m_refc_maxlevels;
+				xrefc_btree_state(cur)->shape_changes);
+	overhead += cur->bc_maxlevels;
 	overhead *= cur->bc_mp->m_sb.sb_blocksize;
 
 	/*
 	 * Only allow 2 refcount extent updates per transaction if the
 	 * refcount continue update "error" has been injected.
 	 */
-	if (cur->bc_ag.refc.nr_ops > 2 &&
+	if (xrefc_btree_state(cur)->nr_ops > 2 &&
 	    XFS_TEST_ERROR(false, cur->bc_mp,
 			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
 		return false;
 
-	if (cur->bc_ag.refc.nr_ops == 0)
+	if (xrefc_btree_state(cur)->nr_ops == 0)
 		return true;
 	else if (overhead > cur->bc_tp->t_log_res)
 		return false;
 	return  cur->bc_tp->t_log_res - overhead >
-		cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+		xrefc_btree_state(cur)->nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
 }
 
 /*
@@ -1117,7 +1174,7 @@ xfs_refcount_adjust_extents(
 		if (error)
 			goto out_error;
 		if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
-			ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+			ext.rc_startblock = xrefc_max_startblock(cur);
 			ext.rc_blockcount = 0;
 			ext.rc_refcount = 0;
 			ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
@@ -1141,7 +1198,7 @@ xfs_refcount_adjust_extents(
 			 * Either cover the hole (increment) or
 			 * delete the range (decrement).
 			 */
-			cur->bc_ag.refc.nr_ops++;
+			xrefc_btree_state(cur)->nr_ops++;
 			if (tmp.rc_refcount) {
 				error = xfs_refcount_insert(cur, &tmp,
 						&found_tmp);
@@ -1201,7 +1258,7 @@ xfs_refcount_adjust_extents(
 			goto skip;
 		ext.rc_refcount += adj;
 		trace_xfs_refcount_modify_extent(cur, &ext);
-		cur->bc_ag.refc.nr_ops++;
+		xrefc_btree_state(cur)->nr_ops++;
 		if (ext.rc_refcount > 1) {
 			error = xfs_refcount_update(cur, &ext);
 			if (error)
@@ -1287,7 +1344,7 @@ xfs_refcount_adjust(
 	if (shape_changed)
 		shape_changes++;
 	if (shape_changes)
-		cur->bc_ag.refc.shape_changes++;
+		xrefc_btree_state(cur)->shape_changes++;
 
 	/* Now that we've taken care of the ends, adjust the middle extents */
 	error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
@@ -1361,8 +1418,8 @@ xfs_refcount_finish_one(
 	 * the startblock, get one now.
 	 */
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
-		nr_ops = rcur->bc_ag.refc.nr_ops;
-		shape_changes = rcur->bc_ag.refc.shape_changes;
+		nr_ops = xrefc_btree_state(rcur)->nr_ops;
+		shape_changes = xrefc_btree_state(rcur)->shape_changes;
 		xfs_btree_del_cursor(rcur, 0);
 		rcur = NULL;
 		*pcur = NULL;
@@ -1375,8 +1432,8 @@ xfs_refcount_finish_one(
 
 		*pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp,
 							  ri->ri_pag);
-		rcur->bc_ag.refc.nr_ops = nr_ops;
-		rcur->bc_ag.refc.shape_changes = shape_changes;
+		xrefc_btree_state(rcur)->nr_ops = nr_ops;
+		xrefc_btree_state(rcur)->shape_changes = shape_changes;
 	}
 
 	switch (ri->ri_type) {
@@ -1667,7 +1724,7 @@ xfs_refcount_adjust_cow_extents(
 		goto out_error;
 	}
 	if (!found_rec) {
-		ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+		ext.rc_startblock = xrefc_max_startblock(cur);
 		ext.rc_blockcount = 0;
 		ext.rc_refcount = 0;
 		ext.rc_domain = XFS_REFC_DOMAIN_COW;
@@ -1878,7 +1935,7 @@ xfs_refcount_recover_extent(
 	INIT_LIST_HEAD(&rr->rr_list);
 	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
 
-	if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
+	if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL ||
 	    XFS_IS_CORRUPT(cur->bc_mp,
 			   rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
 		xfs_btree_mark_sick(cur);
@@ -2027,7 +2084,7 @@ xfs_refcount_query_range_helper(
 	xfs_failaddr_t			fa;
 
 	xfs_refcount_btrec_to_irec(rec, &irec);
-	fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec);
+	fa = xfs_refcount_check_btrec(cur, &irec);
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, &irec);
 
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 68acb0b1b4a87..13344b402a72c 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -12,6 +12,7 @@ struct xfs_perag;
 struct xfs_btree_cur;
 struct xfs_bmbt_irec;
 struct xfs_refcount_irec;
+struct xfs_rtgroup;
 
 extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
 		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
@@ -120,6 +121,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_refcount_irec *irec);
 xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
 		const struct xfs_refcount_irec *irec);
+xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg,
+		const struct xfs_refcount_irec *irec);
 extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
 		struct xfs_refcount_irec *irec, int *stat);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/44] xfs: add a realtime flag to the refcount update log redo items
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:46   ` [PATCH 07/44] xfs: prepare refcount functions to deal with rtrefcountbt Darrick J. Wong
@ 2023-12-31 21:46   ` Darrick J. Wong
  2023-12-31 21:46   ` [PATCH 09/44] xfs: support recovering refcount intent items targetting realtime extents Darrick J. Wong
                     ` (35 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:46 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Extend the refcount update (CUI) log items with a new realtime flag that
indicates that the updates apply against the realtime refcountbt.  We'll
wire up the actual refcount code later.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c        |   10 +-
 fs/xfs/libxfs/xfs_defer.h       |    1 
 fs/xfs/libxfs/xfs_log_format.h  |    6 +
 fs/xfs/libxfs/xfs_log_recover.h |    2 
 fs/xfs/libxfs/xfs_refcount.c    |   72 ++++++++----
 fs/xfs/libxfs/xfs_refcount.h    |   22 ++--
 fs/xfs/scrub/cow_repair.c       |    2 
 fs/xfs/scrub/reap.c             |    5 -
 fs/xfs/xfs_log_recover.c        |    2 
 fs/xfs/xfs_refcount_item.c      |  242 +++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_reflink.c            |   19 ++-
 11 files changed, 330 insertions(+), 53 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 992e492972e76..9a285f38da4cd 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4492,8 +4492,9 @@ xfs_bmapi_write(
 			 * the refcount btree for orphan recovery.
 			 */
 			if (whichfork == XFS_COW_FORK)
-				xfs_refcount_alloc_cow_extent(tp, bma.blkno,
-						bma.length);
+				xfs_refcount_alloc_cow_extent(tp,
+						XFS_IS_REALTIME_INODE(ip),
+						bma.blkno, bma.length);
 		}
 
 		/* Deal with the allocated space we found.  */
@@ -4659,7 +4660,8 @@ xfs_bmapi_convert_delalloc(
 	*seq = READ_ONCE(ifp->if_seq);
 
 	if (whichfork == XFS_COW_FORK)
-		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+		xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip),
+				bma.blkno, bma.length);
 
 	error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
 			whichfork);
@@ -5271,7 +5273,7 @@ xfs_bmap_del_extent_real(
 	 */
 	if (want_free) {
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
-			xfs_refcount_decrease_extent(tp, del);
+			xfs_refcount_decrease_extent(tp, isrt, del);
 		} else {
 			unsigned int	efi_flags = 0;
 
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index fddcb4cccbcc2..a351f00e6d78d 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -68,6 +68,7 @@ struct xfs_defer_op_type {
 
 extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index ea4e88d665707..a888e3d98a3de 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -252,6 +252,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_EFD_RT		0x124b	/* realtime extent free done */
 #define	XFS_LI_RUI_RT		0x124c	/* realtime rmap update intent */
 #define	XFS_LI_RUD_RT		0x124d	/* realtime rmap update done */
+#define	XFS_LI_CUI_RT		0x124e	/* realtime refcount update intent */
+#define	XFS_LI_CUD_RT		0x124f	/* realtime refcount update done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -275,7 +277,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_EFI_RT,	"XFS_LI_EFI_RT" }, \
 	{ XFS_LI_EFD_RT,	"XFS_LI_EFD_RT" }, \
 	{ XFS_LI_RUI_RT,	"XFS_LI_RUI_RT" }, \
-	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }
+	{ XFS_LI_RUD_RT,	"XFS_LI_RUD_RT" }, \
+	{ XFS_LI_CUI_RT,	"XFS_LI_CUI_RT" }, \
+	{ XFS_LI_CUD_RT,	"XFS_LI_CUD_RT" }
 
 /*
  * Inode Log Item Format definitions.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 433974693d10b..0ea9a6db24b84 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -81,6 +81,8 @@ extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtrui_item_ops;
 extern const struct xlog_recover_item_ops xlog_rtrud_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcud_item_ops;
 
 /*
  * Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 269b950399071..2ae126d3bd7ff 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -26,6 +26,7 @@
 #include "xfs_health.h"
 #include "xfs_refcount_item.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -1142,6 +1143,28 @@ xfs_refcount_still_have_space(
 		xrefc_btree_state(cur)->nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
 }
 
+/* Schedule an extent free. */
+static int
+xrefc_free_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*rec)
+{
+	xfs_fsblock_t			fsbno;
+	unsigned int			flags = 0;
+
+	if (cur->bc_btnum == XFS_BTNUM_RTREFC) {
+		flags |= XFS_FREE_EXTENT_REALTIME;
+		fsbno = xfs_rgbno_to_rtb(cur->bc_mp, cur->bc_ino.rtg->rtg_rgno,
+				rec->rc_startblock);
+	} else {
+		fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+				rec->rc_startblock);
+	}
+
+	return xfs_free_extent_later(cur->bc_tp, fsbno, rec->rc_blockcount,
+			NULL, XFS_AG_RESV_NONE, flags);
+}
+
 /*
  * Adjust the refcounts of middle extents.  At this point we should have
  * split extents that crossed the adjustment range; merged with adjacent
@@ -1158,7 +1181,6 @@ xfs_refcount_adjust_extents(
 	struct xfs_refcount_irec	ext, tmp;
 	int				error;
 	int				found_rec, found_tmp;
-	xfs_fsblock_t			fsbno;
 
 	/* Merging did all the work already. */
 	if (*aglen == 0)
@@ -1211,12 +1233,7 @@ xfs_refcount_adjust_extents(
 					goto out_error;
 				}
 			} else {
-				fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
-						cur->bc_ag.pag->pag_agno,
-						tmp.rc_startblock);
-				error = xfs_free_extent_later(cur->bc_tp, fsbno,
-						  tmp.rc_blockcount, NULL,
-						  XFS_AG_RESV_NONE, 0);
+				error = xrefc_free_extent(cur, &tmp);
 				if (error)
 					goto out_error;
 			}
@@ -1274,12 +1291,7 @@ xfs_refcount_adjust_extents(
 			}
 			goto advloop;
 		} else {
-			fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
-					cur->bc_ag.pag->pag_agno,
-					ext.rc_startblock);
-			error = xfs_free_extent_later(cur->bc_tp, fsbno,
-					ext.rc_blockcount, NULL,
-					XFS_AG_RESV_NONE, 0);
+			error = xrefc_free_extent(cur, &ext);
 			if (error)
 				goto out_error;
 		}
@@ -1474,6 +1486,20 @@ xfs_refcount_finish_one(
 	return error;
 }
 
+/*
+ * Process one of the deferred realtime refcount operations.  We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ */
+int
+xfs_rtrefcount_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_refcount_intent	*ri,
+	struct xfs_btree_cur		**pcur)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
 /*
  * Record a refcount intent for later processing.
  */
@@ -1481,6 +1507,7 @@ static void
 __xfs_refcount_add(
 	struct xfs_trans		*tp,
 	enum xfs_refcount_intent_type	type,
+	bool				isrt,
 	xfs_fsblock_t			startblock,
 	xfs_extlen_t			blockcount)
 {
@@ -1492,6 +1519,7 @@ __xfs_refcount_add(
 	ri->ri_type = type;
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
+	ri->ri_realtime = isrt;
 
 	xfs_refcount_defer_add(tp, ri);
 }
@@ -1502,12 +1530,13 @@ __xfs_refcount_add(
 void
 xfs_refcount_increase_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_has_reflink(tp->t_mountp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock,
 			PREV->br_blockcount);
 }
 
@@ -1517,12 +1546,13 @@ xfs_refcount_increase_extent(
 void
 xfs_refcount_decrease_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_has_reflink(tp->t_mountp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock,
 			PREV->br_blockcount);
 }
 
@@ -1878,6 +1908,7 @@ __xfs_refcount_cow_free(
 void
 xfs_refcount_alloc_cow_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
@@ -1886,16 +1917,17 @@ xfs_refcount_alloc_cow_extent(
 	if (!xfs_has_reflink(mp))
 		return;
 
-	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len);
 
 	/* Add rmap entry */
-	xfs_rmap_alloc_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
+	xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
 void
 xfs_refcount_free_cow_extent(
 	struct xfs_trans		*tp,
+	bool				isrt,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
@@ -1905,8 +1937,8 @@ xfs_refcount_free_cow_extent(
 		return;
 
 	/* Remove rmap entry */
-	xfs_rmap_free_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
-	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+	xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len);
 }
 
 struct xfs_refcount_recovery {
@@ -2013,7 +2045,7 @@ xfs_refcount_recover_cow_leftovers(
 		/* Free the orphan record */
 		fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
 				rr->rr_rrec.rc_startblock);
-		xfs_refcount_free_cow_extent(tp, fsb,
+		xfs_refcount_free_cow_extent(tp, false, fsb,
 				rr->rr_rrec.rc_blockcount);
 
 		/* Free the block. */
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 13344b402a72c..56e5834feb624 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -57,10 +57,14 @@ enum xfs_refcount_intent_type {
 
 struct xfs_refcount_intent {
 	struct list_head			ri_list;
-	struct xfs_perag			*ri_pag;
+	union {
+		struct xfs_perag		*ri_pag;
+		struct xfs_rtgroup		*ri_rtg;
+	};
 	enum xfs_refcount_intent_type		ri_type;
 	xfs_extlen_t				ri_blockcount;
 	xfs_fsblock_t				ri_startblock;
+	bool					ri_realtime;
 };
 
 /* Check that the refcount is appropriate for the record domain. */
@@ -75,22 +79,24 @@ xfs_refcount_check_domain(
 	return true;
 }
 
-void xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt,
 		struct xfs_bmbt_irec *irec);
-void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt,
 		struct xfs_bmbt_irec *irec);
 
-extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+int xfs_refcount_finish_one(struct xfs_trans *tp,
+		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
+int xfs_rtrefcount_finish_one(struct xfs_trans *tp,
 		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
 
 extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
 		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
 		xfs_extlen_t *flen, bool find_end_of_shared);
 
-void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-		xfs_extlen_t len);
-void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-		xfs_extlen_t len);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
+		xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
+		xfs_fsblock_t fsb, xfs_extlen_t len);
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 1e82c727af8ed..e48d869986f34 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -344,7 +344,7 @@ xrep_cow_alloc(
 	if (args.fsbno == NULLFSBLOCK)
 		return -ENOSPC;
 
-	xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+	xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len);
 
 	repl->fsbno = args.fsbno;
 	repl->len = args.len;
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index b8c48e36d2a8d..bb28c2d2b8780 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -420,7 +420,8 @@ xreap_agextent_iter(
 			 * records from the refcountbt, which will remove the
 			 * rmap record as well.
 			 */
-			xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+			xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
+					*aglenp);
 			return 0;
 		}
 
@@ -452,7 +453,7 @@ xreap_agextent_iter(
 	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
 		ASSERT(rs->resv == XFS_AG_RESV_NONE);
 
-		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+		xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp);
 		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
 				rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
 		if (error)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1efb69fcadf10..46b4ea4cce15a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1797,6 +1797,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
 	&xlog_rtefd_item_ops,
 	&xlog_rtrui_item_ops,
 	&xlog_rtrud_item_ops,
+	&xlog_rtcui_item_ops,
+	&xlog_rtcud_item_ops,
 };
 
 static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index bec3b91e826a4..4d5941335bc75 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -23,6 +23,7 @@
 #include "xfs_ag.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_rtgroup.h"
 
 struct kmem_cache	*xfs_cui_cache;
 struct kmem_cache	*xfs_cud_cache;
@@ -94,8 +95,9 @@ xfs_cui_item_format(
 
 	ASSERT(atomic_read(&cuip->cui_next_extent) ==
 			cuip->cui_format.cui_nextents);
+	ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT);
 
-	cuip->cui_format.cui_type = XFS_LI_CUI;
+	cuip->cui_format.cui_type = lip->li_type;
 	cuip->cui_format.cui_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format,
@@ -138,12 +140,15 @@ xfs_cui_item_release(
 STATIC struct xfs_cui_log_item *
 xfs_cui_init(
 	struct xfs_mount		*mp,
+	unsigned short			item_type,
 	uint				nextents)
 
 {
 	struct xfs_cui_log_item		*cuip;
 
 	ASSERT(nextents > 0);
+	ASSERT(item_type == XFS_LI_CUI || item_type == XFS_LI_CUI_RT);
+
 	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
 		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
 				0);
@@ -151,7 +156,7 @@ xfs_cui_init(
 		cuip = kmem_cache_zalloc(xfs_cui_cache,
 					 GFP_KERNEL | __GFP_NOFAIL);
 
-	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+	xfs_log_item_init(mp, &cuip->cui_item, item_type, &xfs_cui_item_ops);
 	cuip->cui_format.cui_nextents = nextents;
 	cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
 	atomic_set(&cuip->cui_next_extent, 0);
@@ -190,7 +195,9 @@ xfs_cud_item_format(
 	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
 	struct xfs_log_iovec	*vecp = NULL;
 
-	cudp->cud_format.cud_type = XFS_LI_CUD;
+	ASSERT(lip->li_type == XFS_LI_CUD || lip->li_type == XFS_LI_CUD_RT);
+
+	cudp->cud_format.cud_type = lip->li_type;
 	cudp->cud_format.cud_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format,
@@ -234,6 +241,14 @@ static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e)
 	return list_entry(e, struct xfs_refcount_intent, ri_list);
 }
 
+static inline bool
+xfs_cui_item_isrt(const struct xfs_log_item *lip)
+{
+	ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT);
+
+	return lip->li_type == XFS_LI_CUI_RT;
+}
+
 /* Sort refcount intents by AG. */
 static int
 xfs_refcount_update_diff_items(
@@ -289,11 +304,12 @@ xfs_refcount_update_create_intent(
 	bool				sort)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_cui_log_item		*cuip = xfs_cui_init(mp, count);
+	struct xfs_cui_log_item		*cuip;
 	struct xfs_refcount_intent	*ri;
 
 	ASSERT(count > 0);
 
+	cuip = xfs_cui_init(mp, XFS_LI_CUI, count);
 	if (sort)
 		list_sort(mp, items, xfs_refcount_update_diff_items);
 	list_for_each_entry(ri, items, ri_list)
@@ -301,6 +317,12 @@ xfs_refcount_update_create_intent(
 	return &cuip->cui_item;
 }
 
+static inline unsigned short
+xfs_cud_type_from_cui(const struct xfs_cui_log_item *cuip)
+{
+	return xfs_cui_item_isrt(&cuip->cui_item) ? XFS_LI_CUD_RT : XFS_LI_CUD;
+}
+
 /* Get an CUD so we can process all the deferred refcount updates. */
 static struct xfs_log_item *
 xfs_refcount_update_create_done(
@@ -312,8 +334,8 @@ xfs_refcount_update_create_done(
 	struct xfs_cud_log_item		*cudp;
 
 	cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
-	xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
-			  &xfs_cud_item_ops);
+	xfs_log_item_init(tp->t_mountp, &cudp->cud_item,
+			xfs_cud_type_from_cui(cuip), &xfs_cud_item_ops);
 	cudp->cud_cuip = cuip;
 	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
 
@@ -330,8 +352,22 @@ xfs_refcount_defer_add(
 
 	trace_xfs_refcount_defer(mp, ri);
 
-	ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
-	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
+	/*
+	 * Deferred refcount updates for the realtime and data sections must
+	 * use separate transactions to finish deferred work because updates to
+	 * realtime metadata files can lock AGFs to allocate btree blocks and
+	 * we don't want that mixing with the AGF locks taken to finish data
+	 * section updates.
+	 */
+	if (ri->ri_realtime) {
+		ri->ri_rtg = xfs_rtgroup_intent_get(mp, ri->ri_startblock);
+		xfs_defer_add(tp, &ri->ri_list,
+				&xfs_rtrefcount_update_defer_type);
+	} else {
+		ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
+		xfs_defer_add(tp, &ri->ri_list,
+				&xfs_refcount_update_defer_type);
+	}
 }
 
 /* Cancel a deferred refcount update. */
@@ -514,10 +550,12 @@ xfs_refcount_relog_intent(
 	struct xfs_phys_extent		*pmap;
 	unsigned int			count;
 
+	ASSERT(intent->li_type == XFS_LI_CUI || intent->li_type == XFS_LI_CUI_RT);
+
 	count = CUI_ITEM(intent)->cui_format.cui_nextents;
 	pmap = CUI_ITEM(intent)->cui_format.cui_extents;
 
-	cuip = xfs_cui_init(tp->t_mountp, count);
+	cuip = xfs_cui_init(tp->t_mountp, intent->li_type, count);
 	memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
 	atomic_set(&cuip->cui_next_extent, count);
 
@@ -537,6 +575,105 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.relog_intent	= xfs_refcount_relog_intent,
 };
 
+#ifdef CONFIG_XFS_RT
+/* Sort refcount intents by rtgroup. */
+static int
+xfs_rtrefcount_update_diff_items(
+	void				*priv,
+	const struct list_head		*a,
+	const struct list_head		*b)
+{
+	struct xfs_refcount_intent	*ra = ci_entry(a);
+	struct xfs_refcount_intent	*rb = ci_entry(b);
+
+	return ra->ri_rtg->rtg_rgno - rb->ri_rtg->rtg_rgno;
+}
+
+static struct xfs_log_item *
+xfs_rtrefcount_update_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_cui_log_item		*cuip;
+	struct xfs_refcount_intent	*ri;
+
+	ASSERT(count > 0);
+
+	cuip = xfs_cui_init(mp, XFS_LI_CUI_RT, count);
+	if (sort)
+		list_sort(mp, items, xfs_rtrefcount_update_diff_items);
+	list_for_each_entry(ri, items, ri_list)
+		xfs_refcount_update_log_item(tp, cuip, ri);
+	return &cuip->cui_item;
+}
+
+/* Cancel a deferred realtime refcount update. */
+STATIC void
+xfs_rtrefcount_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_refcount_intent	*ri = ci_entry(item);
+
+	xfs_rtgroup_intent_put(ri->ri_rtg);
+	kmem_cache_free(xfs_refcount_intent_cache, ri);
+}
+
+/* Process a deferred realtime refcount update. */
+STATIC int
+xfs_rtrefcount_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_refcount_intent	*ri = ci_entry(item);
+	int				error;
+
+	error = xfs_rtrefcount_finish_one(tp, ri, state);
+
+	/* Did we run out of reservation?  Requeue what we didn't finish. */
+	if (!error && ri->ri_blockcount > 0) {
+		ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
+		       ri->ri_type == XFS_REFCOUNT_DECREASE);
+		return -EAGAIN;
+	}
+
+	xfs_rtrefcount_update_cancel_item(item);
+	return error;
+}
+
+/* Clean up after calling xfs_rtrefcount_finish_one. */
+STATIC void
+xfs_rtrefcount_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	if (rcur)
+		xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+	.name		= "rtrefcount",
+	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_rtrefcount_update_create_intent,
+	.abort_intent	= xfs_refcount_update_abort_intent,
+	.create_done	= xfs_refcount_update_create_done,
+	.finish_item	= xfs_rtrefcount_update_finish_item,
+	.finish_cleanup = xfs_rtrefcount_finish_one_cleanup,
+	.cancel_item	= xfs_rtrefcount_update_cancel_item,
+	.recover_work	= xfs_refcount_recover_work,
+	.relog_intent	= xfs_refcount_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+	.name		= "rtrefcount",
+};
+#endif /* CONFIG_XFS_RT */
+
 STATIC bool
 xfs_cui_item_match(
 	struct xfs_log_item	*lip,
@@ -602,7 +739,7 @@ xlog_recover_cui_commit_pass2(
 		return -EFSCORRUPTED;
 	}
 
-	cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+	cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents);
 	xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
 	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
 
@@ -616,6 +753,61 @@ const struct xlog_recover_item_ops xlog_cui_item_ops = {
 	.commit_pass2		= xlog_recover_cui_commit_pass2,
 };
 
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtcui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_cui_log_item		*cuip;
+	struct xfs_cui_log_format	*cui_formatp;
+	size_t				len;
+
+	cui_formatp = item->ri_buf[0].i_addr;
+
+	if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents);
+	xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
+	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+
+	xlog_recover_intent_item(log, &cuip->cui_item, lsn,
+			&xfs_rtrefcount_update_defer_type);
+	return 0;
+}
+#else
+STATIC int
+xlog_recover_rtcui_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+			item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+	return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtcui_item_ops = {
+	.item_type		= XFS_LI_CUI_RT,
+	.commit_pass2		= xlog_recover_rtcui_commit_pass2,
+};
+
 /*
  * This routine is called when an CUD format structure is found in a committed
  * transaction in the log. Its purpose is to cancel the corresponding CUI if it
@@ -647,3 +839,33 @@ const struct xlog_recover_item_ops xlog_cud_item_ops = {
 	.item_type		= XFS_LI_CUD,
 	.commit_pass2		= xlog_recover_cud_commit_pass2,
 };
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtcud_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_cud_log_format	*cud_formatp;
+
+	cud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+				item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_CUI_RT,
+			cud_formatp->cud_cui_id);
+	return 0;
+}
+#else
+# define xlog_recover_rtcud_commit_pass2	xlog_recover_rtcui_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtcud_item_ops = {
+	.item_type		= XFS_LI_CUD_RT,
+	.commit_pass2		= xlog_recover_rtcud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 7e9273cc16e14..591782ca7d284 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -585,6 +585,7 @@ xfs_reflink_cancel_cow_blocks(
 	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	struct xfs_bmbt_irec		got, del;
 	struct xfs_iext_cursor		icur;
+	bool				isrt = XFS_IS_REALTIME_INODE(ip);
 	int				error = 0;
 
 	if (!xfs_inode_has_cow_data(ip))
@@ -614,12 +615,13 @@ xfs_reflink_cancel_cow_blocks(
 			ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
 
 			/* Free the CoW orphan record. */
-			xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
-					del.br_blockcount);
+			xfs_refcount_free_cow_extent(*tpp, isrt,
+					del.br_startblock, del.br_blockcount);
 
 			error = xfs_free_extent_later(*tpp, del.br_startblock,
 					del.br_blockcount, NULL,
-					XFS_AG_RESV_NONE, 0);
+					XFS_AG_RESV_NONE,
+					isrt ? XFS_FREE_EXTENT_REALTIME : 0);
 			if (error)
 				break;
 
@@ -729,6 +731,7 @@ xfs_reflink_end_cow_extent(
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	unsigned int		resblks;
 	int			nmaps;
+	bool			isrt = XFS_IS_REALTIME_INODE(ip);
 	int			error;
 
 	/* No COW extents?  That's easy! */
@@ -807,7 +810,7 @@ xfs_reflink_end_cow_extent(
 		 * or not), unmap the extent and drop its refcount.
 		 */
 		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
-		xfs_refcount_decrease_extent(tp, &data);
+		xfs_refcount_decrease_extent(tp, isrt, &data);
 		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
 				-data.br_blockcount);
 	} else if (data.br_startblock == DELAYSTARTBLOCK) {
@@ -827,7 +830,8 @@ xfs_reflink_end_cow_extent(
 	}
 
 	/* Free the CoW orphan record. */
-	xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
+	xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock,
+			del.br_blockcount);
 
 	/* Map the new blocks into the data fork. */
 	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
@@ -1164,6 +1168,7 @@ xfs_reflink_remap_extent(
 	bool			quota_reserved = true;
 	bool			smap_real;
 	bool			dmap_written = xfs_bmap_is_written_extent(dmap);
+	bool			isrt = XFS_IS_REALTIME_INODE(ip);
 	int			iext_delta = 0;
 	int			nimaps;
 	int			error;
@@ -1295,7 +1300,7 @@ xfs_reflink_remap_extent(
 		 * or not), unmap the extent and drop its refcount.
 		 */
 		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
-		xfs_refcount_decrease_extent(tp, &smap);
+		xfs_refcount_decrease_extent(tp, isrt, &smap);
 		qdelta -= smap.br_blockcount;
 	} else if (smap.br_startblock == DELAYSTARTBLOCK) {
 		int		done;
@@ -1318,7 +1323,7 @@ xfs_reflink_remap_extent(
 	 * its refcount and map it into the file.
 	 */
 	if (dmap_written) {
-		xfs_refcount_increase_extent(tp, dmap);
+		xfs_refcount_increase_extent(tp, isrt, dmap);
 		xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
 		qdelta += dmap->br_blockcount;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/44] xfs: support recovering refcount intent items targetting realtime extents
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:46   ` [PATCH 08/44] xfs: add a realtime flag to the refcount update log redo items Darrick J. Wong
@ 2023-12-31 21:46   ` Darrick J. Wong
  2023-12-31 21:47   ` [PATCH 10/44] xfs: add realtime refcount btree block detection to log recovery Darrick J. Wong
                     ` (34 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:46 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we have reflink on the realtime device, refcount intent items
have to support remapping extents on the realtime volume.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_refcount_item.c |   18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 4d5941335bc75..11df3aaa7b3f7 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -433,6 +433,7 @@ xfs_refcount_update_abort_intent(
 static inline bool
 xfs_cui_validate_phys(
 	struct xfs_mount		*mp,
+	bool				isrt,
 	struct xfs_phys_extent		*pmap)
 {
 	if (!xfs_has_reflink(mp))
@@ -451,6 +452,9 @@ xfs_cui_validate_phys(
 		return false;
 	}
 
+	if (isrt)
+		return xfs_verify_rtbext(mp, pmap->pe_startblock, pmap->pe_len);
+
 	return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
 }
 
@@ -458,6 +462,7 @@ static inline void
 xfs_cui_recover_work(
 	struct xfs_mount		*mp,
 	struct xfs_defer_pending	*dfp,
+	bool				isrt,
 	struct xfs_phys_extent		*pmap)
 {
 	struct xfs_refcount_intent	*ri;
@@ -467,7 +472,12 @@ xfs_cui_recover_work(
 	ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
 	ri->ri_startblock = pmap->pe_startblock;
 	ri->ri_blockcount = pmap->pe_len;
-	ri->ri_pag = xfs_perag_intent_get(mp, pmap->pe_startblock);
+	ri->ri_realtime = isrt;
+	if (isrt) {
+		ri->ri_rtg = xfs_rtgroup_intent_get(mp, pmap->pe_startblock);
+	} else {
+		ri->ri_pag = xfs_perag_intent_get(mp, pmap->pe_startblock);
+	}
 
 	xfs_defer_add_item(dfp, &ri->ri_list);
 }
@@ -486,6 +496,7 @@ xfs_refcount_recover_work(
 	struct xfs_cui_log_item		*cuip = CUI_ITEM(lip);
 	struct xfs_trans		*tp;
 	struct xfs_mount		*mp = lip->li_log->l_mp;
+	bool				isrt = xfs_cui_item_isrt(lip);
 	int				i;
 	int				error = 0;
 
@@ -495,7 +506,7 @@ xfs_refcount_recover_work(
 	 * just toss the CUI.
 	 */
 	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
-		if (!xfs_cui_validate_phys(mp,
+		if (!xfs_cui_validate_phys(mp, isrt,
 					&cuip->cui_format.cui_extents[i])) {
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					&cuip->cui_format,
@@ -503,7 +514,8 @@ xfs_refcount_recover_work(
 			return -EFSCORRUPTED;
 		}
 
-		xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]);
+		xfs_cui_recover_work(mp, dfp, isrt,
+				&cuip->cui_format.cui_extents[i]);
 	}
 
 	/*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/44] xfs: add realtime refcount btree block detection to log recovery
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 21:46   ` [PATCH 09/44] xfs: support recovering refcount intent items targetting realtime extents Darrick J. Wong
@ 2023-12-31 21:47   ` Darrick J. Wong
  2023-12-31 21:47   ` [PATCH 11/44] xfs: add realtime refcount btree inode to metadata directory Darrick J. Wong
                     ` (33 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:47 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Identify rt refcount btree blocks in the log correctly so that we can
validate them during log recovery.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_buf_item_recover.c |    4 ++++
 1 file changed, 4 insertions(+)


diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index c7d86636bd312..ebe7f2c3cf635 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -268,6 +268,9 @@ xlog_recover_validate_buf_type(
 		case XFS_REFC_CRC_MAGIC:
 			bp->b_ops = &xfs_refcountbt_buf_ops;
 			break;
+		case XFS_RTREFC_CRC_MAGIC:
+			bp->b_ops = &xfs_rtrefcountbt_buf_ops;
+			break;
 		default:
 			warnmsg = "Bad btree block magic!";
 			break;
@@ -772,6 +775,7 @@ xlog_recover_get_buf_lsn(
 		break;
 	}
 	case XFS_RTRMAP_CRC_MAGIC:
+	case XFS_RTREFC_CRC_MAGIC:
 	case XFS_BMAP_CRC_MAGIC:
 	case XFS_BMAP_MAGIC: {
 		struct xfs_btree_block *btb = blk;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/44] xfs: add realtime refcount btree inode to metadata directory
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 21:47   ` [PATCH 10/44] xfs: add realtime refcount btree block detection to log recovery Darrick J. Wong
@ 2023-12-31 21:47   ` Darrick J. Wong
  2023-12-31 21:47   ` [PATCH 12/44] xfs: add metadata reservations for realtime refcount btree Darrick J. Wong
                     ` (32 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:47 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a metadir path to select the realtime refcount btree inode and load
it at mount time.  The rtrefcountbt inode will have a unique extent format
code, which means that we also have to update the inode validation and
flush routines to look for it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c             |    8 +++-
 fs/xfs/libxfs/xfs_format.h           |    4 ++
 fs/xfs/libxfs/xfs_inode_buf.c        |    8 ++++
 fs/xfs/libxfs/xfs_inode_fork.c       |    9 +++++
 fs/xfs/libxfs/xfs_rtgroup.h          |    3 ++
 fs/xfs/libxfs/xfs_rtrefcount_btree.c |   33 ++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |    4 ++
 fs/xfs/xfs_inode.c                   |   13 +++++++
 fs/xfs/xfs_inode_item.c              |    2 +
 fs/xfs/xfs_inode_item_recover.c      |    1 +
 fs/xfs/xfs_rtalloc.c                 |   63 ++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trace.h                   |    1 +
 12 files changed, 146 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9a285f38da4cd..27f992dc6d2d6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5111,9 +5111,13 @@ xfs_bmap_del_extent_real(
 		 * the same order of operations as the data device, which is:
 		 * Remove the file mapping, remove the reverse mapping, and
 		 * then free the blocks.  This means that we must delay the
-		 * freeing until after we've scheduled the rmap update.
+		 * freeing until after we've scheduled the rmap update.  If
+		 * realtime reflink is enabled, use deferred refcount intent
+		 * items to decide what to do with the extent, just like we do
+		 * for the data device.
 		 */
-		if (want_free && !xfs_has_rtrmapbt(mp)) {
+		if (want_free && !xfs_has_rtrmapbt(mp) &&
+				 !xfs_has_rtreflink(mp)) {
 			error = xfs_rtfree_blocks(tp, del->br_startblock,
 					del->br_blockcount);
 			if (error)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index c938b814c430d..93a9b8e3b5694 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1028,6 +1028,7 @@ enum xfs_dinode_fmt {
 	XFS_DINODE_FMT_BTREE,		/* struct xfs_bmdr_block */
 	XFS_DINODE_FMT_UUID,		/* added long ago, but never used */
 	XFS_DINODE_FMT_RMAP,		/* reverse mapping btree */
+	XFS_DINODE_FMT_REFCOUNT,	/* reference count btree */
 };
 
 #define XFS_INODE_FORMAT_STR \
@@ -1036,7 +1037,8 @@ enum xfs_dinode_fmt {
 	{ XFS_DINODE_FMT_EXTENTS,	"extent" }, \
 	{ XFS_DINODE_FMT_BTREE,		"btree" }, \
 	{ XFS_DINODE_FMT_UUID,		"uuid" }, \
-	{ XFS_DINODE_FMT_RMAP,		"rmap" }
+	{ XFS_DINODE_FMT_RMAP,		"rmap" }, \
+	{ XFS_DINODE_FMT_REFCOUNT,	"refcount" }
 
 /*
  * Max values for extnum and aextnum.
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index dae2efec1d5d0..6e08ff8d8e239 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -417,6 +417,12 @@ xfs_dinode_verify_fork(
 		if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
 			return __this_address;
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		if (!xfs_has_rtreflink(mp))
+			return __this_address;
+		if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
+			return __this_address;
+		break;
 	default:
 		return __this_address;
 	}
@@ -437,6 +443,7 @@ xfs_dinode_verify_forkoff(
 			return __this_address;
 		break;
 	case XFS_DINODE_FMT_RMAP:
+	case XFS_DINODE_FMT_REFCOUNT:
 		if (!(xfs_has_metadir(mp) && xfs_has_parent(mp)))
 			return __this_address;
 		fallthrough;
@@ -708,6 +715,7 @@ xfs_dinode_verify(
 	if (flags2 & XFS_DIFLAG2_METADIR) {
 		switch (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK)) {
 		case XFS_DINODE_FMT_RMAP:
+		case XFS_DINODE_FMT_REFCOUNT:
 			break;
 		default:
 			if (nextents + naextents == 0 && nblocks != 0)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index e7ab04aea2db6..ae6e7deb04106 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -271,6 +271,11 @@ xfs_iformat_data_fork(
 				return -EFSCORRUPTED;
 			}
 			return xfs_iformat_rtrmap(ip, dip);
+		case XFS_DINODE_FMT_REFCOUNT:
+			if (!xfs_has_rtreflink(ip->i_mount))
+				return -EFSCORRUPTED;
+			ASSERT(0); /* to be implemented later */
+			return -EFSCORRUPTED;
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -666,6 +671,10 @@ xfs_iflush_fork(
 			xfs_iflush_rtrmap(ip, dip);
 		break;
 
+	case XFS_DINODE_FMT_REFCOUNT:
+		ASSERT(0); /* to be implemented later */
+		break;
+
 	default:
 		ASSERT(0);
 		break;
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 3522527e553b8..bd88a4d728135 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -25,6 +25,9 @@ struct xfs_rtgroup {
 	/* reverse mapping btree inode */
 	struct xfs_inode	*rtg_rmapip;
 
+	/* refcount btree inode */
+	struct xfs_inode	*rtg_refcountip;
+
 	/* Number of blocks in this group */
 	xfs_rgblock_t		rtg_blockcount;
 
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index 0892c4ddc7adf..ead6baf6de7e4 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -26,6 +26,7 @@
 #include "xfs_extent_busy.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_imeta.h"
 
 static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
 
@@ -355,6 +356,7 @@ xfs_rtrefcountbt_commit_staged_btree(
 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_REFCOUNT);
 
 	/*
 	 * Free any resources hanging off the real fork, then shallow-copy the
@@ -458,3 +460,34 @@ xfs_rtrefcountbt_compute_maxlevels(
 	/* Add one level to handle the inode root level. */
 	mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
 }
+
+#define XFS_RTREFC_NAMELEN		21
+
+/* Create the metadata directory path for an rtrefcount btree inode. */
+int
+xfs_rtrefcountbt_create_path(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_imeta_path	**pathp)
+{
+	struct xfs_imeta_path	*path;
+	unsigned char		*fname;
+	int			error;
+
+	error = xfs_imeta_create_file_path(mp, 2, &path);
+	if (error)
+		return error;
+
+	fname = kmalloc(XFS_RTREFC_NAMELEN, GFP_KERNEL);
+	if (!fname) {
+		xfs_imeta_free_path(path);
+		return -ENOMEM;
+	}
+
+	snprintf(fname, XFS_RTREFC_NAMELEN, "%u.refcount", rgno);
+	path->im_path[0] = "realtime";
+	path->im_path[1] = fname;
+	path->im_dynamicmask = 0x2;
+	*pathp = path;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
index 6d23ab3a9ad41..ff49e95d1a490 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_cur;
 struct xfs_mount;
 struct xbtree_ifakeroot;
 struct xfs_rtgroup;
+struct xfs_imeta_path;
 
 /* refcounts only exist on crc enabled filesystems */
 #define XFS_RTREFCOUNT_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
@@ -68,4 +69,7 @@ unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
 int __init xfs_rtrefcountbt_init_cur_cache(void);
 void xfs_rtrefcountbt_destroy_cur_cache(void);
 
+int xfs_rtrefcountbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		struct xfs_imeta_path **pathp);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b7cda81161fb5..334f87f7a8f3f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2509,6 +2509,14 @@ xfs_iflush(
 				__func__, ip->i_ino, ip);
 			goto flush_out;
 		}
+	} else if (ip->i_df.if_format == XFS_DINODE_FMT_REFCOUNT) {
+		if (!S_ISREG(VFS_I(ip)->i_mode) ||
+		    !(ip->i_diflags2 & XFS_DIFLAG2_METADIR)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad rt refcountbt inode %Lu, ptr "PTR_FMT,
+				__func__, ip->i_ino, ip);
+			goto flush_out;
+		}
 	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
 		if (XFS_TEST_ERROR(
 		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
@@ -2555,6 +2563,11 @@ xfs_iflush(
 				"%s: rt rmapbt in inode %Lu attr fork, ptr "PTR_FMT,
 				__func__, ip->i_ino, ip);
 			goto flush_out;
+		} else if (ip->i_af.if_format == XFS_DINODE_FMT_REFCOUNT) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: rt refcountbt in inode %Lu attr fork, ptr "PTR_FMT,
+				__func__, ip->i_ino, ip);
+			goto flush_out;
 		}
 	}
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2903c101505f5..fdc0b14bb9fbb 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -211,6 +211,7 @@ xfs_inode_item_data_fork_size(
 		break;
 	case XFS_DINODE_FMT_BTREE:
 	case XFS_DINODE_FMT_RMAP:
+	case XFS_DINODE_FMT_REFCOUNT:
 		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
 		    ip->i_df.if_broot_bytes > 0) {
 			*nbytes += ip->i_df.if_broot_bytes;
@@ -332,6 +333,7 @@ xfs_inode_item_format_data_fork(
 		break;
 	case XFS_DINODE_FMT_BTREE:
 	case XFS_DINODE_FMT_RMAP:
+	case XFS_DINODE_FMT_REFCOUNT:
 		iip->ili_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
 
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 4cf967df28ef1..2b5f7a143c479 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -420,6 +420,7 @@ xlog_recover_inode_commit_pass2(
 	if (unlikely(S_ISREG(ldip->di_mode))) {
 		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (ldip->di_format != XFS_DINODE_FMT_RMAP) &&
+		    (ldip->di_format != XFS_DINODE_FMT_REFCOUNT) &&
 		    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
 			XFS_CORRUPTION_ERROR(
 				"Bad log dinode data fork format for regular file",
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 37156cf8acd25..4102a46ed274d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -31,6 +31,7 @@
 #include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_trace.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Realtime metadata files are not quite regular files because userspace can't
@@ -42,6 +43,7 @@
 static struct lock_class_key xfs_rbmip_key;
 static struct lock_class_key xfs_rsumip_key;
 static struct lock_class_key xfs_rrmapip_key;
+static struct lock_class_key xfs_rrefcountip_key;
 
 /*
  * Read and return the summary information for a given extent size,
@@ -1835,6 +1837,53 @@ xfs_rtmount_iread_extents(
 	return error;
 }
 
+/* Load realtime refcount btree inode. */
+STATIC int
+xfs_rtmount_refcountbt(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_path	*path;
+	struct xfs_inode	*ip;
+	xfs_ino_t		ino;
+	int			error;
+
+	if (!xfs_has_rtreflink(mp))
+		return 0;
+
+	error = xfs_rtrefcountbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		return error;
+
+	error = xfs_imeta_lookup(tp, path, &ino);
+	if (error)
+		goto out_path;
+
+	if (ino == NULLFSINO) {
+		error = -EFSCORRUPTED;
+		goto out_path;
+	}
+
+	error = xfs_rt_iget(tp, ino, &xfs_rrefcountip_key, &ip);
+	if (error)
+		goto out_path;
+
+	if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_REFCOUNT)) {
+		error = -EFSCORRUPTED;
+		goto out_rele;
+	}
+
+	rtg->rtg_refcountip = ip;
+	ip = NULL;
+out_rele:
+	if (ip)
+		xfs_imeta_irele(ip);
+out_path:
+	xfs_imeta_free_path(path);
+	return error;
+}
+
 /*
  * Get the bitmap and summary inodes and the summary cache into the mount
  * structure at mount time.
@@ -1886,6 +1935,12 @@ xfs_rtmount_inodes(
 			xfs_rtgroup_rele(rtg);
 			goto out_rele_rtgroup;
 		}
+
+		error = xfs_rtmount_refcountbt(rtg, tp);
+		if (error) {
+			xfs_rtgroup_rele(rtg);
+			goto out_rele_rtgroup;
+		}
 	}
 
 	xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
@@ -1894,6 +1949,10 @@ xfs_rtmount_inodes(
 
 out_rele_rtgroup:
 	for_each_rtgroup(mp, rgno, rtg) {
+		if (rtg->rtg_refcountip)
+			xfs_imeta_irele(rtg->rtg_refcountip);
+		rtg->rtg_refcountip = NULL;
+
 		if (rtg->rtg_rmapip)
 			xfs_imeta_irele(rtg->rtg_rmapip);
 		rtg->rtg_rmapip = NULL;
@@ -1917,6 +1976,10 @@ xfs_rtunmount_inodes(
 	kmem_free(mp->m_rsum_cache);
 
 	for_each_rtgroup(mp, rgno, rtg) {
+		if (rtg->rtg_refcountip)
+			xfs_imeta_irele(rtg->rtg_refcountip);
+		rtg->rtg_refcountip = NULL;
+
 		if (rtg->rtg_rmapip)
 			xfs_imeta_irele(rtg->rtg_rmapip);
 		rtg->rtg_rmapip = NULL;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 86e0aa946aa00..f94f144f9a39d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2225,6 +2225,7 @@ TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_RMAP);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_REFCOUNT);
 
 DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 	TP_PROTO(struct xfs_inode *ip, int which),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/44] xfs: add metadata reservations for realtime refcount btree
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 21:47   ` [PATCH 11/44] xfs: add realtime refcount btree inode to metadata directory Darrick J. Wong
@ 2023-12-31 21:47   ` Darrick J. Wong
  2023-12-31 21:47   ` [PATCH 13/44] xfs: wire up a new inode fork type for the realtime refcount Darrick J. Wong
                     ` (31 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:47 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reserve some free blocks so that we will always have enough free blocks
in the data volume to handle expansion of the realtime refcount btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtrefcount_btree.c |   39 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |    4 +++
 fs/xfs/xfs_rtalloc.c                 |    9 +++++++-
 3 files changed, 51 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index ead6baf6de7e4..e1e8a3ea32091 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -491,3 +491,42 @@ xfs_rtrefcountbt_create_path(
 	*pathp = path;
 	return 0;
 }
+
+/* Calculate the rtrefcount btree size for some records. */
+unsigned long long
+xfs_rtrefcountbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_rtrefc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+static unsigned long long
+xfs_rtrefcountbt_max_size(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtblocks)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rtrefc_mxr[0] == 0)
+		return 0;
+
+	return xfs_rtrefcountbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ * We need enough space to hold one record for every rt extent in the rtgroup.
+ */
+xfs_filblks_t
+xfs_rtrefcountbt_calc_reserves(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_has_rtreflink(mp))
+		return 0;
+
+	return xfs_rtrefcountbt_max_size(mp,
+			xfs_rtb_to_rtx(mp, mp->m_sb.sb_rgblocks));
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
index ff49e95d1a490..045f7b1f72833 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -72,4 +72,8 @@ void xfs_rtrefcountbt_destroy_cur_cache(void);
 int xfs_rtrefcountbt_create_path(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 		struct xfs_imeta_path **pathp);
 
+xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
+unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 4102a46ed274d..14e17c2b39ef0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1711,8 +1711,10 @@ xfs_rt_resv_free(
 	struct xfs_rtgroup	*rtg;
 	xfs_rgnumber_t		rgno;
 
-	for_each_rtgroup(mp, rgno, rtg)
+	for_each_rtgroup(mp, rgno, rtg) {
+		xfs_imeta_resv_free_inode(rtg->rtg_refcountip);
 		xfs_imeta_resv_free_inode(rtg->rtg_rmapip);
+	}
 }
 
 /* Reserve space for rt metadata inodes' space expansion. */
@@ -1732,6 +1734,11 @@ xfs_rt_resv_init(
 		err2 = xfs_imeta_resv_init_inode(rtg->rtg_rmapip, ask);
 		if (err2 && !error)
 			error = err2;
+
+		ask = xfs_rtrefcountbt_calc_reserves(mp);
+		err2 = xfs_imeta_resv_init_inode(rtg->rtg_refcountip, ask);
+		if (err2 && !error)
+			error = err2;
 	}
 
 	return error;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/44] xfs: wire up a new inode fork type for the realtime refcount
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 21:47   ` [PATCH 12/44] xfs: add metadata reservations for realtime refcount btree Darrick J. Wong
@ 2023-12-31 21:47   ` Darrick J. Wong
  2023-12-31 21:48   ` [PATCH 14/44] xfs: wire up realtime refcount btree cursors Darrick J. Wong
                     ` (30 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:47 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Plumb in the pieces we need to embed the root of the realtime refcount
btree in an inode's data fork, complete with new fork type and
on-disk interpretation functions.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h           |    8 +
 fs/xfs/libxfs/xfs_inode_fork.c       |    8 +
 fs/xfs/libxfs/xfs_ondisk.h           |    1 
 fs/xfs/libxfs/xfs_rtrefcount_btree.c |  236 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |  112 ++++++++++++++++
 fs/xfs/xfs_inode_item_recover.c      |    4 +
 6 files changed, 366 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 93a9b8e3b5694..ca964befb51cf 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1824,6 +1824,14 @@ typedef __be32 xfs_refcount_ptr_t;
  */
 #define	XFS_RTREFC_CRC_MAGIC	0x52434e54	/* 'RCNT' */
 
+/*
+ * rt refcount root header, on-disk form only.
+ */
+struct xfs_rtrefcount_root {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+};
+
 /* inode-rooted btree pointer type */
 typedef __be64 xfs_rtrefcount_ptr_t;
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index ae6e7deb04106..df42ffa15d96e 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -28,6 +28,7 @@
 #include "xfs_health.h"
 #include "xfs_symlink_remote.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 struct kmem_cache *xfs_ifork_cache;
 
@@ -274,8 +275,7 @@ xfs_iformat_data_fork(
 		case XFS_DINODE_FMT_REFCOUNT:
 			if (!xfs_has_rtreflink(ip->i_mount))
 				return -EFSCORRUPTED;
-			ASSERT(0); /* to be implemented later */
-			return -EFSCORRUPTED;
+			return xfs_iformat_rtrefcount(ip, dip);
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
@@ -672,7 +672,9 @@ xfs_iflush_fork(
 		break;
 
 	case XFS_DINODE_FMT_REFCOUNT:
-		ASSERT(0); /* to be implemented later */
+		ASSERT(whichfork == XFS_DATA_FORK);
+		if (iip->ili_fields & brootflag[whichfork])
+			xfs_iflush_rtrefcount(ip, dip);
 		break;
 
 	default:
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 242b683125662..3a5581ecb36d4 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -80,6 +80,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t,			8);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t,		8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root,	4);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index e1e8a3ea32091..ae8dea035d29f 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -85,6 +85,41 @@ xfs_rtrefcountbt_get_maxrecs(
 	return cur->bc_mp->m_rtrefc_mxr[level != 0];
 }
 
+/*
+ * Calculate number of records in a realtime refcount btree inode root.
+ */
+unsigned int
+xfs_rtrefcountbt_droot_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= sizeof(struct xfs_rtrefcount_root);
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (2 * sizeof(struct xfs_refcount_key) +
+			sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork so that
+ * we can resize the in-memory buffer to match it.  After a resize to the
+ * maximum size this function returns the same value as
+ * xfs_rtrefcountbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrefcountbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_rtrefc_mxr[level != 0];
+	return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
 STATIC void
 xfs_rtrefcountbt_init_key_from_rec(
 	union xfs_btree_key		*key,
@@ -255,6 +290,68 @@ xfs_rtrefcountbt_keys_contiguous(
 				 be32_to_cpu(key2->refc.rc_startblock));
 }
 
+/* Move the rt refcount btree root from one incore buffer to another. */
+static void
+xfs_rtrefcountbt_broot_move(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_btree_block	*dst_broot,
+	size_t			dst_bytes,
+	struct xfs_btree_block	*src_broot,
+	size_t			src_bytes,
+	unsigned int		level,
+	unsigned int		numrecs)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	void			*dptr;
+	void			*sptr;
+
+	ASSERT(xfs_rtrefcount_droot_space(src_broot) <=
+			xfs_inode_fork_size(ip, whichfork));
+
+	/*
+	 * We always have to move the pointers because they are not butted
+	 * against the btree block header.
+	 */
+	if (numrecs && level > 0) {
+		sptr = xfs_rtrefcount_broot_ptr_addr(mp, src_broot, 1,
+				src_bytes);
+		dptr = xfs_rtrefcount_broot_ptr_addr(mp, dst_broot, 1,
+				dst_bytes);
+		memmove(dptr, sptr, numrecs * sizeof(xfs_fsblock_t));
+	}
+
+	if (src_broot == dst_broot)
+		return;
+
+	/*
+	 * If the root is being totally relocated, we have to migrate the block
+	 * header and the keys/records that come after it.
+	 */
+	memcpy(dst_broot, src_broot, XFS_RTREFCOUNT_BLOCK_LEN);
+
+	if (!numrecs)
+		return;
+
+	if (level == 0) {
+		sptr = xfs_rtrefcount_rec_addr(src_broot, 1);
+		dptr = xfs_rtrefcount_rec_addr(dst_broot, 1);
+		memcpy(dptr, sptr,
+				numrecs * sizeof(struct xfs_refcount_rec));
+	} else {
+		sptr = xfs_rtrefcount_key_addr(src_broot, 1);
+		dptr = xfs_rtrefcount_key_addr(dst_broot, 1);
+		memcpy(dptr, sptr,
+				numrecs * sizeof(struct xfs_refcount_key));
+	}
+}
+
+static const struct xfs_ifork_broot_ops xfs_rtrefcountbt_iroot_ops = {
+	.maxrecs		= xfs_rtrefcountbt_maxrecs,
+	.size			= xfs_rtrefcount_broot_space_calc,
+	.move			= xfs_rtrefcountbt_broot_move,
+};
+
 const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.rec_len		= sizeof(struct xfs_refcount_rec),
 	.key_len		= sizeof(struct xfs_refcount_key),
@@ -267,6 +364,7 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.free_block		= xfs_btree_free_imeta_block,
 	.get_minrecs		= xfs_rtrefcountbt_get_minrecs,
 	.get_maxrecs		= xfs_rtrefcountbt_get_maxrecs,
+	.get_dmaxrecs		= xfs_rtrefcountbt_get_dmaxrecs,
 	.init_key_from_rec	= xfs_rtrefcountbt_init_key_from_rec,
 	.init_high_key_from_rec	= xfs_rtrefcountbt_init_high_key_from_rec,
 	.init_rec_from_cur	= xfs_rtrefcountbt_init_rec_from_cur,
@@ -277,6 +375,7 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
 	.keys_inorder		= xfs_rtrefcountbt_keys_inorder,
 	.recs_inorder		= xfs_rtrefcountbt_recs_inorder,
 	.keys_contiguous	= xfs_rtrefcountbt_keys_contiguous,
+	.iroot_ops		= &xfs_rtrefcountbt_iroot_ops,
 };
 
 /* Initialize a new rt refcount btree cursor. */
@@ -530,3 +629,140 @@ xfs_rtrefcountbt_calc_reserves(
 	return xfs_rtrefcountbt_max_size(mp,
 			xfs_rtb_to_rtx(mp, mp->m_sb.sb_rgblocks));
 }
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+STATIC void
+xfs_rtrefcountbt_from_disk(
+	struct xfs_inode		*ip,
+	struct xfs_rtrefcount_root	*dblock,
+	int				dblocklen,
+	struct xfs_btree_block		*rblock)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_refcount_key	*fkp;
+	__be64				*fpp;
+	struct xfs_refcount_key	*tkp;
+	__be64				*tpp;
+	struct xfs_refcount_rec	*frp;
+	struct xfs_refcount_rec	*trp;
+	unsigned int			numrecs;
+	unsigned int			maxrecs;
+	unsigned int			rblocklen;
+
+	rblocklen = xfs_rtrefcount_broot_space(mp, dblock);
+
+	xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0,
+			ip->i_ino);
+
+	rblock->bb_level = dblock->bb_level;
+	rblock->bb_numrecs = dblock->bb_numrecs;
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+		tkp = xfs_rtrefcount_key_addr(rblock, 1);
+		fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+		tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		numrecs = be16_to_cpu(dblock->bb_numrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+		trp = xfs_rtrefcount_rec_addr(rblock, 1);
+		numrecs = be16_to_cpu(dblock->bb_numrecs);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Load a realtime reference count btree root in from disk. */
+int
+xfs_iformat_rtrefcount(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	unsigned int		numrecs;
+	unsigned int		level;
+	int			dsize;
+
+	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+	numrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > mp->m_rtrefc_maxlevels ||
+	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize)
+		return -EFSCORRUPTED;
+
+	xfs_iroot_alloc(ip, XFS_DATA_FORK,
+			xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
+	xfs_rtrefcountbt_from_disk(ip, dfp, dsize, ifp->if_broot);
+	return 0;
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_rtrefcountbt_to_disk(
+	struct xfs_mount		*mp,
+	struct xfs_btree_block		*rblock,
+	int				rblocklen,
+	struct xfs_rtrefcount_root	*dblock,
+	int				dblocklen)
+{
+	struct xfs_refcount_key	*fkp;
+	__be64				*fpp;
+	struct xfs_refcount_key	*tkp;
+	__be64				*tpp;
+	struct xfs_refcount_rec	*frp;
+	struct xfs_refcount_rec	*trp;
+	unsigned int			maxrecs;
+	unsigned int			numrecs;
+
+	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC));
+	ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+	ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+
+	if (be16_to_cpu(rblock->bb_level) > 0) {
+		maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+		fkp = xfs_rtrefcount_key_addr(rblock, 1);
+		tkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+		fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+		tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+		numrecs = be16_to_cpu(rblock->bb_numrecs);
+		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+	} else {
+		frp = xfs_rtrefcount_rec_addr(rblock, 1);
+		trp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+		numrecs = be16_to_cpu(rblock->bb_numrecs);
+		memcpy(trp, frp, sizeof(*frp) * numrecs);
+	}
+}
+
+/* Flush a realtime reference count btree root out to disk. */
+void
+xfs_iflush_rtrefcount(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+	ASSERT(ifp->if_broot != NULL);
+	ASSERT(ifp->if_broot_bytes > 0);
+	ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <=
+			xfs_inode_fork_size(ip, XFS_DATA_FORK));
+	xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot,
+			ifp->if_broot_bytes, dfp,
+			XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
index 045f7b1f72833..bd070e54781a1 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -27,6 +27,7 @@ void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
 unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
 		unsigned int blocklen, bool leaf);
 void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf);
 
 /*
  * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
@@ -76,4 +77,115 @@ xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
 unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
+/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */
+
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_droot_rec_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index)
+{
+	return (struct xfs_refcount_rec *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_droot_key_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index)
+{
+	return (struct xfs_refcount_key *)
+		((char *)(block + 1) +
+		 (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_droot_ptr_addr(
+	struct xfs_rtrefcount_root	*block,
+	unsigned int			index,
+	unsigned int			maxrecs)
+{
+	return (xfs_rtrefcount_ptr_t *)
+		((char *)(block + 1) +
+		 maxrecs * sizeof(struct xfs_refcount_key) +
+		 (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_broot_ptr_addr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*bb,
+	unsigned int		index,
+	unsigned int		block_size)
+{
+	return xfs_rtrefcount_ptr_addr(bb, index,
+			xfs_rtrefcountbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space_calc(
+	struct xfs_mount	*mp,
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = XFS_RTREFCOUNT_BLOCK_LEN;
+
+	if (level > 0)
+		return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+				     sizeof(xfs_rtrefcount_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb)
+{
+	return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrefcount_droot_space_calc(
+	unsigned int		level,
+	unsigned int		nrecs)
+{
+	size_t			sz = sizeof(struct xfs_rtrefcount_root);
+
+	if (level > 0)
+		return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+				     sizeof(xfs_rtrefcount_ptr_t));
+	return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrefcount_droot_space(struct xfs_btree_block *bb)
+{
+	return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level),
+			be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
+		struct xfs_btree_block *rblock, int rblocklen,
+		struct xfs_rtrefcount_root *dblock, int dblocklen);
+void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 2b5f7a143c479..317a27e6a5a4b 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -23,6 +23,7 @@
 #include "xfs_icache.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 STATIC void
 xlog_recover_inode_ra_pass2(
@@ -284,6 +285,9 @@ xlog_recover_inode_dbroot(
 	case XFS_DINODE_FMT_RMAP:
 		xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize);
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		xfs_rtrefcountbt_to_disk(mp, src, len, dfork, dsize);
+		break;
 	default:
 		ASSERT(0);
 		return -EFSCORRUPTED;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/44] xfs: wire up realtime refcount btree cursors
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 21:47   ` [PATCH 13/44] xfs: wire up a new inode fork type for the realtime refcount Darrick J. Wong
@ 2023-12-31 21:48   ` Darrick J. Wong
  2023-12-31 21:48   ` [PATCH 15/44] xfs: create routine to allocate and initialize a realtime refcount btree inode Darrick J. Wong
                     ` (29 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:48 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Wire up realtime refcount btree cursors wherever they're needed
throughout the code base.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |  101 +++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/libxfs/xfs_rtgroup.c  |   10 ++++
 fs/xfs/libxfs/xfs_rtgroup.h  |    5 ++
 fs/xfs/xfs_fsmap.c           |   22 ++++++---
 fs/xfs/xfs_reflink.c         |   99 +++++++++++++++++++++++++++++++++--------
 5 files changed, 206 insertions(+), 31 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 2ae126d3bd7ff..60e838adde0d8 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -27,6 +27,7 @@
 #include "xfs_refcount_item.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtalloc.h"
+#include "xfs_rtrefcount_btree.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -1486,6 +1487,33 @@ xfs_refcount_finish_one(
 	return error;
 }
 
+/*
+ * Set up a continuation a deferred rtrefcount operation by updating the
+ * intent.  Checks to make sure we're not going to run off the end of the
+ * rtgroup.
+ */
+static inline int
+xfs_rtrefcount_continue_op(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_intent	*ri,
+	xfs_agblock_t			new_agbno)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rtgroup		*rtg = ri->ri_rtg;
+
+	if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno,
+					ri->ri_blockcount))) {
+		xfs_btree_mark_sick(cur);
+		return -EFSCORRUPTED;
+	}
+
+	ri->ri_startblock = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno, new_agbno);
+
+	ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount));
+	ASSERT(rtg->rtg_rgno == xfs_rtb_to_rgno(mp, ri->ri_startblock));
+	return 0;
+}
+
 /*
  * Process one of the deferred realtime refcount operations.  We pass back the
  * btree cursor to maintain our lock on the btree between calls.
@@ -1496,8 +1524,77 @@ xfs_rtrefcount_finish_one(
 	struct xfs_refcount_intent	*ri,
 	struct xfs_btree_cur		**pcur)
 {
-	ASSERT(0);
-	return -EFSCORRUPTED;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*rcur = *pcur;
+	int				error = 0;
+	xfs_rgnumber_t			rgno;
+	xfs_rgblock_t			bno;
+	unsigned long			nr_ops = 0;
+	int				shape_changes = 0;
+
+	bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock, &rgno);
+
+	trace_xfs_refcount_deferred(mp, ri);
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+		return -EIO;
+
+	/*
+	 * If we haven't gotten a cursor or the cursor AG doesn't match
+	 * the startblock, get one now.
+	 */
+	if (rcur != NULL && rcur->bc_ino.rtg != ri->ri_rtg) {
+		nr_ops = xrefc_btree_state(rcur)->nr_ops;
+		shape_changes = xrefc_btree_state(rcur)->shape_changes;
+		xfs_btree_del_cursor(rcur, 0);
+		rcur = NULL;
+		*pcur = NULL;
+	}
+	if (rcur == NULL) {
+		xfs_rtgroup_lock(tp, ri->ri_rtg, XFS_RTGLOCK_REFCOUNT);
+		*pcur = rcur = xfs_rtrefcountbt_init_cursor(mp, tp, ri->ri_rtg,
+						ri->ri_rtg->rtg_refcountip);
+
+		xrefc_btree_state(rcur)->nr_ops = nr_ops;
+		xrefc_btree_state(rcur)->shape_changes = shape_changes;
+	}
+
+	switch (ri->ri_type) {
+	case XFS_REFCOUNT_INCREASE:
+		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+				XFS_REFCOUNT_ADJUST_INCREASE);
+		if (error)
+			return error;
+		if (ri->ri_blockcount > 0)
+			error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+		break;
+	case XFS_REFCOUNT_DECREASE:
+		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+				XFS_REFCOUNT_ADJUST_DECREASE);
+		if (error)
+			return error;
+		if (ri->ri_blockcount > 0)
+			error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+		break;
+	case XFS_REFCOUNT_ALLOC_COW:
+		error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
+		if (error)
+			return error;
+		ri->ri_blockcount = 0;
+		break;
+	case XFS_REFCOUNT_FREE_COW:
+		error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
+		if (error)
+			return error;
+		ri->ri_blockcount = 0;
+		break;
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+	if (!error && ri->ri_blockcount > 0)
+		trace_xfs_refcount_finish_one_leftover(mp, ri);
+	return error;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 8bc97c9aa4c9c..173c3887788f7 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -561,6 +561,13 @@ xfs_rtgroup_lock(
 		if (tp)
 			xfs_trans_ijoin(tp, rtg->rtg_rmapip, XFS_ILOCK_EXCL);
 	}
+
+	if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg->rtg_refcountip) {
+		xfs_ilock(rtg->rtg_refcountip, XFS_ILOCK_EXCL);
+		if (tp)
+			xfs_trans_ijoin(tp, rtg->rtg_refcountip,
+					XFS_ILOCK_EXCL);
+	}
 }
 
 /* Unlock metadata inodes associated with this rt group. */
@@ -573,6 +580,9 @@ xfs_rtgroup_unlock(
 	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
 	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
 
+	if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg->rtg_refcountip)
+		xfs_iunlock(rtg->rtg_refcountip, XFS_ILOCK_EXCL);
+
 	if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg->rtg_rmapip)
 		xfs_iunlock(rtg->rtg_rmapip, XFS_ILOCK_EXCL);
 
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index bd88a4d728135..659d0c15d2ade 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -248,10 +248,13 @@ int xfs_rtgroup_init_secondary_super(struct xfs_mount *mp, xfs_rgnumber_t rgno,
 #define XFS_RTGLOCK_BITMAP_SHARED	(1U << 1)
 /* Lock the rt rmap inode in exclusive mode */
 #define XFS_RTGLOCK_RMAP		(1U << 2)
+/* Lock the rt refcount inode in exclusive mode */
+#define XFS_RTGLOCK_REFCOUNT		(1U << 3)
 
 #define XFS_RTGLOCK_ALL_FLAGS	(XFS_RTGLOCK_BITMAP | \
 				 XFS_RTGLOCK_BITMAP_SHARED | \
-				 XFS_RTGLOCK_RMAP)
+				 XFS_RTGLOCK_RMAP | \
+				 XFS_RTGLOCK_REFCOUNT)
 
 void xfs_rtgroup_lock(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
 		unsigned int rtglock_flags);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index b0eabc76eb28a..cc8175af986aa 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -27,6 +27,7 @@
 #include "xfs_ag.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 /* Convert an xfs_fsmap to an fsmap. */
 static void
@@ -216,14 +217,16 @@ xfs_getfsmap_is_shared(
 	*stat = false;
 	if (!xfs_has_reflink(mp))
 		return 0;
-	/* rt files will have no perag structure */
-	if (!info->pag)
-		return 0;
+
+	if (info->rtg)
+		cur = xfs_rtrefcountbt_init_cursor(mp, tp, info->rtg,
+				info->rtg->rtg_refcountip);
+	else
+		cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+				info->pag);
 
 	/* Are there any shared blocks here? */
 	flen = 0;
-	cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag);
-
 	error = xfs_refcount_find_shared(cur, rec->rm_startblock,
 			rec->rm_blockcount, &fbno, &flen, false);
 
@@ -828,7 +831,8 @@ xfs_getfsmap_rtdev_rmapbt_query(
 				info);
 
 	/* Query the rtrmapbt */
-	xfs_rtgroup_lock(NULL, info->rtg, XFS_RTGLOCK_RMAP);
+	xfs_rtgroup_lock(NULL, info->rtg, XFS_RTGLOCK_RMAP |
+					  XFS_RTGLOCK_REFCOUNT);
 	*curpp = xfs_rtrmapbt_init_cursor(mp, tp, info->rtg,
 			info->rtg->rtg_rmapip);
 	return xfs_rmap_query_range(*curpp, &info->low, &info->high,
@@ -917,7 +921,8 @@ xfs_getfsmap_rtdev_rmapbt(
 
 		if (bt_cur) {
 			xfs_rtgroup_unlock(bt_cur->bc_ino.rtg,
-					XFS_RTGLOCK_RMAP);
+					XFS_RTGLOCK_RMAP |
+					XFS_RTGLOCK_REFCOUNT);
 			xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
 			bt_cur = NULL;
 		}
@@ -954,7 +959,8 @@ xfs_getfsmap_rtdev_rmapbt(
 	}
 
 	if (bt_cur) {
-		xfs_rtgroup_unlock(bt_cur->bc_ino.rtg, XFS_RTGLOCK_RMAP);
+		xfs_rtgroup_unlock(bt_cur->bc_ino.rtg, XFS_RTGLOCK_RMAP |
+						       XFS_RTGLOCK_REFCOUNT);
 		xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
 							 XFS_BTREE_NOERROR);
 	}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 591782ca7d284..a10d43a1a7da4 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -30,6 +30,9 @@
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 #include "xfs_health.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -155,6 +158,38 @@ xfs_reflink_find_shared(
 	return error;
 }
 
+/*
+ * Given an RT extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen.  If
+ * find_end_of_shared is true, return the longest contiguous extent of
+ * shared blocks.  If there are no shared extents, fbno and flen will
+ * be set to NULLRGBLOCK and 0, respectively.
+ */
+static int
+xfs_reflink_find_rtshared(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_trans	*tp,
+	xfs_agblock_t		rtbno,
+	xfs_extlen_t		rtlen,
+	xfs_agblock_t		*fbno,
+	xfs_extlen_t		*flen,
+	bool			find_end_of_shared)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK);
+
+	xfs_rtgroup_lock(NULL, rtg, XFS_RTGLOCK_REFCOUNT);
+	cur = xfs_rtrefcountbt_init_cursor(mp, tp, rtg, rtg->rtg_refcountip);
+	error = xfs_refcount_find_shared(cur, rtbno, rtlen, fbno, flen,
+			find_end_of_shared);
+	xfs_btree_del_cursor(cur, error);
+	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
+	return error;
+}
+
 /*
  * Trim the mapping to the next block where there's a change in the
  * shared/unshared status.  More specifically, this means that we
@@ -172,9 +207,7 @@ xfs_reflink_trim_around_shared(
 	bool			*shared)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_perag	*pag;
-	xfs_agblock_t		agbno;
-	xfs_extlen_t		aglen;
+	xfs_agblock_t		orig_bno;
 	xfs_agblock_t		fbno;
 	xfs_extlen_t		flen;
 	int			error = 0;
@@ -187,13 +220,25 @@ xfs_reflink_trim_around_shared(
 
 	trace_xfs_reflink_trim_around_shared(ip, irec);
 
-	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
-	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
-	aglen = irec->br_blockcount;
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		struct xfs_rtgroup	*rtg;
+		xfs_rgnumber_t		rgno;
 
-	error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
-			true);
-	xfs_perag_put(pag);
+		orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock, &rgno);
+		rtg = xfs_rtgroup_get(mp, rgno);
+		error = xfs_reflink_find_rtshared(rtg, NULL, orig_bno,
+				irec->br_blockcount, &fbno, &flen, true);
+		xfs_rtgroup_put(rtg);
+	} else {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp,
+					irec->br_startblock));
+		orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
+		error = xfs_reflink_find_shared(pag, NULL, orig_bno,
+				irec->br_blockcount, &fbno, &flen, true);
+		xfs_perag_put(pag);
+	}
 	if (error)
 		return error;
 
@@ -203,7 +248,7 @@ xfs_reflink_trim_around_shared(
 		return 0;
 	}
 
-	if (fbno == agbno) {
+	if (fbno == orig_bno) {
 		/*
 		 * The start of this extent is shared.  Truncate the
 		 * mapping at the end of the shared region so that a
@@ -221,7 +266,7 @@ xfs_reflink_trim_around_shared(
 	 * extent so that a subsequent iteration starts at the
 	 * start of the shared region.
 	 */
-	irec->br_blockcount = fbno - agbno;
+	irec->br_blockcount = fbno - orig_bno;
 	return 0;
 }
 
@@ -1582,9 +1627,6 @@ xfs_reflink_inode_has_shared_extents(
 	*has_shared = false;
 	found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
 	while (found) {
-		struct xfs_perag	*pag;
-		xfs_agblock_t		agbno;
-		xfs_extlen_t		aglen;
 		xfs_agblock_t		rbno;
 		xfs_extlen_t		rlen;
 
@@ -1592,12 +1634,29 @@ xfs_reflink_inode_has_shared_extents(
 		    got.br_state != XFS_EXT_NORM)
 			goto next;
 
-		pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
-		agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
-		aglen = got.br_blockcount;
-		error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
-				&rbno, &rlen, false);
-		xfs_perag_put(pag);
+		if (XFS_IS_REALTIME_INODE(ip)) {
+			struct xfs_rtgroup	*rtg;
+			xfs_rgnumber_t		rgno;
+			xfs_rgblock_t		rgbno;
+
+			rgbno = xfs_rtb_to_rgbno(mp, got.br_startblock, &rgno);
+			rtg = xfs_rtgroup_get(mp, rgno);
+			error = xfs_reflink_find_rtshared(rtg, tp, rgbno,
+					got.br_blockcount, &rbno, &rlen,
+					false);
+			xfs_rtgroup_put(rtg);
+		} else {
+			struct xfs_perag	*pag;
+			xfs_agblock_t		agbno;
+
+			pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp,
+						got.br_startblock));
+			agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
+			error = xfs_reflink_find_shared(pag, tp, agbno,
+					got.br_blockcount, &rbno, &rlen,
+					false);
+			xfs_perag_put(pag);
+		}
 		if (error)
 			return error;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/44] xfs: create routine to allocate and initialize a realtime refcount btree inode
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-31 21:48   ` [PATCH 14/44] xfs: wire up realtime refcount btree cursors Darrick J. Wong
@ 2023-12-31 21:48   ` Darrick J. Wong
  2023-12-31 21:48   ` [PATCH 16/44] xfs: update rmap to allow cow staging extents in the rt rmap Darrick J. Wong
                     ` (28 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:48 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a library routine to allocate and initialize an empty realtime
refcountbt inode.  We'll use this for growfs, mkfs, and repair.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtrefcount_btree.c |   34 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtrefcount_btree.h |    5 +++++
 2 files changed, 39 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index ae8dea035d29f..fb0e4abcd6f6a 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -766,3 +766,37 @@ xfs_iflush_rtrefcount(
 			ifp->if_broot_bytes, dfp,
 			XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
 }
+
+/*
+ * Create a realtime refcount btree inode.
+ *
+ * Regardless of the return value, the caller must clean up @upd.  If a new
+ * inode is returned through *ipp, the caller must finish setting up the incore
+ * inode and release it.
+ */
+int
+xfs_rtrefcountbt_create(
+	struct xfs_imeta_update	*upd,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = upd->mp;
+	struct xfs_ifork	*ifp;
+	int			error;
+
+	error = xfs_imeta_create(upd, S_IFREG, ipp);
+	if (error)
+		return error;
+
+	ifp = xfs_ifork_ptr(upd->ip, XFS_DATA_FORK);
+	ifp->if_format = XFS_DINODE_FMT_REFCOUNT;
+	ASSERT(ifp->if_broot_bytes == 0);
+	ASSERT(ifp->if_bytes == 0);
+
+	/* Initialize the empty incore btree root. */
+	xfs_iroot_alloc(upd->ip, XFS_DATA_FORK,
+			xfs_rtrefcount_broot_space_calc(mp, 0, 0));
+	xfs_btree_init_block(mp, ifp->if_broot, &xfs_rtrefcountbt_ops,
+			0, 0, upd->ip->i_ino);
+	xfs_trans_log_inode(upd->tp, upd->ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
index bd070e54781a1..749c6fd02f837 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -188,4 +188,9 @@ void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
 		struct xfs_rtrefcount_root *dblock, int dblocklen);
 void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
 
+struct xfs_imeta_update;
+
+int xfs_rtrefcountbt_create(struct xfs_imeta_update *upd,
+		struct xfs_inode **ipp);
+
 #endif	/* __XFS_RTREFCOUNT_BTREE_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/44] xfs: update rmap to allow cow staging extents in the rt rmap
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-31 21:48   ` [PATCH 15/44] xfs: create routine to allocate and initialize a realtime refcount btree inode Darrick J. Wong
@ 2023-12-31 21:48   ` Darrick J. Wong
  2023-12-31 21:49   ` [PATCH 17/44] xfs: compute rtrmap btree max levels when reflink enabled Darrick J. Wong
                     ` (27 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:48 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Don't error out on CoW staging extent records when realtime reflink is
enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rmap.c |   14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 43108a195004c..501427d2f38cb 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -276,6 +276,7 @@ xfs_rtrmap_check_irec(
 	bool				is_unwritten;
 	bool				is_bmbt;
 	bool				is_attr;
+	bool				is_cow;
 
 	if (irec->rm_blockcount == 0)
 		return __this_address;
@@ -287,6 +288,12 @@ xfs_rtrmap_check_irec(
 			return __this_address;
 		if (irec->rm_offset != 0)
 			return __this_address;
+	} else if (irec->rm_owner == XFS_RMAP_OWN_COW) {
+		if (!xfs_has_rtreflink(mp))
+			return __this_address;
+		if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
+					    irec->rm_blockcount))
+			return __this_address;
 	} else {
 		if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
 					    irec->rm_blockcount))
@@ -303,8 +310,10 @@ xfs_rtrmap_check_irec(
 	is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
 	is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
 	is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+	is_cow = xfs_has_rtreflink(mp) &&
+		 irec->rm_owner == XFS_RMAP_OWN_COW;
 
-	if (!is_inode && irec->rm_owner != XFS_RMAP_OWN_FS)
+	if (!is_inode && !is_cow && irec->rm_owner != XFS_RMAP_OWN_FS)
 		return __this_address;
 
 	if (!is_inode && irec->rm_offset != 0)
@@ -316,6 +325,9 @@ xfs_rtrmap_check_irec(
 	if (is_unwritten && !is_inode)
 		return __this_address;
 
+	if (is_unwritten && is_cow)
+		return __this_address;
+
 	/* Check for a valid fork offset, if applicable. */
 	if (is_inode &&
 	    !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/44] xfs: compute rtrmap btree max levels when reflink enabled
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-31 21:48   ` [PATCH 16/44] xfs: update rmap to allow cow staging extents in the rt rmap Darrick J. Wong
@ 2023-12-31 21:49   ` Darrick J. Wong
  2023-12-31 21:49   ` [PATCH 18/44] xfs: refactor reflink quota updates Darrick J. Wong
                     ` (26 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:49 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Compute the maximum possible height of the realtime rmap btree when
reflink is enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtrmap_btree.c |   28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 3084153af3a43..87ea5e3ca8937 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -738,6 +738,7 @@ xfs_rtrmapbt_maxrecs(
 unsigned int
 xfs_rtrmapbt_maxlevels_ondisk(void)
 {
+	unsigned long long	max_dblocks;
 	unsigned int		minrecs[2];
 	unsigned int		blocklen;
 
@@ -746,8 +747,20 @@ xfs_rtrmapbt_maxlevels_ondisk(void)
 	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
 	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
 
-	/* We need at most one record for every block in an rt group. */
-	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+	/*
+	 * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs.
+	 *
+	 * On a reflink filesystem, each block in an rtgroup can have up to
+	 * 2^32 (per the refcount record format) owners, which means that
+	 * theoretically we could face up to 2^64 rmap records.  However, we're
+	 * likely to run out of blocks in the data device long before that
+	 * happens, which means that we must compute the max height based on
+	 * what the btree will look like if it consumes almost all the blocks
+	 * in the data device due to maximal sharing factor.
+	 */
+	max_dblocks = -1U; /* max ag count */
+	max_dblocks *= XFS_MAX_CRC_AG_BLOCKS;
+	return xfs_btree_space_to_height(minrecs, max_dblocks);
 }
 
 int __init
@@ -786,9 +799,20 @@ xfs_rtrmapbt_compute_maxlevels(
 	 * maximum height is constrained by the size of the data device and
 	 * the height required to store one rmap record for each block in an
 	 * rt group.
+	 *
+	 * On a reflink filesystem, each rt block can have up to 2^32 (per the
+	 * refcount record format) owners, which means that theoretically we
+	 * could face up to 2^64 rmap records.  This makes the computation of
+	 * maxlevels based on record count meaningless, so we only consider the
+	 * size of the data device.
 	 */
 	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
 				mp->m_sb.sb_dblocks);
+	if (xfs_has_rtreflink(mp)) {
+		mp->m_rtrmap_maxlevels = d_maxlevels + 1;
+		return;
+	}
+
 	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
 				mp->m_sb.sb_rgblocks);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/44] xfs: refactor reflink quota updates
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-31 21:49   ` [PATCH 17/44] xfs: compute rtrmap btree max levels when reflink enabled Darrick J. Wong
@ 2023-12-31 21:49   ` Darrick J. Wong
  2023-12-31 21:49   ` [PATCH 19/44] xfs: enable CoW for realtime data Darrick J. Wong
                     ` (25 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:49 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Hoist all quota updates for reflink into a helper function, since things
are about to become more complicated.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index a10d43a1a7da4..8e352b23dacf2 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -753,6 +753,35 @@ xfs_reflink_cancel_cow_range(
 	return error;
 }
 
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * Update quota accounting for a remapping operation.  When we're remapping
+ * something from the CoW fork to the data fork, we must update the quota
+ * accounting for delayed allocations.  For remapping from the data fork to the
+ * data fork, use regular block accounting.
+ */
+static inline void
+xfs_reflink_update_quota(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	bool			is_cow,
+	int64_t			blocks)
+{
+	unsigned int		qflag;
+
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT :
+				 XFS_TRANS_DQ_RTBCOUNT;
+	} else {
+		qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT :
+				 XFS_TRANS_DQ_BCOUNT;
+	}
+	xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks);
+}
+#else
+# define xfs_reflink_update_quota(tp, ip, is_cow, blocks)	((void)0)
+#endif
+
 /*
  * Remap part of the CoW fork into the data fork.
  *
@@ -856,8 +885,7 @@ xfs_reflink_end_cow_extent(
 		 */
 		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
 		xfs_refcount_decrease_extent(tp, isrt, &data);
-		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-				-data.br_blockcount);
+		xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount);
 	} else if (data.br_startblock == DELAYSTARTBLOCK) {
 		int		done;
 
@@ -882,8 +910,7 @@ xfs_reflink_end_cow_extent(
 	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
 
 	/* Charge this new data fork mapping to the on-disk quota. */
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
-			(long)del.br_blockcount);
+	xfs_reflink_update_quota(tp, ip, true, del.br_blockcount);
 
 	/* Remove the mapping from the CoW fork. */
 	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
@@ -1373,7 +1400,7 @@ xfs_reflink_remap_extent(
 		qdelta += dmap->br_blockcount;
 	}
 
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
+	xfs_reflink_update_quota(tp, ip, false, qdelta);
 
 	/* Update dest isize if needed. */
 	newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/44] xfs: enable CoW for realtime data
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-31 21:49   ` [PATCH 18/44] xfs: refactor reflink quota updates Darrick J. Wong
@ 2023-12-31 21:49   ` Darrick J. Wong
  2023-12-31 21:49   ` [PATCH 20/44] xfs: enable sharing of realtime file blocks Darrick J. Wong
                     ` (24 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:49 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Update our write paths to support copy on write on the rt volume.  This
works in more or less the same way as it does on the data device, with
the major exception that we never do delalloc on the rt volume.

Because we consider unwritten CoW fork staging extents to be incore
quota reservation, we update xfs_quota_reserve_blkres to support this
case.  Though xfs doesn't allow rt and quota together, the change is
trivial and we shouldn't leave a logic bomb here.

While we're at it, add a missing xfs_mod_delalloc call when we remove
delalloc block reservation from the inode.  This is largely irrelvant
since realtime files do not use delalloc, but we want to avoid leaving
logic bombs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_bmap_util.c   |   61 ++++++++++++++++++++++++++++++++++++++--------
 fs/xfs/xfs_quota.h       |    6 +----
 fs/xfs/xfs_reflink.c     |   36 +++++++++++++++++++++------
 fs/xfs/xfs_trans_dquot.c |   11 ++++++++
 4 files changed, 90 insertions(+), 24 deletions(-)


diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index ef8658a9724dd..a7a99177bbf8b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -71,6 +71,55 @@ xfs_zero_extent(
 }
 
 #ifdef CONFIG_XFS_RT
+
+/* Update all inode and quota accounting for the allocation we just did. */
+static void
+xfs_bmap_rtalloc_accounting(
+	struct xfs_bmalloca	*ap)
+{
+	if (ap->flags & XFS_BMAPI_COWFORK) {
+		/*
+		 * COW fork blocks are in-core only and thus are treated as
+		 * in-core quota reservation (like delalloc blocks) even when
+		 * converted to real blocks. The quota reservation is not
+		 * accounted to disk until blocks are remapped to the data
+		 * fork. So if these blocks were previously delalloc, we
+		 * already have quota reservation and there's nothing to do
+		 * yet.
+		 */
+		if (ap->wasdel) {
+			xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+			return;
+		}
+
+		/*
+		 * Otherwise, we've allocated blocks in a hole. The transaction
+		 * has acquired in-core quota reservation for this extent.
+		 * Rather than account these as real blocks, however, we reduce
+		 * the transaction quota reservation based on the allocation.
+		 * This essentially transfers the transaction quota reservation
+		 * to that of a delalloc extent.
+		 */
+		ap->ip->i_delayed_blks += ap->length;
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+				XFS_TRANS_DQ_RES_RTBLKS, -(long)ap->length);
+		return;
+	}
+
+	/* data fork only */
+	ap->ip->i_nblocks += ap->length;
+	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+	if (ap->wasdel) {
+		ap->ip->i_delayed_blks -= ap->length;
+		xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+	}
+
+	/* Adjust the disk quota also. This was reserved earlier. */
+	xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+				     XFS_TRANS_DQ_RTBCOUNT, ap->length);
+}
+
 int
 xfs_bmap_rtalloc(
 	struct xfs_bmalloca	*ap)
@@ -166,17 +215,7 @@ xfs_bmap_rtalloc(
 	if (rtx != NULLRTEXTNO) {
 		ap->blkno = xfs_rtx_to_rtb(mp, rtx);
 		ap->length = xfs_rtxlen_to_extlen(mp, ralen);
-		ap->ip->i_nblocks += ap->length;
-		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-		if (ap->wasdel)
-			ap->ip->i_delayed_blks -= ap->length;
-		/*
-		 * Adjust the disk quota also. This was reserved
-		 * earlier.
-		 */
-		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
-			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
-					XFS_TRANS_DQ_RTBCOUNT, ap->length);
+		xfs_bmap_rtalloc_accounting(ap);
 		return 0;
 	}
 
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 55320c9ff1367..165013f03db9e 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -129,11 +129,7 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *);
 extern void xfs_qm_unmount(struct xfs_mount *);
 extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 
-static inline int
-xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
-{
-	return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
-}
+int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks);
 bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
 
 # ifdef CONFIG_XFS_LIVE_HOOKS
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 8e352b23dacf2..ed9f4ca34fcea 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -434,20 +434,26 @@ xfs_reflink_fill_cow_hole(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
 	xfs_filblks_t		resaligned;
-	xfs_extlen_t		resblks;
+	unsigned int		dblocks = 0, rblocks = 0;
 	int			nimaps;
 	int			error;
 	bool			found;
 
 	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
 		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
-	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+		rblocks = resaligned;
+	} else {
+		dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+		rblocks = 0;
+	}
 
 	xfs_iunlock(ip, *lockmode);
 	*lockmode = 0;
 
-	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
-			false, &tp);
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+			rblocks, false, &tp);
 	if (error)
 		return error;
 
@@ -1236,7 +1242,7 @@ xfs_reflink_remap_extent(
 	struct xfs_trans	*tp;
 	xfs_off_t		newlen;
 	int64_t			qdelta = 0;
-	unsigned int		resblks;
+	unsigned int		dblocks, rblocks, resblks;
 	bool			quota_reserved = true;
 	bool			smap_real;
 	bool			dmap_written = xfs_bmap_is_written_extent(dmap);
@@ -1267,8 +1273,15 @@ xfs_reflink_remap_extent(
 	 * we're remapping.
 	 */
 	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		dblocks = resblks;
+		rblocks = dmap->br_blockcount;
+	} else {
+		dblocks = resblks + dmap->br_blockcount;
+		rblocks = 0;
+	}
 	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
-			resblks + dmap->br_blockcount, 0, false, &tp);
+			dblocks, rblocks, false, &tp);
 	if (error == -EDQUOT || error == -ENOSPC) {
 		quota_reserved = false;
 		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
@@ -1348,8 +1361,15 @@ xfs_reflink_remap_extent(
 	 * done.
 	 */
 	if (!quota_reserved && !smap_real && dmap_written) {
-		error = xfs_trans_reserve_quota_nblks(tp, ip,
-				dmap->br_blockcount, 0, false);
+		if (XFS_IS_REALTIME_INODE(ip)) {
+			dblocks = 0;
+			rblocks = dmap->br_blockcount;
+		} else {
+			dblocks = dmap->br_blockcount;
+			rblocks = 0;
+		}
+		error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
+				false);
 		if (error)
 			goto out_cancel;
 	}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 6983e35b7c2b7..5a0bdc8e06fca 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -1020,3 +1020,14 @@ xfs_trans_free_dqinfo(
 	kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo);
 	tp->t_dqinfo = NULL;
 }
+
+int
+xfs_quota_reserve_blkres(
+	struct xfs_inode	*ip,
+	int64_t			blocks)
+{
+	if (XFS_IS_REALTIME_INODE(ip))
+		return xfs_trans_reserve_quota_nblks(NULL, ip, 0, blocks,
+				false);
+	return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
+}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/44] xfs: enable sharing of realtime file blocks
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-31 21:49   ` [PATCH 19/44] xfs: enable CoW for realtime data Darrick J. Wong
@ 2023-12-31 21:49   ` Darrick J. Wong
  2023-12-31 21:50   ` [PATCH 21/44] xfs: allow inodes to have the realtime and reflink flags Darrick J. Wong
                     ` (23 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:49 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Update the remapping routines to be able to handle realtime files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index ed9f4ca34fcea..d05ab2c91bd98 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -33,6 +33,7 @@
 #include "xfs_rtrefcount_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_rtgroup.h"
+#include "xfs_imeta.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -1211,14 +1212,29 @@ xfs_reflink_update_dest(
 static int
 xfs_reflink_ag_has_free_space(
 	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno)
+	struct xfs_inode	*ip,
+	xfs_fsblock_t		fsb)
 {
 	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
 	int			error = 0;
 
 	if (!xfs_has_rmapbt(mp))
 		return 0;
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		struct xfs_rtgroup	*rtg;
+		xfs_rgnumber_t		rgno;
 
+		rgno = xfs_rtb_to_rgno(mp, fsb);
+		rtg = xfs_rtgroup_get(mp, rgno);
+		if (xfs_imeta_resv_critical(rtg->rtg_rmapip) ||
+		    xfs_imeta_resv_critical(rtg->rtg_refcountip))
+			error = -ENOSPC;
+		xfs_rtgroup_put(rtg);
+		return error;
+	}
+
+	agno = XFS_FSB_TO_AGNO(mp, fsb);
 	pag = xfs_perag_get(mp, agno);
 	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
 	    xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
@@ -1332,8 +1348,8 @@ xfs_reflink_remap_extent(
 
 	/* No reflinking if the AG of the dest mapping is low on space. */
 	if (dmap_written) {
-		error = xfs_reflink_ag_has_free_space(mp,
-				XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
+		error = xfs_reflink_ag_has_free_space(mp, ip,
+				dmap->br_startblock);
 		if (error)
 			goto out_cancel;
 	}
@@ -1593,8 +1609,8 @@ xfs_reflink_remap_prep(
 
 	/* Check file eligibility and prepare for block sharing. */
 	ret = -EINVAL;
-	/* Don't reflink realtime inodes */
-	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+	/* Can't reflink between data and rt volumes */
+	if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest))
 		goto out_unlock;
 
 	/* Don't share DAX file data with non-DAX file. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/44] xfs: allow inodes to have the realtime and reflink flags
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-31 21:49   ` [PATCH 20/44] xfs: enable sharing of realtime file blocks Darrick J. Wong
@ 2023-12-31 21:50   ` Darrick J. Wong
  2023-12-31 21:50   ` [PATCH 22/44] xfs: refcover CoW leftovers in the realtime volume Darrick J. Wong
                     ` (22 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:50 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we can share blocks between realtime files, allow this
combination.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c |    3 ++-
 fs/xfs/scrub/inode.c          |    5 +++--
 fs/xfs/scrub/inode_repair.c   |    6 ------
 fs/xfs/xfs_ioctl.c            |    4 ----
 4 files changed, 5 insertions(+), 13 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 6e08ff8d8e239..ba37b864f6a8b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -691,7 +691,8 @@ xfs_dinode_verify(
 		return __this_address;
 
 	/* don't let reflink and realtime mix */
-	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) &&
+	    !xfs_has_rtreflink(mp))
 		return __this_address;
 
 	/* COW extent size hint validation */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 5fc10e02b9c41..705865ec6c1c0 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -359,8 +359,9 @@ xchk_inode_flags2(
 	if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
 		goto bad;
 
-	/* realtime and reflink make no sense, currently */
-	if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+	/* realtime and reflink don't always go together */
+	if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) &&
+	    !xfs_has_rtreflink(mp))
 		goto bad;
 
 	/* no bigtime iflag without the bigtime feature */
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index f4f6ed6ef5120..8d67ae257e597 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -509,8 +509,6 @@ xrep_dinode_flags(
 		flags2 |= XFS_DIFLAG2_REFLINK;
 	else
 		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
-	if (flags & XFS_DIFLAG_REALTIME)
-		flags2 &= ~XFS_DIFLAG2_REFLINK;
 	if (!xfs_has_bigtime(mp))
 		flags2 &= ~XFS_DIFLAG2_BIGTIME;
 	if (!xfs_has_large_extent_counts(mp))
@@ -1716,10 +1714,6 @@ xrep_inode_flags(
 	/* DAX only applies to files and dirs. */
 	if (!(S_ISREG(mode) || S_ISDIR(mode)))
 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
-
-	/* No reflink files on the realtime device. */
-	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
-		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 }
 
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 32a52799ae826..4559d122101cd 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1119,10 +1119,6 @@ xfs_ioctl_setattr_xflags(
 		if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
 		    xfs_extlen_to_rtxmod(mp, ip->i_extsize))
 			return -EINVAL;
-
-		/* Clear reflink if we are actually able to set the rt flag. */
-		if (xfs_is_reflink_inode(ip))
-			ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 	}
 
 	/* diflags2 only valid for v3 inodes. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/44] xfs: refcover CoW leftovers in the realtime volume
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-31 21:50   ` [PATCH 21/44] xfs: allow inodes to have the realtime and reflink flags Darrick J. Wong
@ 2023-12-31 21:50   ` Darrick J. Wong
  2023-12-31 21:50   ` [PATCH 23/44] xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files Darrick J. Wong
                     ` (21 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:50 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Scan the realtime refcount tree at mount time to get rid of leftover
CoW staging extents.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_refcount.c |   64 +++++++++++++++++++++++++++++++++---------
 fs/xfs/libxfs/xfs_refcount.h |    4 ++-
 fs/xfs/xfs_reflink.c         |   14 ++++++++-
 3 files changed, 65 insertions(+), 17 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 60e838adde0d8..b4ec6077270a7 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -2077,14 +2077,15 @@ xfs_refcount_recover_extent(
 }
 
 /* Find and remove leftover CoW reservations. */
-int
-xfs_refcount_recover_cow_leftovers(
+static int
+xfs_refcount_recover_group_cow_leftovers(
 	struct xfs_mount		*mp,
-	struct xfs_perag		*pag)
+	struct xfs_perag		*pag,
+	struct xfs_rtgroup		*rtg)
 {
 	struct xfs_trans		*tp;
 	struct xfs_btree_cur		*cur;
-	struct xfs_buf			*agbp;
+	struct xfs_buf			*agbp = NULL;
 	struct xfs_refcount_recovery	*rr, *n;
 	struct list_head		debris;
 	union xfs_btree_irec		low = {
@@ -2099,7 +2100,12 @@ xfs_refcount_recover_cow_leftovers(
 
 	/* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
 	BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
-	if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
+	if (pag && mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
+		return -EOPNOTSUPP;
+
+	/* rtreflink filesystems can't have rtgroups larger than 2^31-1 blocks */
+	BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG);
+	if (rtg && mp->m_sb.sb_rgblocks >= XFS_MAX_RGBLOCKS)
 		return -EOPNOTSUPP;
 
 	INIT_LIST_HEAD(&debris);
@@ -2118,16 +2124,25 @@ xfs_refcount_recover_cow_leftovers(
 	if (error)
 		return error;
 
-	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
-	if (error)
-		goto out_trans;
-	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+	if (rtg) {
+		xfs_rtgroup_lock(NULL, rtg, XFS_RTGLOCK_REFCOUNT);
+		cur = xfs_rtrefcountbt_init_cursor(mp, tp, rtg,
+				rtg->rtg_refcountip);
+	} else {
+		error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+		if (error)
+			goto out_trans;
+		cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+	}
 
 	/* Find all the leftover CoW staging extents. */
 	error = xfs_btree_query_range(cur, &low, &high,
 			xfs_refcount_recover_extent, &debris);
 	xfs_btree_del_cursor(cur, error);
-	xfs_trans_brelse(tp, agbp);
+	if (agbp)
+		xfs_trans_brelse(tp, agbp);
+	else
+		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
 	xfs_trans_cancel(tp);
 	if (error)
 		goto out_free;
@@ -2140,15 +2155,20 @@ xfs_refcount_recover_cow_leftovers(
 			goto out_free;
 
 		/* Free the orphan record */
-		fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
-				rr->rr_rrec.rc_startblock);
-		xfs_refcount_free_cow_extent(tp, false, fsb,
+		if (rtg)
+			fsb = xfs_rgbno_to_rtb(mp, rtg->rtg_rgno,
+					rr->rr_rrec.rc_startblock);
+		else
+			fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
+					rr->rr_rrec.rc_startblock);
+		xfs_refcount_free_cow_extent(tp, rtg != NULL, fsb,
 				rr->rr_rrec.rc_blockcount);
 
 		/* Free the block. */
 		error = xfs_free_extent_later(tp, fsb,
 				rr->rr_rrec.rc_blockcount, NULL,
-				XFS_AG_RESV_NONE, 0);
+				XFS_AG_RESV_NONE,
+				rtg != NULL ? XFS_FREE_EXTENT_REALTIME : 0);
 		if (error)
 			goto out_trans;
 
@@ -2172,6 +2192,22 @@ xfs_refcount_recover_cow_leftovers(
 	return error;
 }
 
+int
+xfs_refcount_recover_cow_leftovers(
+	struct xfs_mount		*mp,
+	struct xfs_perag		*pag)
+{
+	return xfs_refcount_recover_group_cow_leftovers(mp, pag, NULL);
+}
+
+int
+xfs_refcount_recover_rtcow_leftovers(
+	struct xfs_mount		*mp,
+	struct xfs_rtgroup		*rtg)
+{
+	return xfs_refcount_recover_group_cow_leftovers(mp, NULL, rtg);
+}
+
 /*
  * Scan part of the keyspace of the refcount records and tell us if the area
  * has no records, is fully mapped by records, or is partially filled.
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 56e5834feb624..18e0479d3d1d0 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -97,8 +97,10 @@ void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
 		xfs_fsblock_t fsb, xfs_extlen_t len);
 void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
 		xfs_fsblock_t fsb, xfs_extlen_t len);
-extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		struct xfs_perag *pag);
+int xfs_refcount_recover_rtcow_leftovers(struct xfs_mount *mp,
+		struct xfs_rtgroup *rtg);
 
 /*
  * While we're adjusting the refcounts records of an extent, we have
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d05ab2c91bd98..b0f3170c11c8b 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1006,7 +1006,9 @@ xfs_reflink_recover_cow(
 	struct xfs_mount	*mp)
 {
 	struct xfs_perag	*pag;
+	struct xfs_rtgroup	*rtg;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 	int			error = 0;
 
 	if (!xfs_has_reflink(mp))
@@ -1016,11 +1018,19 @@ xfs_reflink_recover_cow(
 		error = xfs_refcount_recover_cow_leftovers(mp, pag);
 		if (error) {
 			xfs_perag_rele(pag);
-			break;
+			return error;
 		}
 	}
 
-	return error;
+	for_each_rtgroup(mp, rgno, rtg) {
+		error = xfs_refcount_recover_rtcow_leftovers(mp, rtg);
+		if (error) {
+			xfs_rtgroup_rele(rtg);
+			return error;
+		}
+	}
+
+	return 0;
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/44] xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-31 21:50   ` [PATCH 22/44] xfs: refcover CoW leftovers in the realtime volume Darrick J. Wong
@ 2023-12-31 21:50   ` Darrick J. Wong
  2023-12-31 21:50   ` [PATCH 24/44] xfs: apply rt extent alignment constraints to CoW extsize hint Darrick J. Wong
                     ` (20 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:50 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Currently, we (ab)use xfs_get_extsz_hint so that it always returns a
nonzero value for realtime files.  This apparently was done to disable
delayed allocation for realtime files.

However, once we enable realtime reflink, we can also turn on the
alwayscow flag to force CoW writes to realtime files.  In this case, the
logic will incorrectly send the write through the delalloc write path.

Fix this by adjusting the logic slightly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 27f992dc6d2d6..316b574b34b8a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6384,9 +6384,8 @@ xfs_get_extsz_hint(
 	 * No point in aligning allocations if we need to COW to actually
 	 * write to them.
 	 */
-	if (xfs_is_always_cow_inode(ip))
-		return 0;
-	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+	if (!xfs_is_always_cow_inode(ip) &&
+	    (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
 		return ip->i_extsize;
 	if (XFS_IS_REALTIME_INODE(ip))
 		return ip->i_mount->m_sb.sb_rextsize;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/44] xfs: apply rt extent alignment constraints to CoW extsize hint
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-31 21:50   ` [PATCH 23/44] xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files Darrick J. Wong
@ 2023-12-31 21:50   ` Darrick J. Wong
  2023-12-31 21:51   ` [PATCH 25/44] xfs: enable extent size hints for CoW operations Darrick J. Wong
                     ` (19 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:50 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The copy-on-write extent size hint is subject to the same alignment
constraints as the regular extent size hint.  Since we're in the process
of adding reflink (and therefore CoW) to the realtime device, we must
apply the same scattered rextsize alignment validation strategies to
both hints to deal with the possibility of rextsize changing.

Therefore, fix the inode validator to perform rextsize alignment checks
on regular realtime files, and to remove misaligned directory hints.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c |   25 ++++++++++++++++++++-----
 fs/xfs/xfs_inode_item.c       |   14 ++++++++++++++
 fs/xfs/xfs_ioctl.c            |   17 +++++++++++++++--
 3 files changed, 49 insertions(+), 7 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index ba37b864f6a8b..81a12ca8ec434 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -861,11 +861,29 @@ xfs_inode_validate_cowextsize(
 	bool				rt_flag;
 	bool				hint_flag;
 	uint32_t			cowextsize_bytes;
+	uint32_t			blocksize_bytes;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
 	cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
 
+	/*
+	 * Similar to extent size hints, a directory can be configured to
+	 * propagate realtime status and a CoW extent size hint to newly
+	 * created files even if there is no realtime device, and the hints on
+	 * disk can become misaligned if the sysadmin changes the rt extent
+	 * size while adding the realtime device.
+	 *
+	 * Therefore, we can only enforce the rextsize alignment check against
+	 * regular realtime files, and rely on callers to decide when alignment
+	 * checks are appropriate, and fix things up as needed.
+	 */
+
+	if (rt_flag)
+		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+	else
+		blocksize_bytes = mp->m_sb.sb_blocksize;
+
 	if (hint_flag && !xfs_has_reflink(mp))
 		return __this_address;
 
@@ -879,16 +897,13 @@ xfs_inode_validate_cowextsize(
 	if (mode && !hint_flag && cowextsize != 0)
 		return __this_address;
 
-	if (hint_flag && rt_flag)
-		return __this_address;
-
-	if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+	if (cowextsize_bytes % blocksize_bytes)
 		return __this_address;
 
 	if (cowextsize > XFS_MAX_BMBT_EXTLEN)
 		return __this_address;
 
-	if (cowextsize > mp->m_sb.sb_agblocks / 2)
+	if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2)
 		return __this_address;
 
 	return NULL;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fdc0b14bb9fbb..16d7d934da6e8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -127,6 +127,20 @@ xfs_inode_item_precommit(
 	if (flags & XFS_ILOG_IVERSION)
 		flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
 
+	/*
+	 * Inode verifiers do not check that the CoW extent size hint is an
+	 * integer multiple of the rt extent size on a directory with both
+	 * rtinherit and cowextsize flags set.  If we're logging a directory
+	 * that is misconfigured in this way, clear the hint.
+	 */
+	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = 0;
+		flags |= XFS_ILOG_CORE;
+	}
+
 	if (!iip->ili_item.li_buf) {
 		struct xfs_buf	*bp;
 		int		error;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4559d122101cd..f85d5f142d180 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1058,8 +1058,21 @@ xfs_fill_fsxattr(
 		}
 	}
 
-	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
-		fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+		/*
+		 * Don't let a misaligned CoW extent size hint on a directory
+		 * escape to userspace if it won't pass the setattr checks
+		 * later.
+		 */
+		if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+		    ip->i_cowextsize % mp->m_sb.sb_rextsize > 0) {
+			fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+			fa->fsx_cowextsize = 0;
+		} else {
+			fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+		}
+	}
+
 	fa->fsx_projid = ip->i_projid;
 	if (ifp && !xfs_need_iread_extents(ifp))
 		fa->fsx_nextents = xfs_iext_count(ifp);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 25/44] xfs: enable extent size hints for CoW operations
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (23 preceding siblings ...)
  2023-12-31 21:50   ` [PATCH 24/44] xfs: apply rt extent alignment constraints to CoW extsize hint Darrick J. Wong
@ 2023-12-31 21:51   ` Darrick J. Wong
  2023-12-31 21:51   ` [PATCH 26/44] xfs: check that the rtrefcount maxlevels doesn't increase when growing fs Darrick J. Wong
                     ` (18 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:51 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Wire up the copy-on-write extent size hint for realtime files, and
connect it to the rt allocator so that we avoid fragmentation on rt
filesystems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |    8 +++++++-
 fs/xfs/xfs_bmap_util.c   |    5 ++++-
 2 files changed, 11 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 316b574b34b8a..41354bdbbc90f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6407,7 +6407,13 @@ xfs_get_cowextsz_hint(
 	a = 0;
 	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 		a = ip->i_cowextsize;
-	b = xfs_get_extsz_hint(ip);
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		b = 0;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+			b = ip->i_extsize;
+	} else {
+		b = xfs_get_extsz_hint(ip);
+	}
 
 	a = max(a, b);
 	if (a == 0)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a7a99177bbf8b..60992f8e86adf 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -138,7 +138,10 @@ xfs_bmap_rtalloc(
 	bool			ignore_locality = false;
 	int			error;
 
-	align = xfs_get_extsz_hint(ap->ip);
+	if (ap->flags & XFS_BMAPI_COWFORK)
+		align = xfs_get_cowextsz_hint(ap->ip);
+	else
+		align = xfs_get_extsz_hint(ap->ip);
 retry:
 	prod = xfs_extlen_to_rtxlen(mp, align);
 	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 26/44] xfs: check that the rtrefcount maxlevels doesn't increase when growing fs
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (24 preceding siblings ...)
  2023-12-31 21:51   ` [PATCH 25/44] xfs: enable extent size hints for CoW operations Darrick J. Wong
@ 2023-12-31 21:51   ` Darrick J. Wong
  2023-12-31 21:51   ` [PATCH 27/44] xfs: add realtime refcount btree when adding rt volume Darrick J. Wong
                     ` (17 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:51 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The size of filesystem transaction reservations depends on the maximum
height (maxlevels) of the realtime btrees.  Since we don't want a grow
operation to increase the reservation size enough that we'll fail the
minimum log size checks on the next mount, constrain growfs operations
if they would cause an increase in the rt refcount btree maxlevels.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_fsops.c   |    2 ++
 fs/xfs/xfs_rtalloc.c |    2 ++
 2 files changed, 4 insertions(+)


diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index e78ee67b9dd12..9584c08480f75 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,6 +24,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtalloc.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 
 /*
  * Write new AG headers to disk. Non-transactional, but need to be
@@ -238,6 +239,7 @@ xfs_growfs_data_private(
 
 		/* Compute new maxlevels for rt btrees. */
 		xfs_rtrmapbt_compute_maxlevels(mp);
+		xfs_rtrefcountbt_compute_maxlevels(mp);
 	}
 
 	return error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 14e17c2b39ef0..54859b32d37fc 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1128,6 +1128,7 @@ xfs_growfs_check_rtgeom(
 		fake_mp->m_features |= XFS_FEAT_REALTIME;
 
 	xfs_rtrmapbt_compute_maxlevels(fake_mp);
+	xfs_rtrefcountbt_compute_maxlevels(fake_mp);
 
 	xfs_trans_resv_calc(fake_mp, M_RES(fake_mp));
 	min_logfsbs = xfs_log_calc_minimum_size(fake_mp);
@@ -1451,6 +1452,7 @@ xfs_growfs_rt(
 		 */
 		mp->m_features |= XFS_FEAT_REALTIME;
 		xfs_rtrmapbt_compute_maxlevels(mp);
+		xfs_rtrefcountbt_compute_maxlevels(mp);
 	}
 	if (error)
 		goto out_free;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 27/44] xfs: add realtime refcount btree when adding rt volume
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (25 preceding siblings ...)
  2023-12-31 21:51   ` [PATCH 26/44] xfs: check that the rtrefcount maxlevels doesn't increase when growing fs Darrick J. Wong
@ 2023-12-31 21:51   ` Darrick J. Wong
  2023-12-31 21:51   ` [PATCH 28/44] xfs: report realtime refcount btree corruption errors to the health system Darrick J. Wong
                     ` (16 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:51 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If we're adding enough space to the realtime section to require the
creation of new realtime groups, create the rt refcount btree inode
before we start adding the space.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_rtalloc.c |   65 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 54859b32d37fc..1dd76cb757534 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1090,6 +1090,59 @@ xfs_growfsrt_create_rtrmap(
 	return error;
 }
 
+/* Add a metadata inode for a realtime refcount btree. */
+static int
+xfs_growfsrt_create_rtrefcount(
+	struct xfs_rtgroup	*rtg)
+{
+	struct xfs_imeta_update	upd;
+	struct xfs_mount	*mp = rtg->rtg_mount;
+	struct xfs_imeta_path	*path;
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	if (!xfs_has_rtreflink(mp) || rtg->rtg_refcountip)
+		return 0;
+
+	error = xfs_rtrefcountbt_create_path(mp, rtg->rtg_rgno, &path);
+	if (error)
+		return error;
+
+	error = xfs_imeta_ensure_dirpath(mp, path);
+	if (error)
+		goto out_path;
+
+	error = xfs_imeta_start_create(mp, path, &upd);
+	if (error)
+		goto out_path;
+
+	error = xfs_rtrefcountbt_create(&upd, &ip);
+	if (error)
+		goto out_cancel;
+
+	lockdep_set_class(&ip->i_lock.mr_lock, &xfs_rrefcountip_key);
+
+	error = xfs_imeta_commit_update(&upd);
+	if (error)
+		goto out_path;
+
+	xfs_imeta_free_path(path);
+	xfs_finish_inode_setup(ip);
+	rtg->rtg_refcountip = ip;
+	return 0;
+
+out_cancel:
+	xfs_imeta_cancel_update(&upd, error);
+	/* Have to finish setting up the inode to ensure it's deleted. */
+	if (ip) {
+		xfs_finish_inode_setup(ip);
+		xfs_irele(ip);
+	}
+out_path:
+	xfs_imeta_free_path(path);
+	return error;
+}
+
 /*
  * Check that changes to the realtime geometry won't affect the minimum
  * log size, which would cause the fs to become unusable.
@@ -1196,9 +1249,11 @@ xfs_growfs_rt(
 		return -EINVAL;
 
 	/* Unsupported realtime features. */
-	if (!xfs_has_rtgroups(mp) && xfs_has_rmapbt(mp))
+	if (!xfs_has_rtgroups(mp) && (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)))
 		return -EOPNOTSUPP;
-	if (xfs_has_reflink(mp) || xfs_has_quota(mp))
+	if (xfs_has_quota(mp))
+		return -EOPNOTSUPP;
+	if (xfs_has_reflink(mp) && in->extsize != 1)
 		return -EOPNOTSUPP;
 
 	nrblocks = in->newblocks;
@@ -1349,6 +1404,12 @@ xfs_growfs_rt(
 					xfs_rtgroup_rele(rtg);
 					break;
 				}
+
+				error = xfs_growfsrt_create_rtrefcount(rtg);
+				if (error) {
+					xfs_rtgroup_rele(rtg);
+					break;
+				}
 			}
 		}
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 28/44] xfs: report realtime refcount btree corruption errors to the health system
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (26 preceding siblings ...)
  2023-12-31 21:51   ` [PATCH 27/44] xfs: add realtime refcount btree when adding rt volume Darrick J. Wong
@ 2023-12-31 21:51   ` Darrick J. Wong
  2023-12-31 21:52   ` [PATCH 29/44] xfs: scrub the realtime refcount btree Darrick J. Wong
                     ` (15 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:51 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Whenever we encounter corrupt realtime refcount btree blocks, we should
report that to the health monitoring system for later reporting.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs_staging.h       |    1 +
 fs/xfs/libxfs/xfs_health.h           |    4 +++-
 fs/xfs/libxfs/xfs_inode_fork.c       |    4 +++-
 fs/xfs/libxfs/xfs_rtrefcount_btree.c |    5 ++++-
 fs/xfs/xfs_health.c                  |    4 ++++
 fs/xfs/xfs_rtalloc.c                 |    2 ++
 6 files changed, 17 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs_staging.h b/fs/xfs/libxfs/xfs_fs_staging.h
index 9d5d6af62b616..9f0c03103f05b 100644
--- a/fs/xfs/libxfs/xfs_fs_staging.h
+++ b/fs/xfs/libxfs/xfs_fs_staging.h
@@ -217,6 +217,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_SUPER	(1 << 0)  /* superblock */
 #define XFS_RTGROUP_GEOM_SICK_BITMAP	(1 << 1)  /* rtbitmap for this group */
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1 << 2)  /* reverse mappings */
+#define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1 << 3)  /* reference counts */
 
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 63, struct xfs_rtgroup_geometry)
 
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index aeeb62769773f..4fe4daca4c4f4 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -69,6 +69,7 @@ struct xfs_rtgroup;
 #define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
 #define XFS_SICK_RT_SUPER	(1 << 2)  /* rt group superblock */
 #define XFS_SICK_RT_RMAPBT	(1 << 3)  /* reverse mappings */
+#define XFS_SICK_RT_REFCNTBT	(1 << 4)  /* reference counts */
 
 /* Observable health issues for AG metadata. */
 #define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
@@ -115,7 +116,8 @@ struct xfs_rtgroup;
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY | \
 				 XFS_SICK_RT_SUPER | \
-				 XFS_SICK_RT_RMAPBT)
+				 XFS_SICK_RT_RMAPBT | \
+				 XFS_SICK_RT_REFCNTBT)
 
 #define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
 				 XFS_SICK_AG_AGF | \
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index df42ffa15d96e..9ff913d0fa140 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -273,8 +273,10 @@ xfs_iformat_data_fork(
 			}
 			return xfs_iformat_rtrmap(ip, dip);
 		case XFS_DINODE_FMT_REFCOUNT:
-			if (!xfs_has_rtreflink(ip->i_mount))
+			if (!xfs_has_rtreflink(ip->i_mount)) {
+				xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 				return -EFSCORRUPTED;
+			}
 			return xfs_iformat_rtrefcount(ip, dip);
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index fb0e4abcd6f6a..47ce0acd92a19 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -27,6 +27,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_imeta.h"
+#include "xfs_health.h"
 
 static struct kmem_cache	*xfs_rtrefcountbt_cur_cache;
 
@@ -694,8 +695,10 @@ xfs_iformat_rtrefcount(
 	level = be16_to_cpu(dfp->bb_level);
 
 	if (level > mp->m_rtrefc_maxlevels ||
-	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize)
+	    xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
+	}
 
 	xfs_iroot_alloc(ip, XFS_DATA_FORK,
 			xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index ed0767a6fa15a..6f40e2b728e27 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -533,6 +533,7 @@ static const struct ioctl_sick_map rtgroup_map[] = {
 	{ XFS_SICK_RT_SUPER,	XFS_RTGROUP_GEOM_SICK_SUPER },
 	{ XFS_SICK_RT_BITMAP,	XFS_RTGROUP_GEOM_SICK_BITMAP },
 	{ XFS_SICK_RT_RMAPBT,	XFS_RTGROUP_GEOM_SICK_RMAPBT },
+	{ XFS_SICK_RT_REFCNTBT,	XFS_RTGROUP_GEOM_SICK_REFCNTBT },
 	{ 0, 0 },
 };
 
@@ -640,6 +641,9 @@ xfs_btree_mark_sick(
 	case XFS_BTNUM_RTRMAP:
 		xfs_rtgroup_mark_sick(cur->bc_ino.rtg, XFS_SICK_RT_RMAPBT);
 		return;
+	case XFS_BTNUM_RTREFC:
+		xfs_rtgroup_mark_sick(cur->bc_ino.rtg, XFS_SICK_RT_REFCNTBT);
+		return;
 	case XFS_BTNUM_BNO:
 		mask = XFS_SICK_AG_BNOBT;
 		break;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 1dd76cb757534..11b6645a5a534 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1931,6 +1931,7 @@ xfs_rtmount_refcountbt(
 		goto out_path;
 
 	if (ino == NULLFSINO) {
+		xfs_rtgroup_mark_sick(rtg, XFS_SICK_RT_REFCNTBT);
 		error = -EFSCORRUPTED;
 		goto out_path;
 	}
@@ -1940,6 +1941,7 @@ xfs_rtmount_refcountbt(
 		goto out_path;
 
 	if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_REFCOUNT)) {
+		xfs_rtgroup_mark_sick(rtg, XFS_SICK_RT_REFCNTBT);
 		error = -EFSCORRUPTED;
 		goto out_rele;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 29/44] xfs: scrub the realtime refcount btree
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (27 preceding siblings ...)
  2023-12-31 21:51   ` [PATCH 28/44] xfs: report realtime refcount btree corruption errors to the health system Darrick J. Wong
@ 2023-12-31 21:52   ` Darrick J. Wong
  2023-12-31 21:52   ` [PATCH 30/44] xfs: cross-reference checks with the rt " Darrick J. Wong
                     ` (14 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:52 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add code to scrub realtime refcount btrees.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile            |    1 
 fs/xfs/libxfs/xfs_fs.h     |    3 
 fs/xfs/scrub/bmap.c        |    1 
 fs/xfs/scrub/bmap_repair.c |    1 
 fs/xfs/scrub/common.c      |   40 +++-
 fs/xfs/scrub/common.h      |    5 
 fs/xfs/scrub/health.c      |    1 
 fs/xfs/scrub/inode.c       |    1 
 fs/xfs/scrub/rtrefcount.c  |  497 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c       |    7 +
 fs/xfs/scrub/scrub.h       |    3 
 fs/xfs/scrub/stats.c       |    1 
 fs/xfs/scrub/trace.h       |    4 
 13 files changed, 551 insertions(+), 14 deletions(-)
 create mode 100644 fs/xfs/scrub/rtrefcount.c


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 783fc053d7be9..4d4f340d904fc 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -194,6 +194,7 @@ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rgsuper.o \
 				   rtbitmap.o \
+				   rtrefcount.o \
 				   rtrmap.o \
 				   rtsummary.o \
 				   )
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 0bbdbfb0a8ae7..7847da61db232 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -738,9 +738,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
 #define XFS_SCRUB_TYPE_RGBITMAP	31	/* realtime group bitmap */
 #define XFS_SCRUB_TYPE_RTRMAPBT	32	/* rtgroup reverse mapping btree */
+#define XFS_SCRUB_TYPE_RTREFCBT	33	/* realtime reference count btree */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	33
+#define XFS_SCRUB_TYPE_NR	34
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 696ac7208c4d5..5531ba2295b12 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -1038,6 +1038,7 @@ xchk_bmap(
 	case XFS_DINODE_FMT_DEV:
 	case XFS_DINODE_FMT_LOCAL:
 	case XFS_DINODE_FMT_RMAP:
+	case XFS_DINODE_FMT_REFCOUNT:
 		/* No mappings to check. */
 		if (whichfork == XFS_COW_FORK)
 			xchk_fblock_set_corrupt(sc, whichfork, 0);
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 25c52caae58a4..26dc04923944b 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -852,6 +852,7 @@ xrep_bmap_check_inputs(
 	case XFS_DINODE_FMT_LOCAL:
 	case XFS_DINODE_FMT_UUID:
 	case XFS_DINODE_FMT_RMAP:
+	case XFS_DINODE_FMT_REFCOUNT:
 		return -ECANCELED;
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_BTREE:
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 558e267924399..a9801a5bb0383 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -37,6 +37,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_bmap_util.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -885,6 +886,10 @@ xchk_rtgroup_init(
 		sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->mp, sc->tp,
 				sr->rtg, sr->rtg->rtg_rmapip);
 
+	if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT))
+		sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->mp, sc->tp,
+				sr->rtg, sr->rtg->rtg_refcountip);
+
 	return 0;
 }
 
@@ -899,7 +904,10 @@ xchk_rtgroup_btcur_free(
 {
 	if (sr->rmap_cur)
 		xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR);
+	if (sr->refc_cur)
+		xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR);
 
+	sr->refc_cur = NULL;
 	sr->rmap_cur = NULL;
 }
 
@@ -1762,16 +1770,26 @@ xchk_inode_count_blocks(
 		}
 		cur = xfs_rtrmapbt_init_cursor(sc->mp, sc->tp, sc->sr.rtg,
 				sc->ip);
-		error = xfs_btree_count_blocks(cur, &btblocks);
-		xfs_btree_del_cursor(cur, error);
-		if (error)
-			return error;
-
-		*nextents = 0;
-		*count = btblocks - 1;
-		return 0;
-	default:
-		return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork,
-				nextents, count);
+		goto meta_btree;
+	case XFS_DINODE_FMT_REFCOUNT:
+		if (!sc->sr.rtg) {
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+		cur = xfs_rtrefcountbt_init_cursor(sc->mp, sc->tp, sc->sr.rtg,
+				sc->ip);
+		goto meta_btree;
 	}
+
+	return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork, nextents,
+			count);
+meta_btree:
+	error = xfs_btree_count_blocks(cur, &btblocks);
+	xfs_btree_del_cursor(cur, error);
+	if (error)
+		return error;
+
+	*nextents = 0;
+	*count = btblocks - 1;
+	return 0;
 }
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 5dc481f69d160..ed22e1403d0f0 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -85,12 +85,14 @@ int xchk_setup_rtsummary(struct xfs_scrub *sc);
 int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
 int xchk_setup_rgbitmap(struct xfs_scrub *sc);
 int xchk_setup_rtrmapbt(struct xfs_scrub *sc);
+int xchk_setup_rtrefcountbt(struct xfs_scrub *sc);
 #else
 # define xchk_setup_rtbitmap		xchk_setup_nothing
 # define xchk_setup_rtsummary		xchk_setup_nothing
 # define xchk_setup_rgsuperblock	xchk_setup_nothing
 # define xchk_setup_rgbitmap		xchk_setup_nothing
 # define xchk_setup_rtrmapbt		xchk_setup_nothing
+# define xchk_setup_rtrefcountbt	xchk_setup_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -152,7 +154,8 @@ void xchk_rt_unlock_rtbitmap(struct xfs_scrub *sc);
 
 /* All the locks we need to check an rtgroup. */
 #define XCHK_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP_SHARED | \
-				 XFS_RTGLOCK_RMAP)
+				 XFS_RTGLOCK_RMAP | \
+				 XFS_RTGLOCK_REFCOUNT)
 
 int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
 		struct xchk_rt *sr, unsigned int rtglock_flags);
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 0fec833057697..10af83eb2dbd7 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -116,6 +116,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_METAPATH]	= { XHG_FS,  XFS_SICK_FS_METAPATH },
 	[XFS_SCRUB_TYPE_RGSUPER]	= { XHG_RTGROUP, XFS_SICK_RT_SUPER },
 	[XFS_SCRUB_TYPE_RTRMAPBT]	= { XHG_RTGROUP, XFS_SICK_RT_RMAPBT },
+	[XFS_SCRUB_TYPE_RTREFCBT]	= { XHG_RTGROUP, XFS_SICK_RT_REFCNTBT },
 };
 
 /* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 705865ec6c1c0..f746032a9db88 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -498,6 +498,7 @@ xchk_dinode(
 			xchk_ino_set_corrupt(sc, ino);
 		break;
 	case XFS_DINODE_FMT_RMAP:
+	case XFS_DINODE_FMT_REFCOUNT:
 		if (!S_ISREG(mode))
 			xchk_ino_set_corrupt(sc, ino);
 		break;
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
new file mode 100644
index 0000000000000..cc44438eece76
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_imeta.h"
+#include "xfs_rtrefcount_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+
+/* Set us up with the realtime refcount metadata locked. */
+int
+xchk_setup_rtrefcountbt(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_rtgroup	*rtg;
+	int			error;
+
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+	rtg = xfs_rtgroup_get(mp, sc->sm->sm_agno);
+	if (!rtg)
+		return -ENOENT;
+
+	error = xchk_setup_rt(sc);
+	if (error)
+		goto out_rtg;
+
+	error = xchk_install_live_inode(sc, rtg->rtg_refcountip);
+	if (error)
+		goto out_rtg;
+
+	error = xchk_ino_dqattach(sc);
+	if (error)
+		goto out_rtg;
+
+	error = xchk_rtgroup_init(sc, rtg->rtg_rgno, &sc->sr, XCHK_RTGLOCK_ALL);
+out_rtg:
+	xfs_rtgroup_put(rtg);
+	return error;
+}
+
+/* Realtime Reference count btree scrubber. */
+
+/*
+ * Confirming Reference Counts via Reverse Mappings
+ *
+ * We want to count the reverse mappings overlapping a refcount record
+ * (bno, len, refcount), allowing for the possibility that some of the
+ * overlap may come from smaller adjoining reverse mappings, while some
+ * comes from single extents which overlap the range entirely.  The
+ * outer loop is as follows:
+ *
+ * 1. For all reverse mappings overlapping the refcount extent,
+ *    a. If a given rmap completely overlaps, mark it as seen.
+ *    b. Otherwise, record the fragment (in agbno order) for later
+ *       processing.
+ *
+ * Once we've seen all the rmaps, we know that for all blocks in the
+ * refcount record we want to find $refcount owners and we've already
+ * visited $seen extents that overlap all the blocks.  Therefore, we
+ * need to find ($refcount - $seen) owners for every block in the
+ * extent; call that quantity $target_nr.  Proceed as follows:
+ *
+ * 2. Pull the first $target_nr fragments from the list; all of them
+ *    should start at or before the start of the extent.
+ *    Call this subset of fragments the working set.
+ * 3. Until there are no more unprocessed fragments,
+ *    a. Find the shortest fragments in the set and remove them.
+ *    b. Note the block number of the end of these fragments.
+ *    c. Pull the same number of fragments from the list.  All of these
+ *       fragments should start at the block number recorded in the
+ *       previous step.
+ *    d. Put those fragments in the set.
+ * 4. Check that there are $target_nr fragments remaining in the list,
+ *    and that they all end at or beyond the end of the refcount extent.
+ *
+ * If the refcount is correct, all the check conditions in the algorithm
+ * should always hold true.  If not, the refcount is incorrect.
+ */
+struct xchk_rtrefcnt_frag {
+	struct list_head	list;
+	struct xfs_rmap_irec	rm;
+};
+
+struct xchk_rtrefcnt_check {
+	struct xfs_scrub	*sc;
+	struct list_head	fragments;
+
+	/* refcount extent we're examining */
+	xfs_rgblock_t		bno;
+	xfs_extlen_t		len;
+	xfs_nlink_t		refcount;
+
+	/* number of owners seen */
+	xfs_nlink_t		seen;
+};
+
+/*
+ * Decide if the given rmap is large enough that we can redeem it
+ * towards refcount verification now, or if it's a fragment, in
+ * which case we'll hang onto it in the hopes that we'll later
+ * discover that we've collected exactly the correct number of
+ * fragments as the rtrefcountbt says we should have.
+ */
+STATIC int
+xchk_rtrefcountbt_rmap_check(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xchk_rtrefcnt_check	*refchk = priv;
+	struct xchk_rtrefcnt_frag	*frag;
+	xfs_rgblock_t			rm_last;
+	xfs_rgblock_t			rc_last;
+	int				error = 0;
+
+	if (xchk_should_terminate(refchk->sc, &error))
+		return error;
+
+	rm_last = rec->rm_startblock + rec->rm_blockcount - 1;
+	rc_last = refchk->bno + refchk->len - 1;
+
+	/* Confirm that a single-owner refc extent is a CoW stage. */
+	if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) {
+		xchk_btree_xref_set_corrupt(refchk->sc, cur, 0);
+		return 0;
+	}
+
+	if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) {
+		/*
+		 * The rmap overlaps the refcount record, so we can confirm
+		 * one refcount owner seen.
+		 */
+		refchk->seen++;
+	} else {
+		/*
+		 * This rmap covers only part of the refcount record, so
+		 * save the fragment for later processing.  If the rmapbt
+		 * is healthy each rmap_irec we see will be in agbno order
+		 * so we don't need insertion sort here.
+		 */
+		frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag),
+				XCHK_GFP_FLAGS);
+		if (!frag)
+			return -ENOMEM;
+		memcpy(&frag->rm, rec, sizeof(frag->rm));
+		list_add_tail(&frag->list, &refchk->fragments);
+	}
+
+	return 0;
+}
+
+/*
+ * Given a bunch of rmap fragments, iterate through them, keeping
+ * a running tally of the refcount.  If this ever deviates from
+ * what we expect (which is the rtrefcountbt's refcount minus the
+ * number of extents that totally covered the rtrefcountbt extent),
+ * we have a rtrefcountbt error.
+ */
+STATIC void
+xchk_rtrefcountbt_process_rmap_fragments(
+	struct xchk_rtrefcnt_check	*refchk)
+{
+	struct list_head		worklist;
+	struct xchk_rtrefcnt_frag	*frag;
+	struct xchk_rtrefcnt_frag	*n;
+	xfs_rgblock_t			bno;
+	xfs_rgblock_t			rbno;
+	xfs_rgblock_t			next_rbno;
+	xfs_nlink_t			nr;
+	xfs_nlink_t			target_nr;
+
+	target_nr = refchk->refcount - refchk->seen;
+	if (target_nr == 0)
+		return;
+
+	/*
+	 * There are (refchk->rc.rc_refcount - refchk->nr refcount)
+	 * references we haven't found yet.  Pull that many off the
+	 * fragment list and figure out where the smallest rmap ends
+	 * (and therefore the next rmap should start).  All the rmaps
+	 * we pull off should start at or before the beginning of the
+	 * refcount record's range.
+	 */
+	INIT_LIST_HEAD(&worklist);
+	rbno = NULLRGBLOCK;
+
+	/* Make sure the fragments actually /are/ in bno order. */
+	bno = 0;
+	list_for_each_entry(frag, &refchk->fragments, list) {
+		if (frag->rm.rm_startblock < bno)
+			goto done;
+		bno = frag->rm.rm_startblock;
+	}
+
+	/*
+	 * Find all the rmaps that start at or before the refc extent,
+	 * and put them on the worklist.
+	 */
+	nr = 0;
+	list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+		if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
+			break;
+		bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+		if (bno < rbno)
+			rbno = bno;
+		list_move_tail(&frag->list, &worklist);
+		nr++;
+	}
+
+	/*
+	 * We should have found exactly $target_nr rmap fragments starting
+	 * at or before the refcount extent.
+	 */
+	if (nr != target_nr)
+		goto done;
+
+	while (!list_empty(&refchk->fragments)) {
+		/* Discard any fragments ending at rbno from the worklist. */
+		nr = 0;
+		next_rbno = NULLRGBLOCK;
+		list_for_each_entry_safe(frag, n, &worklist, list) {
+			bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+			if (bno != rbno) {
+				if (bno < next_rbno)
+					next_rbno = bno;
+				continue;
+			}
+			list_del(&frag->list);
+			kfree(frag);
+			nr++;
+		}
+
+		/* Try to add nr rmaps starting at rbno to the worklist. */
+		list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+			bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+			if (frag->rm.rm_startblock != rbno)
+				goto done;
+			list_move_tail(&frag->list, &worklist);
+			if (next_rbno > bno)
+				next_rbno = bno;
+			nr--;
+			if (nr == 0)
+				break;
+		}
+
+		/*
+		 * If we get here and nr > 0, this means that we added fewer
+		 * items to the worklist than we discarded because the fragment
+		 * list ran out of items.  Therefore, we cannot maintain the
+		 * required refcount.  Something is wrong, so we're done.
+		 */
+		if (nr)
+			goto done;
+
+		rbno = next_rbno;
+	}
+
+	/*
+	 * Make sure the last extent we processed ends at or beyond
+	 * the end of the refcount extent.
+	 */
+	if (rbno < refchk->bno + refchk->len)
+		goto done;
+
+	/* Actually record us having seen the remaining refcount. */
+	refchk->seen = refchk->refcount;
+done:
+	/* Delete fragments and work list. */
+	list_for_each_entry_safe(frag, n, &worklist, list) {
+		list_del(&frag->list);
+		kfree(frag);
+	}
+	list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+		list_del(&frag->list);
+		kfree(frag);
+	}
+}
+
+/* Use the rmap entries covering this extent to verify the refcount. */
+STATIC void
+xchk_rtrefcountbt_xref_rmap(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*irec)
+{
+	struct xchk_rtrefcnt_check	refchk = {
+		.sc			= sc,
+		.bno			= irec->rc_startblock,
+		.len			= irec->rc_blockcount,
+		.refcount		= irec->rc_refcount,
+		.seen			= 0,
+	};
+	struct xfs_rmap_irec		low;
+	struct xfs_rmap_irec		high;
+	struct xchk_rtrefcnt_frag	*frag;
+	struct xchk_rtrefcnt_frag	*n;
+	int				error;
+
+	if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Cross-reference with the rmapbt to confirm the refcount. */
+	memset(&low, 0, sizeof(low));
+	low.rm_startblock = irec->rc_startblock;
+	memset(&high, 0xFF, sizeof(high));
+	high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
+
+	INIT_LIST_HEAD(&refchk.fragments);
+	error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+			xchk_rtrefcountbt_rmap_check, &refchk);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		goto out_free;
+
+	xchk_rtrefcountbt_process_rmap_fragments(&refchk);
+	if (irec->rc_refcount != refchk.seen)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+
+out_free:
+	list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
+		list_del(&frag->list);
+		kfree(frag);
+	}
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_rtrefcountbt_xref(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*irec)
+{
+	xfs_rtblock_t			rtbno;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	rtbno = xfs_rgbno_to_rtb(sc->mp, sc->sr.rtg->rtg_rgno,
+			irec->rc_startblock);
+	xchk_xref_is_used_rt_space(sc, rtbno, irec->rc_blockcount);
+	xchk_rtrefcountbt_xref_rmap(sc, irec);
+}
+
+struct xchk_rtrefcbt_records {
+	/* Previous refcount record. */
+	struct xfs_refcount_irec	prev_rec;
+
+	/* Number of CoW blocks we expect. */
+	xfs_extlen_t			cow_blocks;
+};
+
+static inline bool
+xchk_rtrefcount_mergeable(
+	struct xchk_rtrefcbt_records	*rrc,
+	const struct xfs_refcount_irec	*r2)
+{
+	const struct xfs_refcount_irec	*r1 = &rrc->prev_rec;
+
+	/* Ignore if prev_rec is not yet initialized. */
+	if (r1->rc_blockcount > 0)
+		return false;
+
+	if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock)
+		return false;
+	if (r1->rc_refcount != r2->rc_refcount)
+		return false;
+	if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
+			XFS_REFC_LEN_MAX)
+		return false;
+
+	return true;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrefcountbt_check_mergeable(
+	struct xchk_btree		*bs,
+	struct xchk_rtrefcbt_records	*rrc,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	if (xchk_rtrefcount_mergeable(rrc, irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
+}
+
+/* Scrub a rtrefcountbt record. */
+STATIC int
+xchk_rtrefcountbt_rec(
+	struct xchk_btree		*bs,
+	const union xfs_btree_rec	*rec)
+{
+	struct xfs_mount		*mp = bs->cur->bc_mp;
+	struct xchk_rtrefcbt_records	*rrc = bs->private;
+	struct xfs_refcount_irec	irec;
+	u32				mod;
+
+	xfs_refcount_btrec_to_irec(rec, &irec);
+	if (xfs_rtrefcount_check_irec(bs->cur->bc_ino.rtg, &irec) != NULL) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
+
+	/* We can only share full rt extents. */
+	mod = xfs_rtb_to_rtxoff(mp, irec.rc_startblock);
+	if (mod)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	mod = xfs_rtb_to_rtxoff(mp, irec.rc_blockcount);
+	if (mod)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+		rrc->cow_blocks += irec.rc_blockcount;
+
+	xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec);
+	xchk_rtrefcountbt_xref(bs->sc, &irec);
+
+	return 0;
+}
+
+/* Make sure we have as many refc blocks as the rmap says. */
+STATIC void
+xchk_refcount_xref_rmap(
+	struct xfs_scrub	*sc,
+	const struct xfs_owner_info *btree_oinfo,
+	xfs_extlen_t		cow_blocks)
+{
+	xfs_extlen_t		refcbt_blocks = 0;
+	xfs_filblks_t		blocks;
+	int			error;
+
+	if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Check that we saw as many refcbt blocks as the rmap knows about. */
+	error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks);
+	if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error))
+		return;
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo,
+			&blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	if (blocks != refcbt_blocks)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+	/* Check that we saw as many cow blocks as the rmap knows about. */
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur,
+			&XFS_RMAP_OINFO_COW, &blocks);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+		return;
+	if (blocks != cow_blocks)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xchk_rtrefcountbt(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_owner_info	btree_oinfo;
+	struct xchk_rtrefcbt_records rrc = {
+		.cow_blocks	= 0,
+	};
+	int			error;
+
+	error = xchk_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	xfs_rmap_ino_bmbt_owner(&btree_oinfo, sc->sr.rtg->rtg_refcountip->i_ino,
+			XFS_DATA_FORK);
+	error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec,
+			&btree_oinfo, &rrc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks);
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 8193ad6702b4d..2611e0223489c 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -481,6 +481,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.has	= xfs_has_rtrmapbt,
 		.repair	= xrep_rtrmapbt,
 	},
+	[XFS_SCRUB_TYPE_RTREFCBT] = {	/* realtime refcountbt */
+		.type	= ST_RTGROUP,
+		.setup	= xchk_setup_rtrefcountbt,
+		.scrub	= xchk_rtrefcountbt,
+		.has	= xfs_has_rtreflink,
+		.repair	= xrep_notsupported,
+	},
 };
 
 static int
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 38731b99625d6..7f94f31a5236e 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -125,6 +125,7 @@ struct xchk_rt {
 
 	/* rtgroup btrees */
 	struct xfs_btree_cur	*rmap_cur;
+	struct xfs_btree_cur	*refc_cur;
 };
 
 struct xfs_scrub {
@@ -282,12 +283,14 @@ int xchk_rtsummary(struct xfs_scrub *sc);
 int xchk_rgsuperblock(struct xfs_scrub *sc);
 int xchk_rgbitmap(struct xfs_scrub *sc);
 int xchk_rtrmapbt(struct xfs_scrub *sc);
+int xchk_rtrefcountbt(struct xfs_scrub *sc);
 #else
 # define xchk_rtbitmap		xchk_nothing
 # define xchk_rtsummary		xchk_nothing
 # define xchk_rgsuperblock	xchk_nothing
 # define xchk_rgbitmap		xchk_nothing
 # define xchk_rtrmapbt		xchk_nothing
+# define xchk_rtrefcountbt	xchk_nothing
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_quota(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index 0da7ecabfe9d9..0e0be23adfcb4 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -84,6 +84,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_RGSUPER]	= "rgsuper",
 	[XFS_SCRUB_TYPE_RGBITMAP]	= "rgbitmap",
 	[XFS_SCRUB_TYPE_RTRMAPBT]	= "rtrmapbt",
+	[XFS_SCRUB_TYPE_RTREFCBT]	= "rtrefcountbt",
 };
 
 /* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 72b5277f4ba6d..2d373c9a53ad6 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -88,6 +88,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGBITMAP);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT);
 
 #define XFS_SCRUB_TYPE_STRINGS \
 	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
@@ -123,7 +124,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
 	{ XFS_SCRUB_TYPE_METAPATH,	"metapath" }, \
 	{ XFS_SCRUB_TYPE_RGSUPER,	"rgsuper" }, \
 	{ XFS_SCRUB_TYPE_RGBITMAP,	"rgbitmap" }, \
-	{ XFS_SCRUB_TYPE_RTRMAPBT,	"rtrmapbt" }
+	{ XFS_SCRUB_TYPE_RTRMAPBT,	"rtrmapbt" }, \
+	{ XFS_SCRUB_TYPE_RTREFCBT,	"rtrefcountbt" }
 
 #define XFS_SCRUB_FLAG_STRINGS \
 	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 30/44] xfs: cross-reference checks with the rt refcount btree
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (28 preceding siblings ...)
  2023-12-31 21:52   ` [PATCH 29/44] xfs: scrub the realtime refcount btree Darrick J. Wong
@ 2023-12-31 21:52   ` Darrick J. Wong
  2023-12-31 21:52   ` [PATCH 31/44] xfs: allow overlapping rtrmapbt records for shared data extents Darrick J. Wong
                     ` (13 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:52 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use the realtime refcount btree to implement cross-reference checks in
other data structures.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/bmap.c       |   30 +++++++++++++---
 fs/xfs/scrub/rtbitmap.c   |    2 +
 fs/xfs/scrub/rtrefcount.c |   86 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/rtrmap.c     |   37 +++++++++++++++++++
 fs/xfs/scrub/scrub.h      |    9 +++++
 5 files changed, 158 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 5531ba2295b12..41b83515b4491 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -348,12 +348,30 @@ xchk_bmap_rt_iextent_xref(
 
 	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
 			irec->br_blockcount);
-	xchk_bmap_xref_rmap(info, irec, rgbno);
-
-	xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, info->whichfork,
-			irec->br_startoff);
-	xchk_xref_is_only_rt_owned_by(info->sc, rgbno, irec->br_blockcount,
-			&oinfo);
+	switch (info->whichfork) {
+	case XFS_DATA_FORK:
+		xchk_bmap_xref_rmap(info, irec, rgbno);
+		if (!xfs_is_reflink_inode(info->sc->ip)) {
+			xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+					info->whichfork, irec->br_startoff);
+			xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+					irec->br_blockcount, &oinfo);
+			xchk_xref_is_not_rt_shared(info->sc, rgbno,
+					irec->br_blockcount);
+		}
+		xchk_xref_is_not_rt_cow_staging(info->sc, rgbno,
+				irec->br_blockcount);
+		break;
+	case XFS_COW_FORK:
+		xchk_bmap_xref_rmap_cow(info, irec, rgbno);
+		xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+				irec->br_blockcount, &XFS_RMAP_OINFO_COW);
+		xchk_xref_is_rt_cow_staging(info->sc, rgbno,
+				irec->br_blockcount);
+		xchk_xref_is_not_rt_shared(info->sc, rgbno,
+				irec->br_blockcount);
+		break;
+	}
 
 out_free:
 	xchk_rtgroup_btcur_free(&info->sc->sr);
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 47463ef336eed..234c9a1f224d5 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -158,6 +158,8 @@ xchk_rgbitmap_xref(
 
 	rgbno = xfs_rtb_to_rgbno(sc->mp, startblock, &rgno);
 	xchk_xref_has_no_rt_owner(sc, rgbno, blockcount);
+	xchk_xref_is_not_rt_shared(sc, rgbno, blockcount);
+	xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount);
 
 	if (rgb->next_free_rtblock < startblock) {
 		xfs_rgblock_t	next_rgbno;
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
index cc44438eece76..360b9c6a4c04d 100644
--- a/fs/xfs/scrub/rtrefcount.c
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -495,3 +495,89 @@ xchk_rtrefcountbt(
 
 	return 0;
 }
+
+/* xref check that a cow staging extent is marked in the rtrefcountbt. */
+void
+xchk_xref_is_rt_cow_staging(
+	struct xfs_scrub		*sc,
+	xfs_rgblock_t			bno,
+	xfs_extlen_t			len)
+{
+	struct xfs_refcount_irec	rc;
+	int				has_refcount;
+	int				error;
+
+	if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	/* Find the CoW staging extent. */
+	error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+			bno, &has_refcount);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (!has_refcount) {
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+		return;
+	}
+
+	error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (!has_refcount) {
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+		return;
+	}
+
+	/* CoW lookup returned a shared extent record? */
+	if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+
+	/* Must be at least as long as what was passed in */
+	if (rc.rc_blockcount < len)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/*
+ * xref check that the extent is not shared.  Only file data blocks
+ * can have multiple owners.
+ */
+void
+xchk_xref_is_not_rt_shared(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		bno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_refcount_has_records(sc->sr.refc_cur,
+			XFS_REFC_DOMAIN_SHARED, bno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/* xref check that the extent is not being used for CoW staging. */
+void
+xchk_xref_is_not_rt_cow_staging(
+	struct xfs_scrub	*sc,
+	xfs_rgblock_t		bno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+			bno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index ce21fa95a5da7..60751c21fe52e 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -22,6 +22,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_rtgroup.h"
 #include "xfs_imeta.h"
+#include "xfs_refcount.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -158,6 +159,37 @@ xchk_rtrmapbt_check_mergeable(
 	memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
 }
 
+/* Cross-reference a rmap against the refcount btree. */
+STATIC void
+xchk_rtrmapbt_xref_rtrefc(
+	struct xfs_scrub	*sc,
+	struct xfs_rmap_irec	*irec)
+{
+	xfs_rgblock_t		fbno;
+	xfs_extlen_t		flen;
+	bool			is_inode;
+	bool			is_bmbt;
+	bool			is_attr;
+	bool			is_unwritten;
+	int			error;
+
+	if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+	is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+	is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+	is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+	/* If this is shared, must be a data fork extent. */
+	error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock,
+			irec->rm_blockcount, &fbno, &flen, false);
+	if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+		return;
+	if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten))
+		xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
 /* Cross-reference with other metadata. */
 STATIC void
 xchk_rtrmapbt_xref(
@@ -173,6 +205,11 @@ xchk_rtrmapbt_xref(
 			irec->rm_startblock);
 
 	xchk_xref_is_used_rt_space(sc, rtbno, irec->rm_blockcount);
+	if (irec->rm_owner == XFS_RMAP_OWN_COW)
+		xchk_xref_is_cow_staging(sc, irec->rm_startblock,
+				irec->rm_blockcount);
+	else
+		xchk_rtrmapbt_xref_rtrefc(sc, irec);
 }
 
 /* Scrub a realtime rmapbt record. */
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 7f94f31a5236e..93a4aba096d57 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -330,11 +330,20 @@ void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
 		xfs_extlen_t len);
 void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
 		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
+void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
+void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
+void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+		xfs_extlen_t len);
 #else
 # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
 # define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0)
 # define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0)
 # define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0)
+# define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0)
 #endif
 
 #endif	/* __XFS_SCRUB_SCRUB_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 31/44] xfs: allow overlapping rtrmapbt records for shared data extents
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (29 preceding siblings ...)
  2023-12-31 21:52   ` [PATCH 30/44] xfs: cross-reference checks with the rt " Darrick J. Wong
@ 2023-12-31 21:52   ` Darrick J. Wong
  2023-12-31 21:52   ` [PATCH 32/44] xfs: check reference counts of gaps between rt refcount records Darrick J. Wong
                     ` (12 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:52 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow overlapping realtime reverse mapping records if they both describe
shared data extents and the fs supports reflink on the realtime volume.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/rtrmap.c |   17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 60751c21fe52e..773be13c3be74 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -87,6 +87,18 @@ struct xchk_rtrmap {
 	struct xfs_rmap_irec	prev_rec;
 };
 
+static inline bool
+xchk_rtrmapbt_is_shareable(
+	struct xfs_scrub		*sc,
+	const struct xfs_rmap_irec	*irec)
+{
+	if (!xfs_has_rtreflink(sc->mp))
+		return false;
+	if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+		return false;
+	return true;
+}
+
 /* Flag failures for records that overlap but cannot. */
 STATIC void
 xchk_rtrmapbt_check_overlapping(
@@ -108,7 +120,10 @@ xchk_rtrmapbt_check_overlapping(
 	if (pnext <= irec->rm_startblock)
 		goto set_prev;
 
-	xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	/* Overlap is only allowed if both records are data fork mappings. */
+	if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) ||
+	    !xchk_rtrmapbt_is_shareable(bs->sc, irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
 	/* Save whichever rmap record extends furthest. */
 	inext = irec->rm_startblock + irec->rm_blockcount;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 32/44] xfs: check reference counts of gaps between rt refcount records
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (30 preceding siblings ...)
  2023-12-31 21:52   ` [PATCH 31/44] xfs: allow overlapping rtrmapbt records for shared data extents Darrick J. Wong
@ 2023-12-31 21:52   ` Darrick J. Wong
  2023-12-31 21:53   ` [PATCH 33/44] xfs: allow dquot rt block count to exceed rt blocks on reflink fs Darrick J. Wong
                     ` (11 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:52 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If there's a gap between records in the rt refcount btree, we ought to
cross-reference the gap with the rtrmap records to make sure that there
aren't any overlapping records for a region that doesn't have any shared
ownership.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/rtrefcount.c |   81 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
index 360b9c6a4c04d..058a0ea18e09b 100644
--- a/fs/xfs/scrub/rtrefcount.c
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -17,6 +17,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_imeta.h"
 #include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -359,8 +360,14 @@ struct xchk_rtrefcbt_records {
 	/* Previous refcount record. */
 	struct xfs_refcount_irec	prev_rec;
 
+	/* The next rtgroup block where we aren't expecting shared extents. */
+	xfs_rgblock_t			next_unshared_rgbno;
+
 	/* Number of CoW blocks we expect. */
 	xfs_extlen_t			cow_blocks;
+
+	/* Was the last record a shared or CoW staging extent? */
+	enum xfs_refc_domain		prev_domain;
 };
 
 static inline bool
@@ -401,6 +408,53 @@ xchk_rtrefcountbt_check_mergeable(
 	memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
 }
 
+STATIC int
+xchk_rtrefcountbt_rmap_check_gap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	xfs_rgblock_t			*next_bno = priv;
+
+	if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno)
+		return -ECANCELED;
+
+	*next_bno = rec->rm_startblock + rec->rm_blockcount;
+	return 0;
+}
+
+/*
+ * Make sure that a gap in the reference count records does not correspond to
+ * overlapping records (i.e. shared extents) in the reverse mappings.
+ */
+static inline void
+xchk_rtrefcountbt_xref_gaps(
+	struct xfs_scrub	*sc,
+	struct xchk_rtrefcbt_records *rrc,
+	xfs_rtblock_t		bno)
+{
+	struct xfs_rmap_irec	low;
+	struct xfs_rmap_irec	high;
+	xfs_rgblock_t		next_bno = NULLRGBLOCK;
+	int			error;
+
+	if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur ||
+            xchk_skip_xref(sc->sm))
+		return;
+
+	memset(&low, 0, sizeof(low));
+	low.rm_startblock = rrc->next_unshared_rgbno;
+	memset(&high, 0xFF, sizeof(high));
+	high.rm_startblock = bno - 1;
+
+	error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+			xchk_rtrefcountbt_rmap_check_gap, &next_bno);
+	if (error == -ECANCELED)
+		xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+	else
+		xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur);
+}
+
 /* Scrub a rtrefcountbt record. */
 STATIC int
 xchk_rtrefcountbt_rec(
@@ -429,9 +483,26 @@ xchk_rtrefcountbt_rec(
 	if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
 		rrc->cow_blocks += irec.rc_blockcount;
 
+	/* Shared records always come before CoW records. */
+	if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED &&
+	    rrc->prev_domain == XFS_REFC_DOMAIN_COW)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	rrc->prev_domain = irec.rc_domain;
+
 	xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec);
 	xchk_rtrefcountbt_xref(bs->sc, &irec);
 
+	/*
+	 * If this is a record for a shared extent, check that all blocks
+	 * between the previous record and this one have at most one reverse
+	 * mapping.
+	 */
+	if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) {
+		xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock);
+		rrc->next_unshared_rgbno = irec.rc_startblock +
+					   irec.rc_blockcount;
+	}
+
 	return 0;
 }
 
@@ -476,7 +547,9 @@ xchk_rtrefcountbt(
 {
 	struct xfs_owner_info	btree_oinfo;
 	struct xchk_rtrefcbt_records rrc = {
-		.cow_blocks	= 0,
+		.cow_blocks		= 0,
+		.next_unshared_rgbno	= 0,
+		.prev_domain		= XFS_REFC_DOMAIN_SHARED,
 	};
 	int			error;
 
@@ -491,6 +564,12 @@ xchk_rtrefcountbt(
 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 		return error;
 
+	/*
+	 * Check that all blocks between the last refcount > 1 record and the
+	 * end of the rt volume have at most one reverse mapping.
+	 */
+	xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks);
+
 	xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks);
 
 	return 0;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 33/44] xfs: allow dquot rt block count to exceed rt blocks on reflink fs
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (31 preceding siblings ...)
  2023-12-31 21:52   ` [PATCH 32/44] xfs: check reference counts of gaps between rt refcount records Darrick J. Wong
@ 2023-12-31 21:53   ` Darrick J. Wong
  2023-12-31 21:53   ` [PATCH 34/44] xfs: detect and repair misaligned rtinherit directory cowextsize hints Darrick J. Wong
                     ` (10 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:53 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Update the quota scrubber to allow dquots where the realtime block count
exceeds the block count of the rt volume if reflink is enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/quota.c |    8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 183d531875eae..58d6d4ed2853b 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -212,12 +212,18 @@ xchk_quota_item(
 		if (mp->m_sb.sb_dblocks < dq->q_blk.count)
 			xchk_fblock_set_warning(sc, XFS_DATA_FORK,
 					offset);
+		if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+			xchk_fblock_set_warning(sc, XFS_DATA_FORK,
+					offset);
 	} else {
 		if (mp->m_sb.sb_dblocks < dq->q_blk.count)
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
 					offset);
+		if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+					offset);
 	}
-	if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
+	if (dq->q_ino.count > fs_icount)
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
 
 	/*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 34/44] xfs: detect and repair misaligned rtinherit directory cowextsize hints
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (32 preceding siblings ...)
  2023-12-31 21:53   ` [PATCH 33/44] xfs: allow dquot rt block count to exceed rt blocks on reflink fs Darrick J. Wong
@ 2023-12-31 21:53   ` Darrick J. Wong
  2023-12-31 21:53   ` [PATCH 35/44] xfs: scrub the metadir path of rt refcount btree files Darrick J. Wong
                     ` (9 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:53 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If we encounter a directory that has been configured to pass on a CoW
extent size hint to a new realtime file and the hint isn't an integer
multiple of the rt extent size, we should flag the hint for
administrative review and/or turn it off because that is a
misconfiguration.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/inode.c        |   26 +++++++++++++++++---------
 fs/xfs/scrub/inode_repair.c |   15 +++++++++++++++
 2 files changed, 32 insertions(+), 9 deletions(-)


diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index f746032a9db88..e52e12e9a1b4b 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -259,12 +259,7 @@ xchk_inode_extsize(
 		xchk_ino_set_warning(sc, ino);
 }
 
-/*
- * Validate di_cowextsize hint.
- *
- * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
- * These functions must be kept in sync with each other.
- */
+/* Validate di_cowextsize hint. */
 STATIC void
 xchk_inode_cowextsize(
 	struct xfs_scrub	*sc,
@@ -275,12 +270,25 @@ xchk_inode_cowextsize(
 	uint64_t		flags2)
 {
 	xfs_failaddr_t		fa;
+	uint32_t		value = be32_to_cpu(dip->di_cowextsize);
 
-	fa = xfs_inode_validate_cowextsize(sc->mp,
-			be32_to_cpu(dip->di_cowextsize), mode, flags,
-			flags2);
+	fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
 	if (fa)
 		xchk_ino_set_corrupt(sc, ino);
+
+	/*
+	 * XFS allows a sysadmin to change the rt extent size when adding a rt
+	 * section to a filesystem after formatting.  If there are any
+	 * directories with cowextsize and rtinherit set, the hint could become
+	 * misaligned with the new rextsize.  The verifier doesn't check this,
+	 * because we allow rtinherit directories even without an rt device.
+	 * Flag this as an administrative warning since we will clean this up
+	 * eventually.
+	 */
+	if ((flags & XFS_DIFLAG_RTINHERIT) &&
+	    (flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    value % sc->mp->m_sb.sb_rextsize > 0)
+		xchk_ino_set_warning(sc, ino);
 }
 
 /* Make sure the di_flags make sense for the inode. */
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 8d67ae257e597..b6b37652c4a35 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1836,6 +1836,20 @@ xrep_inode_pptr(
 			sizeof(struct xfs_attr_sf_hdr), true);
 }
 
+/* Fix COW extent size hint problems. */
+STATIC void
+xrep_inode_cowextsize(
+	struct xfs_scrub	*sc)
+{
+	/* Fix misaligned CoW extent size hints on a directory. */
+	if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) {
+		sc->ip->i_cowextsize = 0;
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+	}
+}
+
 /* Fix any irregularities in an inode that the verifiers don't catch. */
 STATIC int
 xrep_inode_problems(
@@ -1859,6 +1873,7 @@ xrep_inode_problems(
 	if (S_ISDIR(VFS_I(sc->ip)->i_mode))
 		xrep_inode_dir_size(sc);
 	xrep_inode_extsize(sc);
+	xrep_inode_cowextsize(sc);
 
 	trace_xrep_inode_fixed(sc);
 	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 35/44] xfs: scrub the metadir path of rt refcount btree files
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (33 preceding siblings ...)
  2023-12-31 21:53   ` [PATCH 34/44] xfs: detect and repair misaligned rtinherit directory cowextsize hints Darrick J. Wong
@ 2023-12-31 21:53   ` Darrick J. Wong
  2023-12-31 21:53   ` [PATCH 36/44] xfs: don't flag quota rt block usage on rtreflink filesystems Darrick J. Wong
                     ` (8 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:53 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new XFS_SCRUB_METAPATH subtype so that we can scrub the metadata
directory tree path to the refcount btree file for each rt group.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h  |    3 ++-
 fs/xfs/scrub/metapath.c |   14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 7847da61db232..4159e96d01ae6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -806,9 +806,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_METAPATH_GRPQUOTA	3
 #define XFS_SCRUB_METAPATH_PRJQUOTA	4
 #define XFS_SCRUB_METAPATH_RTRMAPBT	5
+#define XFS_SCRUB_METAPATH_RTREFCBT	6
 
 /* Number of metapath sm_ino values */
-#define XFS_SCRUB_METAPATH_NR		6
+#define XFS_SCRUB_METAPATH_NR		7
 
 /*
  * ioctl limits
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index 6afd117c890e9..447bdbea210d2 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -23,6 +23,7 @@
 #include "xfs_attr.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -106,6 +107,7 @@ xchk_setup_metapath(
 
 	switch (sc->sm->sm_ino) {
 	case XFS_SCRUB_METAPATH_RTRMAPBT:
+	case XFS_SCRUB_METAPATH_RTREFCBT:
 		/* empty */
 		break;
 	default:
@@ -157,6 +159,18 @@ xchk_setup_metapath(
 			xfs_rtgroup_put(rtg);
 		}
 		break;
+	case XFS_SCRUB_METAPATH_RTREFCBT:
+		error = xfs_rtrefcountbt_create_path(mp, sc->sm->sm_agno,
+				&path);
+		if (error)
+			return error;
+		mpath->path = path;
+		rtg = xfs_rtgroup_get(mp, sc->sm->sm_agno);
+		if (rtg) {
+			ip = rtg->rtg_refcountip;
+			xfs_rtgroup_put(rtg);
+		}
+		break;
 	default:
 		return -EINVAL;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 36/44] xfs: don't flag quota rt block usage on rtreflink filesystems
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (34 preceding siblings ...)
  2023-12-31 21:53   ` [PATCH 35/44] xfs: scrub the metadir path of rt refcount btree files Darrick J. Wong
@ 2023-12-31 21:53   ` Darrick J. Wong
  2023-12-31 21:54   ` [PATCH 37/44] xfs: check new rtbitmap records against rt refcount btree Darrick J. Wong
                     ` (7 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:53 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Quota space usage is allowed to exceed the size of the physical storage
when reflink is enabled.  Now that we have reflink for the realtime
volume, apply this same logic to the rtb repair logic.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/quota_repair.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index 0bab4c30cb85a..4e7198073a64a 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -236,7 +236,7 @@ xrep_quota_item(
 		rqi->need_quotacheck = true;
 		dirty = true;
 	}
-	if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+	if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) {
 		dq->q_rtb.reserved -= dq->q_rtb.count;
 		dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
 		dq->q_rtb.count = mp->m_sb.sb_rblocks;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 37/44] xfs: check new rtbitmap records against rt refcount btree
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (35 preceding siblings ...)
  2023-12-31 21:53   ` [PATCH 36/44] xfs: don't flag quota rt block usage on rtreflink filesystems Darrick J. Wong
@ 2023-12-31 21:54   ` Darrick J. Wong
  2023-12-31 21:54   ` [PATCH 38/44] xfs: walk the rt reference count tree when rebuilding rmap Darrick J. Wong
                     ` (6 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:54 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When we're rebuilding the realtime bitmap, check the proposed free
extents against the rt refcount btree to make sure we don't commit any
grievous errors.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/repair.c          |    7 +++++++
 fs/xfs/scrub/rtbitmap_repair.c |   24 +++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c9c538083c722..fdd12933fd47f 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -41,6 +41,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_rtalloc.h"
 #include "xfs_imeta.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -1008,6 +1009,12 @@ xrep_rtgroup_btcur_init(
 	    xfs_has_rtrmapbt(mp))
 		sr->rmap_cur = xfs_rtrmapbt_init_cursor(mp, sc->tp, sr->rtg,
 				sr->rtg->rtg_rmapip);
+
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT &&
+	    (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) &&
+	    xfs_has_rtreflink(mp))
+		sr->refc_cur = xfs_rtrefcountbt_init_cursor(mp, sc->tp,
+				sr->rtg, sr->rtg->rtg_refcountip);
 }
 
 /*
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index db87ce51c35fc..24d7d0ff49ea3 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -22,6 +22,7 @@
 #include "xfs_swapext.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtgroup.h"
+#include "xfs_refcount.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -418,7 +419,8 @@ xrep_rgbitmap_mark_free(
 	xfs_rgblock_t		rgbno)
 {
 	struct xfs_mount	*mp = rgb->sc->mp;
-	struct xfs_rtgroup	*rtg = rgb->sc->sr.rtg;
+	struct xchk_rt		*sr =  &rgb->sc->sr;
+	struct xfs_rtgroup	*rtg = sr->rtg;
 	xfs_rtblock_t		rtbno;
 	xfs_rtxnum_t		startrtx;
 	xfs_rtxnum_t		nextrtx;
@@ -427,6 +429,7 @@ xrep_rgbitmap_mark_free(
 	unsigned int		bufwsize;
 	xfs_extlen_t		mod;
 	xfs_rtword_t		mask;
+	enum xbtree_recpacking	outcome;
 	int			error;
 
 	if (!xfs_verify_rgbext(rtg, rgb->next_rgbno, rgbno - rgb->next_rgbno))
@@ -446,6 +449,25 @@ xrep_rgbitmap_mark_free(
 	if (mod != mp->m_sb.sb_rextsize - 1)
 		return -EFSCORRUPTED;
 
+	/* Must not be shared or CoW staging. */
+	if (sr->refc_cur) {
+		error = xfs_refcount_has_records(sr->refc_cur,
+				XFS_REFC_DOMAIN_SHARED, rgb->next_rgbno,
+				rgbno - rgb->next_rgbno, &outcome);
+		if (error)
+			return error;
+		if (outcome != XBTREE_RECPACKING_EMPTY)
+			return -EFSCORRUPTED;
+
+		error = xfs_refcount_has_records(sr->refc_cur,
+				XFS_REFC_DOMAIN_COW, rgb->next_rgbno,
+				rgbno - rgb->next_rgbno, &outcome);
+		if (error)
+			return error;
+		if (outcome != XBTREE_RECPACKING_EMPTY)
+			return -EFSCORRUPTED;
+	}
+
 	trace_xrep_rgbitmap_record_free(mp, startrtx, nextrtx - 1);
 
 	/* Set bits as needed to round startrtx up to the nearest word. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 38/44] xfs: walk the rt reference count tree when rebuilding rmap
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (36 preceding siblings ...)
  2023-12-31 21:54   ` [PATCH 37/44] xfs: check new rtbitmap records against rt refcount btree Darrick J. Wong
@ 2023-12-31 21:54   ` Darrick J. Wong
  2023-12-31 21:54   ` [PATCH 39/44] xfs: capture realtime CoW staging extents when rebuilding rt rmapbt Darrick J. Wong
                     ` (5 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:54 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When we're rebuilding the data device rmap, if we encounter a "refcount"
format fork, we have to walk the (realtime) refcount btree inode to
build the appropriate mappings.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/rmap_repair.c |   36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)


diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index 7733334a1faa9..0f3783aaaa997 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -32,6 +32,7 @@
 #include "xfs_ag.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtgroup.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -547,6 +548,39 @@ xrep_rmap_scan_rtrmapbt(
 	return -EFSCORRUPTED;
 }
 
+static int
+xrep_rmap_scan_rtrefcountbt(
+	struct xrep_rmap_ifork	*rf,
+	struct xfs_inode	*ip)
+{
+	struct xfs_scrub	*sc = rf->rr->sc;
+	struct xfs_btree_cur	*cur;
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+	int			error;
+
+	if (rf->whichfork != XFS_DATA_FORK)
+		return -EFSCORRUPTED;
+
+	for_each_rtgroup(sc->mp, rgno, rtg) {
+		if (ip == rtg->rtg_refcountip) {
+			cur = xfs_rtrefcountbt_init_cursor(sc->mp, sc->tp, rtg,
+					ip);
+			error = xrep_rmap_scan_iroot_btree(rf, cur);
+			xfs_btree_del_cursor(cur, error);
+			xfs_rtgroup_rele(rtg);
+			return error;
+		}
+	}
+
+	/*
+	 * We shouldn't find a refcount format inode that isn't associated with
+	 * an rtgroup!
+	 */
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
 /* Find all the extents from a given AG in an inode fork. */
 STATIC int
 xrep_rmap_scan_ifork(
@@ -578,6 +612,8 @@ xrep_rmap_scan_ifork(
 			return error;
 	} else if (ifp->if_format == XFS_DINODE_FMT_RMAP) {
 		return xrep_rmap_scan_rtrmapbt(&rf, ip);
+	} else if (ifp->if_format == XFS_DINODE_FMT_REFCOUNT) {
+		return xrep_rmap_scan_rtrefcountbt(&rf, ip);
 	} else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
 		return 0;
 	}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 39/44] xfs: capture realtime CoW staging extents when rebuilding rt rmapbt
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (37 preceding siblings ...)
  2023-12-31 21:54   ` [PATCH 38/44] xfs: walk the rt reference count tree when rebuilding rmap Darrick J. Wong
@ 2023-12-31 21:54   ` Darrick J. Wong
  2023-12-31 21:54   ` [PATCH 40/44] xfs: online repair of the realtime refcount btree Darrick J. Wong
                     ` (4 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:54 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Walk the realtime refcount btree to find the CoW staging extents when
we're rebuilding the realtime rmap btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/repair.h        |    1 
 fs/xfs/scrub/rgb_bitmap.h    |   37 +++++++++++++++
 fs/xfs/scrub/rtrmap_repair.c |  103 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 fs/xfs/scrub/rgb_bitmap.h


diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index a382ba0478aa0..61c6bc31a266b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -49,6 +49,7 @@ xrep_trans_commit(
 
 struct xbitmap;
 struct xagb_bitmap;
+struct xrgb_bitmap;
 struct xfsb_bitmap;
 
 int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
diff --git a/fs/xfs/scrub/rgb_bitmap.h b/fs/xfs/scrub/rgb_bitmap.h
new file mode 100644
index 0000000000000..47a5caf3a230d
--- /dev/null
+++ b/fs/xfs/scrub/rgb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RGB_BITMAP_H__
+#define __XFS_SCRUB_RGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rgblock_t */
+
+struct xrgb_bitmap {
+	struct xbitmap32	rgbitmap;
+};
+
+static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap)
+{
+	xbitmap32_init(&bitmap->rgbitmap);
+}
+
+static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap)
+{
+	xbitmap32_destroy(&bitmap->rgbitmap);
+}
+
+static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap,
+		xfs_rgblock_t start, xfs_extlen_t len)
+{
+	return xbitmap32_set(&bitmap->rgbitmap, start, len);
+}
+
+static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap,
+		xbitmap32_walk_fn fn, void *priv)
+{
+	return xbitmap32_walk(&bitmap->rgbitmap, fn, priv);
+}
+
+#endif	/* __XFS_SCRUB_RGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index 42df1cf45ae0b..a40afa571b981 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -29,6 +29,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_ag.h"
 #include "xfs_rtgroup.h"
+#include "xfs_refcount.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -37,6 +38,7 @@
 #include "scrub/repair.h"
 #include "scrub/bitmap.h"
 #include "scrub/fsb_bitmap.h"
+#include "scrub/rgb_bitmap.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
 #include "scrub/iscan.h"
@@ -436,6 +438,100 @@ xrep_rtrmap_scan_ag(
 	return error;
 }
 
+struct xrep_rtrmap_stash_run {
+	struct xrep_rtrmap	*rr;
+	uint64_t		owner;
+};
+
+static int
+xrep_rtrmap_stash_run(
+	uint32_t			start,
+	uint32_t			len,
+	void				*priv)
+{
+	struct xrep_rtrmap_stash_run	*rsr = priv;
+	struct xrep_rtrmap		*rr = rsr->rr;
+	xfs_rgblock_t			rgbno = start;
+
+	return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap.  Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rtrmap_stash_bitmap(
+	struct xrep_rtrmap		*rr,
+	struct xrgb_bitmap		*bitmap,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xrep_rtrmap_stash_run	rsr = {
+		.rr			= rr,
+		.owner			= oinfo->oi_owner,
+	};
+
+	return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrmap_walk_cowblocks(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*irec,
+	void				*priv)
+{
+	struct xrgb_bitmap		*bitmap = priv;
+
+	if (!xfs_refcount_check_domain(irec) ||
+	    irec->rc_domain != XFS_REFC_DOMAIN_COW)
+		return -EFSCORRUPTED;
+
+	return xrgb_bitmap_set(bitmap, irec->rc_startblock,
+			irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rtrmap_find_refcount_rmaps(
+	struct xrep_rtrmap	*rr)
+{
+	struct xrgb_bitmap	cow_blocks;		/* COWBIT */
+	struct xfs_refcount_irec low = {
+		.rc_startblock	= 0,
+		.rc_domain	= XFS_REFC_DOMAIN_COW,
+	};
+	struct xfs_refcount_irec high = {
+		.rc_startblock	= -1U,
+		.rc_domain	= XFS_REFC_DOMAIN_COW,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	if (!xfs_has_rtreflink(sc->mp))
+		return 0;
+
+	xrgb_bitmap_init(&cow_blocks);
+
+	/* Collect rmaps for CoW staging extents. */
+	error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high,
+			xrep_rtrmap_walk_cowblocks, &cow_blocks);
+	if (error)
+		goto out_bitmap;
+
+	/* Generate rmaps for everything. */
+	error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xrgb_bitmap_destroy(&cow_blocks);
+	return error;
+}
+
 /* Count and check all collected records. */
 STATIC int
 xrep_rtrmap_check_record(
@@ -483,6 +579,13 @@ xrep_rtrmap_find_rmaps(
 	if (error)
 		return error;
 
+	/* Find CoW staging extents. */
+	xrep_rtgroup_btcur_init(sc, &sc->sr);
+	error = xrep_rtrmap_find_refcount_rmaps(rr);
+	xchk_rtgroup_btcur_free(&sc->sr);
+	if (error)
+		return error;
+
 	/*
 	 * Set up for a potentially lengthy filesystem scan by reducing our
 	 * transaction resource usage for the duration.  Specifically:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 40/44] xfs: online repair of the realtime refcount btree
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (38 preceding siblings ...)
  2023-12-31 21:54   ` [PATCH 39/44] xfs: capture realtime CoW staging extents when rebuilding rt rmapbt Darrick J. Wong
@ 2023-12-31 21:54   ` Darrick J. Wong
  2023-12-31 21:55   ` [PATCH 41/44] xfs: repair inodes that have a refcount btree in the data fork Darrick J. Wong
                     ` (3 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:54 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Port the data device's refcount btree repair code to the realtime
refcount btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile                  |    1 
 fs/xfs/scrub/bmap_repair.c       |    2 
 fs/xfs/scrub/repair.c            |   20 +
 fs/xfs/scrub/repair.h            |    7 
 fs/xfs/scrub/rtrefcount.c        |    9 
 fs/xfs/scrub/rtrefcount_repair.c |  781 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/rtrmap_repair.c     |    2 
 fs/xfs/scrub/scrub.c             |    2 
 fs/xfs/scrub/trace.h             |   31 ++
 9 files changed, 847 insertions(+), 8 deletions(-)
 create mode 100644 fs/xfs/scrub/rtrefcount_repair.c


diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 4d4f340d904fc..a779030fd91db 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -237,6 +237,7 @@ xfs-y				+= $(addprefix scrub/, \
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rgsuper_repair.o \
 				   rtbitmap_repair.o \
+				   rtrefcount_repair.o \
 				   rtrmap_repair.o \
 				   rtsummary_repair.o \
 				   )
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 26dc04923944b..98618821ad975 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -397,7 +397,7 @@ xrep_bmap_check_rtfork_rmap(
 	/* Make sure this isn't free space. */
 	rtbno = xfs_rgbno_to_rtb(sc->mp, cur->bc_ino.rtg->rtg_rgno,
 			rec->rm_startblock);
-	return xrep_require_rtext_inuse(sc, rtbno, rec->rm_blockcount);
+	return xrep_require_rtext_inuse(sc, rtbno, rec->rm_blockcount, false);
 }
 
 /* Record realtime extents that belong to this inode's fork. */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index fdd12933fd47f..72aa1e522910b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1039,21 +1039,31 @@ xrep_rtgroup_init(
 	return 0;
 }
 
-/* Ensure that all rt blocks in the given range are not marked free. */
+/*
+ * Ensure that all rt blocks in the given range are not marked free.  If
+ * @must_align is true, then both ends must be aligned to a rt extent.
+ */
 int
 xrep_require_rtext_inuse(
 	struct xfs_scrub	*sc,
 	xfs_rtblock_t		rtbno,
-	xfs_filblks_t		len)
+	xfs_filblks_t		len,
+	bool			must_align)
 {
 	struct xfs_mount	*mp = sc->mp;
 	xfs_rtxnum_t		startrtx;
 	xfs_rtxnum_t		endrtx;
+	xfs_extlen_t		mod;
 	bool			is_free = false;
 	int			error;
 
-	startrtx = xfs_rtb_to_rtx(mp, rtbno);
-	endrtx = xfs_rtb_to_rtx(mp, rtbno + len - 1);
+	startrtx = xfs_rtb_to_rtxrem(mp, rtbno, &mod);
+	if (must_align && mod != 0)
+		return -EFSCORRUPTED;
+
+	endrtx = xfs_rtb_to_rtxrem(mp, rtbno + len - 1, &mod);
+	if (must_align && mod != mp->m_sb.sb_rextsize - 1)
+		return -EFSCORRUPTED;
 
 	error = xfs_rtalloc_extent_is_free(mp, sc->tp, startrtx,
 			endrtx - startrtx + 1, &is_free);
@@ -1341,6 +1351,8 @@ xrep_is_rtmeta_ino(
 	/* Newer rt metadata files are not guaranteed to exist */
 	if (rtg->rtg_rmapip && ino == rtg->rtg_rmapip->i_ino)
 		return true;
+	if (rtg->rtg_refcountip && ino == rtg->rtg_refcountip->i_ino)
+		return true;
 
 	return false;
 }
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 61c6bc31a266b..969317d429db6 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -98,6 +98,7 @@ int xrep_setup_nlinks(struct xfs_scrub *sc);
 int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
 int xrep_setup_dirtree(struct xfs_scrub *sc);
 int xrep_setup_rtrmapbt(struct xfs_scrub *sc);
+int xrep_setup_rtrefcountbt(struct xfs_scrub *sc);
 
 /* Repair setup functions */
 int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -113,7 +114,7 @@ int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
 		struct xchk_rt *sr, unsigned int rtglock_flags);
 void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr);
 int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
-		xfs_filblks_t len);
+		xfs_filblks_t len, bool must_align);
 xfs_extlen_t xrep_calc_rtgroup_resblks(struct xfs_scrub *sc);
 #else
 # define xrep_rtgroup_init(sc, rtg, sr, lockflags)	(-ENOSYS)
@@ -160,12 +161,14 @@ int xrep_rtsummary(struct xfs_scrub *sc);
 int xrep_rgsuperblock(struct xfs_scrub *sc);
 int xrep_rgbitmap(struct xfs_scrub *sc);
 int xrep_rtrmapbt(struct xfs_scrub *sc);
+int xrep_rtrefcountbt(struct xfs_scrub *sc);
 #else
 # define xrep_rtbitmap			xrep_notsupported
 # define xrep_rtsummary			xrep_notsupported
 # define xrep_rgsuperblock		xrep_notsupported
 # define xrep_rgbitmap			xrep_notsupported
 # define xrep_rtrmapbt			xrep_notsupported
+# define xrep_rtrefcountbt		xrep_notsupported
 #endif /* CONFIG_XFS_RT */
 
 #ifdef CONFIG_XFS_QUOTA
@@ -239,6 +242,7 @@ xrep_setup_nothing(
 #define xrep_setup_dirtree		xrep_setup_nothing
 #define xrep_setup_metapath		xrep_setup_nothing
 #define xrep_setup_rtrmapbt		xrep_setup_nothing
+#define xrep_setup_rtrefcountbt		xrep_setup_nothing
 
 #define xrep_setup_inode(sc, imap)	((void)0)
 
@@ -278,6 +282,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
 #define xrep_rgsuperblock		xrep_notsupported
 #define xrep_rgbitmap			xrep_notsupported
 #define xrep_rtrmapbt			xrep_notsupported
+#define xrep_rtrefcountbt		xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
index 058a0ea18e09b..8924ad8b680f0 100644
--- a/fs/xfs/scrub/rtrefcount.c
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -7,8 +7,10 @@
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_refcount.h"
@@ -21,6 +23,7 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
+#include "scrub/repair.h"
 
 /* Set us up with the realtime refcount metadata locked. */
 int
@@ -34,6 +37,12 @@ xchk_setup_rtrefcountbt(
 	if (xchk_need_intent_drain(sc))
 		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
 
+	if (xchk_could_repair(sc)) {
+		error = xrep_setup_rtrefcountbt(sc);
+		if (error)
+			return error;
+	}
+
 	rtg = xfs_rtgroup_get(mp, sc->sm->sm_agno);
 	if (!rtg)
 		return -ENOENT;
diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c
new file mode 100644
index 0000000000000..3034ca88be8b4
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount_repair.c
@@ -0,0 +1,781 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+#include "scrub/rcbag.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair.  Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ *                 -    ---
+ *       --      ----- ----   ---        ------
+ * --   ----     ----------- ----     ---------
+ * -------------------------------- -----------
+ * ^ ^  ^^ ^^    ^ ^^ ^^^  ^^^^  ^ ^^ ^  ^     ^
+ * 2 1  23 21    3 43 234  2123  1 01 2  3     0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt.  If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time.  Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * rt refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ *  - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ *  - Add to the bag all rmaps in the array where startblock == sp.
+ *  - Set np to the physical block where the bag size will change.  This
+ *    is the minimum of (the pblk of the next unprocessed rmap) and
+ *    (startblock + len of each rmap in the bag).
+ *  - Record the bag size as old_bag_size.
+ *
+ *  - While the bag isn't empty,
+ *     - Remove from the bag all rmaps where startblock + len == np.
+ *     - Add to the bag all rmaps in the array where startblock == np.
+ *     - If the bag size isn't old_bag_size, store the refcount entry
+ *       (sp, np - sp, bag_size) in the refcnt btree.
+ *     - If the bag is empty, break out of the inner loop.
+ *     - Set old_bag_size to the bag size
+ *     - Set sp = np.
+ *     - Set np to the physical block where the bag size will change.
+ *       This is the minimum of (the pblk of the next unprocessed rmap)
+ *       and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the rt refcount btree root and
+ * insert all the records.
+ */
+
+struct xrep_rtrefc {
+	/* refcount extents */
+	struct xfarray		*refcount_records;
+
+	/* new refcountbt information */
+	struct xrep_newbt	new_btree;
+
+	/* old refcountbt blocks */
+	struct xfsb_bitmap	old_rtrefcountbt_blocks;
+
+	struct xfs_scrub	*sc;
+
+	/* get_records()'s position in the rt refcount record array. */
+	xfarray_idx_t		array_cur;
+
+	/* # of refcountbt blocks */
+	xfs_filblks_t		btblocks;
+};
+
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_rtrefcountbt(
+	struct xfs_scrub	*sc)
+{
+	char			*descr;
+	int			error;
+
+	descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+	error = xrep_setup_buftarg(sc, descr);
+	kfree(descr);
+	return error;
+}
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_rtrefc_check_ext(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*rec)
+{
+	xfs_rtblock_t			rtbno;
+
+	if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL)
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space or misaligned. */
+	rtbno = xfs_rgbno_to_rtb(sc->mp, sc->sr.rtg->rtg_rgno,
+			rec->rc_startblock);
+	return xrep_require_rtext_inuse(sc, rtbno, rec->rc_blockcount, true);
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_rtrefc_stash(
+	struct xrep_rtrefc		*rr,
+	enum xfs_refc_domain		domain,
+	xfs_rgblock_t			bno,
+	xfs_extlen_t			len,
+	uint64_t			refcount)
+{
+	struct xfs_refcount_irec	irec = {
+		.rc_startblock		= bno,
+		.rc_blockcount		= len,
+		.rc_refcount		= refcount,
+		.rc_domain		= domain,
+	};
+	int				error = 0;
+
+	if (xchk_should_terminate(rr->sc, &error))
+		return error;
+
+	irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
+
+	error = xrep_rtrefc_check_ext(rr->sc, &irec);
+	if (error)
+		return error;
+
+	trace_xrep_rtrefc_found(rr->sc->sr.rtg, &irec);
+
+	return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrefc_stash_cow(
+	struct xrep_rtrefc		*rr,
+	xfs_rgblock_t			bno,
+	xfs_extlen_t			len)
+{
+	return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_rtrefc_rmap_shareable(
+	const struct xfs_rmap_irec	*rmap)
+{
+	/* rt metadata are never sharable */
+	if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+		return false;
+
+	/* Unwritten file blocks are not shareable. */
+	if (rmap->rm_flags & XFS_RMAP_UNWRITTEN)
+		return false;
+
+	return true;
+}
+
+/* Grab the next (abbreviated) rmap record from the rmapbt. */
+STATIC int
+xrep_rtrefc_walk_rmaps(
+	struct xrep_rtrefc	*rr,
+	struct xfs_rmap_irec	*rmap,
+	bool			*have_rec)
+{
+	struct xfs_btree_cur	*cur = rr->sc->sr.rmap_cur;
+	struct xfs_mount	*mp = cur->bc_mp;
+	int			have_gt;
+	int			error = 0;
+
+	*have_rec = false;
+
+	/*
+	 * Loop through the remaining rmaps.  Remember CoW staging
+	 * extents and the refcountbt blocks from the old tree for later
+	 * disposal.  We can only share written data fork extents, so
+	 * keep looping until we find an rmap for one.
+	 */
+	do {
+		if (xchk_should_terminate(rr->sc, &error))
+			return error;
+
+		error = xfs_btree_increment(cur, 0, &have_gt);
+		if (error)
+			return error;
+		if (!have_gt)
+			return 0;
+
+		error = xfs_rmap_get_rec(cur, rmap, &have_gt);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(mp, !have_gt)) {
+			xfs_btree_mark_sick(cur);
+			return -EFSCORRUPTED;
+		}
+
+		if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+			error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock,
+					rmap->rm_blockcount);
+			if (error)
+				return error;
+		} else if (xfs_internal_inum(mp, rmap->rm_owner) ||
+			   (rmap->rm_flags & (XFS_RMAP_ATTR_FORK |
+					      XFS_RMAP_BMBT_BLOCK))) {
+			xfs_btree_mark_sick(cur);
+			return -EFSCORRUPTED;
+		}
+	} while (!xrep_rtrefc_rmap_shareable(rmap));
+
+	*have_rec = true;
+	return 0;
+}
+
+static inline uint32_t
+xrep_rtrefc_encode_startblock(
+	const struct xfs_refcount_irec	*irec)
+{
+	uint32_t			start;
+
+	start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+	if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+		start |= XFS_REFC_COWFLAG;
+
+	return start;
+}
+
+/*
+ * Compare two refcount records.  We want to sort in order of increasing block
+ * number.
+ */
+static int
+xrep_rtrefc_extent_cmp(
+	const void			*a,
+	const void			*b)
+{
+	const struct xfs_refcount_irec	*ap = a;
+	const struct xfs_refcount_irec	*bp = b;
+	uint32_t			sa, sb;
+
+	sa = xrep_rtrefc_encode_startblock(ap);
+	sb = xrep_rtrefc_encode_startblock(bp);
+
+	if (sa > sb)
+		return 1;
+	if (sa < sb)
+		return -1;
+	return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order.  Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_rtrefc_sort_records(
+	struct xrep_rtrefc		*rr)
+{
+	struct xfs_refcount_irec	irec;
+	xfarray_idx_t			cur;
+	enum xfs_refc_domain		dom = XFS_REFC_DOMAIN_SHARED;
+	xfs_rgblock_t			next_rgbno = 0;
+	int				error;
+
+	error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp,
+			XFARRAY_SORT_KILLABLE);
+	if (error)
+		return error;
+
+	foreach_xfarray_idx(rr->refcount_records, cur) {
+		if (xchk_should_terminate(rr->sc, &error))
+			return error;
+
+		error = xfarray_load(rr->refcount_records, cur, &irec);
+		if (error)
+			return error;
+
+		if (dom == XFS_REFC_DOMAIN_SHARED &&
+		    irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+			dom = irec.rc_domain;
+			next_rgbno = 0;
+		}
+
+		if (dom != irec.rc_domain)
+			return -EFSCORRUPTED;
+		if (irec.rc_startblock < next_rgbno)
+			return -EFSCORRUPTED;
+
+		next_rgbno = irec.rc_startblock + irec.rc_blockcount;
+	}
+
+	return error;
+}
+
+/* Record extents that belong to the realtime refcount inode. */
+STATIC int
+xrep_rtrefc_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_rtrefc		*rr = priv;
+	struct xfs_mount		*mp = cur->bc_mp;
+	xfs_fsblock_t			fsbno;
+	int				error = 0;
+
+	if (xchk_should_terminate(rr->sc, &error))
+		return error;
+
+	/* Skip extents which are not owned by this inode and fork. */
+	if (rec->rm_owner != rr->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_check_ino_btree_mapping(rr->sc, rec);
+	if (error)
+		return error;
+
+	fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
+			rec->rm_startblock);
+
+	return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks, fsbno,
+			rec->rm_blockcount);
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag.  These represent the file(s) that share ownership of
+ * the current block.  Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_rtrefc_push_rmaps_at(
+	struct xrep_rtrefc	*rr,
+	struct rcbag		*rcstack,
+	xfs_rgblock_t		bno,
+	struct xfs_rmap_irec	*rmap,
+	bool			*have)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	int			have_gt;
+	int			error;
+
+	while (*have && rmap->rm_startblock == bno) {
+		error = rcbag_add(rcstack, rr->sc->tp, rmap);
+		if (error)
+			return error;
+
+		error = xrep_rtrefc_walk_rmaps(rr, rmap, have);
+		if (error)
+			return error;
+	}
+
+	error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+		xfs_btree_mark_sick(sc->sr.rmap_cur);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+/* Scan one AG for reverse mappings for the realtime refcount btree. */
+STATIC int
+xrep_rtrefc_scan_ag(
+	struct xrep_rtrefc	*rr,
+	struct xfs_perag	*pag)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	error = xrep_ag_init(sc, pag, &sc->sa);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr);
+	xchk_ag_free(sc, &sc->sa);
+	return error;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_rtrefc_find_refcounts(
+	struct xrep_rtrefc	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct rcbag		*rcstack;
+	struct xfs_perag	*pag;
+	uint64_t		old_stack_height;
+	xfs_rgblock_t		sbno;
+	xfs_rgblock_t		cbno;
+	xfs_rgblock_t		nbno;
+	xfs_agnumber_t		agno;
+	bool			have;
+	int			error;
+
+	/* Scan for old rtrefc btree blocks. */
+	for_each_perag(sc->mp, agno, pag) {
+		error = xrep_rtrefc_scan_ag(rr, pag);
+		if (error) {
+			xfs_perag_rele(pag);
+			return error;
+		}
+	}
+
+	xrep_rtgroup_btcur_init(sc, &sc->sr);
+
+	/*
+	 * Set up a bag to store all the rmap records that we're tracking to
+	 * generate a reference count record.  If this exceeds
+	 * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
+	 */
+	error = rcbag_init(sc->mp, sc->xfile_buftarg, &rcstack);
+	if (error)
+		goto out_cur;
+
+	/* Start the rtrmapbt cursor to the left of all records. */
+	error = xfs_btree_goto_left_edge(sc->sr.rmap_cur);
+	if (error)
+		goto out_bag;
+
+	/* Process reverse mappings into refcount data. */
+	while (xfs_btree_has_more_records(sc->sr.rmap_cur)) {
+		struct xfs_rmap_irec	rmap;
+
+		/* Push all rmaps with pblk == sbno onto the stack */
+		error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+		if (error)
+			goto out_bag;
+		if (!have)
+			break;
+		sbno = cbno = rmap.rm_startblock;
+		error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+				&have);
+		if (error)
+			goto out_bag;
+
+		/* Set nbno to the bno of the next refcount change */
+		error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
+		if (error)
+			goto out_bag;
+
+		ASSERT(nbno > sbno);
+		old_stack_height = rcbag_count(rcstack);
+
+		/* While stack isn't empty... */
+		while (rcbag_count(rcstack) > 0) {
+			/* Pop all rmaps that end at nbno */
+			error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
+			if (error)
+				goto out_bag;
+
+			/* Push array items that start at nbno */
+			error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+			if (error)
+				goto out_bag;
+			if (have) {
+				error = xrep_rtrefc_push_rmaps_at(rr, rcstack,
+						nbno, &rmap, &have);
+				if (error)
+					goto out_bag;
+			}
+
+			/* Emit refcount if necessary */
+			ASSERT(nbno > cbno);
+			if (rcbag_count(rcstack) != old_stack_height) {
+				if (old_stack_height > 1) {
+					error = xrep_rtrefc_stash(rr,
+							XFS_REFC_DOMAIN_SHARED,
+							cbno, nbno - cbno,
+							old_stack_height);
+					if (error)
+						goto out_bag;
+				}
+				cbno = nbno;
+			}
+
+			/* Stack empty, go find the next rmap */
+			if (rcbag_count(rcstack) == 0)
+				break;
+			old_stack_height = rcbag_count(rcstack);
+			sbno = nbno;
+
+			/* Set nbno to the bno of the next refcount change */
+			error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
+					&nbno);
+			if (error)
+				goto out_bag;
+
+			ASSERT(nbno > sbno);
+		}
+	}
+
+	ASSERT(rcbag_count(rcstack) == 0);
+out_bag:
+	rcbag_free(&rcstack);
+out_cur:
+	xchk_rtgroup_btcur_free(&sc->sr);
+	return error;
+}
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_rtrefc_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xrep_rtrefc		*rr = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+	int				error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		error = xfarray_load(rr->refcount_records, rr->array_cur++,
+				&cur->bc_rec.rc);
+		if (error)
+			return error;
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrefc_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_rtrefc	*rr = priv;
+
+	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrefc_iroot_size(
+	struct xfs_btree_cur	*cur,
+	unsigned int		level,
+	unsigned int		nr_this_level,
+	void			*priv)
+{
+	return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level,
+			nr_this_level);
+}
+
+/*
+ * Use the collected refcount information to stage a new rt refcount btree.  If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_rtrefc_build_new_tree(
+	struct xrep_rtrefc	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_rtgroup	*rtg = sc->sr.rtg;
+	struct xfs_btree_cur	*refc_cur;
+	int			error;
+
+	error = xrep_rtrefc_sort_records(rr);
+	if (error)
+		return error;
+
+	/*
+	 * Prepare to construct the new btree by reserving disk space for the
+	 * new btree and setting up all the accounting information we'll need
+	 * to root the new btree while it's under construction and before we
+	 * attach it to the realtime refcount inode.
+	 */
+	error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+	if (error)
+		return error;
+
+	rr->new_btree.bload.get_records = xrep_rtrefc_get_records;
+	rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block;
+	rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size;
+
+	/* Compute how many blocks we'll need. */
+	refc_cur = xfs_rtrefcountbt_stage_cursor(mp, rtg, rtg->rtg_refcountip,
+			&rr->new_btree.ifake);
+	error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload,
+			xfarray_length(rr->refcount_records));
+	if (error)
+		goto err_cur;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		goto err_cur;
+
+	/*
+	 * Guess how many blocks we're going to need to rebuild an entire
+	 * rtrefcountbt from the number of extents we found, and pump up our
+	 * transaction to have sufficient block reservation.  We're allowed
+	 * to exceed quota to repair inconsistent metadata, though this is
+	 * unlikely.
+	 */
+	error = xfs_trans_reserve_more_inode(sc->tp, rtg->rtg_refcountip,
+			rr->new_btree.bload.nr_blocks, 0, true);
+	if (error)
+		goto err_cur;
+
+	/* Reserve the space we'll need for the new btree. */
+	error = xrep_newbt_alloc_blocks(&rr->new_btree,
+			rr->new_btree.bload.nr_blocks);
+	if (error)
+		goto err_cur;
+
+	/* Add all observed refcount records. */
+	rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_REFCOUNT;
+	rr->array_cur = XFARRAY_CURSOR_INIT;
+	error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Install the new rtrefc btree in the inode.  After this point the old
+	 * btree is no longer accessible, the new tree is live, and we can
+	 * delete the cursor.
+	 */
+	xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp);
+	xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+	xfs_btree_del_cursor(refc_cur, 0);
+
+	/* Dispose of any unused blocks and the accounting information. */
+	error = xrep_newbt_commit(&rr->new_btree);
+	if (error)
+		return error;
+
+	return xrep_roll_trans(sc);
+err_cur:
+	xfs_btree_del_cursor(refc_cur, error);
+	xrep_newbt_cancel(&rr->new_btree);
+	return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_rtrefc_remove_old_tree(
+	struct xrep_rtrefc	*rr)
+{
+	int			error;
+
+	/*
+	 * Free all the extents that were allocated to the former rtrefcountbt
+	 * and aren't cross-linked with something else.
+	 */
+	error = xrep_reap_metadir_fsblocks(rr->sc,
+			&rr->old_rtrefcountbt_blocks);
+	if (error)
+		return error;
+
+	/*
+	 * Ensure the proper reservation for the rtrefcount inode so that we
+	 * don't fail to expand the btree.
+	 */
+	return xrep_reset_imeta_reservation(rr->sc);
+}
+
+/* Rebuild the rt refcount btree. */
+int
+xrep_rtrefcountbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_rtrefc	*rr;
+	struct xfs_mount	*mp = sc->mp;
+	char			*descr;
+	int			error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_has_rtrmapbt(mp))
+		return -EOPNOTSUPP;
+
+	/* Make sure any problems with the fork are fixed. */
+	error = xrep_metadata_inode_forks(sc);
+	if (error)
+		return error;
+
+	rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS);
+	if (!rr)
+		return -ENOMEM;
+	rr->sc = sc;
+
+	/* Set up enough storage to handle one refcount record per rt extent. */
+	descr = xchk_xfile_ag_descr(sc, "reference count records");
+	error = xfarray_create(descr, mp->m_sb.sb_rextents,
+			sizeof(struct xfs_refcount_irec),
+			&rr->refcount_records);
+	kfree(descr);
+	if (error)
+		goto out_rr;
+
+	/* Collect all reference counts. */
+	xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks);
+	error = xrep_rtrefc_find_refcounts(rr);
+	if (error)
+		goto out_bitmap;
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* Rebuild the refcount information. */
+	error = xrep_rtrefc_build_new_tree(rr);
+	if (error)
+		goto out_bitmap;
+
+	/* Kill the old tree. */
+	error = xrep_rtrefc_remove_old_tree(rr);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks);
+	xfarray_destroy(rr->refcount_records);
+out_rr:
+	kfree(rr);
+	return error;
+}
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index a40afa571b981..c29e398283018 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -137,7 +137,7 @@ xrep_rtrmap_check_mapping(
 	/* Make sure this isn't free space. */
 	rtbno = xfs_rgbno_to_rtb(sc->mp, sc->sr.rtg->rtg_rgno,
 			rec->rm_startblock);
-	return xrep_require_rtext_inuse(sc, rtbno, rec->rm_blockcount);
+	return xrep_require_rtext_inuse(sc, rtbno, rec->rm_blockcount, false);
 }
 
 /* Store a reverse-mapping record. */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 2611e0223489c..94a733975879a 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -486,7 +486,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_rtrefcountbt,
 		.scrub	= xchk_rtrefcountbt,
 		.has	= xfs_has_rtreflink,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_rtrefcountbt,
 	},
 };
 
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 2d373c9a53ad6..aa943433cbb02 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -3995,6 +3995,37 @@ TRACE_EVENT(xrep_rtrmap_live_update,
 		  __entry->offset,
 		  __entry->flags)
 );
+
+TRACE_EVENT(xrep_rtrefc_found,
+	TP_PROTO(struct xfs_rtgroup *rtg, const struct xfs_refcount_irec *rec),
+	TP_ARGS(rtg, rec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(enum xfs_refc_domain, domain)
+		__field(xfs_rgblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg->rtg_mount->m_super->s_dev;
+		__entry->rtdev = rtg->rtg_mount->m_rtdev_targp->bt_dev;
+		__entry->rgno = rtg->rtg_rgno;
+		__entry->domain = rec->rc_domain;
+		__entry->startblock = rec->rc_startblock;
+		__entry->blockcount = rec->rc_blockcount;
+		__entry->refcount = rec->rc_refcount;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x dom %s rgbno 0x%x fsbcount 0x%x refcount %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rgno,
+		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount)
+);
 #endif /* CONFIG_XFS_RT */
 
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 41/44] xfs: repair inodes that have a refcount btree in the data fork
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (39 preceding siblings ...)
  2023-12-31 21:54   ` [PATCH 40/44] xfs: online repair of the realtime refcount btree Darrick J. Wong
@ 2023-12-31 21:55   ` Darrick J. Wong
  2023-12-31 21:55   ` [PATCH 42/44] xfs: check for shared rt extents when rebuilding rt file's " Darrick J. Wong
                     ` (2 subsequent siblings)
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:55 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Plumb knowledge of refcount btrees into the inode core repair code.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/inode_repair.c |   47 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)


diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index b6b37652c4a35..a182a9551c08c 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -40,6 +40,7 @@
 #include "xfs_symlink_remote.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -924,6 +925,37 @@ xrep_dinode_bad_rmapbt_fork(
 	return false;
 }
 
+/* Return true if this refcount-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_refcountbt_fork(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	unsigned int		dfork_size,
+	int			whichfork)
+{
+	struct xfs_rtrefcount_root *dfp;
+	unsigned int		nrecs;
+	unsigned int		level;
+
+	if (whichfork != XFS_DATA_FORK)
+		return true;
+	if (dfork_size < sizeof(struct xfs_rtrefcount_root))
+		return true;
+
+	dfp = XFS_DFORK_PTR(dip, whichfork);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (level > sc->mp->m_rtrefc_maxlevels)
+		return true;
+	if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size)
+		return true;
+	if (level > 0 && nrecs == 0)
+		return true;
+
+	return false;
+}
+
 /*
  * Check the data fork for things that will fail the ifork verifiers or the
  * ifork formatters.
@@ -1009,6 +1041,11 @@ xrep_dinode_check_dfork(
 				XFS_DATA_FORK))
 			return true;
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		if (xrep_dinode_bad_refcountbt_fork(sc, dip, dfork_size,
+				XFS_DATA_FORK))
+			return true;
+		break;
 	default:
 		return true;
 	}
@@ -1134,6 +1171,11 @@ xrep_dinode_check_afork(
 				XFS_ATTR_FORK))
 			return true;
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		if (xrep_dinode_bad_refcountbt_fork(sc, dip, afork_size,
+				XFS_ATTR_FORK))
+			return true;
+		break;
 	default:
 		return true;
 	}
@@ -1182,6 +1224,7 @@ xrep_dinode_ensure_forkoff(
 {
 	struct xfs_bmdr_block	*bmdr;
 	struct xfs_rtrmap_root	*rmdr;
+	struct xfs_rtrefcount_root *rcdr;
 	struct xfs_scrub	*sc = ri->sc;
 	xfs_extnum_t		attr_extents, data_extents;
 	size_t			bmdr_minsz = xfs_bmdr_space_calc(1);
@@ -1292,6 +1335,10 @@ xrep_dinode_ensure_forkoff(
 		rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
 		dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr);
 		break;
+	case XFS_DINODE_FMT_REFCOUNT:
+		rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+		dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr);
+		break;
 	default:
 		dfork_min = 0;
 		break;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 42/44] xfs: check for shared rt extents when rebuilding rt file's data fork
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (40 preceding siblings ...)
  2023-12-31 21:55   ` [PATCH 41/44] xfs: repair inodes that have a refcount btree in the data fork Darrick J. Wong
@ 2023-12-31 21:55   ` Darrick J. Wong
  2023-12-31 21:55   ` [PATCH 43/44] xfs: fix CoW forks for realtime files Darrick J. Wong
  2023-12-31 21:56   ` [PATCH 44/44] xfs: enable realtime reflink Darrick J. Wong
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:55 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When we're rebuilding the data fork of a realtime file, we need to
cross-reference each mapping with the rt refcount btree to ensure that
the reflink flag is set if there are any shared extents found.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/bmap_repair.c |   23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)


diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 98618821ad975..c4f9a8ba8cf73 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -101,14 +101,23 @@ xrep_bmap_discover_shared(
 	xfs_filblks_t		blockcount)
 {
 	struct xfs_scrub	*sc = rb->sc;
+	struct xfs_btree_cur	*cur;
 	xfs_agblock_t		agbno;
 	xfs_agblock_t		fbno;
 	xfs_extlen_t		flen;
 	int			error;
 
-	agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
-	error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
-			&fbno, &flen, false);
+	if (XFS_IS_REALTIME_INODE(sc->ip)) {
+		xfs_rgnumber_t		rgno;
+
+		agbno = xfs_rtb_to_rgbno(sc->mp, startblock, &rgno);
+		cur = sc->sr.refc_cur;
+	} else {
+		agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+		cur = sc->sa.refc_cur;
+	}
+	error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen,
+			false);
 	if (error)
 		return error;
 
@@ -456,7 +465,9 @@ xrep_bmap_scan_rtgroup(
 		return 0;
 
 	error = xrep_rtgroup_init(sc, rtg, &sc->sr,
-			XFS_RTGLOCK_RMAP | XFS_RTGLOCK_BITMAP_SHARED);
+			XFS_RTGLOCK_RMAP |
+			XFS_RTGLOCK_REFCOUNT |
+			XFS_RTGLOCK_BITMAP_SHARED);
 	if (error)
 		return error;
 
@@ -900,10 +911,6 @@ xrep_bmap_init_reflink_scan(
 	if (whichfork != XFS_DATA_FORK)
 		return RLS_IRRELEVANT;
 
-	/* cannot share realtime extents */
-	if (XFS_IS_REALTIME_INODE(sc->ip))
-		return RLS_IRRELEVANT;
-
 	return RLS_UNKNOWN;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 43/44] xfs: fix CoW forks for realtime files
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (41 preceding siblings ...)
  2023-12-31 21:55   ` [PATCH 42/44] xfs: check for shared rt extents when rebuilding rt file's " Darrick J. Wong
@ 2023-12-31 21:55   ` Darrick J. Wong
  2023-12-31 21:56   ` [PATCH 44/44] xfs: enable realtime reflink Darrick J. Wong
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:55 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Port the copy on write fork repair to realtime files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/cow_repair.c |  210 +++++++++++++++++++++++++++++++++++++++----
 fs/xfs/scrub/reap.c       |  222 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/reap.h       |    7 +
 fs/xfs/scrub/repair.h     |    1 
 fs/xfs/scrub/rtb_bitmap.h |   37 ++++++++
 fs/xfs/scrub/trace.h      |   72 +++++++++++++++
 6 files changed, 532 insertions(+), 17 deletions(-)
 create mode 100644 fs/xfs/scrub/rtb_bitmap.h


diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index e48d869986f34..4be05fd8d0490 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -26,6 +26,9 @@
 #include "xfs_errortag.h"
 #include "xfs_icache.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -34,6 +37,7 @@
 #include "scrub/bitmap.h"
 #include "scrub/off_bitmap.h"
 #include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
 #include "scrub/reap.h"
 
 /*
@@ -61,7 +65,10 @@ struct xrep_cow {
 	struct xoff_bitmap	bad_fileoffs;
 
 	/* Bitmap of fsblocks that were removed from the CoW fork. */
-	struct xfsb_bitmap	old_cowfork_fsblocks;
+	union {
+		struct xfsb_bitmap	old_cowfork_fsblocks;
+		struct xrtb_bitmap	old_cowfork_rtblocks;
+	};
 
 	/* CoW fork mappings used to scan for bad CoW staging extents. */
 	struct xfs_bmbt_irec	irec;
@@ -145,8 +152,12 @@ xrep_cow_mark_shared_staging(
 
 	xrep_cow_trim_refcount(xc, &rrec, rec);
 
-	fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
-			rrec.rc_startblock);
+	if (XFS_IS_REALTIME_INODE(xc->sc->ip))
+		fsbno = xfs_rgbno_to_rtb(xc->sc->mp, cur->bc_ino.rtg->rtg_rgno,
+				rrec.rc_startblock);
+	else
+		fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+				rrec.rc_startblock);
 	return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
 }
 
@@ -166,6 +177,7 @@ xrep_cow_mark_missing_staging(
 {
 	struct xrep_cow			*xc = priv;
 	struct xfs_refcount_irec	rrec;
+	xfs_fsblock_t			fsbno;
 	int				error;
 
 	if (!xfs_refcount_check_domain(rec) ||
@@ -177,9 +189,13 @@ xrep_cow_mark_missing_staging(
 	if (xc->next_bno >= rrec.rc_startblock)
 		goto next;
 
-	error = xrep_cow_mark_file_range(xc,
-			XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
-				       xc->next_bno),
+	if (XFS_IS_REALTIME_INODE(xc->sc->ip))
+		fsbno = xfs_rgbno_to_rtb(xc->sc->mp, cur->bc_ino.rtg->rtg_rgno,
+				xc->next_bno);
+	else
+		fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+				xc->next_bno);
+	error = xrep_cow_mark_file_range(xc, fsbno,
 			rrec.rc_startblock - xc->next_bno);
 	if (error)
 		return error;
@@ -222,7 +238,12 @@ xrep_cow_mark_missing_staging_rmap(
 		rec_len -= adj;
 	}
 
-	fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
+	if (XFS_IS_REALTIME_INODE(xc->sc->ip))
+		fsbno = xfs_rgbno_to_rtb(xc->sc->mp, cur->bc_ino.rtg->rtg_rgno,
+				rec_bno);
+	else
+		fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+				rec_bno);
 	return xrep_cow_mark_file_range(xc, fsbno, rec_len);
 }
 
@@ -311,6 +332,97 @@ xrep_cow_find_bad(
 	return 0;
 }
 
+/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad_rt(
+	struct xrep_cow			*xc)
+{
+	struct xfs_refcount_irec	rc_low = { 0 };
+	struct xfs_refcount_irec	rc_high = { 0 };
+	struct xfs_rmap_irec		rm_low = { 0 };
+	struct xfs_rmap_irec		rm_high = { 0 };
+	struct xfs_scrub		*sc = xc->sc;
+	struct xfs_rtgroup		*rtg;
+	xfs_rgnumber_t			rgno;
+	int				error = 0;
+
+	xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock,
+						&rgno);
+
+	rtg = xfs_rtgroup_get(sc->mp, rgno);
+	if (!rtg)
+		return -EFSCORRUPTED;
+
+	if (xrep_is_rtmeta_ino(sc, rtg, sc->ip->i_ino))
+		goto out_rtg;
+
+	error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+			XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
+	if (error)
+		goto out_rtg;
+
+	/* Mark any CoW fork extents that are shared. */
+	rc_low.rc_startblock = xc->irec_startbno;
+	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+	error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+			xrep_cow_mark_shared_staging, xc);
+	if (error)
+		goto out_sr;
+
+	/* Make sure there are CoW staging extents for the whole mapping. */
+	rc_low.rc_startblock = xc->irec_startbno;
+	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+	xc->next_bno = xc->irec_startbno;
+	error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+			xrep_cow_mark_missing_staging, xc);
+	if (error)
+		goto out_sr;
+
+	if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+		error = xrep_cow_mark_file_range(xc,
+				xfs_rgbno_to_rtb(sc->mp, rtg->rtg_rgno,
+						 xc->next_bno),
+				xc->irec_startbno + xc->irec.br_blockcount -
+				xc->next_bno);
+		if (error)
+			goto out_sr;
+	}
+
+	/* Mark any area has an rmap that isn't a COW staging extent. */
+	rm_low.rm_startblock = xc->irec_startbno;
+	memset(&rm_high, 0xFF, sizeof(rm_high));
+	rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high,
+			xrep_cow_mark_missing_staging_rmap, xc);
+	if (error)
+		goto out_sr;
+
+	/*
+	 * If userspace is forcing us to rebuild the CoW fork or someone
+	 * turned on the debugging knob, replace everything in the
+	 * CoW fork and then scan for staging extents in the refcountbt.
+	 */
+	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+				xc->irec.br_blockcount);
+		if (error)
+			goto out_rtg;
+	}
+
+out_sr:
+	xchk_rtgroup_btcur_free(&sc->sr);
+	xchk_rtgroup_free(sc, &sc->sr);
+out_rtg:
+	xfs_rtgroup_put(rtg);
+	return error;
+}
+
 /*
  * Allocate a replacement CoW staging extent of up to the given number of
  * blocks, and fill out the mapping.
@@ -351,6 +463,46 @@ xrep_cow_alloc(
 	return 0;
 }
 
+/*
+ * Allocate a replacement rt CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc_rt(
+	struct xfs_scrub	*sc,
+	xfs_extlen_t		maxlen,
+	struct xrep_cow_extent	*repl)
+{
+	xfs_rtxnum_t		rtx = NULLRTEXTNO;
+	xfs_rtxlen_t		maxrtx;
+	xfs_rtxlen_t		rtxlen = 0;
+	xfs_rtblock_t		rtbno;
+	xfs_extlen_t		len;
+	int			error;
+
+	maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen);
+	error = xfs_trans_reserve_more(sc->tp, 0, maxrtx);
+	if (error)
+		return error;
+
+	xfs_rtbitmap_lock(sc->tp, sc->mp);
+
+	error = xfs_rtallocate_extent(sc->tp, 0, 1, maxrtx, &rtxlen, 0, 1,
+			&rtx);
+	if (error)
+		return error;
+	if (rtx == NULLRTEXTNO)
+		return -ENOSPC;
+
+	rtbno = xfs_rtx_to_rtb(sc->mp, rtx);
+	len = xfs_rtxlen_to_extlen(sc->mp, rtxlen);
+	xfs_refcount_alloc_cow_extent(sc->tp, true, rtbno, len);
+
+	repl->fsbno = rtbno;
+	repl->len = len;
+	return 0;
+}
+
 /*
  * Look up the current CoW fork mapping so that we only allocate enough to
  * replace a single mapping.  If we don't find a mapping that covers the start
@@ -468,7 +620,10 @@ xrep_cow_replace_range(
 	 */
 	alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
 			  nextoff - startoff);
-	error = xrep_cow_alloc(sc, alloc_len, &repl);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		error = xrep_cow_alloc_rt(sc, alloc_len, &repl);
+	else
+		error = xrep_cow_alloc(sc, alloc_len, &repl);
 	if (error)
 		return error;
 
@@ -484,8 +639,12 @@ xrep_cow_replace_range(
 		return error;
 
 	/* Note the old CoW staging extents; we'll reap them all later. */
-	error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
-			repl.len);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks,
+				got.br_startblock, repl.len);
+	else
+		error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks,
+				got.br_startblock, repl.len);
 	if (error)
 		return error;
 
@@ -541,8 +700,12 @@ xrep_bmap_cow(
 	if (!ifp)
 		return 0;
 
-	/* realtime files aren't supported yet */
-	if (XFS_IS_REALTIME_INODE(sc->ip))
+	/*
+	 * Realtime files with large extent sizes are not supported because
+	 * we could encounter an CoW mapping that has been partially written
+	 * out *and* requires replacement, and there's no solution to that.
+	 */
+	if (xfs_inode_has_bigallocunit(sc->ip))
 		return -EOPNOTSUPP;
 
 	/*
@@ -563,7 +726,10 @@ xrep_bmap_cow(
 
 	xc->sc = sc;
 	xoff_bitmap_init(&xc->bad_fileoffs);
-	xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		xrtb_bitmap_init(&xc->old_cowfork_rtblocks);
+	else
+		xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
 
 	for_each_xfs_iext(ifp, &icur, &xc->irec) {
 		if (xchk_should_terminate(sc, &error))
@@ -586,7 +752,10 @@ xrep_bmap_cow(
 		if (xfs_bmap_is_written_extent(&xc->irec))
 			continue;
 
-		error = xrep_cow_find_bad(xc);
+		if (XFS_IS_REALTIME_INODE(sc->ip))
+			error = xrep_cow_find_bad_rt(xc);
+		else
+			error = xrep_cow_find_bad(xc);
 		if (error)
 			goto out_bitmap;
 	}
@@ -601,13 +770,20 @@ xrep_bmap_cow(
 	 * by the refcount btree, not the inode, so it is correct to treat them
 	 * like inode metadata.
 	 */
-	error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
-			&XFS_RMAP_OINFO_COW);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks,
+				&XFS_RMAP_OINFO_COW);
+	else
+		error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+				&XFS_RMAP_OINFO_COW);
 	if (error)
 		goto out_bitmap;
 
 out_bitmap:
-	xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks);
+	else
+		xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
 	xoff_bitmap_destroy(&xc->bad_fileoffs);
 	kmem_free(xc);
 	return error;
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index bb28c2d2b8780..b8166e19726a4 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -34,6 +34,8 @@
 #include "xfs_attr_remote.h"
 #include "xfs_defer.h"
 #include "xfs_imeta.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -41,6 +43,7 @@
 #include "scrub/bitmap.h"
 #include "scrub/agb_bitmap.h"
 #include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
 #include "scrub/reap.h"
 
 /*
@@ -680,6 +683,225 @@ xrep_reap_fsblocks(
 	return 0;
 }
 
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out the longest run of blocks that we can dispose of with a single
+ * call.  Cross-linked blocks should have their reverse mappings removed, but
+ * single-owner extents can be freed.  Units are rt blocks, not rt extents.
+ */
+STATIC int
+xreap_rgextent_select(
+	struct xreap_state	*rs,
+	xfs_rgblock_t		rgbno,
+	xfs_rgblock_t		rgbno_next,
+	bool			*crosslinked,
+	xfs_extlen_t		*rglenp)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_btree_cur	*cur;
+	xfs_rgblock_t		bno = rgbno + 1;
+	xfs_extlen_t		len = 1;
+	int			error;
+
+	/*
+	 * Determine if there are any other rmap records covering the first
+	 * block of this extent.  If so, the block is crosslinked.
+	 */
+	cur = xfs_rtrmapbt_init_cursor(sc->mp, sc->tp, sc->sr.rtg,
+			sc->sr.rtg->rtg_rmapip);
+	error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo,
+			crosslinked);
+	if (error)
+		goto out_cur;
+
+	/*
+	 * Figure out how many of the subsequent blocks have the same crosslink
+	 * status.
+	 */
+	while (bno < rgbno_next) {
+		bool		also_crosslinked;
+
+		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
+				&also_crosslinked);
+		if (error)
+			goto out_cur;
+
+		if (*crosslinked != also_crosslinked)
+			break;
+
+		len++;
+		bno++;
+	}
+
+	*rglenp = len;
+	trace_xreap_rgextent_select(sc->sr.rtg, rgbno, len, *crosslinked);
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * Dispose of as much of the beginning of this rtgroup extent as possible.
+ * The number of blocks disposed of will be returned in @rglenp.
+ */
+STATIC int
+xreap_rgextent_iter(
+	struct xreap_state	*rs,
+	xfs_rgblock_t		rgbno,
+	xfs_extlen_t		*rglenp,
+	bool			crosslinked)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	xfs_rtblock_t		rtbno;
+	int			error;
+
+	/*
+	 * The only caller so far is CoW fork repair, so we only know how to
+	 * unlink or free CoW staging extents.  Here we don't have to worry
+	 * about invalidating buffers!
+	 */
+	if (rs->oinfo != &XFS_RMAP_OINFO_COW) {
+		ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW);
+		return -EFSCORRUPTED;
+	}
+	ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+	rtbno = xfs_rgbno_to_rtb(sc->mp, sc->sr.rtg->rtg_rgno, rgbno);
+
+	/*
+	 * If there are other rmappings, this block is cross linked and must
+	 * not be freed.  Remove the forward and reverse mapping and move on.
+	 */
+	if (crosslinked) {
+		trace_xreap_dispose_unmap_rtextent(sc->sr.rtg, rgbno, *rglenp);
+
+		xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+		rs->deferred++;
+		return 0;
+	}
+
+	trace_xreap_dispose_free_rtextent(sc->sr.rtg, rgbno, *rglenp);
+
+	/*
+	 * The CoW staging extent is not crosslinked.  Use deferred work items
+	 * to remove the refcountbt records (which removes the rmap records)
+	 * and free the extent.  We're not worried about the system going down
+	 * here because log recovery walks the refcount btree to clean out the
+	 * CoW staging extents.
+	 */
+	xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+	error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL,
+			rs->resv,
+			XFS_FREE_EXTENT_REALTIME |
+			XFS_FREE_EXTENT_SKIP_DISCARD);
+	if (error)
+		return error;
+
+	rs->deferred++;
+	return 0;
+}
+
+#define XREAP_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP | \
+				 XFS_RTGLOCK_RMAP | \
+				 XFS_RTGLOCK_REFCOUNT)
+
+/*
+ * Break a rt file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately.  The extent must
+ * be aligned to a realtime extent.
+ */
+STATIC int
+xreap_rtmeta_extent(
+	uint64_t		rtbno,
+	uint64_t		len,
+	void			*priv)
+{
+	struct xreap_state	*rs = priv;
+	struct xfs_scrub	*sc = rs->sc;
+	xfs_rgnumber_t		rgno;
+	xfs_rgblock_t		rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno, &rgno);
+	xfs_rgblock_t		rgbno_next = rgbno + len;
+	int			error = 0;
+
+	ASSERT(sc->ip != NULL);
+	ASSERT(!sc->sr.rtg);
+
+	/*
+	 * We're reaping blocks after repairing file metadata, which means that
+	 * we have to init the xchk_ag structure ourselves.
+	 */
+	sc->sr.rtg = xfs_rtgroup_get(sc->mp, rgno);
+	if (!sc->sr.rtg)
+		return -EFSCORRUPTED;
+
+	xfs_rtgroup_lock(NULL, sc->sr.rtg, XREAP_RTGLOCK_ALL);
+
+	while (rgbno < rgbno_next) {
+		xfs_extlen_t	rglen;
+		bool		crosslinked;
+
+		error = xreap_rgextent_select(rs, rgbno, rgbno_next,
+				&crosslinked, &rglen);
+		if (error)
+			goto out_unlock;
+
+		error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked);
+		if (error)
+			goto out_unlock;
+
+		if (xreap_want_defer_finish(rs)) {
+			error = xfs_defer_finish(&sc->tp);
+			if (error)
+				goto out_unlock;
+			xreap_defer_finish_reset(rs);
+		} else if (xreap_want_roll(rs)) {
+			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+			if (error)
+				goto out_unlock;
+			xreap_reset(rs);
+		}
+
+		rgbno += rglen;
+	}
+
+out_unlock:
+	xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+	xfs_rtgroup_put(sc->sr.rtg);
+	sc->sr.rtg = NULL;
+	return error;
+}
+
+/*
+ * Dispose of every block of every rt metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_rtblocks(
+	struct xfs_scrub		*sc,
+	struct xrtb_bitmap		*bitmap,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xreap_state		rs = {
+		.sc			= sc,
+		.oinfo			= oinfo,
+		.resv			= XFS_AG_RESV_NONE,
+	};
+	int				error;
+
+	ASSERT(xfs_has_rmapbt(sc->mp));
+	ASSERT(sc->ip != NULL);
+
+	error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
+	if (error)
+		return error;
+
+	if (xreap_dirty(&rs))
+		return xrep_defer_finish(sc);
+
+	return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Dispose of every block of an old metadata btree that used to be rooted in a
  * metadata directory file.
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 70e5e6bbb8d38..4c8f62701fb36 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -17,6 +17,13 @@ int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
 int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc,
 		struct xfsb_bitmap *bitmap);
 
+#ifdef CONFIG_XFS_RT
+int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap,
+		const struct xfs_owner_info *oinfo);
+#else
+# define xrep_reap_rtblocks(...)	(-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
+
 /* Buffer cache scan context. */
 struct xrep_bufscan {
 	/* Disk address for the buffers we want to scan. */
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 969317d429db6..801a9013f37f1 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -51,6 +51,7 @@ struct xbitmap;
 struct xagb_bitmap;
 struct xrgb_bitmap;
 struct xfsb_bitmap;
+struct xrtb_bitmap;
 
 int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
 
diff --git a/fs/xfs/scrub/rtb_bitmap.h b/fs/xfs/scrub/rtb_bitmap.h
new file mode 100644
index 0000000000000..1313ef605511e
--- /dev/null
+++ b/fs/xfs/scrub/rtb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTB_BITMAP_H__
+#define __XFS_SCRUB_RTB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rtblock_t */
+
+struct xrtb_bitmap {
+	struct xbitmap64	rtbitmap;
+};
+
+static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap)
+{
+	xbitmap64_init(&bitmap->rtbitmap);
+}
+
+static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap)
+{
+	xbitmap64_destroy(&bitmap->rtbitmap);
+}
+
+static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap,
+		xfs_rtblock_t start, xfs_filblks_t len)
+{
+	return xbitmap64_set(&bitmap->rtbitmap, start, len);
+}
+
+static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap,
+		xbitmap64_walk_fn fn, void *priv)
+{
+	return xbitmap64_walk(&bitmap->rtbitmap, fn, priv);
+}
+
+#endif	/* __XFS_SCRUB_RTB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index aa943433cbb02..2c6f7e3b7578d 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2013,6 +2013,41 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xrep_rtgroup_extent_class,
+	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+		 xfs_extlen_t len),
+	TP_ARGS(rtg, rgbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg->rtg_mount->m_super->s_dev;
+		__entry->rtdev = rtg->rtg_mount->m_rtdev_targp->bt_dev;
+		__entry->rgno = rtg->rtg_rgno;
+		__entry->rgbno = rgbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rgbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rgno,
+		  __entry->rgbno,
+		  __entry->len)
+);
+#define DEFINE_REPAIR_RTGROUP_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_rtgroup_extent_class, name, \
+	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, \
+		 xfs_extlen_t len), \
+	TP_ARGS(rtg, rgbno, len))
+DEFINE_REPAIR_RTGROUP_EXTENT_EVENT(xreap_dispose_unmap_rtextent);
+DEFINE_REPAIR_RTGROUP_EXTENT_EVENT(xreap_dispose_free_rtextent);
+#endif /* CONFIG_XFS_RT */
+
 DECLARE_EVENT_CLASS(xrep_reap_find_class,
 	TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len,
 		bool crosslinked),
@@ -2046,6 +2081,43 @@ DEFINE_EVENT(xrep_reap_find_class, name, \
 DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
 DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xrep_rtgroup_reap_find_class,
+	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, xfs_extlen_t len,
+		 bool crosslinked),
+	TP_ARGS(rtg, rgbno, len, crosslinked),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+		__field(bool, crosslinked)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg->rtg_mount->m_super->s_dev;
+		__entry->rtdev = rtg->rtg_mount->m_rtdev_targp->bt_dev;
+		__entry->rgno = rtg->rtg_rgno;
+		__entry->rgbno = rgbno;
+		__entry->len = len;
+		__entry->crosslinked = crosslinked;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rgbno 0x%x fsbcount 0x%x crosslinked %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->rgno,
+		  __entry->rgbno,
+		  __entry->len,
+		  __entry->crosslinked ? 1 : 0)
+);
+#define DEFINE_REPAIR_RTGROUP_REAP_FIND_EVENT(name) \
+DEFINE_EVENT(xrep_rtgroup_reap_find_class, name, \
+	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, \
+		 xfs_extlen_t len, bool crosslinked), \
+	TP_ARGS(rtg, rgbno, len, crosslinked))
+DEFINE_REPAIR_RTGROUP_REAP_FIND_EVENT(xreap_rgextent_select);
+#endif /* CONFIG_XFS_RT */
+
 DECLARE_EVENT_CLASS(xrep_rmap_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
 		 xfs_agblock_t agbno, xfs_extlen_t len,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 44/44] xfs: enable realtime reflink
  2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
                     ` (42 preceding siblings ...)
  2023-12-31 21:55   ` [PATCH 43/44] xfs: fix CoW forks for realtime files Darrick J. Wong
@ 2023-12-31 21:56   ` Darrick J. Wong
  43 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:56 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable reflink for realtime devices, sort of.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_super.c |   19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fbfd2963b2e2c..bf0c0ce9a54b9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1728,14 +1728,27 @@ xfs_fs_fill_super(
 "EXPERIMENTAL realtime allocation group feature in use. Use at your own risk!");
 
 	if (xfs_has_reflink(mp)) {
-		if (mp->m_sb.sb_rblocks) {
+		/*
+		 * Reflink doesn't support rt extent sizes larger than a single
+		 * block because we would have to perform unshare-around for
+		 * rtext-unaligned write requests.
+		 */
+		if (xfs_has_realtime(mp) && mp->m_sb.sb_rextsize != 1) {
 			xfs_alert(mp,
-	"reflink not compatible with realtime device!");
+	"reflink not compatible with realtime extent size %u!",
+					mp->m_sb.sb_rextsize);
 			error = -EINVAL;
 			goto out_filestream_unmount;
 		}
 
-		if (xfs_globals.always_cow) {
+		/*
+		 * always-cow mode is not supported on filesystems with rt
+		 * extent sizes larger than a single block because we'd have
+		 * to perform write-around for unaligned writes because remap
+		 * requests must be aligned to an rt extent.
+		 */
+		if (xfs_globals.always_cow &&
+		    (!xfs_has_realtime(mp) || mp->m_sb.sb_rextsize == 1)) {
 			xfs_info(mp, "using DEBUG-only always_cow mode.");
 			mp->m_always_cow = true;
 		}


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/9] vfs: explicitly pass the block size to the remap prep function
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
@ 2023-12-31 21:56   ` Darrick J. Wong
  2023-12-31 21:56   ` [PATCH 2/9] xfs: enable CoW when rt extent size is larger than 1 block Darrick J. Wong
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:56 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make it so that filesystems can pass an explicit blocksize to the remap
prep function.  This enables filesystems whose fundamental allocation
units are /not/ the same as the blocksize to ensure that the remapping
checks are aligned properly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/dax.c           |    5 ++++-
 fs/remap_range.c   |   30 ++++++++++++++++++------------
 include/linux/fs.h |    3 ++-
 3 files changed, 24 insertions(+), 14 deletions(-)


diff --git a/fs/dax.c b/fs/dax.c
index 3380b43cb6bbb..06b631eafb46c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -2061,7 +2061,10 @@ int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 			      loff_t *len, unsigned int remap_flags,
 			      const struct iomap_ops *ops)
 {
+	unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize;
+
 	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
-					       pos_out, len, remap_flags, ops);
+					       pos_out, len, remap_flags, ops,
+					       blocksize);
 }
 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index f75ff15c94976..5d5b802f56086 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -30,18 +30,18 @@
  */
 static int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
-				loff_t *req_count, unsigned int remap_flags)
+				loff_t *req_count, unsigned int remap_flags,
+				unsigned int blocksize)
 {
 	struct inode *inode_in = file_in->f_mapping->host;
 	struct inode *inode_out = file_out->f_mapping->host;
 	uint64_t count = *req_count;
 	uint64_t bcount;
 	loff_t size_in, size_out;
-	loff_t bs = inode_out->i_sb->s_blocksize;
 	int ret;
 
 	/* The start of both ranges must be aligned to an fs block. */
-	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+	if (!IS_ALIGNED(pos_in, blocksize) || !IS_ALIGNED(pos_out, blocksize))
 		return -EINVAL;
 
 	/* Ensure offsets don't wrap. */
@@ -75,10 +75,10 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
 	 */
 	if (pos_in + count == size_in &&
 	    (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
-		bcount = ALIGN(size_in, bs) - pos_in;
+		bcount = ALIGN(size_in, blocksize) - pos_in;
 	} else {
-		if (!IS_ALIGNED(count, bs))
-			count = ALIGN_DOWN(count, bs);
+		if (!IS_ALIGNED(count, blocksize))
+			count = ALIGN_DOWN(count, blocksize);
 		bcount = count;
 	}
 
@@ -128,9 +128,10 @@ static int generic_remap_check_len(struct inode *inode_in,
 				   struct inode *inode_out,
 				   loff_t pos_out,
 				   loff_t *len,
-				   unsigned int remap_flags)
+				   unsigned int remap_flags,
+				   unsigned int blocksize)
 {
-	u64 blkmask = i_blocksize(inode_in) - 1;
+	u64 blkmask = blocksize - 1;
 	loff_t new_len = *len;
 
 	if ((*len & blkmask) == 0)
@@ -271,7 +272,8 @@ int
 __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *len, unsigned int remap_flags,
-				const struct iomap_ops *dax_read_ops)
+				const struct iomap_ops *dax_read_ops,
+				unsigned int blocksize)
 {
 	struct inode *inode_in = file_inode(file_in);
 	struct inode *inode_out = file_inode(file_out);
@@ -306,7 +308,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	/* Check that we don't violate system file offset limits. */
 	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-			remap_flags);
+			remap_flags, blocksize);
 	if (ret || *len == 0)
 		return ret;
 
@@ -347,7 +349,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 	}
 
 	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
-			remap_flags);
+			remap_flags, blocksize);
 	if (ret || *len == 0)
 		return ret;
 
@@ -357,13 +359,17 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	return ret;
 }
+EXPORT_SYMBOL(__generic_remap_file_range_prep);
 
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  loff_t *len, unsigned int remap_flags)
 {
+	unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize;
+
 	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
-					       pos_out, len, remap_flags, NULL);
+					       pos_out, len, remap_flags, NULL,
+					       blocksize);
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d9badb63bfc29..aeead94a5b167 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2036,7 +2036,8 @@ int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
 int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    loff_t *len, unsigned int remap_flags,
-				    const struct iomap_ops *dax_read_ops);
+				    const struct iomap_ops *dax_read_ops,
+				    unsigned int block_size);
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  loff_t *count, unsigned int remap_flags);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/9] xfs: enable CoW when rt extent size is larger than 1 block
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
  2023-12-31 21:56   ` [PATCH 1/9] vfs: explicitly pass the block size to the remap prep function Darrick J. Wong
@ 2023-12-31 21:56   ` Darrick J. Wong
  2023-12-31 21:56   ` [PATCH 3/9] xfs: forcibly convert unwritten blocks within an rt extent before sharing Darrick J. Wong
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:56 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Copy on write encounters a major plot twist when the file being CoW'd
lives on the realtime volume and the realtime extent size is larger than
a single filesystem block.  XFS can only unmap and remap full rt
extents, which means that allocations are always done in units of full
rt extents, and a request to unmap less than one extent is treated as a
request to convert an extent to unwritten status.

This behavioral quirk is not compatible with the existing CoW mechanism,
so we have to intercept every path through which files can be modified
to ensure that we dirty an entire rt extent at once so that we can remap
a full rt extent.  Use the existing VFS unshare functions to dirty the
page cache to set that up.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_file.c    |  224 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_file.h    |    3 +
 fs/xfs/xfs_inode.h   |    6 +
 fs/xfs/xfs_iops.c    |   22 +++++
 fs/xfs/xfs_reflink.c |   39 +++++++++
 fs/xfs/xfs_trace.h   |    1 
 6 files changed, 293 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index fdbeb6c3fbc44..ebdda286cb2a2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -358,6 +358,116 @@ xfs_file_splice_read(
 	return ret;
 }
 
+/*
+ * Decide if this file write requires COWing-around at either end of the write
+ * range.  This is only required if the file allocation unit is larger than
+ * 1FSB and the write range is not aligned with the allocation unit.
+ */
+static bool
+xfs_file_write_needs_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	long long int		count)
+{
+	/*
+	 * No COWing required if this inode doesn't do COW.
+	 *
+	 * If the allocation unit is 1FSB, we do not need to COW around the
+	 * edges of the operation range.  This applies to all files on the data
+	 * device and rt files that have an extent size of 1FSB.
+	 */
+	if (!xfs_inode_needs_cow_around(ip))
+		return false;
+
+	/*
+	 * Otherwise, check that the operation is aligned to the rt extent
+	 * size.  Any unaligned operation /must/ be COWed around since the
+	 * regular reflink code only handles extending writes up to fsblock
+	 * boundaries.
+	 */
+	return !xfs_is_falloc_aligned(ip, pos, count);
+}
+
+/* Do we need to COW-around at this offset to handle a truncate up or down? */
+bool
+xfs_truncate_needs_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos)
+{
+	return xfs_file_write_needs_cow_around(ip, pos, 0);
+}
+
+/* Does this file write require COWing around? */
+static inline bool
+xfs_iocb_needs_cow_around(
+	struct xfs_inode	*ip,
+	const struct kiocb	*iocb,
+	const struct iov_iter	*from)
+{
+	return xfs_file_write_needs_cow_around(ip, iocb->ki_pos,
+			iov_iter_count(from));
+}
+
+/* Unshare the allocation unit mapped to the given file position.  */
+inline int
+xfs_file_unshare_at(
+	struct xfs_inode	*ip,
+	loff_t			pos)
+{
+	loff_t			isize = i_size_read(VFS_I(ip));
+	unsigned int		extsize, len;
+	uint32_t		mod;
+
+	len = extsize = xfs_inode_alloc_unitsize(ip);
+
+	/* Open-coded rounddown_64 so that we can skip out if aligned */
+	div_u64_rem(pos, extsize, &mod);
+	if (mod == 0)
+		return 0;
+	pos -= mod;
+
+	/* Do not extend the file. */
+	if (pos >= isize)
+		return 0;
+	if (pos + len > isize)
+		len = isize - pos;
+
+	trace_xfs_file_cow_around(ip, pos, len);
+
+	if (IS_DAX(VFS_I(ip)))
+		return dax_file_unshare(VFS_I(ip), pos, len,
+				&xfs_dax_write_iomap_ops);
+	return iomap_file_unshare(VFS_I(ip), pos, len,
+			&xfs_buffered_write_iomap_ops);
+}
+
+/*
+ * Dirty the pages on either side of a write request as needed to satisfy
+ * alignment requirements if we're going to perform a copy-write.
+ *
+ * This is only needed for realtime files when the rt extent size is larger
+ * than 1 fs block, because we don't allow a logical rt extent in a file to map
+ * to multiple physical rt extents.  In other words, we can only map and unmap
+ * full rt extents.  Note that page cache doesn't exist above EOF, so be
+ * careful to stay below EOF.
+ */
+static int
+xfs_file_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	long long int		count)
+{
+	int			error;
+
+	/* Unshare at the start of the extent. */
+	error = xfs_file_unshare_at(ip,  pos);
+	if (error)
+		return error;
+
+	/* Unshare at the end. */
+	return xfs_file_unshare_at(ip, pos + count);
+}
+
 /*
  * Common pre-write limit and setup checks.
  *
@@ -397,9 +507,10 @@ xfs_file_write_checks(
 
 	/*
 	 * For changing security info in file_remove_privs() we need i_rwsem
-	 * exclusively.
+	 * exclusively.  We also need it to COW around the range being written.
 	 */
-	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+	if (*iolock == XFS_IOLOCK_SHARED &&
+	    (!IS_NOSEC(inode) || xfs_iocb_needs_cow_around(ip, iocb, from))) {
 		xfs_iunlock(ip, *iolock);
 		*iolock = XFS_IOLOCK_EXCL;
 		error = xfs_ilock_iocb(iocb, *iolock);
@@ -410,6 +521,22 @@ xfs_file_write_checks(
 		goto restart;
 	}
 
+	/*
+	 * The write is not aligned to the file's allocation unit.  If either
+	 * of the allocation units at the start or end of the write range are
+	 * shared, unshare them through the page cache.
+	 */
+	if (xfs_iocb_needs_cow_around(ip, iocb, from)) {
+		ASSERT(*iolock == XFS_IOLOCK_EXCL);
+
+		inode_dio_wait(VFS_I(ip));
+		drained_dio = true;
+
+		error = xfs_file_cow_around(ip, iocb->ki_pos, count);
+		if (error)
+			return error;
+	}
+
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
@@ -461,6 +588,17 @@ xfs_file_write_checks(
 			goto restart;
 		}
 
+		/*
+		 * If we're starting the write past EOF, COW the allocation
+		 * unit containing the current EOF before we start zeroing the
+		 * range between EOF and the start of the write.
+		 */
+		if (xfs_truncate_needs_cow_around(ip, isize)) {
+			error = xfs_file_unshare_at(ip, isize);
+			if (error)
+				return error;
+		}
+
 		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 		error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
 		if (error)
@@ -575,6 +713,16 @@ xfs_file_dio_write_aligned(
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
 	ssize_t			ret;
 
+	/*
+	 * If the range to write is not aligned to an allocation unit, we will
+	 * have to COW the allocation units on both ends of the write.  Because
+	 * this runs through the page cache, it requires IOLOCK_EXCL.  This
+	 * predicate performs an unlocked access of the rt and reflink inode
+	 * state.
+	 */
+	if (xfs_iocb_needs_cow_around(ip, iocb, from))
+		iolock = XFS_IOLOCK_EXCL;
+
 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
@@ -927,6 +1075,13 @@ xfs_file_fallocate(
 		goto out_unlock;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		/* Unshare around the region to punch, if needed. */
+		if (xfs_file_write_needs_cow_around(ip, offset, len)) {
+			error = xfs_file_cow_around(ip, offset, len);
+			if (error)
+				goto out_unlock;
+		}
+
 		error = xfs_free_file_space(ip, offset, len);
 		if (error)
 			goto out_unlock;
@@ -997,6 +1152,13 @@ xfs_file_fallocate(
 
 			trace_xfs_zero_file_space(ip);
 
+			/* Unshare around the region to zero, if needed. */
+			if (xfs_file_write_needs_cow_around(ip, offset, len)) {
+				error = xfs_file_cow_around(ip, offset, len);
+				if (error)
+					goto out_unlock;
+			}
+
 			error = xfs_free_file_space(ip, offset, len);
 			if (error)
 				goto out_unlock;
@@ -1005,6 +1167,26 @@ xfs_file_fallocate(
 			      round_down(offset, blksize);
 			offset = round_down(offset, blksize);
 		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+			/*
+			 * Enlarge the unshare region to align to a full
+			 * allocation unit.
+			 */
+			if (xfs_inode_needs_cow_around(ip)) {
+				loff_t		isize = i_size_read(VFS_I(ip));
+				unsigned int	rextsize;
+				uint32_t	mod;
+
+				rextsize = xfs_inode_alloc_unitsize(ip);
+				div_u64_rem(offset, rextsize, &mod);
+				offset -= mod;
+				len += mod;
+
+				div_u64_rem(offset + len, rextsize, &mod);
+				if (mod)
+					len += rextsize - mod;
+				if (offset + len > isize)
+					len = isize - offset;
+			}
 			error = xfs_reflink_unshare(ip, offset, len);
 			if (error)
 				goto out_unlock;
@@ -1272,6 +1454,34 @@ xfs_dax_fault(
 }
 #endif
 
+static int
+xfs_filemap_fault_around(
+	struct vm_fault		*vmf,
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct folio		*folio = page_folio(vmf->page);
+	loff_t			pos;
+	ssize_t			len;
+
+	if (!xfs_inode_needs_cow_around(ip))
+		return 0;
+
+	folio_lock(folio);
+	len = folio_mkwrite_check_truncate(folio, inode);
+	if (len < 0) {
+		folio_unlock(folio);
+		return len;
+	}
+	pos = folio_pos(folio);
+	folio_unlock(folio);
+
+	if (!xfs_file_write_needs_cow_around(ip, pos, len))
+		return 0;
+
+	return xfs_file_cow_around(XFS_I(inode), pos, len);
+}
+
 /*
  * Locking for serialisation of IO during page faults. This results in a lock
  * ordering of:
@@ -1310,11 +1520,21 @@ __xfs_filemap_fault(
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, order, pfn);
 	} else if (write_fault) {
+		/*
+		 * Unshare all the blocks in this rt extent surrounding
+		 * this page.
+		 */
+		int error = xfs_filemap_fault_around(vmf, inode);
+		if (error) {
+			ret = vmf_fs_error(error);
+			goto out_unlock;
+		}
 		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
 	} else {
 		ret = filemap_fault(vmf);
 	}
 
+out_unlock:
 	if (lock_mode)
 		xfs_iunlock(XFS_I(inode), lock_mode);
 
diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h
index 2ad91f755caf3..24490ea49e16c 100644
--- a/fs/xfs/xfs_file.h
+++ b/fs/xfs/xfs_file.h
@@ -12,4 +12,7 @@ extern const struct file_operations xfs_dir_file_operations;
 bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos,
 		long long int len);
 
+bool xfs_truncate_needs_cow_around(struct xfs_inode *ip, loff_t pos);
+int xfs_file_unshare_at(struct xfs_inode *ip, loff_t pos);
+
 #endif /* __XFS_FILE_H__ */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6013a97d02c5d..df8197fe4cb82 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -331,6 +331,12 @@ static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip)
 	return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
 }
 
+/* Decide if we need to unshare the blocks around a range that we're writing. */
+static inline bool xfs_inode_needs_cow_around(struct xfs_inode *ip)
+{
+	return xfs_is_cow_inode(ip) && xfs_inode_has_bigallocunit(ip);
+}
+
 /*
  * Return the buftarg used for data allocations on a given inode.
  */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 461e77dd54e38..71a4398fd36ac 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -27,6 +27,7 @@
 #include "xfs_xattr.h"
 #include "xfs_file.h"
 #include "xfs_bmap.h"
+#include "xfs_reflink.h"
 
 #include <linux/posix_acl.h>
 #include <linux/security.h>
@@ -877,10 +878,31 @@ xfs_setattr_size(
 	 * truncate.
 	 */
 	if (newsize > oldsize) {
+		/*
+		 * Extending the file size, so COW around the allocation unit
+		 * containing EOF before we zero the new range of the file.
+		 */
+		if (xfs_truncate_needs_cow_around(ip, oldsize)) {
+			error = xfs_file_unshare_at(ip, oldsize);
+			if (error)
+				return error;
+		}
+
 		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
 		error = xfs_zero_range(ip, oldsize, newsize - oldsize,
 				&did_zeroing);
 	} else {
+		/*
+		 * Truncating the file, so COW around the new EOF allocation
+		 * unit before truncation zeroes the part of the EOF block
+		 * after the new EOF.
+		 */
+		if (xfs_truncate_needs_cow_around(ip, newsize)) {
+			error = xfs_file_unshare_at(ip, newsize);
+			if (error)
+				return error;
+		}
+
 		/*
 		 * iomap won't detect a dirty page over an unwritten block (or a
 		 * cow block over a hole) and subsequently skips zeroing the
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b0f3170c11c8b..d5773f9b7ec54 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -34,6 +34,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_rtgroup.h"
 #include "xfs_imeta.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -297,9 +298,26 @@ xfs_reflink_convert_cow_locked(
 	struct xfs_iext_cursor	icur;
 	struct xfs_bmbt_irec	got;
 	struct xfs_btree_cur	*dummy_cur = NULL;
+	struct xfs_mount	*mp = ip->i_mount;
 	int			dummy_logflags;
 	int			error = 0;
 
+	/*
+	 * We can only remap full rt extents, so make sure that we convert the
+	 * entire extent.  The caller must ensure that this is either a direct
+	 * write that's aligned to the rt extent size, or a buffered write for
+	 * which we've dirtied extra pages to make this work properly.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		xfs_fileoff_t	new_off;
+
+		new_off = xfs_rtb_rounddown_rtx(mp, offset_fsb);
+		count_fsb += offset_fsb - new_off;
+		offset_fsb = new_off;
+
+		count_fsb = xfs_rtb_roundup_rtx(mp, count_fsb);
+	}
+
 	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
 		return 0;
 
@@ -635,11 +653,21 @@ xfs_reflink_cancel_cow_blocks(
 	bool				cancel_real)
 {
 	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+	struct xfs_mount		*mp = ip->i_mount;
 	struct xfs_bmbt_irec		got, del;
 	struct xfs_iext_cursor		icur;
 	bool				isrt = XFS_IS_REALTIME_INODE(ip);
 	int				error = 0;
 
+	/*
+	 * Shrink the range that we're cancelling if they don't align to the
+	 * realtime extent size, since we can only free full extents.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		offset_fsb = xfs_rtb_roundup_rtx(mp, offset_fsb);
+		end_fsb = xfs_rtb_rounddown_rtx(mp, end_fsb);
+	}
+
 	if (!xfs_inode_has_cow_data(ip))
 		return 0;
 	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
@@ -946,6 +974,7 @@ xfs_reflink_end_cow(
 	xfs_off_t			offset,
 	xfs_off_t			count)
 {
+	struct xfs_mount		*mp = ip->i_mount;
 	xfs_fileoff_t			offset_fsb;
 	xfs_fileoff_t			end_fsb;
 	int				error = 0;
@@ -955,6 +984,16 @@ xfs_reflink_end_cow(
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
 
+	/*
+	 * Make sure the end is aligned with a rt extent (if desired), since
+	 * the end of the range could be EOF.  The _convert_cow function should
+	 * have set us up to swap only full rt extents.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		offset_fsb = xfs_rtb_rounddown_rtx(mp, offset_fsb);
+		end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
+	}
+
 	/*
 	 * Walk forwards until we've remapped the I/O range.  The loop function
 	 * repeatedly cycles the ILOCK to allocate one transaction per remapped
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f94f144f9a39d..643cffaf3add2 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3892,6 +3892,7 @@ TRACE_EVENT(xfs_ioctl_clone,
 
 /* unshare tracepoints */
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
+DEFINE_SIMPLE_IO_EVENT(xfs_file_cow_around);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
 #ifdef CONFIG_XFS_RT
 DEFINE_SIMPLE_IO_EVENT(xfs_convert_bigalloc_file_space);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/9] xfs: forcibly convert unwritten blocks within an rt extent before sharing
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
  2023-12-31 21:56   ` [PATCH 1/9] vfs: explicitly pass the block size to the remap prep function Darrick J. Wong
  2023-12-31 21:56   ` [PATCH 2/9] xfs: enable CoW when rt extent size is larger than 1 block Darrick J. Wong
@ 2023-12-31 21:56   ` Darrick J. Wong
  2023-12-31 21:57   ` [PATCH 4/9] xfs: add some tracepoints for writeback Darrick J. Wong
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:56 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

As noted in the previous patch, XFS can only unmap and map full rt
extents.  This means that we cannot stop mid-extent for any reason,
including stepping around unwritten/written extents.  Second, the
reflink and CoW mechanisms were not designed to handle shared unwritten
extents, so we have to do something to get rid of them.

If the user asks us to remap two files, we must scan both ranges
beforehand to convert any unwritten extents that are not aligned to rt
extent boundaries into zeroed written extents before sharing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   19 +++++++++++++++++++
 1 file changed, 19 insertions(+)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d5773f9b7ec54..5d68603506f27 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1693,6 +1693,25 @@ xfs_reflink_remap_prep(
 	if (ret)
 		goto out_unlock;
 
+	/*
+	 * Now that we've marked both inodes for reflink, make sure that all
+	 * allocation units (AU) mapped into either files' ranges are either
+	 * wholly written, wholly unwritten, or holes.  The bmap code requires
+	 * that we align all unmap and remap requests to an AU.  We've already
+	 * flushed the page cache and finished directio for the range that's
+	 * being remapped, so we can convert the mappings directly.
+	 */
+	if (xfs_inode_has_bigallocunit(src)) {
+		ret = xfs_convert_bigalloc_file_space(src, pos_in, *len);
+		if (ret)
+			goto out_unlock;
+	}
+	if (xfs_inode_has_bigallocunit(dest)) {
+		ret = xfs_convert_bigalloc_file_space(dest, pos_out, *len);
+		if (ret)
+			goto out_unlock;
+	}
+
 	/*
 	 * If pos_out > EOF, we may have dirtied blocks between EOF and
 	 * pos_out. In that case, we need to extend the flush and unmap to cover


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/9] xfs: add some tracepoints for writeback
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:56   ` [PATCH 3/9] xfs: forcibly convert unwritten blocks within an rt extent before sharing Darrick J. Wong
@ 2023-12-31 21:57   ` Darrick J. Wong
  2023-12-31 21:57   ` [PATCH 5/9] xfs: extend writeback requests to handle rt cow correctly Darrick J. Wong
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:57 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a tracepoint so I can see where writeback is initiated.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_aops.c  |   19 ++++++++++++-------
 fs/xfs/xfs_trace.h |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 7 deletions(-)


diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3001ddf48d6c6..1217ce197ad98 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -497,10 +497,11 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
 
 STATIC int
 xfs_vm_writepages(
-	struct address_space	*mapping,
-	struct writeback_control *wbc)
+	struct address_space		*mapping,
+	struct writeback_control	*wbc)
 {
-	struct xfs_writepage_ctx wpc = { };
+	struct xfs_writepage_ctx	wpc = { };
+	struct xfs_inode		*ip = XFS_I(mapping->host);
 
 	/*
 	 * Writing back data in a transaction context can result in recursive
@@ -509,16 +510,20 @@ xfs_vm_writepages(
 	if (WARN_ON_ONCE(current->journal_info))
 		return 0;
 
-	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+	trace_xfs_vm_writepages(ip, wbc);
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
 
 STATIC int
 xfs_dax_writepages(
-	struct address_space	*mapping,
-	struct writeback_control *wbc)
+	struct address_space		*mapping,
+	struct writeback_control	*wbc)
 {
-	struct xfs_inode	*ip = XFS_I(mapping->host);
+	struct xfs_inode		*ip = XFS_I(mapping->host);
+
+	trace_xfs_dax_writepages(ip, wbc);
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return dax_writeback_mapping_range(mapping,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 643cffaf3add2..39df20ae702c8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1552,6 +1552,40 @@ DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IMAP_EVENT(xfs_iomap_found);
 
+DECLARE_EVENT_CLASS(xfs_writeback_class,
+	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc),
+	TP_ARGS(ip, wbc),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, range_start)
+		__field(loff_t, range_end)
+		__field(long, nr_to_write)
+		__field(enum writeback_sync_modes, sync_mode)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->range_start = wbc->range_start;
+		__entry->range_end = wbc->range_end;
+		__entry->nr_to_write = wbc->nr_to_write;
+		__entry->sync_mode = wbc->sync_mode;
+	),
+	TP_printk("dev %d:%d ino 0x%llx range_start 0x%llx range_end 0x%llx nr_to_write %ld sync_mode %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->range_start,
+		   __entry->range_end,
+		   __entry->nr_to_write,
+		   __entry->sync_mode)
+);
+#define DEFINE_WRITEBACK_EVENT(name)	\
+DEFINE_EVENT(xfs_writeback_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \
+	TP_ARGS(ip, wbc))
+DEFINE_WRITEBACK_EVENT(xfs_vm_writepages);
+DEFINE_WRITEBACK_EVENT(xfs_dax_writepages);
+
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count),
 	TP_ARGS(ip, offset, count),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/9] xfs: extend writeback requests to handle rt cow correctly
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:57   ` [PATCH 4/9] xfs: add some tracepoints for writeback Darrick J. Wong
@ 2023-12-31 21:57   ` Darrick J. Wong
  2023-12-31 21:57   ` [PATCH 6/9] xfs: enable extent size hints for CoW when rtextsize > 1 Darrick J. Wong
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:57 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If we have shared realtime files and the rt extent size is larger than a
single fs block, we need to extend writeback requests to be aligned to
rt extent size granularity because we cannot share partial rt extents.
The front end should have set us up for this by dirtying the relevant
ranges.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_aops.c  |   38 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trace.h |    1 +
 2 files changed, 39 insertions(+)


diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1217ce197ad98..b6ef76ee65f5e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -495,6 +495,38 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
 	.discard_folio		= xfs_discard_folio,
 };
 
+/*
+ * Extend the writeback range to allocation unit granularity and alignment.
+ * This is a requirement for blocksize > pagesize scenarios such as realtime
+ * copy on write, since we can only share full rt extents.
+ */
+static inline void
+xfs_vm_writepages_extend(
+	struct xfs_inode		*ip,
+	struct writeback_control	*wbc)
+{
+	unsigned int			bsize = xfs_inode_alloc_unitsize(ip);
+	long long int			pages_to_write;
+	loff_t				next = wbc->range_end + 1;
+
+	wbc->range_start = rounddown_64(wbc->range_start, bsize);
+	if (wbc->range_end != LLONG_MAX)
+		wbc->range_end = roundup_64(next, bsize) - 1;
+
+	if (wbc->nr_to_write != LONG_MAX) {
+		pgoff_t		pg_start = wbc->range_start >> PAGE_SHIFT;
+		pgoff_t		pg_next = (wbc->range_end + 1) >> PAGE_SHIFT;
+
+		pages_to_write = pg_next - pg_start;
+		if (pages_to_write >= LONG_MAX)
+			pages_to_write = LONG_MAX;
+		if (wbc->nr_to_write < pages_to_write)
+			wbc->nr_to_write = pages_to_write;
+	}
+
+	trace_xfs_vm_writepages_extend(ip, wbc);
+}
+
 STATIC int
 xfs_vm_writepages(
 	struct address_space		*mapping,
@@ -512,6 +544,9 @@ xfs_vm_writepages(
 
 	trace_xfs_vm_writepages(ip, wbc);
 
+	if (xfs_inode_needs_cow_around(ip))
+		xfs_vm_writepages_extend(ip, wbc);
+
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
@@ -525,6 +560,9 @@ xfs_dax_writepages(
 
 	trace_xfs_dax_writepages(ip, wbc);
 
+	if (xfs_inode_needs_cow_around(ip))
+		xfs_vm_writepages_extend(ip, wbc);
+
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return dax_writeback_mapping_range(mapping,
 			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 39df20ae702c8..4767fc49c4641 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1584,6 +1584,7 @@ DEFINE_EVENT(xfs_writeback_class, name,	\
 	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \
 	TP_ARGS(ip, wbc))
 DEFINE_WRITEBACK_EVENT(xfs_vm_writepages);
+DEFINE_WRITEBACK_EVENT(xfs_vm_writepages_extend);
 DEFINE_WRITEBACK_EVENT(xfs_dax_writepages);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/9] xfs: enable extent size hints for CoW when rtextsize > 1
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:57   ` [PATCH 5/9] xfs: extend writeback requests to handle rt cow correctly Darrick J. Wong
@ 2023-12-31 21:57   ` Darrick J. Wong
  2023-12-31 21:57   ` [PATCH 7/9] xfs: allow reflink on the rt volume when extent size is larger than 1 rt block Darrick J. Wong
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:57 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

CoW extent size hints are not allowed on filesystems that have large
realtime extents because we only want to perform the minimum required
amount of write-around (aka write amplification) for shared extents.

On filesystems where rtextsize > 1, allocations can only be done in
units of full rt extents, which means that we can only map an entire rt
extent's worth of blocks into the data fork.  Hole punch requests become
conversions to unwritten if the request isn't aligned properly.

Because a copy-write fundamentally requires remapping, this means that
we also can only do copy-writes of a full rt extent.  This is too
expensive for large hint sizes, since it's all or nothing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |   22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 41354bdbbc90f..c6ab6a38bc7cd 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6408,6 +6408,28 @@ xfs_get_cowextsz_hint(
 	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 		a = ip->i_cowextsize;
 	if (XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * For realtime files, the realtime extent is the fundamental
+		 * unit of allocation.  This means that data sharing and CoW
+		 * remapping can only be done in those units.  For filesystems
+		 * where the extent size is larger than one block, write
+		 * requests that are not aligned to an extent boundary employ
+		 * an unshare-around strategy to ensure that all pages for a
+		 * shared extent are fully dirtied.
+		 *
+		 * Because the remapping alignment requirement applies equally
+		 * to all CoW writes, any regular overwrites that could be
+		 * turned (by a speculative CoW preallocation) into a CoW write
+		 * must either employ this dirty-around strategy, or be smart
+		 * enough to ignore the CoW fork mapping unless the entire
+		 * extent is dirty or becomes shared by writeback time.  Doing
+		 * the first would dramatically increase write amplification,
+		 * and the second would require deeper insight into the state
+		 * of the page cache during a writeback request.  For now, we
+		 * ignore the hint.
+		 */
+		if (ip->i_mount->m_sb.sb_rextsize > 1)
+			return ip->i_mount->m_sb.sb_rextsize;
 		b = 0;
 		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
 			b = ip->i_extsize;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 7/9] xfs: allow reflink on the rt volume when extent size is larger than 1 rt block
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 21:57   ` [PATCH 6/9] xfs: enable extent size hints for CoW when rtextsize > 1 Darrick J. Wong
@ 2023-12-31 21:57   ` Darrick J. Wong
  2023-12-31 21:58   ` [PATCH 8/9] xfs: fix integer overflow when validating extent size hints Darrick J. Wong
  2023-12-31 21:58   ` [PATCH 9/9] xfs: support realtime reflink with an extent size that isn't a power of 2 Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:57 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make the necessary tweaks to the reflink remapping code to support
remapping on the realtime volume when the rt extent size is larger than
a single rt block.  We need to check that the remap arguments from
userspace are aligned to a rt extent boundary, and that the length
is always aligned, even if the kernel tried to round it up to EOF for
us.  XFS can only map and remap full rt extents, so we have to be a
little more strict about the alignment there.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_rtalloc.c |    2 +
 fs/xfs/xfs_super.c   |   19 +++++++++---
 fs/xfs/xfs_trace.h   |    3 ++
 4 files changed, 93 insertions(+), 12 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 5d68603506f27..d516f3a35df36 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1534,6 +1534,13 @@ xfs_reflink_remap_blocks(
 	len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
 			XFS_MAX_FILEOFF);
 
+	/*
+	 * Make sure the end is aligned with an allocation unit, even if it's
+	 * past EOF.
+	 */
+	if (xfs_inode_has_bigallocunit(dest))
+		len = xfs_rtb_roundup_rtx(mp, len);
+
 	trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
 
 	while (len > 0) {
@@ -1607,6 +1614,57 @@ xfs_reflink_zero_posteof(
 	return xfs_zero_range(ip, isize, pos - isize, NULL);
 }
 
+#ifdef CONFIG_XFS_RT
+/*
+ * Adjust the length of the remap operation to end on an allocation unit (AU)
+ * boundary.
+ */
+STATIC int
+xfs_reflink_adjust_bigalloc_len(
+	struct xfs_inode	*src,
+	loff_t			pos_in,
+	struct xfs_inode	*dest,
+	loff_t			pos_out,
+	loff_t			*len,
+	unsigned int		remap_flags)
+{
+	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(src);
+	uint32_t		mod;
+
+	div_u64_rem(*len, alloc_unit, &mod);
+
+	/*
+	 * We previously checked the AU alignment of both offsets, so we now
+	 * have to check the AU alignment of the length.  The VFS remap prep
+	 * function can change the length on us, so we can only make length
+	 * adjustments after that.  If the length is aligned to an AU, we're
+	 * good to go.
+	 *
+	 * Otherwise, the length is not aligned to an AU.  If the source file's
+	 * range ends at EOF, the VFS ensured that the dest file's range also
+	 * ends at EOF.  The actual remap function will round the (byte) length
+	 * up to the nearest AU, so we're ok here too.
+	 */
+	if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src)))
+		return 0;
+
+	/*
+	 * Otherwise, the only thing we can do is round the request length down
+	 * to an AU boundary.  If the caller doesn't allow that, we cannot move
+	 * forward.
+	 */
+	if (!(remap_flags & REMAP_FILE_CAN_SHORTEN))
+		return -EINVAL;
+
+	/* Back off by a single extent. */
+	(*len) -= mod;
+	trace_xfs_reflink_adjust_bigalloc_len(src, pos_in, *len, dest, pos_out);
+	return 0;
+}
+#else
+# define xfs_reflink_adjust_bigalloc_len(...)		(0)
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file will
@@ -1649,6 +1707,7 @@ xfs_reflink_remap_prep(
 	struct xfs_inode	*src = XFS_I(inode_in);
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
+	const struct iomap_ops	*dax_read_ops = NULL;
 	int			ret;
 
 	/* Lock both files against IO */
@@ -1666,15 +1725,25 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) != IS_DAX(inode_out))
 		goto out_unlock;
 
-	if (!IS_DAX(inode_in))
-		ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
-				pos_out, len, remap_flags);
-	else
-		ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
-				pos_out, len, remap_flags, &xfs_read_iomap_ops);
+	ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest)));
+
+	if (IS_DAX(inode_in))
+		dax_read_ops = &xfs_read_iomap_ops;
+
+	ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
+			pos_out, len, remap_flags, dax_read_ops,
+			xfs_inode_alloc_unitsize(dest));
 	if (ret || *len == 0)
 		goto out_unlock;
 
+	/* Adjust the end to align to an allocation unit. */
+	if (xfs_inode_has_bigallocunit(src)) {
+		ret = xfs_reflink_adjust_bigalloc_len(src, pos_in, dest,
+				pos_out, len, remap_flags);
+		if (ret || *len == 0)
+			goto out_unlock;
+	}
+
 	/* Attach dquots to dest inode before changing block map */
 	ret = xfs_qm_dqattach(dest);
 	if (ret)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 11b6645a5a534..c617c326125b3 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1253,7 +1253,7 @@ xfs_growfs_rt(
 		return -EOPNOTSUPP;
 	if (xfs_has_quota(mp))
 		return -EOPNOTSUPP;
-	if (xfs_has_reflink(mp) && in->extsize != 1)
+	if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize))
 		return -EOPNOTSUPP;
 
 	nrblocks = in->newblocks;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bf0c0ce9a54b9..c17e1d06820d1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1729,13 +1729,22 @@ xfs_fs_fill_super(
 
 	if (xfs_has_reflink(mp)) {
 		/*
-		 * Reflink doesn't support rt extent sizes larger than a single
-		 * block because we would have to perform unshare-around for
-		 * rtext-unaligned write requests.
+		 * Reflink doesn't support pagecache pages that span multiple
+		 * realtime extents because iomap doesn't track subpage dirty
+		 * state.  This means that we cannot dirty all the pages
+		 * backing an rt extent without dirtying the adjoining rt
+		 * extents.  If those rt extents are shared and extend into
+		 * other pages, this leads to crazy write amplification.  The
+		 * VFS remap_range checks assume power-of-two block sizes.
+		 *
+		 * Hence we only support rt extent sizes that are an integer
+		 * power of two because we know those will align with the page
+		 * size.
 		 */
-		if (xfs_has_realtime(mp) && mp->m_sb.sb_rextsize != 1) {
+		if (xfs_has_realtime(mp) &&
+		    !is_power_of_2(mp->m_sb.sb_rextsize)) {
 			xfs_alert(mp,
-	"reflink not compatible with realtime extent size %u!",
+	"reflink not compatible with non-power-of-2 realtime extent size %u!",
 					mp->m_sb.sb_rextsize);
 			error = -EINVAL;
 			goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 4767fc49c4641..906e35eef223d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3887,6 +3887,9 @@ TRACE_EVENT(xfs_reflink_remap_blocks,
 		  __entry->dest_lblk)
 );
 DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
+#ifdef CONFIG_XFS_RT
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_adjust_bigalloc_len);
+#endif /* CONFIG_XFS_RT */
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 8/9] xfs: fix integer overflow when validating extent size hints
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 21:57   ` [PATCH 7/9] xfs: allow reflink on the rt volume when extent size is larger than 1 rt block Darrick J. Wong
@ 2023-12-31 21:58   ` Darrick J. Wong
  2023-12-31 21:58   ` [PATCH 9/9] xfs: support realtime reflink with an extent size that isn't a power of 2 Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:58 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Both file extent size hints are stored as 32-bit quantities, in units of
filesystem blocks.  As part of validating the hints, we convert these
quantities to bytes to ensure that the hint is congruent with the file's
allocation size.

The maximum possible hint value is 2097151 (aka XFS_MAX_BMBT_EXTLEN).
If the file allocation unit is larger than 2048, the unit conversion
will exceed 32 bits in size, which overflows the uint32_t used to store
the value used in the comparison.  This isn't a problem for files on the
data device since the hint will always be a multiple of the block size.
However, this is a problem for realtime files because the rtextent size
can be any integer number of fs blocks, and truncation of upper bits
changes the outcome of division.

Eliminate the overflow by performing the congruency check in units of
blocks, not bytes.  Otherwise, we get errors like this:

$ truncate -s 500T /tmp/a
$ mkfs.xfs -f -N /tmp/a -d extszinherit=2097151,rtinherit=1 -r extsize=28k
illegal extent size hint 2097151, must be less than 2097151 and a multiple of 7.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c |   20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 81a12ca8ec434..adc457da52ef0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -770,13 +770,11 @@ xfs_inode_validate_extsize(
 	bool				rt_flag;
 	bool				hint_flag;
 	bool				inherit_flag;
-	uint32_t			extsize_bytes;
-	uint32_t			blocksize_bytes;
+	uint32_t			alloc_unit = 1;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
 	inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
-	extsize_bytes = XFS_FSB_TO_B(mp, extsize);
 
 	/*
 	 * This comment describes a historic gap in this verifier function.
@@ -805,9 +803,7 @@ xfs_inode_validate_extsize(
 	 */
 
 	if (rt_flag)
-		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
-	else
-		blocksize_bytes = mp->m_sb.sb_blocksize;
+		alloc_unit = mp->m_sb.sb_rextsize;
 
 	if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
 		return __this_address;
@@ -825,7 +821,7 @@ xfs_inode_validate_extsize(
 	if (mode && !(hint_flag || inherit_flag) && extsize != 0)
 		return __this_address;
 
-	if (extsize_bytes % blocksize_bytes)
+	if (extsize % alloc_unit)
 		return __this_address;
 
 	if (extsize > XFS_MAX_BMBT_EXTLEN)
@@ -860,12 +856,10 @@ xfs_inode_validate_cowextsize(
 {
 	bool				rt_flag;
 	bool				hint_flag;
-	uint32_t			cowextsize_bytes;
-	uint32_t			blocksize_bytes;
+	uint32_t			alloc_unit = 1;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
-	cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
 
 	/*
 	 * Similar to extent size hints, a directory can be configured to
@@ -880,9 +874,7 @@ xfs_inode_validate_cowextsize(
 	 */
 
 	if (rt_flag)
-		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
-	else
-		blocksize_bytes = mp->m_sb.sb_blocksize;
+		alloc_unit = mp->m_sb.sb_rextsize;
 
 	if (hint_flag && !xfs_has_reflink(mp))
 		return __this_address;
@@ -897,7 +889,7 @@ xfs_inode_validate_cowextsize(
 	if (mode && !hint_flag && cowextsize != 0)
 		return __this_address;
 
-	if (cowextsize_bytes % blocksize_bytes)
+	if (cowextsize % alloc_unit)
 		return __this_address;
 
 	if (cowextsize > XFS_MAX_BMBT_EXTLEN)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 9/9] xfs: support realtime reflink with an extent size that isn't a power of 2
  2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 21:58   ` [PATCH 8/9] xfs: fix integer overflow when validating extent size hints Darrick J. Wong
@ 2023-12-31 21:58   ` Darrick J. Wong
  8 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:58 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add the necessary alignment checking code to the reflink remap code to
ensure that remap requests are aligned to rt extent boundaries if the
realtime extent size isn't a power of two.  The VFS helpers assume that
they can use the usual (blocksize - 1) masking to avoid slow 64-bit
division, but since XFS is special we won't make everyone pay that cost
for our weird edge case.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   92 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_rtalloc.c |    3 +-
 fs/xfs/xfs_super.c   |   12 +++----
 3 files changed, 97 insertions(+), 10 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d516f3a35df36..0c54522404963 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1665,6 +1665,83 @@ xfs_reflink_adjust_bigalloc_len(
 # define xfs_reflink_adjust_bigalloc_len(...)		(0)
 #endif /* CONFIG_XFS_RT */
 
+/*
+ * Check the alignment of a remap request when the allocation unit size isn't a
+ * power of two.  The VFS helpers use (fast) bitmask-based alignment checks,
+ * but here we have to use slow long division.
+ */
+static int
+xfs_reflink_remap_check_rtalign(
+	struct xfs_inode		*ip_in,
+	loff_t				pos_in,
+	struct xfs_inode		*ip_out,
+	loff_t				pos_out,
+	loff_t				*req_len,
+	unsigned int			remap_flags)
+{
+	struct xfs_mount		*mp = ip_in->i_mount;
+	uint32_t			rextbytes;
+	loff_t				in_size, out_size;
+	loff_t				new_length, length = *req_len;
+	loff_t				blen;
+
+	rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+	in_size = i_size_read(VFS_I(ip_in));
+	out_size = i_size_read(VFS_I(ip_out));
+
+	/* The start of both ranges must be aligned to a rt extent. */
+	if (!isaligned_64(pos_in, rextbytes) ||
+	    !isaligned_64(pos_out, rextbytes))
+		return -EINVAL;
+
+	if (length == 0)
+		length = in_size - pos_in;
+
+	/*
+	 * If the user wanted us to exchange up to the infile's EOF, round up
+	 * to the next block boundary for this check.
+	 *
+	 * Otherwise, reject the range length if it's not extent aligned.  We
+	 * already confirmed the starting offsets' extent alignment.
+	 */
+	if (pos_in + length == in_size)
+		blen = roundup_64(in_size, rextbytes) - pos_in;
+	else
+		blen = rounddown_64(length, rextbytes);
+
+	/* Don't allow overlapped remappings within the same file. */
+	if (ip_in == ip_out &&
+	    pos_out + blen > pos_in &&
+	    pos_in + blen > pos_out)
+		return -EINVAL;
+
+	/*
+	 * Ensure that we don't exchange a partial EOF extent into the middle
+	 * of another file.
+	 */
+	if (isaligned_64(length, rextbytes))
+		return 0;
+
+	new_length = length;
+	if (pos_out + length < out_size)
+		new_length = rounddown_64(new_length, rextbytes);
+
+	if (new_length == length)
+		return 0;
+
+	/*
+	 * Return the shortened request if the caller permits it.  If the
+	 * request was shortened to zero rt extents, we know that the original
+	 * arguments weren't valid in the first place.
+	 */
+	if ((remap_flags & REMAP_FILE_CAN_SHORTEN) && new_length > 0) {
+		*req_len = new_length;
+		return 0;
+	}
+
+	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
+}
+
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file will
@@ -1708,6 +1785,7 @@ xfs_reflink_remap_prep(
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
 	const struct iomap_ops	*dax_read_ops = NULL;
+	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(dest);
 	int			ret;
 
 	/* Lock both files against IO */
@@ -1725,14 +1803,22 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) != IS_DAX(inode_out))
 		goto out_unlock;
 
-	ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest)));
+	/* Check non-power of two alignment issues, if necessary. */
+	if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) {
+		ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest,
+				pos_out, len, remap_flags);
+		if (ret)
+			goto out_unlock;
+
+		/* Do the VFS checks with the regular block alignment. */
+		alloc_unit = src->i_mount->m_sb.sb_blocksize;
+	}
 
 	if (IS_DAX(inode_in))
 		dax_read_ops = &xfs_read_iomap_ops;
 
 	ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
-			pos_out, len, remap_flags, dax_read_ops,
-			xfs_inode_alloc_unitsize(dest));
+			pos_out, len, remap_flags, dax_read_ops, alloc_unit);
 	if (ret || *len == 0)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c617c326125b3..7917eaef911f6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1253,7 +1253,8 @@ xfs_growfs_rt(
 		return -EOPNOTSUPP;
 	if (xfs_has_quota(mp))
 		return -EOPNOTSUPP;
-	if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize))
+	if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize) &&
+	    (XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) & ~PAGE_MASK))
 		return -EOPNOTSUPP;
 
 	nrblocks = in->newblocks;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c17e1d06820d1..b5291b0ea21d9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1734,17 +1734,17 @@ xfs_fs_fill_super(
 		 * state.  This means that we cannot dirty all the pages
 		 * backing an rt extent without dirtying the adjoining rt
 		 * extents.  If those rt extents are shared and extend into
-		 * other pages, this leads to crazy write amplification.  The
-		 * VFS remap_range checks assume power-of-two block sizes.
+		 * other pages, this leads to crazy write amplification.
 		 *
 		 * Hence we only support rt extent sizes that are an integer
-		 * power of two because we know those will align with the page
-		 * size.
+		 * power of two or an integer multiple of the page size because
+		 * we know those will align with the page size.
 		 */
 		if (xfs_has_realtime(mp) &&
-		    !is_power_of_2(mp->m_sb.sb_rextsize)) {
+		    !is_power_of_2(mp->m_sb.sb_rextsize) &&
+		    (XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) & ~PAGE_MASK)) {
 			xfs_alert(mp,
-	"reflink not compatible with non-power-of-2 realtime extent size %u!",
+	"reflink not compatible with realtime extent size %u!",
 					mp->m_sb.sb_rextsize);
 			error = -EINVAL;
 			goto out_filestream_unmount;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/6] xfs: attach dquots to rt metadata files when starting quota
  2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
@ 2023-12-31 21:58   ` Darrick J. Wong
  2023-12-31 21:58   ` [PATCH 2/6] xfs: fix chown with rt quota Darrick J. Wong
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:58 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Attach dquots to the realtime metadata files when starting up quotas,
since the resources used by them are charged to the root dquot.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_mount.c   |    4 +++-
 fs/xfs/xfs_qm.c      |   20 +++++++++++++++++---
 fs/xfs/xfs_qm_bhv.c  |    2 +-
 fs/xfs/xfs_quota.h   |    4 ++--
 fs/xfs/xfs_rtalloc.c |   22 ++++++++++++++++++++++
 fs/xfs/xfs_rtalloc.h |    3 +++
 6 files changed, 48 insertions(+), 7 deletions(-)


diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b20e0e6410512..879945c305da4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1024,7 +1024,9 @@ xfs_mountfs(
 		ASSERT(mp->m_qflags == 0);
 		mp->m_qflags = quotaflags;
 
-		xfs_qm_mount_quotas(mp);
+		error = xfs_qm_mount_quotas(mp);
+		if (error)
+			goto out_rtunmount;
 	}
 
 	/*
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index a6b5193190c4c..2771aef361a25 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -30,6 +30,7 @@
 #include "xfs_imeta.h"
 #include "xfs_imeta_utils.h"
 #include "xfs_da_format.h"
+#include "xfs_rtalloc.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
@@ -1525,7 +1526,7 @@ xfs_qm_quotacheck(
  * If we fail here, the mount will continue with quota turned off. We don't
  * need to inidicate success or failure at all.
  */
-void
+int
 xfs_qm_mount_quotas(
 	struct xfs_mount	*mp)
 {
@@ -1564,7 +1565,7 @@ xfs_qm_mount_quotas(
 		error = xfs_qm_quotacheck(mp);
 		if (error) {
 			/* Quotacheck failed and disabled quotas. */
-			return;
+			return 0;
 		}
 	}
 	/*
@@ -1605,8 +1606,21 @@ xfs_qm_mount_quotas(
 
 	if (error) {
 		xfs_warn(mp, "Failed to initialize disk quotas.");
-		return;
+		return 0;
 	}
+
+	/*
+	 * Attach dquots to realtime metadata files before we do anything that
+	 * could alter the resource usage of rt metadata (log recovery, normal
+	 * operation, etc).
+	 */
+	error = xfs_rtmount_dqattach(mp);
+	if (error) {
+		xfs_qm_unmount_quotas(mp);
+		return error;
+	}
+
+	return 0;
 }
 
 /*
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 271c1021c7335..df569a839d3f9 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -119,7 +119,7 @@ xfs_qm_newmount(
 			 * mounting, and get on with the boring life
 			 * without disk quotas.
 			 */
-			xfs_qm_mount_quotas(mp);
+			return xfs_qm_mount_quotas(mp);
 		} else {
 			/*
 			 * Clear the quota flags, but remember them. This
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 165013f03db9e..b0cdbe1f1f275 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -125,7 +125,7 @@ extern void xfs_qm_dqdetach(struct xfs_inode *);
 extern void xfs_qm_dqrele(struct xfs_dquot *);
 extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
 extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
-extern void xfs_qm_mount_quotas(struct xfs_mount *);
+int xfs_qm_mount_quotas(struct xfs_mount *mp);
 extern void xfs_qm_unmount(struct xfs_mount *);
 extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 
@@ -206,7 +206,7 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
 #define xfs_qm_dqrele(d)			do { (d) = (d); } while(0)
 #define xfs_qm_statvfs(ip, s)			do { } while(0)
 #define xfs_qm_newmount(mp, a, b)					(0)
-#define xfs_qm_mount_quotas(mp)
+#define xfs_qm_mount_quotas(mp)						(0)
 #define xfs_qm_unmount(mp)
 #define xfs_qm_unmount_quotas(mp)
 #define xfs_inode_near_dquot_enforcement(ip, type)			(false)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 7917eaef911f6..cc83651636ecb 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -32,6 +32,7 @@
 #include "xfs_rtrmap_btree.h"
 #include "xfs_trace.h"
 #include "xfs_rtrefcount_btree.h"
+#include "xfs_quota.h"
 
 /*
  * Realtime metadata files are not quite regular files because userspace can't
@@ -2039,6 +2040,27 @@ xfs_rtmount_inodes(
 	return error;
 }
 
+/*
+ * Attach dquots for realtime metadata files.  Prior to the introduction of the
+ * metadata directory tree, the rtbitmap and rtsummary inodes were counted in
+ * the root dquot icount, so we must dqattach them to maintain correct counts.
+ */
+int
+xfs_rtmount_dqattach(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	if (xfs_has_metadir(mp))
+		return 0;
+
+	error = xfs_qm_dqattach(mp->m_rbmip);
+	if (error)
+		return error;
+
+	return xfs_qm_dqattach(mp->m_rsumip);
+}
+
 void
 xfs_rtunmount_inodes(
 	struct xfs_mount	*mp)
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8a7b6cfa13cf0..a03796ea90eaf 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -46,6 +46,8 @@ void
 xfs_rtunmount_inodes(
 	struct xfs_mount	*mp);
 
+int xfs_rtmount_dqattach(struct xfs_mount *mp);
+
 /*
  * Get the bitmap and summary inodes into the mount structure
  * at mount time.
@@ -106,6 +108,7 @@ xfs_rtmount_init(
 # define xfs_rt_resv_free(mp)				((void)0)
 # define xfs_rt_resv_init(mp)				(0)
 # define xfs_growfs_check_rtgeom(mp, d, r, rs, rx, rb, rl)	(0)
+# define xfs_rtmount_dqattach(mp)			(0)
 #endif	/* CONFIG_XFS_RT */
 
 #endif	/* __XFS_RTALLOC_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/6] xfs: fix chown with rt quota
  2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
  2023-12-31 21:58   ` [PATCH 1/6] xfs: attach dquots to rt metadata files when starting quota Darrick J. Wong
@ 2023-12-31 21:58   ` Darrick J. Wong
  2023-12-31 21:59   ` [PATCH 3/6] xfs: fix rt growfs quota accounting Darrick J. Wong
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:58 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make chown's quota adjustments work with realtime files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_qm.c    |   44 +++++++++++++++++++++++++++-----------------
 fs/xfs/xfs_trans.c |   31 +++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 19 deletions(-)


diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2771aef361a25..b89f0e8f80f9f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1221,8 +1221,8 @@ xfs_qm_dqusage_adjust(
 	void			*data)
 {
 	struct xfs_inode	*ip;
-	xfs_qcnt_t		nblks;
-	xfs_filblks_t		rtblks = 0;	/* total rt blks */
+	xfs_filblks_t		nblks, rtblks;
+	unsigned int		lock_mode;
 	int			error;
 
 	ASSERT(XFS_IS_QUOTA_ON(mp));
@@ -1263,18 +1263,17 @@ xfs_qm_dqusage_adjust(
 
 	ASSERT(ip->i_delayed_blks == 0);
 
+	lock_mode = xfs_ilock_data_map_shared(ip);
 	if (XFS_IS_REALTIME_INODE(ip)) {
-		struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
-
 		error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
-		if (error)
+		if (error) {
+			xfs_iunlock(ip, lock_mode);
 			goto error0;
-
-		xfs_bmap_count_leaves(ifp, &rtblks);
+		}
 	}
-
-	nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
+	xfs_inode_count_blocks(tp, ip, &nblks, &rtblks);
 	xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED);
+	xfs_iunlock(ip, lock_mode);
 
 	/*
 	 * Add the (disk blocks and inode) resources occupied by this
@@ -1921,9 +1920,8 @@ xfs_qm_vop_chown(
 	struct xfs_dquot	*newdq)
 {
 	struct xfs_dquot	*prevdq;
-	uint		bfield = XFS_IS_REALTIME_INODE(ip) ?
-				 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
-
+	xfs_filblks_t		dblocks, rblocks;
+	bool			isrt = XFS_IS_REALTIME_INODE(ip);
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
@@ -1934,11 +1932,17 @@ xfs_qm_vop_chown(
 	ASSERT(prevdq);
 	ASSERT(prevdq != newdq);
 
-	xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks));
+	xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks);
+
+	xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_BCOUNT,
+			-(xfs_qcnt_t)dblocks);
+	xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_RTBCOUNT,
+			-(xfs_qcnt_t)rblocks);
 	xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
 
 	/* the sparkling new dquot */
-	xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks);
+	xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_BCOUNT, dblocks);
+	xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_RTBCOUNT, rblocks);
 	xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1);
 
 	/*
@@ -1948,7 +1952,8 @@ xfs_qm_vop_chown(
 	 * (having already bumped up the real counter) so that we don't have
 	 * any reservation to give back when we commit.
 	 */
-	xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+	xfs_trans_mod_dquot(tp, newdq,
+			isrt ? XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS,
 			-ip->i_delayed_blks);
 
 	/*
@@ -1960,8 +1965,13 @@ xfs_qm_vop_chown(
 	 */
 	tp->t_flags |= XFS_TRANS_DIRTY;
 	xfs_dqlock(prevdq);
-	ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
-	prevdq->q_blk.reserved -= ip->i_delayed_blks;
+	if (isrt) {
+		ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks);
+		prevdq->q_rtb.reserved -= ip->i_delayed_blks;
+	} else {
+		ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+		prevdq->q_blk.reserved -= ip->i_delayed_blks;
+	}
 	xfs_dqunlock(prevdq);
 
 	/*
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d4952be8f2498..008380482777b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1420,11 +1420,26 @@ xfs_trans_alloc_ichange(
 	gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL;
 	pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL;
 	if (udqp || gdqp || pdqp) {
+		xfs_filblks_t	dblocks, rblocks;
 		unsigned int	qflags = XFS_QMOPT_RES_REGBLKS;
+		bool		isrt = XFS_IS_REALTIME_INODE(ip);
 
 		if (force)
 			qflags |= XFS_QMOPT_FORCE_RES;
 
+		if (isrt) {
+			error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+			if (error)
+				goto out_cancel;
+		}
+
+		xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks);
+
+		if (isrt)
+			rblocks += ip->i_delayed_blks;
+		else
+			dblocks += ip->i_delayed_blks;
+
 		/*
 		 * Reserve enough quota to handle blocks on disk and reserved
 		 * for a delayed allocation.  We'll actually transfer the
@@ -1432,8 +1447,20 @@ xfs_trans_alloc_ichange(
 		 * though that part is only semi-transactional.
 		 */
 		error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp,
-				pdqp, ip->i_nblocks + ip->i_delayed_blks,
-				1, qflags);
+				pdqp, dblocks, 1, qflags);
+		if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+			xfs_trans_cancel(tp);
+			xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+			retried = true;
+			goto retry;
+		}
+		if (error)
+			goto out_cancel;
+
+		/* Do the same for realtime. */
+		qflags = XFS_QMOPT_RES_RTBLKS | (qflags & XFS_QMOPT_FORCE_RES);
+		error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp,
+				pdqp, rblocks, 0, qflags);
 		if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
 			xfs_trans_cancel(tp);
 			xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/6] xfs: fix rt growfs quota accounting
  2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
  2023-12-31 21:58   ` [PATCH 1/6] xfs: attach dquots to rt metadata files when starting quota Darrick J. Wong
  2023-12-31 21:58   ` [PATCH 2/6] xfs: fix chown with rt quota Darrick J. Wong
@ 2023-12-31 21:59   ` Darrick J. Wong
  2023-12-31 21:59   ` [PATCH 4/6] xfs: advertise realtime quota support in the xqm stat files Darrick J. Wong
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:59 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When growing the realtime bitmap or summary inodes, use
xfs_trans_alloc_inode to reserve quota for the blocks that could be
allocated to the file.  Although we never enforce limits against the
root dquot, making a reservation means that the bmap code will update
the quota block count, which is necessary for correct accounting.

Found by running xfs/521.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_rtalloc.c |   24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)


diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index cc83651636ecb..0af834897fad4 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -842,15 +842,10 @@ xfs_growfs_rt_alloc(
 		/*
 		 * Reserve space & log for one extent added to the file.
 		 */
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
-				0, 0, &tp);
+		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_growrtalloc,
+				resblks, 0, false, &tp);
 		if (error)
 			return error;
-		/*
-		 * Lock the inode.
-		 */
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 		error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
 				XFS_IEXT_ADD_NOSPLIT_CNT);
@@ -874,6 +869,7 @@ xfs_growfs_rt_alloc(
 		 * Free any blocks freed up in the transaction, then commit.
 		 */
 		error = xfs_trans_commit(tp);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error)
 			return error;
 		/*
@@ -886,15 +882,11 @@ xfs_growfs_rt_alloc(
 			/*
 			 * Reserve log for one block zeroing.
 			 */
-			error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
-					0, 0, 0, &tp);
+			error = xfs_trans_alloc_inode(ip,
+					&M_RES(mp)->tr_growrtzero, 0, 0, false,
+					&tp);
 			if (error)
 				return error;
-			/*
-			 * Lock the bitmap inode.
-			 */
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 			error = xfs_growfs_init_rtbuf(tp, ip, fsbno, buf_type);
 			if (error)
@@ -904,6 +896,7 @@ xfs_growfs_rt_alloc(
 			 * Commit the transaction.
 			 */
 			error = xfs_trans_commit(tp);
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 			if (error)
 				return error;
 		}
@@ -917,6 +910,7 @@ xfs_growfs_rt_alloc(
 
 out_trans_cancel:
 	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
@@ -1252,8 +1246,6 @@ xfs_growfs_rt(
 	/* Unsupported realtime features. */
 	if (!xfs_has_rtgroups(mp) && (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)))
 		return -EOPNOTSUPP;
-	if (xfs_has_quota(mp))
-		return -EOPNOTSUPP;
 	if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize) &&
 	    (XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) & ~PAGE_MASK))
 		return -EOPNOTSUPP;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/6] xfs: advertise realtime quota support in the xqm stat files
  2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 21:59   ` [PATCH 3/6] xfs: fix rt growfs quota accounting Darrick J. Wong
@ 2023-12-31 21:59   ` Darrick J. Wong
  2023-12-31 21:59   ` [PATCH 5/6] xfs: report realtime block quota limits on realtime directories Darrick J. Wong
  2023-12-31 21:59   ` [PATCH 6/6] xfs: enable realtime quota again Darrick J. Wong
  5 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:59 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a fifth column to this (really old) stat file to advertise that the
kernel supports quota for realtime volumes.  This will be used by
fstests to detect kernel support.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_stats.c |   12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 90a77cd3ebade..ab682c2c2152c 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -113,10 +113,16 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
 
 static int xqm_proc_show(struct seq_file *m, void *v)
 {
-	/* maximum; incore; ratio free to inuse; freelist */
-	seq_printf(m, "%d\t%d\t%d\t%u\n",
+	/* maximum; incore; ratio free to inuse; freelist; rtquota */
+	seq_printf(m, "%d\t%d\t%d\t%u\t%u\n",
 		   0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
-		   0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
+		   0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1),
+#ifdef CONFIG_XFS_RT
+		   1
+#else
+		   0
+#endif
+		   );
 	return 0;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/6] xfs: report realtime block quota limits on realtime directories
  2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 21:59   ` [PATCH 4/6] xfs: advertise realtime quota support in the xqm stat files Darrick J. Wong
@ 2023-12-31 21:59   ` Darrick J. Wong
  2023-12-31 21:59   ` [PATCH 6/6] xfs: enable realtime quota again Darrick J. Wong
  5 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:59 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

On the data device, calling statvfs on a projinherit directory results
in the block and avail counts being curtailed to the project quota block
limits, if any are set.  Do the same for realtime files or directories,
only use the project quota rt block limits.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_qm_bhv.c |   18 ++++++++++++------
 fs/xfs/xfs_super.c  |   11 +++++------
 2 files changed, 17 insertions(+), 12 deletions(-)


diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index df569a839d3f9..341bf46eb380d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -20,18 +20,24 @@
 STATIC void
 xfs_fill_statvfs_from_dquot(
 	struct kstatfs		*statp,
+	struct xfs_inode	*ip,
 	struct xfs_dquot	*dqp)
 {
+	struct xfs_dquot_res	*blkres = &dqp->q_blk;
 	uint64_t		limit;
 
-	limit = dqp->q_blk.softlimit ?
-		dqp->q_blk.softlimit :
-		dqp->q_blk.hardlimit;
+	if (XFS_IS_REALTIME_MOUNT(ip->i_mount) &&
+	    (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME)))
+		blkres = &dqp->q_rtb;
+
+	limit = blkres->softlimit ?
+		blkres->softlimit :
+		blkres->hardlimit;
 	if (limit && statp->f_blocks > limit) {
 		statp->f_blocks = limit;
 		statp->f_bfree = statp->f_bavail =
-			(statp->f_blocks > dqp->q_blk.reserved) ?
-			 (statp->f_blocks - dqp->q_blk.reserved) : 0;
+			(statp->f_blocks > blkres->reserved) ?
+			 (statp->f_blocks - blkres->reserved) : 0;
 	}
 
 	limit = dqp->q_ino.softlimit ?
@@ -62,7 +68,7 @@ xfs_qm_statvfs(
 	struct xfs_dquot	*dqp;
 
 	if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) {
-		xfs_fill_statvfs_from_dquot(statp, dqp);
+		xfs_fill_statvfs_from_dquot(statp, ip, dqp);
 		xfs_qm_dqput(dqp);
 	}
 }
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b5291b0ea21d9..2aa91f9be05a6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -889,12 +889,6 @@ xfs_fs_statfs(
 	ffree = statp->f_files - (icount - ifree);
 	statp->f_ffree = max_t(int64_t, ffree, 0);
 
-
-	if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
-	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
-			      (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
-		xfs_qm_statvfs(ip, statp);
-
 	if (XFS_IS_REALTIME_MOUNT(mp) &&
 	    (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
 		s64	freertx;
@@ -904,6 +898,11 @@ xfs_fs_statfs(
 		statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx);
 	}
 
+	if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+			      (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
+		xfs_qm_statvfs(ip, statp);
+
 	return 0;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/6] xfs: enable realtime quota again
  2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 21:59   ` [PATCH 5/6] xfs: report realtime block quota limits on realtime directories Darrick J. Wong
@ 2023-12-31 21:59   ` Darrick J. Wong
  5 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 21:59 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable quotas for the realtime device.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_qm.c |   12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)


diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b89f0e8f80f9f..9ee319e7b13bb 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1532,15 +1532,9 @@ xfs_qm_mount_quotas(
 	int			error = 0;
 	uint			sbf;
 
-	/*
-	 * If quotas on realtime volumes is not supported, we disable
-	 * quotas immediately.
-	 */
-	if (mp->m_sb.sb_rextents) {
-		xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-		mp->m_qflags = 0;
-		goto write_changes;
-	}
+	if (mp->m_sb.sb_rextents)
+		xfs_warn(mp,
+	"EXPERIMENTAL realtime quota feature in use. Use at your own risk!");
 
 	ASSERT(XFS_IS_QUOTA_ON(mp));
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/28] xfs: hoist extent size helpers to libxfs
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
@ 2023-12-31 23:22   ` Darrick J. Wong
  2023-12-31 23:22   ` [PATCH 02/28] xfs: hoist inode flag conversion functions " Darrick J. Wong
                     ` (26 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:22 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the extent size helpers to xfs_bmap.c in libxfs since they're used
there already.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h  |    7 +++++++
 libxfs/libxfs_priv.h |    2 --
 libxfs/xfs_bmap.c    |   41 +++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_bmap.h    |    3 +++
 4 files changed, 51 insertions(+), 2 deletions(-)


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 496d504747c..42db70f8144 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -340,6 +340,11 @@ static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip)
 	return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
 }
 
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+	return false;
+}
+
 /* Always set the child's GID to this value, even if the parent is setgid. */
 #define CRED_FORCE_GID	(1U << 0)
 struct cred {
@@ -365,4 +370,6 @@ extern int	libxfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 				uint, struct xfs_inode **);
 extern void	libxfs_irele(struct xfs_inode *ip);
 
+#define XFS_DEFAULT_COWEXTSZ_HINT 32
+
 #endif /* __XFS_INODE_H__ */
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 123e25d2f5e..90149b60c57 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -459,8 +459,6 @@ void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
 
 #define xfs_rotorstep				1
 #define xfs_bmap_rtalloc(a)			(-ENOSYS)
-#define xfs_get_extsz_hint(ip)			(0)
-#define xfs_get_cowextsz_hint(ip)		(0)
 #define xfs_inode_is_filestream(ip)		(0)
 #define xfs_filestream_lookup_ag(ip)		(0)
 #define xfs_filestream_new_ag(ip,ag)		(0)
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 5c69720e19e..15553659612 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -6357,3 +6357,44 @@ xfs_bmap_query_all(
 
 	return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query);
 }
+
+/* Helper function to extract extent size hint from inode */
+xfs_extlen_t
+xfs_get_extsz_hint(
+	struct xfs_inode	*ip)
+{
+	/*
+	 * No point in aligning allocations if we need to COW to actually
+	 * write to them.
+	 */
+	if (xfs_is_always_cow_inode(ip))
+		return 0;
+	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+		return ip->i_extsize;
+	if (XFS_IS_REALTIME_INODE(ip))
+		return ip->i_mount->m_sb.sb_rextsize;
+	return 0;
+}
+
+/*
+ * Helper function to extract CoW extent size hint from inode.
+ * Between the extent size hint and the CoW extent size hint, we
+ * return the greater of the two.  If the value is zero (automatic),
+ * use the default size.
+ */
+xfs_extlen_t
+xfs_get_cowextsz_hint(
+	struct xfs_inode	*ip)
+{
+	xfs_extlen_t		a, b;
+
+	a = 0;
+	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+		a = ip->i_cowextsize;
+	b = xfs_get_extsz_hint(ip);
+
+	a = max(a, b);
+	if (a == 0)
+		return XFS_DEFAULT_COWEXTSZ_HINT;
+	return a;
+}
diff --git a/libxfs/xfs_bmap.h b/libxfs/xfs_bmap.h
index c9e297dba88..bd7f936262c 100644
--- a/libxfs/xfs_bmap.h
+++ b/libxfs/xfs_bmap.h
@@ -294,4 +294,7 @@ typedef int (*xfs_bmap_query_range_fn)(
 int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn,
 		void *priv);
 
+xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
+
 #endif	/* __XFS_BMAP_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/28] xfs: hoist inode flag conversion functions to libxfs
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
  2023-12-31 23:22   ` [PATCH 01/28] xfs: hoist extent size helpers " Darrick J. Wong
@ 2023-12-31 23:22   ` Darrick J. Wong
  2023-12-31 23:23   ` [PATCH 03/28] xfs: hoist project id get/set " Darrick J. Wong
                     ` (25 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:22 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Hoist the inode flag conversion functions into libxfs so that we can
keep them in sync.  Do this by creating a new xfs_inode_util.c file in
libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/libxfs.h        |    1 
 include/xfs_inode.h     |    1 
 libxfs/Makefile         |    2 +
 libxfs/util.c           |   60 -----------------------
 libxfs/xfs_bmap.c       |    1 
 libxfs/xfs_inode_util.c |  124 +++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_inode_util.h |   14 +++++
 7 files changed, 143 insertions(+), 60 deletions(-)
 create mode 100644 libxfs/xfs_inode_util.c
 create mode 100644 libxfs/xfs_inode_util.h


diff --git a/include/libxfs.h b/include/libxfs.h
index 425112b0693..0d60b9f9f51 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -73,6 +73,7 @@ struct iomap;
 #include "xfs_attr_sf.h"
 #include "xfs_inode_fork.h"
 #include "xfs_inode_buf.h"
+#include "xfs_inode_util.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 42db70f8144..d75269cb414 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -10,6 +10,7 @@
 /* These match kernel side includes */
 #include "xfs_inode_buf.h"
 #include "xfs_inode_fork.h"
+#include "xfs_inode_util.h"
 
 struct xfs_trans;
 struct xfs_mount;
diff --git a/libxfs/Makefile b/libxfs/Makefile
index 42dce62ecf9..bb6c2d5616c 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -51,6 +51,7 @@ HFILES = \
 	xfs_ialloc_btree.h \
 	xfs_inode_buf.h \
 	xfs_inode_fork.h \
+	xfs_inode_util.h \
 	xfs_parent.h \
 	xfs_quota_defs.h \
 	xfs_refcount.h \
@@ -102,6 +103,7 @@ CFILES = cache.c \
 	xfs_iext_tree.c \
 	xfs_inode_buf.c \
 	xfs_inode_fork.c \
+	xfs_inode_util.c \
 	xfs_ialloc_btree.c \
 	xfs_log_rlimit.c \
 	xfs_parent.c \
diff --git a/libxfs/util.c b/libxfs/util.c
index 5106a6433da..dedf6c2a7b7 100644
--- a/libxfs/util.c
+++ b/libxfs/util.c
@@ -150,66 +150,6 @@ current_time(struct inode *inode)
 	return tv;
 }
 
-STATIC uint16_t
-xfs_flags2diflags(
-	struct xfs_inode	*ip,
-	unsigned int		xflags)
-{
-	/* can't set PREALLOC this way, just preserve it */
-	uint16_t		di_flags =
-		(ip->i_diflags & XFS_DIFLAG_PREALLOC);
-
-	if (xflags & FS_XFLAG_IMMUTABLE)
-		di_flags |= XFS_DIFLAG_IMMUTABLE;
-	if (xflags & FS_XFLAG_APPEND)
-		di_flags |= XFS_DIFLAG_APPEND;
-	if (xflags & FS_XFLAG_SYNC)
-		di_flags |= XFS_DIFLAG_SYNC;
-	if (xflags & FS_XFLAG_NOATIME)
-		di_flags |= XFS_DIFLAG_NOATIME;
-	if (xflags & FS_XFLAG_NODUMP)
-		di_flags |= XFS_DIFLAG_NODUMP;
-	if (xflags & FS_XFLAG_NODEFRAG)
-		di_flags |= XFS_DIFLAG_NODEFRAG;
-	if (xflags & FS_XFLAG_FILESTREAM)
-		di_flags |= XFS_DIFLAG_FILESTREAM;
-	if (S_ISDIR(VFS_I(ip)->i_mode)) {
-		if (xflags & FS_XFLAG_RTINHERIT)
-			di_flags |= XFS_DIFLAG_RTINHERIT;
-		if (xflags & FS_XFLAG_NOSYMLINKS)
-			di_flags |= XFS_DIFLAG_NOSYMLINKS;
-		if (xflags & FS_XFLAG_EXTSZINHERIT)
-			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-		if (xflags & FS_XFLAG_PROJINHERIT)
-			di_flags |= XFS_DIFLAG_PROJINHERIT;
-	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
-		if (xflags & FS_XFLAG_REALTIME)
-			di_flags |= XFS_DIFLAG_REALTIME;
-		if (xflags & FS_XFLAG_EXTSIZE)
-			di_flags |= XFS_DIFLAG_EXTSIZE;
-	}
-
-	return di_flags;
-}
-
-STATIC uint64_t
-xfs_flags2diflags2(
-	struct xfs_inode	*ip,
-	unsigned int		xflags)
-{
-	uint64_t		di_flags2 =
-		(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
-				   XFS_DIFLAG2_BIGTIME |
-				   XFS_DIFLAG2_NREXT64));
-
-	if (xflags & FS_XFLAG_DAX)
-		di_flags2 |= XFS_DIFLAG2_DAX;
-	if (xflags & FS_XFLAG_COWEXTSIZE)
-		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
-
-	return di_flags2;
-}
-
 /* Propagate di_flags from a parent inode to a child inode. */
 static void
 xfs_inode_propagate_flags(
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 15553659612..ed50bb90e6a 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -33,6 +33,7 @@
 #include "xfs_health.h"
 #include "defer_item.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_inode_util.h"
 
 struct kmem_cache		*xfs_bmap_intent_cache;
 
diff --git a/libxfs/xfs_inode_util.c b/libxfs/xfs_inode_util.c
new file mode 100644
index 00000000000..868a77cafa6
--- /dev/null
+++ b/libxfs/xfs_inode_util.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "libxfs_priv.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_inode_util.h"
+
+uint16_t
+xfs_flags2diflags(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	/* can't set PREALLOC this way, just preserve it */
+	uint16_t		di_flags =
+		(ip->i_diflags & XFS_DIFLAG_PREALLOC);
+
+	if (xflags & FS_XFLAG_IMMUTABLE)
+		di_flags |= XFS_DIFLAG_IMMUTABLE;
+	if (xflags & FS_XFLAG_APPEND)
+		di_flags |= XFS_DIFLAG_APPEND;
+	if (xflags & FS_XFLAG_SYNC)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if (xflags & FS_XFLAG_NOATIME)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if (xflags & FS_XFLAG_NODUMP)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if (xflags & FS_XFLAG_NODEFRAG)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (xflags & FS_XFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		if (xflags & FS_XFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (xflags & FS_XFLAG_NOSYMLINKS)
+			di_flags |= XFS_DIFLAG_NOSYMLINKS;
+		if (xflags & FS_XFLAG_EXTSZINHERIT)
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+		if (xflags & FS_XFLAG_PROJINHERIT)
+			di_flags |= XFS_DIFLAG_PROJINHERIT;
+	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
+		if (xflags & FS_XFLAG_REALTIME)
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (xflags & FS_XFLAG_EXTSIZE)
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+	}
+
+	return di_flags;
+}
+
+uint64_t
+xfs_flags2diflags2(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	uint64_t		di_flags2 =
+		(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
+				   XFS_DIFLAG2_BIGTIME |
+				   XFS_DIFLAG2_NREXT64));
+
+	if (xflags & FS_XFLAG_DAX)
+		di_flags2 |= XFS_DIFLAG2_DAX;
+	if (xflags & FS_XFLAG_COWEXTSIZE)
+		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+
+	return di_flags2;
+}
+
+uint32_t
+xfs_ip2xflags(
+	struct xfs_inode	*ip)
+{
+	uint32_t		flags = 0;
+
+	if (ip->i_diflags & XFS_DIFLAG_ANY) {
+		if (ip->i_diflags & XFS_DIFLAG_REALTIME)
+			flags |= FS_XFLAG_REALTIME;
+		if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
+			flags |= FS_XFLAG_PREALLOC;
+		if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
+			flags |= FS_XFLAG_IMMUTABLE;
+		if (ip->i_diflags & XFS_DIFLAG_APPEND)
+			flags |= FS_XFLAG_APPEND;
+		if (ip->i_diflags & XFS_DIFLAG_SYNC)
+			flags |= FS_XFLAG_SYNC;
+		if (ip->i_diflags & XFS_DIFLAG_NOATIME)
+			flags |= FS_XFLAG_NOATIME;
+		if (ip->i_diflags & XFS_DIFLAG_NODUMP)
+			flags |= FS_XFLAG_NODUMP;
+		if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
+			flags |= FS_XFLAG_RTINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+			flags |= FS_XFLAG_PROJINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
+			flags |= FS_XFLAG_NOSYMLINKS;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+			flags |= FS_XFLAG_EXTSIZE;
+		if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
+			flags |= FS_XFLAG_EXTSZINHERIT;
+		if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
+			flags |= FS_XFLAG_NODEFRAG;
+		if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
+			flags |= FS_XFLAG_FILESTREAM;
+	}
+
+	if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
+		if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
+			flags |= FS_XFLAG_DAX;
+		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+			flags |= FS_XFLAG_COWEXTSIZE;
+	}
+
+	if (xfs_inode_has_attr_fork(ip))
+		flags |= FS_XFLAG_HASATTR;
+	return flags;
+}
diff --git a/libxfs/xfs_inode_util.h b/libxfs/xfs_inode_util.h
new file mode 100644
index 00000000000..6ad1898a0f7
--- /dev/null
+++ b/libxfs/xfs_inode_util.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef	__XFS_INODE_UTIL_H__
+#define	__XFS_INODE_UTIL_H__
+
+uint16_t	xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags);
+uint64_t	xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags);
+uint32_t	xfs_dic2xflags(struct xfs_inode *ip);
+uint32_t	xfs_ip2xflags(struct xfs_inode *ip);
+
+#endif /* __XFS_INODE_UTIL_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/28] xfs: hoist project id get/set functions to libxfs
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
  2023-12-31 23:22   ` [PATCH 01/28] xfs: hoist extent size helpers " Darrick J. Wong
  2023-12-31 23:22   ` [PATCH 02/28] xfs: hoist inode flag conversion functions " Darrick J. Wong
@ 2023-12-31 23:23   ` Darrick J. Wong
  2023-12-31 23:23   ` [PATCH 04/28] libxfs: put all the inode functions in a single file Darrick J. Wong
                     ` (24 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:23 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the project id get and set functions into libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    2 ++
 libxfs/xfs_inode_util.c  |   11 +++++++++++
 libxfs/xfs_inode_util.h  |    2 ++
 3 files changed, 15 insertions(+)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 49372a44029..3455c014832 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -150,6 +150,7 @@
 #define xfs_free_extent_later		libxfs_free_extent_later
 #define xfs_free_perag			libxfs_free_perag
 #define xfs_fs_geometry			libxfs_fs_geometry
+#define xfs_get_projid			libxfs_get_projid
 #define xfs_highbit32			libxfs_highbit32
 #define xfs_highbit64			libxfs_highbit64
 #define xfs_ialloc_calc_rootino		libxfs_ialloc_calc_rootino
@@ -252,6 +253,7 @@
 #define xfs_sb_read_secondary		libxfs_sb_read_secondary
 #define xfs_sb_to_disk			libxfs_sb_to_disk
 #define xfs_sb_version_to_features	libxfs_sb_version_to_features
+#define xfs_set_projid			libxfs_set_projid
 #define xfs_symlink_blocks		libxfs_symlink_blocks
 #define xfs_symlink_hdr_ok		libxfs_symlink_hdr_ok
 #define xfs_symlink_write_target	libxfs_symlink_write_target
diff --git a/libxfs/xfs_inode_util.c b/libxfs/xfs_inode_util.c
index 868a77cafa6..89fb58807a1 100644
--- a/libxfs/xfs_inode_util.c
+++ b/libxfs/xfs_inode_util.c
@@ -122,3 +122,14 @@ xfs_ip2xflags(
 		flags |= FS_XFLAG_HASATTR;
 	return flags;
 }
+
+#define XFS_PROJID_DEFAULT	0
+
+prid_t
+xfs_get_initial_prid(struct xfs_inode *dp)
+{
+	if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT)
+		return dp->i_projid;
+
+	return XFS_PROJID_DEFAULT;
+}
diff --git a/libxfs/xfs_inode_util.h b/libxfs/xfs_inode_util.h
index 6ad1898a0f7..f7e4d5a8235 100644
--- a/libxfs/xfs_inode_util.h
+++ b/libxfs/xfs_inode_util.h
@@ -11,4 +11,6 @@ uint64_t	xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags);
 uint32_t	xfs_dic2xflags(struct xfs_inode *ip);
 uint32_t	xfs_ip2xflags(struct xfs_inode *ip);
 
+prid_t		xfs_get_initial_prid(struct xfs_inode *dp);
+
 #endif /* __XFS_INODE_UTIL_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/28] libxfs: put all the inode functions in a single file
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 23:23   ` [PATCH 03/28] xfs: hoist project id get/set " Darrick J. Wong
@ 2023-12-31 23:23   ` Darrick J. Wong
  2023-12-31 23:23   ` [PATCH 05/28] libxfs: pass IGET flags through to xfs_iread Darrick J. Wong
                     ` (23 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:23 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move all the inode functions into a single source code file.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/Makefile |    1 
 libxfs/inode.c  |  380 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/rdwr.c   |   95 --------------
 libxfs/util.c   |  254 -------------------------------------
 4 files changed, 381 insertions(+), 349 deletions(-)
 create mode 100644 libxfs/inode.c


diff --git a/libxfs/Makefile b/libxfs/Makefile
index bb6c2d5616c..6007353ade2 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -69,6 +69,7 @@ HFILES = \
 CFILES = cache.c \
 	defer_item.c \
 	init.c \
+	inode.c \
 	kmem.c \
 	logitem.c \
 	rdwr.c \
diff --git a/libxfs/inode.c b/libxfs/inode.c
new file mode 100644
index 00000000000..2bed1c2022e
--- /dev/null
+++ b/libxfs/inode.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+
+#include "libxfs_priv.h"
+#include "libxfs.h"
+#include "libxfs_io.h"
+#include "init.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_priv.h"
+
+/* Propagate di_flags from a parent inode to a child inode. */
+static void
+xfs_inode_propagate_flags(
+	struct xfs_inode	*ip,
+	const struct xfs_inode	*pip)
+{
+	unsigned int		di_flags = 0;
+	umode_t			mode = VFS_I(ip)->i_mode;
+
+	if ((mode & S_IFMT) == S_IFDIR) {
+		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+			ip->i_extsize = pip->i_extsize;
+		}
+	} else {
+		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+		    xfs_has_realtime(ip->i_mount))
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+			ip->i_extsize = pip->i_extsize;
+		}
+	}
+	if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+		di_flags |= XFS_DIFLAG_PROJINHERIT;
+	ip->i_diflags |= di_flags;
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+void
+libxfs_bumplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	if (inode->i_nlink != XFS_NLINK_PINNED)
+		inc_nlink(inode);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Initialise a newly allocated inode and return the in-core inode to the
+ * caller locked exclusively.
+ */
+static int
+libxfs_init_new_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*pip,
+	xfs_ino_t		ino,
+	umode_t			mode,
+	xfs_nlink_t		nlink,
+	dev_t			rdev,
+	struct cred		*cr,
+	struct fsxattr		*fsx,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_inode	*ip;
+	unsigned int		flags;
+	int			error;
+
+	error = libxfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, &ip);
+	if (error != 0)
+		return error;
+	ASSERT(ip != NULL);
+
+	VFS_I(ip)->i_mode = mode;
+	set_nlink(VFS_I(ip), nlink);
+	i_uid_write(VFS_I(ip), cr->cr_uid);
+	i_gid_write(VFS_I(ip), cr->cr_gid);
+	ip->i_projid = pip ? 0 : fsx->fsx_projid;
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG | XFS_ICHGTIME_MOD);
+
+	if (pip && (VFS_I(pip)->i_mode & S_ISGID)) {
+		if (!(cr->cr_flags & CRED_FORCE_GID))
+			VFS_I(ip)->i_gid = VFS_I(pip)->i_gid;
+		if ((VFS_I(pip)->i_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR)
+			VFS_I(ip)->i_mode |= S_ISGID;
+	}
+
+	ip->i_disk_size = 0;
+	ip->i_df.if_nextents = 0;
+	ASSERT(ip->i_nblocks == 0);
+	ip->i_extsize = pip ? 0 : fsx->fsx_extsize;
+	ip->i_diflags = pip ? 0 : xfs_flags2diflags(ip, fsx->fsx_xflags);
+
+	if (xfs_has_v3inodes(ip->i_mount)) {
+		VFS_I(ip)->i_version = 1;
+		ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2;
+		if (!pip)
+			ip->i_diflags2 = xfs_flags2diflags2(ip,
+							fsx->fsx_xflags);
+		ip->i_crtime = VFS_I(ip)->__i_mtime; /* struct copy */
+		ip->i_cowextsize = pip ? 0 : fsx->fsx_cowextsize;
+	}
+
+	flags = XFS_ILOG_CORE;
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFSOCK:
+		/* doesn't make sense to set an rdev for these */
+		rdev = 0;
+		/* FALLTHROUGH */
+	case S_IFCHR:
+	case S_IFBLK:
+		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
+		flags |= XFS_ILOG_DEV;
+		VFS_I(ip)->i_rdev = rdev;
+		break;
+	case S_IFREG:
+	case S_IFDIR:
+		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
+			xfs_inode_propagate_flags(ip, pip);
+		/* FALLTHROUGH */
+	case S_IFLNK:
+		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+		ip->i_df.if_bytes = 0;
+		ip->i_df.if_u1.if_root = NULL;
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	/*
+	 * If we need to create attributes immediately after allocating the
+	 * inode, initialise an empty attribute fork right now. We use the
+	 * default fork offset for attributes here as we don't know exactly what
+	 * size or how many attributes we might be adding. We can do this
+	 * safely here because we know the data fork is completely empty and
+	 * this saves us from needing to run a separate transaction to set the
+	 * fork offset in the immediate future.
+	 */
+	if (xfs_has_parent(tp->t_mountp) && xfs_has_attr(tp->t_mountp)) {
+		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
+		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+	}
+
+	/*
+	 * Log the new values stuffed into the inode.
+	 */
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_trans_log_inode(tp, ip, flags);
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * Writes a modified inode's changes out to the inode's on disk home.
+ * Originally based on xfs_iflush_int() from xfs_inode.c in the kernel.
+ */
+int
+libxfs_iflush_int(
+	struct xfs_inode		*ip,
+	struct xfs_buf			*bp)
+{
+	struct xfs_inode_log_item	*iip;
+	struct xfs_dinode		*dip;
+	struct xfs_mount		*mp;
+
+	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
+		ip->i_df.if_nextents > ip->i_df.if_ext_max);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+
+	/* set *dip = inode's place in the buffer */
+	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
+
+	if (XFS_ISREG(ip)) {
+		ASSERT( (ip->i_df.if_format == XFS_DINODE_FMT_EXTENTS) ||
+			(ip->i_df.if_format == XFS_DINODE_FMT_BTREE) );
+	} else if (XFS_ISDIR(ip)) {
+		ASSERT( (ip->i_df.if_format == XFS_DINODE_FMT_EXTENTS) ||
+			(ip->i_df.if_format == XFS_DINODE_FMT_BTREE)   ||
+			(ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) );
+	}
+	ASSERT(ip->i_df.if_nextents+ip.i_af->if_nextents <= ip->i_nblocks);
+	ASSERT(ip->i_forkoff <= mp->m_sb.sb_inodesize);
+
+	/* bump the change count on v3 inodes */
+	if (xfs_has_v3inodes(mp))
+		VFS_I(ip)->i_version++;
+
+	/*
+	 * If there are inline format data / attr forks attached to this inode,
+	 * make sure they are not corrupt.
+	 */
+	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
+	    xfs_ifork_verify_local_data(ip))
+		return -EFSCORRUPTED;
+	if (xfs_inode_has_attr_fork(ip) &&
+	    ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
+	    xfs_ifork_verify_local_attr(ip))
+		return -EFSCORRUPTED;
+
+	/*
+	 * Copy the dirty parts of the inode into the on-disk
+	 * inode.  We always copy out the core of the inode,
+	 * because if the inode is dirty at all the core must
+	 * be.
+	 */
+	xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
+
+	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
+	if (xfs_inode_has_attr_fork(ip))
+		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
+
+	/* generate the checksum. */
+	xfs_dinode_calc_crc(mp, dip);
+
+	return 0;
+}
+
+/*
+ * Wrapper around call to libxfs_ialloc. Takes care of committing and
+ * allocating a new transaction as needed.
+ *
+ * Originally there were two copies of this code - one in mkfs, the
+ * other in repair - now there is just the one.
+ */
+int
+libxfs_dir_ialloc(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*dp,
+	mode_t			mode,
+	nlink_t			nlink,
+	xfs_dev_t		rdev,
+	struct cred		*cr,
+	struct fsxattr		*fsx,
+	struct xfs_inode	**ipp)
+{
+	xfs_ino_t		parent_ino = dp ? dp->i_ino : 0;
+	xfs_ino_t		ino;
+	int			error;
+
+	/*
+	 * Call the space management code to pick the on-disk inode to be
+	 * allocated.
+	 */
+	error = xfs_dialloc(tpp, parent_ino, mode, &ino);
+	if (error)
+		return error;
+
+	return libxfs_init_new_inode(*tpp, dp, ino, mode, nlink, rdev, cr,
+				fsx, ipp);
+}
+
+/*
+ * Inode cache stubs.
+ */
+
+struct kmem_cache		*xfs_inode_cache;
+extern struct kmem_cache	*xfs_ili_cache;
+
+int
+libxfs_iget(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	uint			lock_flags,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_inode	*ip;
+	struct xfs_buf		*bp;
+	struct xfs_perag	*pag;
+	int			error = 0;
+
+	/* reject inode numbers outside existing AGs */
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+		return -EINVAL;
+
+	ip = kmem_cache_zalloc(xfs_inode_cache, 0);
+	if (!ip)
+		return -ENOMEM;
+
+	VFS_I(ip)->i_count = 1;
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
+	spin_lock_init(&VFS_I(ip)->i_lock);
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, 0);
+	xfs_perag_put(pag);
+
+	if (error)
+		goto out_destroy;
+
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
+	if (error)
+		goto out_destroy;
+
+	error = xfs_inode_from_disk(ip,
+			xfs_buf_offset(bp, ip->i_imap.im_boffset));
+	if (!error)
+		xfs_buf_set_ref(bp, XFS_INO_REF);
+	xfs_trans_brelse(tp, bp);
+
+	if (error)
+		goto out_destroy;
+
+	*ipp = ip;
+	return 0;
+
+out_destroy:
+	kmem_cache_free(xfs_inode_cache, ip);
+	*ipp = NULL;
+	return error;
+}
+
+static void
+libxfs_idestroy(
+	struct xfs_inode	*ip)
+{
+	switch (VFS_I(ip)->i_mode & S_IFMT) {
+		case S_IFREG:
+		case S_IFDIR:
+		case S_IFLNK:
+			libxfs_idestroy_fork(&ip->i_df);
+			break;
+	}
+
+	libxfs_ifork_zap_attr(ip);
+
+	if (ip->i_cowfp) {
+		libxfs_idestroy_fork(ip->i_cowfp);
+		kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
+	}
+}
+
+void
+libxfs_irele(
+	struct xfs_inode	*ip)
+{
+	VFS_I(ip)->i_count--;
+
+	if (VFS_I(ip)->i_count == 0) {
+		ASSERT(ip->i_itemp == NULL);
+		libxfs_idestroy(ip);
+		kmem_cache_free(xfs_inode_cache, ip);
+	}
+}
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index f352225f23d..ecf10f7946b 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -1117,101 +1117,6 @@ xfs_verify_magic16(
 	return dmagic == bp->b_ops->magic16[idx];
 }
 
-/*
- * Inode cache stubs.
- */
-
-struct kmem_cache		*xfs_inode_cache;
-extern struct kmem_cache	*xfs_ili_cache;
-
-int
-libxfs_iget(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	xfs_ino_t		ino,
-	uint			lock_flags,
-	struct xfs_inode	**ipp)
-{
-	struct xfs_inode	*ip;
-	struct xfs_buf		*bp;
-	struct xfs_perag	*pag;
-	int			error = 0;
-
-	/* reject inode numbers outside existing AGs */
-	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-		return -EINVAL;
-
-	ip = kmem_cache_zalloc(xfs_inode_cache, 0);
-	if (!ip)
-		return -ENOMEM;
-
-	VFS_I(ip)->i_count = 1;
-	ip->i_ino = ino;
-	ip->i_mount = mp;
-	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
-	spin_lock_init(&VFS_I(ip)->i_lock);
-
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, 0);
-	xfs_perag_put(pag);
-
-	if (error)
-		goto out_destroy;
-
-	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
-	if (error)
-		goto out_destroy;
-
-	error = xfs_inode_from_disk(ip,
-			xfs_buf_offset(bp, ip->i_imap.im_boffset));
-	if (!error)
-		xfs_buf_set_ref(bp, XFS_INO_REF);
-	xfs_trans_brelse(tp, bp);
-
-	if (error)
-		goto out_destroy;
-
-	*ipp = ip;
-	return 0;
-
-out_destroy:
-	kmem_cache_free(xfs_inode_cache, ip);
-	*ipp = NULL;
-	return error;
-}
-
-static void
-libxfs_idestroy(xfs_inode_t *ip)
-{
-	switch (VFS_I(ip)->i_mode & S_IFMT) {
-		case S_IFREG:
-		case S_IFDIR:
-		case S_IFLNK:
-			libxfs_idestroy_fork(&ip->i_df);
-			break;
-	}
-
-	libxfs_ifork_zap_attr(ip);
-
-	if (ip->i_cowfp) {
-		libxfs_idestroy_fork(ip->i_cowfp);
-		kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
-	}
-}
-
-void
-libxfs_irele(
-	struct xfs_inode	*ip)
-{
-	VFS_I(ip)->i_count--;
-
-	if (VFS_I(ip)->i_count == 0) {
-		ASSERT(ip->i_itemp == NULL);
-		libxfs_idestroy(ip);
-		kmem_cache_free(xfs_inode_cache, ip);
-	}
-}
-
 /*
  * Flush everything dirty in the kernel and disk write caches to stable media.
  * Returns 0 for success or a negative error code.
diff --git a/libxfs/util.c b/libxfs/util.c
index dedf6c2a7b7..7aa92c0e4a6 100644
--- a/libxfs/util.c
+++ b/libxfs/util.c
@@ -150,226 +150,6 @@ current_time(struct inode *inode)
 	return tv;
 }
 
-/* Propagate di_flags from a parent inode to a child inode. */
-static void
-xfs_inode_propagate_flags(
-	struct xfs_inode	*ip,
-	const struct xfs_inode	*pip)
-{
-	unsigned int		di_flags = 0;
-	umode_t			mode = VFS_I(ip)->i_mode;
-
-	if ((mode & S_IFMT) == S_IFDIR) {
-		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
-			di_flags |= XFS_DIFLAG_RTINHERIT;
-		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
-			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-			ip->i_extsize = pip->i_extsize;
-		}
-	} else {
-		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-		    xfs_has_realtime(ip->i_mount))
-			di_flags |= XFS_DIFLAG_REALTIME;
-		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
-			di_flags |= XFS_DIFLAG_EXTSIZE;
-			ip->i_extsize = pip->i_extsize;
-		}
-	}
-	if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
-		di_flags |= XFS_DIFLAG_PROJINHERIT;
-	ip->i_diflags |= di_flags;
-}
-
-/*
- * Increment the link count on an inode & log the change.
- */
-void
-libxfs_bumplink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-
-	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-	if (inode->i_nlink != XFS_NLINK_PINNED)
-		inc_nlink(inode);
-
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
-
-/*
- * Initialise a newly allocated inode and return the in-core inode to the
- * caller locked exclusively.
- */
-static int
-libxfs_init_new_inode(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*pip,
-	xfs_ino_t		ino,
-	umode_t			mode,
-	xfs_nlink_t		nlink,
-	dev_t			rdev,
-	struct cred		*cr,
-	struct fsxattr		*fsx,
-	struct xfs_inode	**ipp)
-{
-	struct xfs_inode	*ip;
-	unsigned int		flags;
-	int			error;
-
-	error = libxfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, &ip);
-	if (error != 0)
-		return error;
-	ASSERT(ip != NULL);
-
-	VFS_I(ip)->i_mode = mode;
-	set_nlink(VFS_I(ip), nlink);
-	i_uid_write(VFS_I(ip), cr->cr_uid);
-	i_gid_write(VFS_I(ip), cr->cr_gid);
-	ip->i_projid = pip ? 0 : fsx->fsx_projid;
-	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG | XFS_ICHGTIME_MOD);
-
-	if (pip && (VFS_I(pip)->i_mode & S_ISGID)) {
-		if (!(cr->cr_flags & CRED_FORCE_GID))
-			VFS_I(ip)->i_gid = VFS_I(pip)->i_gid;
-		if ((VFS_I(pip)->i_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR)
-			VFS_I(ip)->i_mode |= S_ISGID;
-	}
-
-	ip->i_disk_size = 0;
-	ip->i_df.if_nextents = 0;
-	ASSERT(ip->i_nblocks == 0);
-	ip->i_extsize = pip ? 0 : fsx->fsx_extsize;
-	ip->i_diflags = pip ? 0 : xfs_flags2diflags(ip, fsx->fsx_xflags);
-
-	if (xfs_has_v3inodes(ip->i_mount)) {
-		VFS_I(ip)->i_version = 1;
-		ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2;
-		if (!pip)
-			ip->i_diflags2 = xfs_flags2diflags2(ip,
-							fsx->fsx_xflags);
-		ip->i_crtime = VFS_I(ip)->__i_mtime; /* struct copy */
-		ip->i_cowextsize = pip ? 0 : fsx->fsx_cowextsize;
-	}
-
-	flags = XFS_ILOG_CORE;
-	switch (mode & S_IFMT) {
-	case S_IFIFO:
-	case S_IFSOCK:
-		/* doesn't make sense to set an rdev for these */
-		rdev = 0;
-		/* FALLTHROUGH */
-	case S_IFCHR:
-	case S_IFBLK:
-		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
-		flags |= XFS_ILOG_DEV;
-		VFS_I(ip)->i_rdev = rdev;
-		break;
-	case S_IFREG:
-	case S_IFDIR:
-		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
-			xfs_inode_propagate_flags(ip, pip);
-		/* FALLTHROUGH */
-	case S_IFLNK:
-		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
-		ip->i_df.if_bytes = 0;
-		ip->i_df.if_u1.if_root = NULL;
-		break;
-	default:
-		ASSERT(0);
-	}
-
-	/*
-	 * If we need to create attributes immediately after allocating the
-	 * inode, initialise an empty attribute fork right now. We use the
-	 * default fork offset for attributes here as we don't know exactly what
-	 * size or how many attributes we might be adding. We can do this
-	 * safely here because we know the data fork is completely empty and
-	 * this saves us from needing to run a separate transaction to set the
-	 * fork offset in the immediate future.
-	 */
-	if (xfs_has_parent(tp->t_mountp) && xfs_has_attr(tp->t_mountp)) {
-		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
-		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
-	}
-
-	/*
-	 * Log the new values stuffed into the inode.
-	 */
-	xfs_trans_ijoin(tp, ip, 0);
-	xfs_trans_log_inode(tp, ip, flags);
-	*ipp = ip;
-	return 0;
-}
-
-/*
- * Writes a modified inode's changes out to the inode's on disk home.
- * Originally based on xfs_iflush_int() from xfs_inode.c in the kernel.
- */
-int
-libxfs_iflush_int(
-	xfs_inode_t			*ip,
-	struct xfs_buf			*bp)
-{
-	struct xfs_inode_log_item	*iip;
-	struct xfs_dinode		*dip;
-	xfs_mount_t			*mp;
-
-	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
-		ip->i_df.if_nextents > ip->i_df.if_ext_max);
-
-	iip = ip->i_itemp;
-	mp = ip->i_mount;
-
-	/* set *dip = inode's place in the buffer */
-	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
-
-	if (XFS_ISREG(ip)) {
-		ASSERT( (ip->i_df.if_format == XFS_DINODE_FMT_EXTENTS) ||
-			(ip->i_df.if_format == XFS_DINODE_FMT_BTREE) );
-	} else if (XFS_ISDIR(ip)) {
-		ASSERT( (ip->i_df.if_format == XFS_DINODE_FMT_EXTENTS) ||
-			(ip->i_df.if_format == XFS_DINODE_FMT_BTREE)   ||
-			(ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) );
-	}
-	ASSERT(ip->i_df.if_nextents+ip.i_af->if_nextents <= ip->i_nblocks);
-	ASSERT(ip->i_forkoff <= mp->m_sb.sb_inodesize);
-
-	/* bump the change count on v3 inodes */
-	if (xfs_has_v3inodes(mp))
-		VFS_I(ip)->i_version++;
-
-	/*
-	 * If there are inline format data / attr forks attached to this inode,
-	 * make sure they are not corrupt.
-	 */
-	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
-	    xfs_ifork_verify_local_data(ip))
-		return -EFSCORRUPTED;
-	if (xfs_inode_has_attr_fork(ip) &&
-	    ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
-	    xfs_ifork_verify_local_attr(ip))
-		return -EFSCORRUPTED;
-
-	/*
-	 * Copy the dirty parts of the inode into the on-disk
-	 * inode.  We always copy out the core of the inode,
-	 * because if the inode is dirty at all the core must
-	 * be.
-	 */
-	xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
-
-	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
-	if (xfs_inode_has_attr_fork(ip))
-		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
-
-	/* generate the checksum. */
-	xfs_dinode_calc_crc(mp, dip);
-
-	return 0;
-}
-
 int
 libxfs_mod_incore_sb(
 	struct xfs_mount *mp,
@@ -474,40 +254,6 @@ libxfs_alloc_file_space(
 	return error;
 }
 
-/*
- * Wrapper around call to libxfs_ialloc. Takes care of committing and
- * allocating a new transaction as needed.
- *
- * Originally there were two copies of this code - one in mkfs, the
- * other in repair - now there is just the one.
- */
-int
-libxfs_dir_ialloc(
-	struct xfs_trans	**tpp,
-	struct xfs_inode	*dp,
-	mode_t			mode,
-	nlink_t			nlink,
-	xfs_dev_t		rdev,
-	struct cred		*cr,
-	struct fsxattr		*fsx,
-	struct xfs_inode	**ipp)
-{
-	xfs_ino_t		parent_ino = dp ? dp->i_ino : 0;
-	xfs_ino_t		ino;
-	int			error;
-
-	/*
-	 * Call the space management code to pick the on-disk inode to be
-	 * allocated.
-	 */
-	error = xfs_dialloc(tpp, parent_ino, mode, &ino);
-	if (error)
-		return error;
-
-	return libxfs_init_new_inode(*tpp, dp, ino, mode, nlink, rdev, cr,
-				fsx, ipp);
-}
-
 void
 cmn_err(int level, char *fmt, ...)
 {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/28] libxfs: pass IGET flags through to xfs_iread
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 23:23   ` [PATCH 04/28] libxfs: put all the inode functions in a single file Darrick J. Wong
@ 2023-12-31 23:23   ` Darrick J. Wong
  2023-12-31 23:23   ` [PATCH 06/28] xfs: pack icreate initialization parameters into a separate structure Darrick J. Wong
                     ` (22 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:23 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Change the lock_flags parameter to iget_flags so that we can supply
XFS_IGET_ flags in future patches.  All callers of libxfs_iget and
libxfs_trans_iget pass zero for this parameter and there are no inode
locks in xfsprogs, so there's no behavior change here.

Port the kernel's version of the xfs_inode_from_disk callsite.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/inode.c |   40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)


diff --git a/libxfs/inode.c b/libxfs/inode.c
index 2bed1c2022e..68e68ee83e3 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -295,11 +295,10 @@ libxfs_iget(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
 	xfs_ino_t		ino,
-	uint			lock_flags,
+	uint			flags,
 	struct xfs_inode	**ipp)
 {
 	struct xfs_inode	*ip;
-	struct xfs_buf		*bp;
 	struct xfs_perag	*pag;
 	int			error = 0;
 
@@ -324,18 +323,35 @@ libxfs_iget(
 	if (error)
 		goto out_destroy;
 
-	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
-	if (error)
-		goto out_destroy;
+	/*
+	 * For version 5 superblocks, if we are initialising a new inode and we
+	 * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
+	 * simply build the new inode core with a random generation number.
+	 *
+	 * For version 4 (and older) superblocks, log recovery is dependent on
+	 * the di_flushiter field being initialised from the current on-disk
+	 * value and hence we must also read the inode off disk even when
+	 * initializing new inodes.
+	 */
+	if (xfs_has_v3inodes(mp) &&
+	    (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
+		VFS_I(ip)->i_generation = get_random_u32();
+	} else {
+		struct xfs_buf		*bp;
 
-	error = xfs_inode_from_disk(ip,
-			xfs_buf_offset(bp, ip->i_imap.im_boffset));
-	if (!error)
-		xfs_buf_set_ref(bp, XFS_INO_REF);
-	xfs_trans_brelse(tp, bp);
+		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
+		if (error)
+			goto out_destroy;
 
-	if (error)
-		goto out_destroy;
+		error = xfs_inode_from_disk(ip,
+				xfs_buf_offset(bp, ip->i_imap.im_boffset));
+		if (!error)
+			xfs_buf_set_ref(bp, XFS_INO_REF);
+		xfs_trans_brelse(tp, bp);
+
+		if (error)
+			goto out_destroy;
+	}
 
 	*ipp = ip;
 	return 0;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/28] xfs: pack icreate initialization parameters into a separate structure
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 23:23   ` [PATCH 05/28] libxfs: pass IGET flags through to xfs_iread Darrick J. Wong
@ 2023-12-31 23:23   ` Darrick J. Wong
  2023-12-31 23:24   ` [PATCH 07/28] xfs: implement atime updates in xfs_trans_ichgtime Darrick J. Wong
                     ` (21 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:23 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Callers that want to create an inode currently pass all possible file
attribute values for the new inode into xfs_init_new_inode as ten
separate parameters.  This causes two code maintenance issues: first, we
have large multi-line call sites which programmers must read carefully
to make sure they did not accidentally invert a value.  Second, all
three file id parameters must be passed separately to the quota
functions; any discrepancy results in quota count errors.

Clean this up by creating a new icreate_args structure to hold all this
information, some helpers to initialize them properly, and make the
callers pass this structure through to the creation function, whose name
we shorten to xfs_icreate.  This eliminates the issues, enables us to
keep the inode init code in sync with userspace via libxfs, and is
needed for future metadata directory tree management.

(A subsequent cleanup will also fix the quota alloc calls.)

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h     |   37 +++++++++++++++++++---
 libxfs/inode.c          |   78 +++++++++++++++++++++++++++++------------------
 libxfs/xfs_inode_util.h |   32 +++++++++++++++++++
 3 files changed, 111 insertions(+), 36 deletions(-)


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index d75269cb414..04d971e744b 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -7,6 +7,31 @@
 #ifndef __XFS_INODE_H__
 #define __XFS_INODE_H__
 
+/*
+ * Borrow the kernel's uid/gid types.  These are used by xfs_inode_util.h, so
+ * they must come first in the header file.
+ */
+
+typedef struct {
+	uid_t val;
+} kuid_t;
+
+typedef struct {
+	gid_t val;
+} kgid_t;
+
+static inline kuid_t make_kuid(uid_t uid)
+{
+	kuid_t	v = { .val = uid };
+	return v;
+}
+
+static inline kgid_t make_kgid(gid_t gid)
+{
+	kgid_t	v = { .val = gid };
+	return v;
+}
+
 /* These match kernel side includes */
 #include "xfs_inode_buf.h"
 #include "xfs_inode_fork.h"
@@ -34,8 +59,8 @@ static inline bool IS_I_VERSION(const struct inode *inode) { return false; }
  */
 struct inode {
 	mode_t			i_mode;
-	uint32_t		i_uid;
-	uint32_t		i_gid;
+	kuid_t			i_uid;
+	kgid_t			i_gid;
 	uint32_t		i_nlink;
 	xfs_dev_t		i_rdev;	 /* This actually holds xfs_dev_t */
 	unsigned int		i_count;
@@ -50,19 +75,19 @@ struct inode {
 
 static inline uint32_t i_uid_read(struct inode *inode)
 {
-	return inode->i_uid;
+	return inode->i_uid.val;
 }
 static inline uint32_t i_gid_read(struct inode *inode)
 {
-	return inode->i_gid;
+	return inode->i_gid.val;
 }
 static inline void i_uid_write(struct inode *inode, uint32_t uid)
 {
-	inode->i_uid = uid;
+	inode->i_uid.val = uid;
 }
 static inline void i_gid_write(struct inode *inode, uint32_t gid)
 {
-	inode->i_gid = gid;
+	inode->i_gid.val = gid;
 }
 
 static inline void ihold(struct inode *inode)
diff --git a/libxfs/inode.c b/libxfs/inode.c
index 68e68ee83e3..433755a18ba 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -82,17 +82,13 @@ libxfs_bumplink(
  * caller locked exclusively.
  */
 static int
-libxfs_init_new_inode(
+libxfs_icreate(
 	struct xfs_trans	*tp,
-	struct xfs_inode	*pip,
 	xfs_ino_t		ino,
-	umode_t			mode,
-	xfs_nlink_t		nlink,
-	dev_t			rdev,
-	struct cred		*cr,
-	struct fsxattr		*fsx,
+	const struct xfs_icreate_args *args,
 	struct xfs_inode	**ipp)
 {
+	struct xfs_inode	*pip = args->pip;
 	struct xfs_inode	*ip;
 	unsigned int		flags;
 	int			error;
@@ -102,48 +98,41 @@ libxfs_init_new_inode(
 		return error;
 	ASSERT(ip != NULL);
 
-	VFS_I(ip)->i_mode = mode;
-	set_nlink(VFS_I(ip), nlink);
-	i_uid_write(VFS_I(ip), cr->cr_uid);
-	i_gid_write(VFS_I(ip), cr->cr_gid);
-	ip->i_projid = pip ? 0 : fsx->fsx_projid;
+	VFS_I(ip)->i_mode = args->mode;
+	set_nlink(VFS_I(ip), args->nlink);
+	VFS_I(ip)->i_uid = args->uid;
+	ip->i_projid = args->prid;
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG | XFS_ICHGTIME_MOD);
 
 	if (pip && (VFS_I(pip)->i_mode & S_ISGID)) {
-		if (!(cr->cr_flags & CRED_FORCE_GID))
+		if (!(args->flags & XFS_ICREATE_ARGS_FORCE_GID))
 			VFS_I(ip)->i_gid = VFS_I(pip)->i_gid;
-		if ((VFS_I(pip)->i_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR)
+		if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(args->mode))
 			VFS_I(ip)->i_mode |= S_ISGID;
-	}
+	} else
+		VFS_I(ip)->i_gid = args->gid;
 
 	ip->i_disk_size = 0;
 	ip->i_df.if_nextents = 0;
 	ASSERT(ip->i_nblocks == 0);
-	ip->i_extsize = pip ? 0 : fsx->fsx_extsize;
-	ip->i_diflags = pip ? 0 : xfs_flags2diflags(ip, fsx->fsx_xflags);
-
+	ip->i_extsize = 0;
+	ip->i_diflags = 0;
 	if (xfs_has_v3inodes(ip->i_mount)) {
 		VFS_I(ip)->i_version = 1;
 		ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2;
-		if (!pip)
-			ip->i_diflags2 = xfs_flags2diflags2(ip,
-							fsx->fsx_xflags);
-		ip->i_crtime = VFS_I(ip)->__i_mtime; /* struct copy */
-		ip->i_cowextsize = pip ? 0 : fsx->fsx_cowextsize;
+		ip->i_crtime = VFS_I(ip)->__i_mtime;
+		ip->i_cowextsize = 0;
 	}
 
 	flags = XFS_ILOG_CORE;
-	switch (mode & S_IFMT) {
+	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFSOCK:
-		/* doesn't make sense to set an rdev for these */
-		rdev = 0;
-		/* FALLTHROUGH */
 	case S_IFCHR:
 	case S_IFBLK:
 		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
 		flags |= XFS_ILOG_DEV;
-		VFS_I(ip)->i_rdev = rdev;
+		VFS_I(ip)->i_rdev = args->rdev;
 		break;
 	case S_IFREG:
 	case S_IFDIR:
@@ -267,10 +256,25 @@ libxfs_dir_ialloc(
 	struct fsxattr		*fsx,
 	struct xfs_inode	**ipp)
 {
+	struct xfs_icreate_args	args = {
+		.pip		= dp,
+		.uid		= make_kuid(cr->cr_uid),
+		.gid		= make_kgid(cr->cr_gid),
+		.nlink		= nlink,
+		.rdev		= rdev,
+		.mode		= mode,
+	};
+	struct xfs_inode	*ip;
 	xfs_ino_t		parent_ino = dp ? dp->i_ino : 0;
 	xfs_ino_t		ino;
 	int			error;
 
+	if (cr->cr_flags & CRED_FORCE_GID)
+		args.flags |= XFS_ICREATE_ARGS_FORCE_GID;
+
+	if (dp && xfs_has_parent(dp->i_mount))
+		args.flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
+
 	/*
 	 * Call the space management code to pick the on-disk inode to be
 	 * allocated.
@@ -279,8 +283,22 @@ libxfs_dir_ialloc(
 	if (error)
 		return error;
 
-	return libxfs_init_new_inode(*tpp, dp, ino, mode, nlink, rdev, cr,
-				fsx, ipp);
+	error = libxfs_icreate(*tpp, ino, &args, ipp);
+	if (error || dp)
+		return error;
+
+	/* If there is no parent dir, initialize the file from fsxattr data. */
+	ip = *ipp;
+	ip->i_projid = fsx->fsx_projid;
+	ip->i_extsize = fsx->fsx_extsize;
+	ip->i_diflags = xfs_flags2diflags(ip, fsx->fsx_xflags);
+
+	if (xfs_has_v3inodes(ip->i_mount)) {
+		ip->i_diflags2 = xfs_flags2diflags2(ip, fsx->fsx_xflags);
+		ip->i_cowextsize = fsx->fsx_cowextsize;
+	}
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+	return 0;
 }
 
 /*
diff --git a/libxfs/xfs_inode_util.h b/libxfs/xfs_inode_util.h
index f7e4d5a8235..a494f7c4a3f 100644
--- a/libxfs/xfs_inode_util.h
+++ b/libxfs/xfs_inode_util.h
@@ -13,4 +13,36 @@ uint32_t	xfs_ip2xflags(struct xfs_inode *ip);
 
 prid_t		xfs_get_initial_prid(struct xfs_inode *dp);
 
+/*
+ * Initial ids, link count, device number, and mode of a new inode.
+ *
+ * Due to our only partial reliance on the VFS to propagate uid and gid values
+ * according to accepted Unix behaviors, callers must initialize mnt_userns to
+ * the appropriate namespace, uid to fsuid_into_mnt(), and gid to
+ * fsgid_into_mnt() to get the correct inheritance behaviors when
+ * XFS_MOUNT_GRPID is set.  Use the xfs_ialloc_inherit_args() helper.
+ *
+ * To override the default ids, use the FORCE flags defined below.
+ */
+struct xfs_icreate_args {
+	struct mnt_idmap	*idmap;
+
+	struct xfs_inode	*pip;	/* parent inode or null */
+
+	kuid_t			uid;
+	kgid_t			gid;
+	prid_t			prid;
+
+	xfs_nlink_t		nlink;
+	dev_t			rdev;
+
+	umode_t			mode;
+
+#define XFS_ICREATE_ARGS_FORCE_UID	(1 << 0)
+#define XFS_ICREATE_ARGS_FORCE_GID	(1 << 1)
+#define XFS_ICREATE_ARGS_FORCE_MODE	(1 << 2)
+#define XFS_ICREATE_ARGS_INIT_XATTRS	(1 << 3)
+	uint16_t		flags;
+};
+
 #endif /* __XFS_INODE_UTIL_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/28] xfs: implement atime updates in xfs_trans_ichgtime
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 23:23   ` [PATCH 06/28] xfs: pack icreate initialization parameters into a separate structure Darrick J. Wong
@ 2023-12-31 23:24   ` Darrick J. Wong
  2023-12-31 23:24   ` [PATCH 08/28] libxfs: rearrange libxfs_trans_ichgtime call when creating inodes Darrick J. Wong
                     ` (20 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:24 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable xfs_trans_ichgtime to change the inode access time so that we can
use this function to set inode times when allocating inodes instead of
open-coding it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_shared.h      |    1 +
 libxfs/xfs_trans_inode.c |    2 ++
 2 files changed, 3 insertions(+)


diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h
index 7509c1406a3..1d327685f6e 100644
--- a/libxfs/xfs_shared.h
+++ b/libxfs/xfs_shared.h
@@ -136,6 +136,7 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
 #define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
 #define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
+#define	XFS_ICHGTIME_ACCESS	0x8	/* last access timestamp */
 
 /* Computed inode geometry for the filesystem. */
 struct xfs_ino_geometry {
diff --git a/libxfs/xfs_trans_inode.c b/libxfs/xfs_trans_inode.c
index c171a525cef..c2e5cb26004 100644
--- a/libxfs/xfs_trans_inode.c
+++ b/libxfs/xfs_trans_inode.c
@@ -65,6 +65,8 @@ xfs_trans_ichgtime(
 		inode_set_mtime_to_ts(inode, tv);
 	if (flags & XFS_ICHGTIME_CHG)
 		inode_set_ctime_to_ts(inode, tv);
+	if (flags & XFS_ICHGTIME_ACCESS)
+		inode_set_atime_to_ts(inode, tv);
 	if (flags & XFS_ICHGTIME_CREATE)
 		ip->i_crtime = tv;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/28] libxfs: rearrange libxfs_trans_ichgtime call when creating inodes
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 23:24   ` [PATCH 07/28] xfs: implement atime updates in xfs_trans_ichgtime Darrick J. Wong
@ 2023-12-31 23:24   ` Darrick J. Wong
  2023-12-31 23:24   ` [PATCH 09/28] libxfs: set access time when creating files Darrick J. Wong
                     ` (19 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:24 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Rearrange the libxfs_trans_ichgtime call in libxfs_ialloc so that we
call it once with the flags we want.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/inode.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)


diff --git a/libxfs/inode.c b/libxfs/inode.c
index 433755a18ba..8a2f7dfff4d 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -91,6 +91,7 @@ libxfs_icreate(
 	struct xfs_inode	*pip = args->pip;
 	struct xfs_inode	*ip;
 	unsigned int		flags;
+	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 	int			error;
 
 	error = libxfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, &ip);
@@ -102,7 +103,6 @@ libxfs_icreate(
 	set_nlink(VFS_I(ip), args->nlink);
 	VFS_I(ip)->i_uid = args->uid;
 	ip->i_projid = args->prid;
-	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG | XFS_ICHGTIME_MOD);
 
 	if (pip && (VFS_I(pip)->i_mode & S_ISGID)) {
 		if (!(args->flags & XFS_ICREATE_ARGS_FORCE_GID))
@@ -120,10 +120,12 @@ libxfs_icreate(
 	if (xfs_has_v3inodes(ip->i_mount)) {
 		VFS_I(ip)->i_version = 1;
 		ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2;
-		ip->i_crtime = VFS_I(ip)->__i_mtime;
 		ip->i_cowextsize = 0;
+		times |= XFS_ICHGTIME_CREATE;
 	}
 
+	xfs_trans_ichgtime(tp, ip, times);
+
 	flags = XFS_ILOG_CORE;
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/28] libxfs: set access time when creating files
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 23:24   ` [PATCH 08/28] libxfs: rearrange libxfs_trans_ichgtime call when creating inodes Darrick J. Wong
@ 2023-12-31 23:24   ` Darrick J. Wong
  2023-12-31 23:24   ` [PATCH 10/28] libxfs: when creating a file in a directory, set the project id based on the parent Darrick J. Wong
                     ` (18 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:24 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Set the access time on files that we're creating, to match the behavior
of the kernel.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/inode.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/libxfs/inode.c b/libxfs/inode.c
index 8a2f7dfff4d..a29c1500776 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -91,7 +91,8 @@ libxfs_icreate(
 	struct xfs_inode	*pip = args->pip;
 	struct xfs_inode	*ip;
 	unsigned int		flags;
-	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
+					XFS_ICHGTIME_ACCESS;
 	int			error;
 
 	error = libxfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, &ip);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/28] libxfs: when creating a file in a directory, set the project id based on the parent
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 23:24   ` [PATCH 09/28] libxfs: set access time when creating files Darrick J. Wong
@ 2023-12-31 23:24   ` Darrick J. Wong
  2023-12-31 23:25   ` [PATCH 11/28] libxfs: pass flags2 from parent to child when creating files Darrick J. Wong
                     ` (17 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:24 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When we're creating a file as a child of an existing directory, use
xfs_get_initial_prid to have the child inherit the project id of the
directory if the directory has PROJINHERIT set, just like the kernel
does.  This fixes mkfs project id propagation with -d projinherit=X when
protofiles are in use.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/inode.c           |    1 +
 libxfs/libxfs_api_defs.h |    1 +
 2 files changed, 2 insertions(+)


diff --git a/libxfs/inode.c b/libxfs/inode.c
index a29c1500776..e34b8e4b194 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -263,6 +263,7 @@ libxfs_dir_ialloc(
 		.pip		= dp,
 		.uid		= make_kuid(cr->cr_uid),
 		.gid		= make_kgid(cr->cr_gid),
+		.prid		= dp ? libxfs_get_initial_prid(dp) : 0,
 		.nlink		= nlink,
 		.rdev		= rdev,
 		.mode		= mode,
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 3455c014832..575bf45a211 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -151,6 +151,7 @@
 #define xfs_free_perag			libxfs_free_perag
 #define xfs_fs_geometry			libxfs_fs_geometry
 #define xfs_get_projid			libxfs_get_projid
+#define xfs_get_initial_prid		libxfs_get_initial_prid
 #define xfs_highbit32			libxfs_highbit32
 #define xfs_highbit64			libxfs_highbit64
 #define xfs_ialloc_calc_rootino		libxfs_ialloc_calc_rootino


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/28] libxfs: pass flags2 from parent to child when creating files
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 23:24   ` [PATCH 10/28] libxfs: when creating a file in a directory, set the project id based on the parent Darrick J. Wong
@ 2023-12-31 23:25   ` Darrick J. Wong
  2023-12-31 23:25   ` [PATCH 12/28] xfs: split new inode creation into two pieces Darrick J. Wong
                     ` (16 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:25 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When mkfs creates a new file as a child of an existing directory, we
should propagate the flags2 field from parent to child like the kernel
does.  This ensures that mkfs propagates cowextsize hints properly when
protofiles are in use.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/inode.c |   16 ++++++++++++++++
 1 file changed, 16 insertions(+)


diff --git a/libxfs/inode.c b/libxfs/inode.c
index e34b8e4b194..518c8b45371 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -59,6 +59,20 @@ xfs_inode_propagate_flags(
 	ip->i_diflags |= di_flags;
 }
 
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags2(
+	struct xfs_inode	*ip,
+	const struct xfs_inode	*pip)
+{
+	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+		ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = pip->i_cowextsize;
+	}
+	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
+		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+}
+
 /*
  * Increment the link count on an inode & log the change.
  */
@@ -141,6 +155,8 @@ libxfs_icreate(
 	case S_IFDIR:
 		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
 			xfs_inode_propagate_flags(ip, pip);
+		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
+			xfs_inode_inherit_flags2(ip, pip);
 		/* FALLTHROUGH */
 	case S_IFLNK:
 		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/28] xfs: split new inode creation into two pieces
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 23:25   ` [PATCH 11/28] libxfs: pass flags2 from parent to child when creating files Darrick J. Wong
@ 2023-12-31 23:25   ` Darrick J. Wong
  2023-12-31 23:25   ` [PATCH 13/28] libxfs: backport inode init code from the kernel Darrick J. Wong
                     ` (15 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:25 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

There are two parts to initializing a newly allocated inode: setting up
the incore structures, and initializing the new inode core based on the
parent inode and the current user's environment.  The initialization
code is not specific to the kernel, so we would like to share that with
userspace by hoisting it to libxfs.  Therefore, split xfs_icreate into
separate functions to prepare for the next few patches.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/inode.c      |   48 +++++++++++++++++++++++++++++-------------------
 libxfs/xfs_ialloc.c |   15 +++++++++++++++
 2 files changed, 44 insertions(+), 19 deletions(-)


diff --git a/libxfs/inode.c b/libxfs/inode.c
index 518c8b45371..b61ad0f9e09 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -91,28 +91,17 @@ libxfs_bumplink(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
-/*
- * Initialise a newly allocated inode and return the in-core inode to the
- * caller locked exclusively.
- */
-static int
-libxfs_icreate(
+/* Initialise an inode's attributes. */
+static void
+xfs_inode_init(
 	struct xfs_trans	*tp,
-	xfs_ino_t		ino,
 	const struct xfs_icreate_args *args,
-	struct xfs_inode	**ipp)
+	struct xfs_inode	*ip)
 {
 	struct xfs_inode	*pip = args->pip;
-	struct xfs_inode	*ip;
 	unsigned int		flags;
 	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
 					XFS_ICHGTIME_ACCESS;
-	int			error;
-
-	error = libxfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, &ip);
-	if (error != 0)
-		return error;
-	ASSERT(ip != NULL);
 
 	VFS_I(ip)->i_mode = args->mode;
 	set_nlink(VFS_I(ip), args->nlink);
@@ -181,11 +170,32 @@ libxfs_icreate(
 		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
 	}
 
-	/*
-	 * Log the new values stuffed into the inode.
-	 */
-	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, flags);
+}
+
+/*
+ * Initialise a newly allocated inode and return the in-core inode to the
+ * caller locked exclusively.
+ */
+static int
+libxfs_icreate(
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	const struct xfs_icreate_args *args,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	error = libxfs_iget(mp, tp, ino, XFS_IGET_CREATE, &ip);
+	if (error)
+		return error;
+
+	ASSERT(ip != NULL);
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_inode_init(tp, args, ip);
+
 	*ipp = ip;
 	return 0;
 }
diff --git a/libxfs/xfs_ialloc.c b/libxfs/xfs_ialloc.c
index 46d4515baba..2c941603986 100644
--- a/libxfs/xfs_ialloc.c
+++ b/libxfs/xfs_ialloc.c
@@ -1896,6 +1896,21 @@ xfs_dialloc(
 		}
 		return -ENOSPC;
 	}
+
+	/*
+	 * Protect against obviously corrupt allocation btree records. Later
+	 * xfs_iget checks will catch re-allocation of other active in-memory
+	 * and on-disk inodes. If we don't catch reallocating the parent inode
+	 * here we will deadlock in xfs_iget() so we have to do these checks
+	 * first.
+	 */
+	if (ino == parent || !xfs_verify_dir_ino(mp, ino)) {
+		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+		xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
+				XFS_SICK_AG_INOBT);
+		return -EFSCORRUPTED;
+	}
+
 	*new_ino = ino;
 	return 0;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/28] libxfs: backport inode init code from the kernel
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 23:25   ` [PATCH 12/28] xfs: split new inode creation into two pieces Darrick J. Wong
@ 2023-12-31 23:25   ` Darrick J. Wong
  2023-12-31 23:25   ` [PATCH 14/28] libxfs: remove libxfs_dir_ialloc Darrick J. Wong
                     ` (14 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:25 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Reorganize the userspace inode initialization code to more closely resemble
its kernel counterpart.  This is preparation to hoist the initialization
routines to libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h  |    2 +
 include/xfs_mount.h  |    1 +
 libxfs/inode.c       |   80 +++++++++++++++++++++++++++++++++++++++-----------
 libxfs/libxfs_priv.h |    6 ++++
 4 files changed, 71 insertions(+), 18 deletions(-)


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 04d971e744b..45d53d1eb00 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -398,4 +398,6 @@ extern void	libxfs_irele(struct xfs_inode *ip);
 
 #define XFS_DEFAULT_COWEXTSZ_HINT 32
 
+#define XFS_INHERIT_GID(pip)		(VFS_I(pip)->i_mode & S_ISGID)
+
 #endif /* __XFS_INODE_H__ */
diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index ec63c525fd4..da621f3b992 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -232,6 +232,7 @@ __XFS_UNSUPP_FEAT(ikeep)
 __XFS_UNSUPP_FEAT(swalloc)
 __XFS_UNSUPP_FEAT(small_inums)
 __XFS_UNSUPP_FEAT(readonly)
+__XFS_UNSUPP_FEAT(grpid)
 
 /* Operational mount state flags */
 #define XFS_OPSTATE_INODE32		0	/* inode32 allocator active */
diff --git a/libxfs/inode.c b/libxfs/inode.c
index b61ad0f9e09..3d92e888a16 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -31,7 +31,7 @@
 
 /* Propagate di_flags from a parent inode to a child inode. */
 static void
-xfs_inode_propagate_flags(
+xfs_inode_inherit_flags(
 	struct xfs_inode	*ip,
 	const struct xfs_inode	*pip)
 {
@@ -99,31 +99,47 @@ xfs_inode_init(
 	struct xfs_inode	*ip)
 {
 	struct xfs_inode	*pip = args->pip;
+	struct inode		*dir = pip ? VFS_I(pip) : NULL;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct inode		*inode = VFS_I(ip);
 	unsigned int		flags;
 	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
 					XFS_ICHGTIME_ACCESS;
 
-	VFS_I(ip)->i_mode = args->mode;
-	set_nlink(VFS_I(ip), args->nlink);
-	VFS_I(ip)->i_uid = args->uid;
+	set_nlink(inode, args->nlink);
+	inode->i_rdev = args->rdev;
 	ip->i_projid = args->prid;
 
-	if (pip && (VFS_I(pip)->i_mode & S_ISGID)) {
-		if (!(args->flags & XFS_ICREATE_ARGS_FORCE_GID))
-			VFS_I(ip)->i_gid = VFS_I(pip)->i_gid;
-		if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(args->mode))
-			VFS_I(ip)->i_mode |= S_ISGID;
-	} else
-		VFS_I(ip)->i_gid = args->gid;
+	if (dir && !(dir->i_mode & S_ISGID) &&
+	    xfs_has_grpid(mp)) {
+		inode->i_uid = args->uid;
+		inode->i_gid = dir->i_gid;
+		inode->i_mode = args->mode;
+	} else {
+		inode_init_owner(args->idmap, inode, dir, args->mode);
+	}
+
+	/* struct copies */
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_UID)
+		inode->i_uid = args->uid;
+	else
+		ASSERT(uid_eq(inode->i_uid, args->uid));
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_GID)
+		inode->i_gid = args->gid;
+	else if (!pip || !XFS_INHERIT_GID(pip))
+		ASSERT(gid_eq(inode->i_gid, args->gid));
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_MODE)
+		inode->i_mode = args->mode;
 
 	ip->i_disk_size = 0;
 	ip->i_df.if_nextents = 0;
 	ASSERT(ip->i_nblocks == 0);
+
 	ip->i_extsize = 0;
 	ip->i_diflags = 0;
+
 	if (xfs_has_v3inodes(ip->i_mount)) {
 		VFS_I(ip)->i_version = 1;
-		ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2;
 		ip->i_cowextsize = 0;
 		times |= XFS_ICHGTIME_CREATE;
 	}
@@ -138,12 +154,11 @@ xfs_inode_init(
 	case S_IFBLK:
 		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
 		flags |= XFS_ILOG_DEV;
-		VFS_I(ip)->i_rdev = args->rdev;
 		break;
 	case S_IFREG:
 	case S_IFDIR:
 		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
-			xfs_inode_propagate_flags(ip, pip);
+			xfs_inode_inherit_flags(ip, pip);
 		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
 			xfs_inode_inherit_flags2(ip, pip);
 		/* FALLTHROUGH */
@@ -165,7 +180,8 @@ xfs_inode_init(
 	 * this saves us from needing to run a separate transaction to set the
 	 * fork offset in the immediate future.
 	 */
-	if (xfs_has_parent(tp->t_mountp) && xfs_has_attr(tp->t_mountp)) {
+	if ((args->flags & XFS_ICREATE_ARGS_INIT_XATTRS) &&
+	    xfs_has_attr(mp)) {
 		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
 		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
 	}
@@ -293,15 +309,15 @@ libxfs_dir_ialloc(
 		.nlink		= nlink,
 		.rdev		= rdev,
 		.mode		= mode,
+		.flags		= XFS_ICREATE_ARGS_FORCE_UID |
+				  XFS_ICREATE_ARGS_FORCE_GID |
+				  XFS_ICREATE_ARGS_FORCE_MODE,
 	};
 	struct xfs_inode	*ip;
 	xfs_ino_t		parent_ino = dp ? dp->i_ino : 0;
 	xfs_ino_t		ino;
 	int			error;
 
-	if (cr->cr_flags & CRED_FORCE_GID)
-		args.flags |= XFS_ICREATE_ARGS_FORCE_GID;
-
 	if (dp && xfs_has_parent(dp->i_mount))
 		args.flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
 
@@ -361,6 +377,7 @@ libxfs_iget(
 	VFS_I(ip)->i_count = 1;
 	ip->i_ino = ino;
 	ip->i_mount = mp;
+	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
 	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
 	spin_lock_init(&VFS_I(ip)->i_lock);
 
@@ -442,3 +459,30 @@ libxfs_irele(
 		kmem_cache_free(xfs_inode_cache, ip);
 	}
 }
+
+static inline void inode_fsuid_set(struct inode *inode,
+				   struct mnt_idmap *idmap)
+{
+	inode->i_uid = make_kuid(0);
+}
+
+static inline void inode_fsgid_set(struct inode *inode,
+				   struct mnt_idmap *idmap)
+{
+	inode->i_gid = make_kgid(0);
+}
+
+void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
+		      const struct inode *dir, umode_t mode)
+{
+	inode_fsuid_set(inode, idmap);
+	if (dir && dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+
+		/* Directories are special, and always inherit S_ISGID */
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode_fsgid_set(inode, idmap);
+	inode->i_mode = mode;
+}
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 90149b60c57..1609a8fd03f 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -220,6 +220,12 @@ static inline bool WARN_ON(bool expr) {
 	(inode)->i_version = (version);	\
 } while (0)
 
+struct inode;
+struct mnt_idmap;
+
+void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
+		      const struct inode *dir, umode_t mode);
+
 #define __must_check	__attribute__((__warn_unused_result__))
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/28] libxfs: remove libxfs_dir_ialloc
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 23:25   ` [PATCH 13/28] libxfs: backport inode init code from the kernel Darrick J. Wong
@ 2023-12-31 23:25   ` Darrick J. Wong
  2023-12-31 23:26   ` [PATCH 15/28] libxfs: implement get_random_u32 Darrick J. Wong
                     ` (13 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:25 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

This function no longer exists in the kernel, and it's not really needed
in userspace either.  There are two users of it: repair and mkfs.
Repair passes in zeroed cred and fsxattr structures so it can call
libxfs_dialloc and libxfs_icreate directly.  For mkfs we'll move the
guts of libxfs_dir_ialloc into proto.c as a creatproto function that
takes care of all that, and move struct cred to mkfs since it's now the
only user.

This gets us ready to hoist the rest of the inode initialization code to
libxfs for metadata directories.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/iunlink.c             |   18 +++++++--
 include/xfs_inode.h      |   16 ++------
 libxfs/inode.c           |   86 ++++++++++-------------------------------
 libxfs/libxfs_api_defs.h |    1 
 mkfs/proto.c             |   97 +++++++++++++++++++++++++++++++++++++---------
 repair/phase6.c          |   50 +++++++++++++-----------
 6 files changed, 147 insertions(+), 121 deletions(-)


diff --git a/db/iunlink.c b/db/iunlink.c
index d87562e3b0a..af452d028bd 100644
--- a/db/iunlink.c
+++ b/db/iunlink.c
@@ -309,10 +309,15 @@ static int
 create_unlinked(
 	struct xfs_mount	*mp)
 {
-	struct cred		cr = { };
-	struct fsxattr		fsx = { };
+	struct xfs_icreate_args	args = {
+		.mode		= S_IFREG | 0600,
+		.flags		= XFS_ICREATE_ARGS_FORCE_UID |
+				  XFS_ICREATE_ARGS_FORCE_GID |
+				  XFS_ICREATE_ARGS_FORCE_MODE,
+	};
 	struct xfs_inode	*ip;
 	struct xfs_trans	*tp;
+	xfs_ino_t		ino;
 	unsigned int		resblks;
 	int			error;
 
@@ -324,8 +329,13 @@ create_unlinked(
 		return error;
 	}
 
-	error = -libxfs_dir_ialloc(&tp, NULL, S_IFREG | 0600, 0, 0, &cr, &fsx,
-			&ip);
+	error = -libxfs_dialloc(&tp, 0, args.mode, &ino);
+	if (error) {
+		dbprintf(_("alloc inode: %s\n"), strerror(error));
+		goto out_cancel;
+	}
+
+	error = -libxfs_icreate(tp, ino, &args, &ip);
 	if (error) {
 		dbprintf(_("create inode: %s\n"), strerror(error));
 		goto out_cancel;
diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 45d53d1eb00..e0bf5dcc2c2 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -371,17 +371,6 @@ static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
 	return false;
 }
 
-/* Always set the child's GID to this value, even if the parent is setgid. */
-#define CRED_FORCE_GID	(1U << 0)
-struct cred {
-	uid_t		cr_uid;
-	gid_t		cr_gid;
-	unsigned int	cr_flags;
-};
-
-extern int	libxfs_dir_ialloc (struct xfs_trans **, struct xfs_inode *,
-				mode_t, nlink_t, xfs_dev_t, struct cred *,
-				struct fsxattr *, struct xfs_inode **);
 extern void	libxfs_trans_inode_alloc_buf (struct xfs_trans *,
 				struct xfs_buf *);
 
@@ -391,6 +380,11 @@ extern int	libxfs_iflush_int (struct xfs_inode *, struct xfs_buf *);
 
 void libxfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
 
+int libxfs_icreate(struct xfs_trans *tp, xfs_ino_t ino,
+		const struct xfs_icreate_args *args, struct xfs_inode **ipp);
+void libxfs_icreate_args_rootfile(struct xfs_icreate_args *args,
+		struct xfs_mount *mp, umode_t mode, bool init_xattrs);
+
 /* Inode Cache Interfaces */
 extern int	libxfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 				uint, struct xfs_inode **);
diff --git a/libxfs/inode.c b/libxfs/inode.c
index 3d92e888a16..17bdc0bffb2 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -193,7 +193,7 @@ xfs_inode_init(
  * Initialise a newly allocated inode and return the in-core inode to the
  * caller locked exclusively.
  */
-static int
+int
 libxfs_icreate(
 	struct xfs_trans	*tp,
 	xfs_ino_t		ino,
@@ -216,6 +216,26 @@ libxfs_icreate(
 	return 0;
 }
 
+/* Set up inode attributes for newly created internal files. */
+void
+libxfs_icreate_args_rootfile(
+	struct xfs_icreate_args	*args,
+	struct xfs_mount	*mp,
+	umode_t			mode,
+	bool			init_xattrs)
+{
+	args->idmap = NULL;
+	args->uid = make_kuid(0);
+	args->gid = make_kgid(0);
+	args->prid = 0;
+	args->mode = mode;
+	args->flags = XFS_ICREATE_ARGS_FORCE_UID |
+		      XFS_ICREATE_ARGS_FORCE_GID |
+		      XFS_ICREATE_ARGS_FORCE_MODE;
+	if (init_xattrs)
+		args->flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
+}
+
 /*
  * Writes a modified inode's changes out to the inode's on disk home.
  * Originally based on xfs_iflush_int() from xfs_inode.c in the kernel.
@@ -283,70 +303,6 @@ libxfs_iflush_int(
 	return 0;
 }
 
-/*
- * Wrapper around call to libxfs_ialloc. Takes care of committing and
- * allocating a new transaction as needed.
- *
- * Originally there were two copies of this code - one in mkfs, the
- * other in repair - now there is just the one.
- */
-int
-libxfs_dir_ialloc(
-	struct xfs_trans	**tpp,
-	struct xfs_inode	*dp,
-	mode_t			mode,
-	nlink_t			nlink,
-	xfs_dev_t		rdev,
-	struct cred		*cr,
-	struct fsxattr		*fsx,
-	struct xfs_inode	**ipp)
-{
-	struct xfs_icreate_args	args = {
-		.pip		= dp,
-		.uid		= make_kuid(cr->cr_uid),
-		.gid		= make_kgid(cr->cr_gid),
-		.prid		= dp ? libxfs_get_initial_prid(dp) : 0,
-		.nlink		= nlink,
-		.rdev		= rdev,
-		.mode		= mode,
-		.flags		= XFS_ICREATE_ARGS_FORCE_UID |
-				  XFS_ICREATE_ARGS_FORCE_GID |
-				  XFS_ICREATE_ARGS_FORCE_MODE,
-	};
-	struct xfs_inode	*ip;
-	xfs_ino_t		parent_ino = dp ? dp->i_ino : 0;
-	xfs_ino_t		ino;
-	int			error;
-
-	if (dp && xfs_has_parent(dp->i_mount))
-		args.flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
-
-	/*
-	 * Call the space management code to pick the on-disk inode to be
-	 * allocated.
-	 */
-	error = xfs_dialloc(tpp, parent_ino, mode, &ino);
-	if (error)
-		return error;
-
-	error = libxfs_icreate(*tpp, ino, &args, ipp);
-	if (error || dp)
-		return error;
-
-	/* If there is no parent dir, initialize the file from fsxattr data. */
-	ip = *ipp;
-	ip->i_projid = fsx->fsx_projid;
-	ip->i_extsize = fsx->fsx_extsize;
-	ip->i_diflags = xfs_flags2diflags(ip, fsx->fsx_xflags);
-
-	if (xfs_has_v3inodes(ip->i_mount)) {
-		ip->i_diflags2 = xfs_flags2diflags2(ip, fsx->fsx_xflags);
-		ip->i_cowextsize = fsx->fsx_cowextsize;
-	}
-	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-	return 0;
-}
-
 /*
  * Inode cache stubs.
  */
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 575bf45a211..7e4d4c008cf 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -101,6 +101,7 @@
 #define xfs_da_shrink_inode		libxfs_da_shrink_inode
 #define xfs_defer_cancel		libxfs_defer_cancel
 #define xfs_defer_finish		libxfs_defer_finish
+#define xfs_dialloc			libxfs_dialloc
 #define xfs_dinode_calc_crc		libxfs_dinode_calc_crc
 #define xfs_dinode_good_version		libxfs_dinode_good_version
 #define xfs_dinode_verify		libxfs_dinode_verify
diff --git a/mkfs/proto.c b/mkfs/proto.c
index cc06bdfaf57..bb262390536 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -402,6 +402,68 @@ newpptr(
 	return ret;
 }
 
+struct cred {
+	uid_t		cr_uid;
+	gid_t		cr_gid;
+};
+
+static int
+creatproto(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*dp,
+	mode_t			mode,
+	nlink_t			nlink,
+	xfs_dev_t		rdev,
+	struct cred		*cr,
+	struct fsxattr		*fsx,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_icreate_args	args = {
+		.pip		= dp,
+		.uid		= make_kuid(cr->cr_uid),
+		.gid		= make_kgid(cr->cr_gid),
+		.prid		= dp ? libxfs_get_initial_prid(dp) : 0,
+		.nlink		= nlink,
+		.rdev		= rdev,
+		.mode		= mode,
+		.flags		= XFS_ICREATE_ARGS_FORCE_UID |
+				  XFS_ICREATE_ARGS_FORCE_GID |
+				  XFS_ICREATE_ARGS_FORCE_MODE,
+	};
+	struct xfs_inode	*ip;
+	xfs_ino_t		parent_ino = dp ? dp->i_ino : 0;
+	xfs_ino_t		ino;
+	int			error;
+
+	if (dp && xfs_has_parent(dp->i_mount))
+		args.flags |= XFS_ICREATE_ARGS_INIT_XATTRS;
+
+	/*
+	 * Call the space management code to pick the on-disk inode to be
+	 * allocated.
+	 */
+	error = -libxfs_dialloc(tpp, parent_ino, mode, &ino);
+	if (error)
+		return error;
+
+	error = -libxfs_icreate(*tpp, ino, &args, ipp);
+	if (error || dp)
+		return error;
+
+	/* If there is no parent dir, initialize the file from fsxattr data. */
+	ip = *ipp;
+	ip->i_projid = fsx->fsx_projid;
+	ip->i_extsize = fsx->fsx_extsize;
+	ip->i_diflags = xfs_flags2diflags(ip, fsx->fsx_xflags);
+
+	if (xfs_has_v3inodes(ip->i_mount)) {
+		ip->i_diflags2 = xfs_flags2diflags2(ip, fsx->fsx_xflags);
+		ip->i_cowextsize = fsx->fsx_cowextsize;
+	}
+	libxfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+	return 0;
+}
+
 static void
 parseproto(
 	xfs_mount_t	*mp,
@@ -502,7 +564,6 @@ parseproto(
 	mode |= val;
 	creds.cr_uid = (int)getnum(getstr(pp), 0, 0, false);
 	creds.cr_gid = (int)getnum(getstr(pp), 0, 0, false);
-	creds.cr_flags = CRED_FORCE_GID;
 	xname.name = (unsigned char *)name;
 	xname.len = name ? strlen(name) : 0;
 	xname.type = 0;
@@ -512,8 +573,8 @@ parseproto(
 		buf = newregfile(pp, &len);
 		tp = getres(mp, XFS_B_TO_FSB(mp, len));
 		ppargs = newpptr(mp);
-		error = -libxfs_dir_ialloc(&tp, pip, mode|S_IFREG, 1, 0,
-					   &creds, fsxp, &ip);
+		error = creatproto(&tp, pip, mode | S_IFREG, 1, 0, &creds,
+				fsxp, &ip);
 		if (error)
 			fail(_("Inode allocation failed"), error);
 		writefile(tp, ip, buf, len);
@@ -536,8 +597,8 @@ parseproto(
 		}
 		tp = getres(mp, XFS_B_TO_FSB(mp, llen));
 		ppargs = newpptr(mp);
-		error = -libxfs_dir_ialloc(&tp, pip, mode|S_IFREG, 1, 0,
-					  &creds, fsxp, &ip);
+		error = creatproto(&tp, pip, mode | S_IFREG, 1, 0, &creds,
+				fsxp, &ip);
 		if (error)
 			fail(_("Inode pre-allocation failed"), error);
 
@@ -559,7 +620,7 @@ parseproto(
 		ppargs = newpptr(mp);
 		majdev = getnum(getstr(pp), 0, 0, false);
 		mindev = getnum(getstr(pp), 0, 0, false);
-		error = -libxfs_dir_ialloc(&tp, pip, mode|S_IFBLK, 1,
+		error = creatproto(&tp, pip, mode | S_IFBLK, 1,
 				IRIX_MKDEV(majdev, mindev), &creds, fsxp, &ip);
 		if (error) {
 			fail(_("Inode allocation failed"), error);
@@ -575,7 +636,7 @@ parseproto(
 		ppargs = newpptr(mp);
 		majdev = getnum(getstr(pp), 0, 0, false);
 		mindev = getnum(getstr(pp), 0, 0, false);
-		error = -libxfs_dir_ialloc(&tp, pip, mode|S_IFCHR, 1,
+		error = creatproto(&tp, pip, mode | S_IFCHR, 1,
 				IRIX_MKDEV(majdev, mindev), &creds, fsxp, &ip);
 		if (error)
 			fail(_("Inode allocation failed"), error);
@@ -588,8 +649,8 @@ parseproto(
 	case IF_FIFO:
 		tp = getres(mp, 0);
 		ppargs = newpptr(mp);
-		error = -libxfs_dir_ialloc(&tp, pip, mode|S_IFIFO, 1, 0,
-				&creds, fsxp, &ip);
+		error = creatproto(&tp, pip, mode | S_IFIFO, 1, 0, &creds,
+				fsxp, &ip);
 		if (error)
 			fail(_("Inode allocation failed"), error);
 		libxfs_trans_ijoin(tp, pip, 0);
@@ -601,8 +662,8 @@ parseproto(
 		len = (int)strlen(buf);
 		tp = getres(mp, XFS_B_TO_FSB(mp, len));
 		ppargs = newpptr(mp);
-		error = -libxfs_dir_ialloc(&tp, pip, mode|S_IFLNK, 1, 0,
-				&creds, fsxp, &ip);
+		error = creatproto(&tp, pip, mode | S_IFLNK, 1, 0, &creds,
+				fsxp, &ip);
 		if (error)
 			fail(_("Inode allocation failed"), error);
 		writesymlink(tp, ip, buf, len);
@@ -612,8 +673,8 @@ parseproto(
 		break;
 	case IF_DIRECTORY:
 		tp = getres(mp, 0);
-		error = -libxfs_dir_ialloc(&tp, pip, mode|S_IFDIR, 1, 0,
-				&creds, fsxp, &ip);
+		error = creatproto(&tp, pip, mode | S_IFDIR, 1, 0, &creds,
+				fsxp, &ip);
 		if (error)
 			fail(_("Inode allocation failed"), error);
 		libxfs_bumplink(tp, ip);		/* account for . */
@@ -711,14 +772,14 @@ rtinit(
 
 	memset(&creds, 0, sizeof(creds));
 	memset(&fsxattrs, 0, sizeof(fsxattrs));
-	error = -libxfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0,
-					&creds, &fsxattrs, &rbmip);
+	error = creatproto(&tp, NULL, S_IFREG, 1, 0, &creds, &fsxattrs,
+			&rbmip);
 	if (error) {
 		fail(_("Realtime bitmap inode allocation failed"), error);
 	}
 	/*
 	 * Do our thing with rbmip before allocating rsumip,
-	 * because the next call to ialloc() may
+	 * because the next call to createproto may
 	 * commit the transaction in which rbmip was allocated.
 	 */
 	mp->m_sb.sb_rbmino = rbmip->i_ino;
@@ -728,8 +789,8 @@ rtinit(
 	libxfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE);
 	libxfs_log_sb(tp);
 	mp->m_rbmip = rbmip;
-	error = -libxfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0,
-					&creds, &fsxattrs, &rsumip);
+	error = creatproto(&tp, NULL, S_IFREG, 1, 0, &creds, &fsxattrs,
+			&rsumip);
 	if (error) {
 		fail(_("Realtime summary inode allocation failed"), error);
 	}
diff --git a/repair/phase6.c b/repair/phase6.c
index 5e95dabbe09..51e1ea92dd6 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -20,8 +20,6 @@
 #include "versions.h"
 #include "repair/pptr.h"
 
-static struct cred		zerocr;
-static struct fsxattr 		zerofsx;
 static xfs_ino_t		orphanage_ino;
 
 /*
@@ -890,26 +888,32 @@ mk_root_dir(xfs_mount_t *mp)
  * orphanage name == lost+found
  */
 static xfs_ino_t
-mk_orphanage(xfs_mount_t *mp)
+mk_orphanage(
+	struct xfs_mount	*mp)
 {
-	xfs_ino_t	ino;
-	xfs_trans_t	*tp;
-	xfs_inode_t	*ip;
-	xfs_inode_t	*pip;
-	ino_tree_node_t	*irec;
-	int		ino_offset = 0;
-	int		i;
-	int		error;
-	const int	mode = 0755;
-	int		nres;
-	struct xfs_name	xname;
-	struct xfs_parent_args *ppargs;
+	struct xfs_icreate_args	args = {
+		.nlink		= 2,
+	};
+	struct xfs_trans	*tp;
+	struct xfs_inode	*ip;
+	struct xfs_inode	*pip;
+	struct ino_tree_node	*irec;
+	xfs_ino_t		ino;
+	int			ino_offset = 0;
+	int			i;
+	int			error;
+	int			nres;
+	const umode_t		mode = S_IFDIR | 0755;
+	struct xfs_name		xname;
+	struct xfs_parent_args	*ppargs;
 
 	i = -libxfs_parent_start(mp, &ppargs);
 	if (i)
 		do_error(_("%d - couldn't allocate parent pointer for %s\n"),
 			i, ORPHANAGE);
 
+	libxfs_icreate_args_rootfile(&args, mp, mode, xfs_has_parent(mp));
+
 	/*
 	 * check for an existing lost+found first, if it exists, return
 	 * its inode. Otherwise, we can create it. Bad lost+found inodes
@@ -921,6 +925,7 @@ mk_orphanage(xfs_mount_t *mp)
 		do_error(_("%d - couldn't iget root inode to obtain %s\n"),
 			i, ORPHANAGE);
 
+	args.pip = pip;
 	xname.name = (unsigned char *)ORPHANAGE;
 	xname.len = strlen(ORPHANAGE);
 	xname.type = XFS_DIR3_FT_DIR;
@@ -945,14 +950,15 @@ mk_orphanage(xfs_mount_t *mp)
 		do_error(_("%d - couldn't iget root inode to make %s\n"),
 			i, ORPHANAGE);*/
 
-	error = -libxfs_dir_ialloc(&tp, pip, mode|S_IFDIR,
-					1, 0, &zerocr, &zerofsx, &ip);
-	if (error) {
+	error = -libxfs_dialloc(&tp, mp->m_sb.sb_rootino, mode, &ino);
+	if (error)
 		do_error(_("%s inode allocation failed %d\n"),
 			ORPHANAGE, error);
-	}
-	libxfs_bumplink(tp, ip);		/* account for . */
-	ino = ip->i_ino;
+
+	error = -libxfs_icreate(tp, ino, &args, &ip);
+	if (error)
+		do_error(_("%s inode initialization failed %d\n"),
+			ORPHANAGE, error);
 
 	irec = find_inode_rec(mp,
 			XFS_INO_TO_AGNO(mp, ino),
@@ -3331,8 +3337,6 @@ phase6(xfs_mount_t *mp)
 
 	parent_ptr_init(mp);
 
-	memset(&zerocr, 0, sizeof(struct cred));
-	memset(&zerofsx, 0, sizeof(struct fsxattr));
 	orphanage_ino = 0;
 
 	do_log(_("Phase 6 - check inode connectivity...\n"));


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/28] libxfs: implement get_random_u32
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-31 23:25   ` [PATCH 14/28] libxfs: remove libxfs_dir_ialloc Darrick J. Wong
@ 2023-12-31 23:26   ` Darrick J. Wong
  2023-12-31 23:26   ` [PATCH 16/28] xfs: hoist new inode initialization functions to libxfs Darrick J. Wong
                     ` (12 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:26 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Actually query the kernel for some random bytes instead of returning
zero, if that's possible.  The most noticeable effect of this is that
mkfs will now create the rtbitmap file, the rtsummary file, and children
of the root directory with a nonzero generation.  Apparently xfsdump
requires that the root directory have a generation number of zero.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 configure.ac          |    1 +
 include/builddefs.in  |    1 +
 libxfs/Makefile       |    3 +++
 libxfs/libxfs_priv.h  |   11 +++++++----
 libxfs/util.c         |   19 +++++++++++++++++++
 m4/package_libcdev.m4 |   15 +++++++++++++++
 mkfs/proto.c          |    3 +++
 7 files changed, 49 insertions(+), 4 deletions(-)


diff --git a/configure.ac b/configure.ac
index 38b62619a7a..3b36d769eac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -194,6 +194,7 @@ AC_HAVE_MAP_SYNC
 AC_HAVE_DEVMAPPER
 AC_HAVE_MALLINFO
 AC_HAVE_MALLINFO2
+AC_HAVE_GETRANDOM_NONBLOCK
 AC_PACKAGE_WANT_ATTRIBUTES_H
 AC_HAVE_LIBATTR
 if test "$enable_scrub" = "yes"; then
diff --git a/include/builddefs.in b/include/builddefs.in
index daac1b5d18a..6668e9bbe8b 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -120,6 +120,7 @@ HAVE_MAP_SYNC = @have_map_sync@
 HAVE_DEVMAPPER = @have_devmapper@
 HAVE_MALLINFO = @have_mallinfo@
 HAVE_MALLINFO2 = @have_mallinfo2@
+HAVE_GETRANDOM_NONBLOCK = @have_getrandom_nonblock@
 HAVE_LIBATTR = @have_libattr@
 HAVE_LIBICU = @have_libicu@
 HAVE_OPENAT = @have_openat@
diff --git a/libxfs/Makefile b/libxfs/Makefile
index 6007353ade2..a251322d0f6 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -141,6 +141,9 @@ endif
 ifeq ($(HAVE_MKOSTEMP_CLOEXEC),yes)
 	LCFLAGS += -DHAVE_MKOSTEMP_CLOEXEC
 endif
+ifeq ($(HAVE_GETRANDOM_NONBLOCK),yes)
+	LCFLAGS += -DHAVE_GETRANDOM_NONBLOCK
+endif
 
 FCFLAGS = -I.
 
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 1609a8fd03f..e9c6bbf16ee 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -64,6 +64,9 @@
 #include "libfrog/crc32c.h"
 
 #include <sys/xattr.h>
+#ifdef HAVE_GETRANDOM_NONBLOCK
+#include <sys/random.h>
+#endif
 
 /* Zones used in libxfs allocations that aren't in shared header files */
 extern struct kmem_cache *xfs_buf_item_cache;
@@ -207,11 +210,11 @@ static inline bool WARN_ON(bool expr) {
 #define percpu_counter_read_positive(x)	((*x) > 0 ? (*x) : 0)
 #define percpu_counter_sum(x)		(*x)
 
-/*
- * get_random_u32 is used for di_gen inode allocation, it must be zero for
- * libxfs or all sorts of badness can occur!
- */
+#ifdef HAVE_GETRANDOM_NONBLOCK
+uint32_t get_random_u32(void);
+#else
 #define get_random_u32()	(0)
+#endif
 
 #define PAGE_SIZE		getpagesize()
 
diff --git a/libxfs/util.c b/libxfs/util.c
index 7aa92c0e4a6..a3f3ad29933 100644
--- a/libxfs/util.c
+++ b/libxfs/util.c
@@ -462,3 +462,22 @@ void xfs_dirattr_mark_sick(struct xfs_inode *ip, int whichfork) { }
 void xfs_da_mark_sick(struct xfs_da_args *args) { }
 void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask) { }
 void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask) { }
+
+#ifdef HAVE_GETRANDOM_NONBLOCK
+uint32_t
+get_random_u32(void)
+{
+	uint32_t	ret;
+	ssize_t		sz;
+
+	/*
+	 * Try to extract a u32 of randomness from /dev/urandom.  If that
+	 * fails, fall back to returning zero like we used to do.
+	 */
+	sz = getrandom(&ret, sizeof(ret), GRND_NONBLOCK);
+	if (sz != sizeof(ret))
+		return 0;
+
+	return ret;
+}
+#endif
diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
index c81a7a031d2..2228697a7a3 100644
--- a/m4/package_libcdev.m4
+++ b/m4/package_libcdev.m4
@@ -440,6 +440,21 @@ test = mallinfo2();
     AC_SUBST(have_mallinfo2)
   ])
 
+#
+# Check if we have a getrandom syscall with a GRND_NONBLOCK flag
+#
+AC_DEFUN([AC_HAVE_GETRANDOM_NONBLOCK],
+  [ AC_MSG_CHECKING([for getrandom and GRND_NONBLOCK])
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+#include <sys/random.h>
+    ]], [[
+         unsigned int moo;
+         return getrandom(&moo, sizeof(moo), GRND_NONBLOCK);
+    ]])],[have_getrandom_nonblock=yes
+       AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)])
+    AC_SUBST(have_getrandom_nonblock)
+  ])
+
 #
 # Check if we have a openat call
 #
diff --git a/mkfs/proto.c b/mkfs/proto.c
index bb262390536..f9b0f837ed9 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -460,6 +460,9 @@ creatproto(
 		ip->i_diflags2 = xfs_flags2diflags2(ip, fsx->fsx_xflags);
 		ip->i_cowextsize = fsx->fsx_cowextsize;
 	}
+	/* xfsdump breaks if the root dir has a nonzero generation */
+	if (!dp)
+		VFS_I(ip)->i_generation = 0;
 	libxfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
 	return 0;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/28] xfs: hoist new inode initialization functions to libxfs
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-31 23:26   ` [PATCH 15/28] libxfs: implement get_random_u32 Darrick J. Wong
@ 2023-12-31 23:26   ` Darrick J. Wong
  2023-12-31 23:26   ` [PATCH 17/28] xfs: hoist xfs_iunlink " Darrick J. Wong
                     ` (11 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:26 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move all the code that initializes a new inode's attributes from the
icreate_args structure and the parent directory into libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h     |   20 +++++
 libxfs/inode.c          |  148 -----------------------------------
 libxfs/libxfs_priv.h    |    4 +
 libxfs/xfs_inode_util.c |  201 +++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_inode_util.h |   12 +++
 libxfs/xfs_shared.h     |    8 --
 repair/phase6.c         |    3 -
 7 files changed, 239 insertions(+), 157 deletions(-)


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index e0bf5dcc2c2..2af5f79e31e 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -187,6 +187,20 @@ static inline struct timespec64 inode_set_ctime_current(struct inode *inode)
 	return now;
 }
 
+static inline void
+inode_fsuid_set(
+	struct inode		*inode,
+	struct mnt_idmap	*idmap)
+{
+	inode->i_uid = make_kuid(0);
+}
+
+static inline void
+inode_set_iversion(struct inode *inode, uint64_t version)
+{
+	inode->i_version = version;
+}
+
 typedef struct xfs_inode {
 	struct cache_node	i_node;
 	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
@@ -394,4 +408,10 @@ extern void	libxfs_irele(struct xfs_inode *ip);
 
 #define XFS_INHERIT_GID(pip)		(VFS_I(pip)->i_mode & S_ISGID)
 
+#define xfs_inherit_noatime	(false)
+#define xfs_inherit_nodump	(false)
+#define xfs_inherit_sync	(false)
+#define xfs_inherit_nosymlinks	(false)
+#define xfs_inherit_nodefrag	(false)
+
 #endif /* __XFS_INODE_H__ */
diff --git a/libxfs/inode.c b/libxfs/inode.c
index 17bdc0bffb2..ec005bbbf5e 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -29,50 +29,6 @@
 #include "xfs_da_btree.h"
 #include "xfs_dir2_priv.h"
 
-/* Propagate di_flags from a parent inode to a child inode. */
-static void
-xfs_inode_inherit_flags(
-	struct xfs_inode	*ip,
-	const struct xfs_inode	*pip)
-{
-	unsigned int		di_flags = 0;
-	umode_t			mode = VFS_I(ip)->i_mode;
-
-	if ((mode & S_IFMT) == S_IFDIR) {
-		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
-			di_flags |= XFS_DIFLAG_RTINHERIT;
-		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
-			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-			ip->i_extsize = pip->i_extsize;
-		}
-	} else {
-		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-		    xfs_has_realtime(ip->i_mount))
-			di_flags |= XFS_DIFLAG_REALTIME;
-		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
-			di_flags |= XFS_DIFLAG_EXTSIZE;
-			ip->i_extsize = pip->i_extsize;
-		}
-	}
-	if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
-		di_flags |= XFS_DIFLAG_PROJINHERIT;
-	ip->i_diflags |= di_flags;
-}
-
-/* Propagate di_flags2 from a parent inode to a child inode. */
-static void
-xfs_inode_inherit_flags2(
-	struct xfs_inode	*ip,
-	const struct xfs_inode	*pip)
-{
-	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
-		ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
-		ip->i_cowextsize = pip->i_cowextsize;
-	}
-	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
-		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
-}
-
 /*
  * Increment the link count on an inode & log the change.
  */
@@ -91,104 +47,6 @@ libxfs_bumplink(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
-/* Initialise an inode's attributes. */
-static void
-xfs_inode_init(
-	struct xfs_trans	*tp,
-	const struct xfs_icreate_args *args,
-	struct xfs_inode	*ip)
-{
-	struct xfs_inode	*pip = args->pip;
-	struct inode		*dir = pip ? VFS_I(pip) : NULL;
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct inode		*inode = VFS_I(ip);
-	unsigned int		flags;
-	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
-					XFS_ICHGTIME_ACCESS;
-
-	set_nlink(inode, args->nlink);
-	inode->i_rdev = args->rdev;
-	ip->i_projid = args->prid;
-
-	if (dir && !(dir->i_mode & S_ISGID) &&
-	    xfs_has_grpid(mp)) {
-		inode->i_uid = args->uid;
-		inode->i_gid = dir->i_gid;
-		inode->i_mode = args->mode;
-	} else {
-		inode_init_owner(args->idmap, inode, dir, args->mode);
-	}
-
-	/* struct copies */
-	if (args->flags & XFS_ICREATE_ARGS_FORCE_UID)
-		inode->i_uid = args->uid;
-	else
-		ASSERT(uid_eq(inode->i_uid, args->uid));
-	if (args->flags & XFS_ICREATE_ARGS_FORCE_GID)
-		inode->i_gid = args->gid;
-	else if (!pip || !XFS_INHERIT_GID(pip))
-		ASSERT(gid_eq(inode->i_gid, args->gid));
-	if (args->flags & XFS_ICREATE_ARGS_FORCE_MODE)
-		inode->i_mode = args->mode;
-
-	ip->i_disk_size = 0;
-	ip->i_df.if_nextents = 0;
-	ASSERT(ip->i_nblocks == 0);
-
-	ip->i_extsize = 0;
-	ip->i_diflags = 0;
-
-	if (xfs_has_v3inodes(ip->i_mount)) {
-		VFS_I(ip)->i_version = 1;
-		ip->i_cowextsize = 0;
-		times |= XFS_ICHGTIME_CREATE;
-	}
-
-	xfs_trans_ichgtime(tp, ip, times);
-
-	flags = XFS_ILOG_CORE;
-	switch (args->mode & S_IFMT) {
-	case S_IFIFO:
-	case S_IFSOCK:
-	case S_IFCHR:
-	case S_IFBLK:
-		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
-		flags |= XFS_ILOG_DEV;
-		break;
-	case S_IFREG:
-	case S_IFDIR:
-		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
-			xfs_inode_inherit_flags(ip, pip);
-		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
-			xfs_inode_inherit_flags2(ip, pip);
-		/* FALLTHROUGH */
-	case S_IFLNK:
-		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
-		ip->i_df.if_bytes = 0;
-		ip->i_df.if_u1.if_root = NULL;
-		break;
-	default:
-		ASSERT(0);
-	}
-
-	/*
-	 * If we need to create attributes immediately after allocating the
-	 * inode, initialise an empty attribute fork right now. We use the
-	 * default fork offset for attributes here as we don't know exactly what
-	 * size or how many attributes we might be adding. We can do this
-	 * safely here because we know the data fork is completely empty and
-	 * this saves us from needing to run a separate transaction to set the
-	 * fork offset in the immediate future.
-	 */
-	if ((args->flags & XFS_ICREATE_ARGS_INIT_XATTRS) &&
-	    xfs_has_attr(mp)) {
-		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
-		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
-	}
-
-	xfs_trans_log_inode(tp, ip, flags);
-}
-
 /*
  * Initialise a newly allocated inode and return the in-core inode to the
  * caller locked exclusively.
@@ -416,12 +274,6 @@ libxfs_irele(
 	}
 }
 
-static inline void inode_fsuid_set(struct inode *inode,
-				   struct mnt_idmap *idmap)
-{
-	inode->i_uid = make_kuid(0);
-}
-
 static inline void inode_fsgid_set(struct inode *inode,
 				   struct mnt_idmap *idmap)
 {
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index e9c6bbf16ee..8faf63a8a67 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -612,4 +612,8 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 /* xfs_inode.h */
 #define xfs_iflags_set(ip, flags)	do { } while (0)
 
+/* xfs_linux.h */
+#define irix_sgid_inherit		(0)
+#define vfsgid_in_group_p(...)		(false)
+
 #endif	/* __LIBXFS_INTERNAL_XFS_H__ */
diff --git a/libxfs/xfs_inode_util.c b/libxfs/xfs_inode_util.c
index 89fb58807a1..184da10db71 100644
--- a/libxfs/xfs_inode_util.c
+++ b/libxfs/xfs_inode_util.c
@@ -13,6 +13,10 @@
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_inode_util.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
 
 uint16_t
 xfs_flags2diflags(
@@ -133,3 +137,200 @@ xfs_get_initial_prid(struct xfs_inode *dp)
 
 	return XFS_PROJID_DEFAULT;
 }
+
+/* Propagate di_flags from a parent inode to a child inode. */
+static inline void
+xfs_inode_inherit_flags(
+	struct xfs_inode	*ip,
+	const struct xfs_inode	*pip)
+{
+	unsigned int		di_flags = 0;
+	xfs_failaddr_t		failaddr;
+	umode_t			mode = VFS_I(ip)->i_mode;
+
+	if (S_ISDIR(mode)) {
+		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+			ip->i_extsize = pip->i_extsize;
+		}
+		if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+			di_flags |= XFS_DIFLAG_PROJINHERIT;
+	} else if (S_ISREG(mode)) {
+		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+		    xfs_has_realtime(ip->i_mount))
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+			ip->i_extsize = pip->i_extsize;
+		}
+	}
+	if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
+	    xfs_inherit_noatime)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
+	    xfs_inherit_nodump)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
+	    xfs_inherit_sync)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
+	    xfs_inherit_nosymlinks)
+		di_flags |= XFS_DIFLAG_NOSYMLINKS;
+	if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
+	    xfs_inherit_nodefrag)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+
+	ip->i_diflags |= di_flags;
+
+	/*
+	 * Inode verifiers on older kernels only check that the extent size
+	 * hint is an integer multiple of the rt extent size on realtime files.
+	 * They did not check the hint alignment on a directory with both
+	 * rtinherit and extszinherit flags set.  If the misaligned hint is
+	 * propagated from a directory into a new realtime file, new file
+	 * allocations will fail due to math errors in the rt allocator and/or
+	 * trip the verifiers.  Validate the hint settings in the new file so
+	 * that we don't let broken hints propagate.
+	 */
+	failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
+			VFS_I(ip)->i_mode, ip->i_diflags);
+	if (failaddr) {
+		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+				   XFS_DIFLAG_EXTSZINHERIT);
+		ip->i_extsize = 0;
+	}
+}
+
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static inline void
+xfs_inode_inherit_flags2(
+	struct xfs_inode	*ip,
+	const struct xfs_inode	*pip)
+{
+	xfs_failaddr_t		failaddr;
+
+	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+		ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = pip->i_cowextsize;
+	}
+	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
+		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+
+	/* Don't let invalid cowextsize hints propagate. */
+	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
+			VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
+	if (failaddr) {
+		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+		ip->i_cowextsize = 0;
+	}
+}
+
+/* Initialise an inode's attributes. */
+void
+xfs_inode_init(
+	struct xfs_trans	*tp,
+	const struct xfs_icreate_args *args,
+	struct xfs_inode	*ip)
+{
+	struct xfs_inode	*pip = args->pip;
+	struct inode		*dir = pip ? VFS_I(pip) : NULL;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct inode		*inode = VFS_I(ip);
+	unsigned int		flags;
+	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
+					XFS_ICHGTIME_ACCESS;
+
+	set_nlink(inode, args->nlink);
+	inode->i_rdev = args->rdev;
+	ip->i_projid = args->prid;
+
+	if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
+		inode_fsuid_set(inode, args->idmap);
+		inode->i_gid = dir->i_gid;
+		inode->i_mode = args->mode;
+	} else {
+		inode_init_owner(args->idmap, inode, dir, args->mode);
+	}
+
+	/*
+	 * If the group ID of the new file does not match the effective group
+	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
+	 * (and only if the irix_sgid_inherit compatibility variable is set).
+	 */
+	if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
+	    !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
+		inode->i_mode &= ~S_ISGID;
+
+	/* struct copies */
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_UID)
+		inode->i_uid = args->uid;
+	else
+		ASSERT(uid_eq(inode->i_uid, args->uid));
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_GID)
+		inode->i_gid = args->gid;
+	else if (!pip || !XFS_INHERIT_GID(pip))
+		ASSERT(gid_eq(inode->i_gid, args->gid));
+	if (args->flags & XFS_ICREATE_ARGS_FORCE_MODE)
+		inode->i_mode = args->mode;
+
+	ip->i_disk_size = 0;
+	ip->i_df.if_nextents = 0;
+	ASSERT(ip->i_nblocks == 0);
+
+	ip->i_extsize = 0;
+	ip->i_diflags = 0;
+
+	if (xfs_has_v3inodes(mp)) {
+		inode_set_iversion(inode, 1);
+		ip->i_cowextsize = 0;
+		times |= XFS_ICHGTIME_CREATE;
+	}
+
+	xfs_trans_ichgtime(tp, ip, times);
+
+	flags = XFS_ILOG_CORE;
+	switch (args->mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
+		flags |= XFS_ILOG_DEV;
+		break;
+	case S_IFREG:
+	case S_IFDIR:
+		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
+			xfs_inode_inherit_flags(ip, pip);
+		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
+			xfs_inode_inherit_flags2(ip, pip);
+		fallthrough;
+	case S_IFLNK:
+		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+		ip->i_df.if_bytes = 0;
+		ip->i_df.if_u1.if_root = NULL;
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	/*
+	 * If we need to create attributes immediately after allocating the
+	 * inode, initialise an empty attribute fork right now. We use the
+	 * default fork offset for attributes here as we don't know exactly what
+	 * size or how many attributes we might be adding. We can do this
+	 * safely here because we know the data fork is completely empty and
+	 * this saves us from needing to run a separate transaction to set the
+	 * fork offset in the immediate future.
+	 */
+	if ((args->flags & XFS_ICREATE_ARGS_INIT_XATTRS) &&
+	    xfs_has_attr(mp)) {
+		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
+		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+	}
+
+	xfs_trans_log_inode(tp, ip, flags);
+}
diff --git a/libxfs/xfs_inode_util.h b/libxfs/xfs_inode_util.h
index a494f7c4a3f..54d96e1aa9e 100644
--- a/libxfs/xfs_inode_util.h
+++ b/libxfs/xfs_inode_util.h
@@ -45,4 +45,16 @@ struct xfs_icreate_args {
 	uint16_t		flags;
 };
 
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
+#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
+#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
+#define	XFS_ICHGTIME_ACCESS	0x8	/* last access timestamp */
+void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags);
+
+void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
+		struct xfs_inode *ip);
+
 #endif /* __XFS_INODE_UTIL_H__ */
diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h
index 1d327685f6e..2cecebe0188 100644
--- a/libxfs/xfs_shared.h
+++ b/libxfs/xfs_shared.h
@@ -130,14 +130,6 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define	XFS_REFC_BTREE_REF	1
 #define	XFS_SSB_REF		0
 
-/*
- * Flags for xfs_trans_ichgtime().
- */
-#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
-#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
-#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
-#define	XFS_ICHGTIME_ACCESS	0x8	/* last access timestamp */
-
 /* Computed inode geometry for the filesystem. */
 struct xfs_ino_geometry {
 	/* Maximum inode count in this filesystem. */
diff --git a/repair/phase6.c b/repair/phase6.c
index 51e1ea92dd6..13f2fb2290b 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -841,7 +841,8 @@ mk_root_dir(xfs_mount_t *mp)
 	}
 
 	/*
-	 * take care of the core -- initialization from xfs_ialloc()
+	 * take care of the core since we didn't call the libxfs ialloc function
+	 * (comment changed to avoid tangling xfs/437)
 	 */
 	reset_inode_fields(ip);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/28] xfs: hoist xfs_iunlink to libxfs
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-31 23:26   ` [PATCH 16/28] xfs: hoist new inode initialization functions to libxfs Darrick J. Wong
@ 2023-12-31 23:26   ` Darrick J. Wong
  2023-12-31 23:26   ` [PATCH 18/28] xfs: hoist xfs_{bump,drop}link " Darrick J. Wong
                     ` (10 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:26 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move xfs_iunlink and xfs_iunlink_remove to libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h     |    1 
 include/xfs_trace.h     |    7 +
 libxfs/Makefile         |    2 
 libxfs/inode.c          |    2 
 libxfs/iunlink.c        |  163 +++++++++++++++++++++++++++
 libxfs/iunlink.h        |   24 ++++
 libxfs/libxfs_priv.h    |   28 +++++
 libxfs/xfs_inode_util.c |  281 +++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_inode_util.h |    4 +
 9 files changed, 512 insertions(+)
 create mode 100644 libxfs/iunlink.c
 create mode 100644 libxfs/iunlink.h


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 2af5f79e31e..a48cb7eaf77 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -228,6 +228,7 @@ typedef struct xfs_inode {
 
 	/* unlinked list pointers */
 	xfs_agino_t		i_next_unlinked;
+	xfs_agino_t		i_prev_unlinked;
 
 	xfs_extnum_t		i_cnextents;	/* # of extents in cow fork */
 	unsigned int		i_cformat;	/* format of cow fork */
diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index a8f3ecac7f6..7dcf88b7900 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -360,4 +360,11 @@
 
 #define trace_xlog_intent_recovery_failed(...)	((void) 0)
 
+#define trace_xfs_iunlink_update_bucket(...)	((void) 0)
+#define trace_xfs_iunlink_update_dinode(...)	((void) 0)
+#define trace_xfs_iunlink(...)			((void) 0)
+#define trace_xfs_iunlink_reload_next(...)	((void) 0)
+#define trace_xfs_iunlink_remove(...)		((void) 0)
+#define trace_xfs_iunlink_map_prev_fallback(...)	((void) 0)
+
 #endif /* __TRACE_H__ */
diff --git a/libxfs/Makefile b/libxfs/Makefile
index a251322d0f6..26781ceb913 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -25,6 +25,7 @@ HFILES = \
 	libxfs_io.h \
 	libxfs_api_defs.h \
 	init.h \
+	iunlink.h \
 	libxfs_priv.h \
 	linux-err.h \
 	topology.h \
@@ -70,6 +71,7 @@ CFILES = cache.c \
 	defer_item.c \
 	init.c \
 	inode.c \
+	iunlink.c \
 	kmem.c \
 	logitem.c \
 	rdwr.c \
diff --git a/libxfs/inode.c b/libxfs/inode.c
index ec005bbbf5e..73eecebae96 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -193,6 +193,8 @@ libxfs_iget(
 	ip->i_mount = mp;
 	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
 	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
+	ip->i_next_unlinked = NULLAGINO;
+	ip->i_prev_unlinked = NULLAGINO;
 	spin_lock_init(&VFS_I(ip)->i_lock);
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
diff --git a/libxfs/iunlink.c b/libxfs/iunlink.c
new file mode 100644
index 00000000000..6d055453599
--- /dev/null
+++ b/libxfs/iunlink.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+
+#include "libxfs_priv.h"
+#include "libxfs.h"
+#include "libxfs_io.h"
+#include "init.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+#include "iunlink.h"
+#include "xfs_trace.h"
+
+/* in memory log item structure */
+struct xfs_iunlink_item {
+	struct xfs_inode	*ip;
+	struct xfs_perag	*pag;
+	xfs_agino_t		next_agino;
+	xfs_agino_t		old_agino;
+};
+
+/*
+ * Look up the inode cluster buffer and log the on-disk unlinked inode change
+ * we need to make.
+ */
+static int
+xfs_iunlink_log_dinode(
+	struct xfs_trans	*tp,
+	struct xfs_iunlink_item	*iup)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*ip = iup->ip;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*ibp;
+	int			offset;
+	int			error;
+
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+	if (error)
+		return error;
+	/*
+	 * Don't log the unlinked field on stale buffers as this may be the
+	 * transaction that frees the inode cluster and relogging the buffer
+	 * here will incorrectly remove the stale state.
+	 */
+	if (ibp->b_flags & LIBXFS_B_STALE)
+		goto out;
+
+	dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+
+	/* Make sure the old pointer isn't garbage. */
+	if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) {
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+				sizeof(*dip), __this_address);
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno,
+			XFS_INO_TO_AGINO(mp, ip->i_ino),
+			be32_to_cpu(dip->di_next_unlinked), iup->next_agino);
+
+	dip->di_next_unlinked = cpu_to_be32(iup->next_agino);
+	offset = ip->i_imap.im_boffset +
+			offsetof(struct xfs_dinode, di_next_unlinked);
+
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_inode_buf(tp, ibp);
+	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	return 0;
+out:
+	xfs_trans_brelse(tp, ibp);
+	return error;
+}
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the iunlink item.
+ */
+int
+xfs_iunlink_log_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_iunlink_item	iup = {
+		.ip		= ip,
+		.pag		= pag,
+		.next_agino	= next_agino,
+		.old_agino	= ip->i_next_unlinked,
+	};
+
+	ASSERT(xfs_verify_agino_or_null(pag, next_agino));
+	ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked));
+
+	/*
+	 * Since we're updating a linked list, we should never find that the
+	 * current pointer is the same as the new value, unless we're
+	 * terminating the list.
+	 */
+	if (ip->i_next_unlinked == next_agino) {
+		if (next_agino != NULLAGINO)
+			return -EFSCORRUPTED;
+		return 0;
+	}
+
+	return xfs_iunlink_log_dinode(tp, &iup);
+}
+
+/*
+ * Load the inode @next_agino into the cache and set its prev_unlinked pointer
+ * to @prev_agino.  Caller must hold the AGI to synchronize with other changes
+ * to the unlinked list.
+ */
+int
+xfs_iunlink_reload_next(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agibp,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_perag	*pag = agibp->b_pag;
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_inode	*next_ip = NULL;
+	xfs_ino_t		ino;
+	int			error;
+
+	ASSERT(next_agino != NULLAGINO);
+
+	ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+	error = libxfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, &next_ip);
+	if (error)
+		return error;
+
+	/* If this is not an unlinked inode, something is very wrong. */
+	if (VFS_I(next_ip)->i_nlink != 0) {
+		error = -EFSCORRUPTED;
+		goto rele;
+	}
+
+	next_ip->i_prev_unlinked = prev_agino;
+	trace_xfs_iunlink_reload_next(next_ip);
+rele:
+	xfs_irele(next_ip);
+	return error;
+}
diff --git a/libxfs/iunlink.h b/libxfs/iunlink.h
new file mode 100644
index 00000000000..8d8032cf932
--- /dev/null
+++ b/libxfs/iunlink.h
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_IUNLINK_ITEM_H
+#define XFS_IUNLINK_ITEM_H	1
+
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_perag;
+
+static inline struct xfs_inode *
+xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino)
+{
+	return NULL;
+}
+
+int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+			struct xfs_perag *pag, xfs_agino_t next_agino);
+int xfs_iunlink_reload_next(struct xfs_trans *tp, struct xfs_buf *agibp,
+		xfs_agino_t prev_agino, xfs_agino_t next_agino);
+
+#endif	/* XFS_IUNLINK_ITEM_H */
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 8faf63a8a67..a457e127d8a 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -473,6 +473,8 @@ void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
 #define xfs_filestream_new_ag(ip,ag)		(0)
 #define xfs_filestream_select_ag(...)		(-ENOSYS)
 
+#define xfs_trans_inode_buf(tp, bp)		((void) 0)
+
 /* quota bits */
 #define xfs_trans_mod_dquot_byino(t,i,f,d)		((void) 0)
 #define xfs_trans_reserve_quota_nblks(t,i,b,n,f)	(0)
@@ -595,6 +597,32 @@ unsigned int hweight64(__u64 w);
 static inline int xfs_iunlink_init(struct xfs_perag *pag) { return 0; }
 static inline void xfs_iunlink_destroy(struct xfs_perag *pag) { }
 
+static inline xfs_agino_t
+xfs_iunlink_lookup_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino)
+{
+	return NULLAGINO;
+}
+
+static inline int
+xfs_iunlink_add_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		this_agino)
+{
+	return 0;
+}
+
+static inline int
+xfs_iunlink_change_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agino_t		next_unlinked)
+{
+	return 0;
+}
+
 xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *mp,
 		xfs_agnumber_t agcount);
 
diff --git a/libxfs/xfs_inode_util.c b/libxfs/xfs_inode_util.c
index 184da10db71..a51d0342b98 100644
--- a/libxfs/xfs_inode_util.c
+++ b/libxfs/xfs_inode_util.c
@@ -17,6 +17,9 @@
 #include "xfs_ialloc.h"
 #include "xfs_health.h"
 #include "xfs_bmap.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "iunlink.h"
 
 uint16_t
 xfs_flags2diflags(
@@ -334,3 +337,281 @@ xfs_inode_init(
 
 	xfs_trans_log_inode(tp, ip, flags);
 }
+
+/*
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
+ *
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
+ */
+
+/*
+ * Update the prev pointer of the next agino.  Returns -ENOLINK if the inode
+ * is not in cache.
+ */
+static int
+xfs_iunlink_update_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_inode	*ip;
+
+	/* No update necessary if we are at the end of the list. */
+	if (next_agino == NULLAGINO)
+		return 0;
+
+	ip = xfs_iunlink_lookup(pag, next_agino);
+	if (!ip)
+		return -ENOLINK;
+
+	ip->i_prev_unlinked = prev_agino;
+	return 0;
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results.  The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	unsigned int		bucket_index,
+	xfs_agino_t		new_agino)
+{
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		old_value;
+	int			offset;
+
+	ASSERT(xfs_verify_agino_or_null(pag, new_agino));
+
+	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
+			old_value, new_agino);
+
+	/*
+	 * We should never find the head of the list already set to the value
+	 * passed in because either we're adding or removing ourselves from the
+	 * head of the list.
+	 */
+	if (old_value == new_agino) {
+		xfs_buf_mark_corrupt(agibp);
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+		return -EFSCORRUPTED;
+	}
+
+	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+	offset = offsetof(struct xfs_agi, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	return 0;
+}
+
+static int
+xfs_iunlink_insert_inode(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		next_agino;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			error;
+
+	/*
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the pointer isn't garbage and that this inode
+	 * isn't already on the list.
+	 */
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (next_agino == agino ||
+	    !xfs_verify_agino_or_null(pag, next_agino)) {
+		xfs_buf_mark_corrupt(agibp);
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Update the prev pointer in the next inode to point back to this
+	 * inode.
+	 */
+	error = xfs_iunlink_update_backref(pag, agino, next_agino);
+	if (error == -ENOLINK)
+		error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
+	if (error)
+		return error;
+
+	if (next_agino != NULLAGINO) {
+		/*
+		 * There is already another inode in the bucket, so point this
+		 * inode to the current head of the list.
+		 */
+		error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
+		if (error)
+			return error;
+		ip->i_next_unlinked = next_agino;
+	}
+
+	/* Point the head of the list to point to this inode. */
+	ip->i_prev_unlinked = NULLAGINO;
+	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+ * list when the inode is freed.
+ */
+int
+xfs_iunlink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_perag	*pag;
+	struct xfs_buf		*agibp;
+	int			error;
+
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
+	trace_xfs_iunlink(ip);
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(pag, tp, &agibp);
+	if (error)
+		goto out;
+
+	error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
+static int
+xfs_iunlink_remove_inode(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	xfs_agino_t		head_agino;
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			error;
+
+	trace_xfs_iunlink_remove(ip);
+
+	/*
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the head pointer isn't garbage.
+	 */
+	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (!xfs_verify_agino(pag, head_agino)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				agi, sizeof(*agi));
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Set our inode's next_unlinked pointer to NULL and then return
+	 * the old pointer value so that we can update whatever was previous
+	 * to us in the list to point to whatever was next in the list.
+	 */
+	error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
+	if (error)
+		return error;
+
+	/*
+	 * Update the prev pointer in the next inode to point back to previous
+	 * inode in the chain.
+	 */
+	error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+			ip->i_next_unlinked);
+	if (error == -ENOLINK)
+		error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
+				ip->i_next_unlinked);
+	if (error)
+		return error;
+
+	if (head_agino != agino) {
+		struct xfs_inode	*prev_ip;
+
+		prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+		if (!prev_ip) {
+			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+			return -EFSCORRUPTED;
+		}
+
+		error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+				ip->i_next_unlinked);
+		prev_ip->i_next_unlinked = ip->i_next_unlinked;
+	} else {
+		/* Point the head of the list to the next unlinked inode. */
+		error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+				ip->i_next_unlinked);
+	}
+
+	ip->i_next_unlinked = NULLAGINO;
+	ip->i_prev_unlinked = 0;
+	return error;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+int
+xfs_iunlink_remove(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	struct xfs_buf		*agibp;
+	int			error;
+
+	trace_xfs_iunlink_remove(ip);
+
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(pag, tp, &agibp);
+	if (error)
+		return error;
+
+	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
+}
diff --git a/libxfs/xfs_inode_util.h b/libxfs/xfs_inode_util.h
index 54d96e1aa9e..0406401d21b 100644
--- a/libxfs/xfs_inode_util.h
+++ b/libxfs/xfs_inode_util.h
@@ -57,4 +57,8 @@ void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags);
 void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
 		struct xfs_inode *ip);
 
+int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
+int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
+		struct xfs_inode *ip);
+
 #endif /* __XFS_INODE_UTIL_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/28] xfs: hoist xfs_{bump,drop}link to libxfs
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-31 23:26   ` [PATCH 17/28] xfs: hoist xfs_iunlink " Darrick J. Wong
@ 2023-12-31 23:26   ` Darrick J. Wong
  2023-12-31 23:27   ` [PATCH 19/28] xfs: create libxfs helper to link a new inode into a directory Darrick J. Wong
                     ` (9 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:26 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move xfs_bumplink and xfs_droplink to libxfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/namei.c               |   23 +++++---------------
 include/xfs_inode.h      |    2 --
 libxfs/inode.c           |   18 ----------------
 libxfs/libxfs_api_defs.h |    1 +
 libxfs/libxfs_priv.h     |    1 +
 libxfs/xfs_inode_util.c  |   53 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_inode_util.h  |    2 ++
 7 files changed, 63 insertions(+), 37 deletions(-)


diff --git a/db/namei.c b/db/namei.c
index 196da7f90e9..e75179b2c67 100644
--- a/db/namei.c
+++ b/db/namei.c
@@ -1154,21 +1154,6 @@ unlink_help(void)
 	));
 }
 
-static void
-droplink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-
-	libxfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-	if (inode->i_nlink != XFS_NLINK_PINNED)
-		drop_nlink(VFS_I(ip));
-
-	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
-
 static int
 remove_child(
 	struct xfs_mount	*mp,
@@ -1218,13 +1203,17 @@ remove_child(
 
 	if (S_ISDIR(VFS_I(ip)->i_mode)) {
 		/* drop ip's dotdot link to dp */
-		droplink(tp, dp);
+		error = -libxfs_droplink(tp, dp);
+		if (error)
+			goto out_trans;
 	} else {
 		libxfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 	}
 
 	/* drop dp's link to ip */
-	droplink(tp, ip);
+	error = -libxfs_droplink(tp, ip);
+	if (error)
+		goto out_trans;
 
 	error = -libxfs_dir_removename(tp, dp, &xname, ip->i_ino, resblks);
 	if (error)
diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index a48cb7eaf77..5d7bc69e3ff 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -393,8 +393,6 @@ extern void	libxfs_trans_ichgtime(struct xfs_trans *,
 				struct xfs_inode *, int);
 extern int	libxfs_iflush_int (struct xfs_inode *, struct xfs_buf *);
 
-void libxfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
-
 int libxfs_icreate(struct xfs_trans *tp, xfs_ino_t ino,
 		const struct xfs_icreate_args *args, struct xfs_inode **ipp);
 void libxfs_icreate_args_rootfile(struct xfs_icreate_args *args,
diff --git a/libxfs/inode.c b/libxfs/inode.c
index 73eecebae96..47c9b9d6bd7 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -29,24 +29,6 @@
 #include "xfs_da_btree.h"
 #include "xfs_dir2_priv.h"
 
-/*
- * Increment the link count on an inode & log the change.
- */
-void
-libxfs_bumplink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-
-	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-	if (inode->i_nlink != XFS_NLINK_PINNED)
-		inc_nlink(inode);
-
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
-
 /*
  * Initialise a newly allocated inode and return the in-core inode to the
  * caller locked exclusively.
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 7e4d4c008cf..de59d6cdb5d 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -88,6 +88,7 @@
 #define xfs_buf_read_uncached		libxfs_buf_read_uncached
 #define xfs_buf_relse			libxfs_buf_relse
 #define xfs_buf_unlock			libxfs_buf_unlock
+#define xfs_bumplink			libxfs_bumplink
 #define xfs_bunmapi			libxfs_bunmapi
 #define xfs_bwrite			libxfs_bwrite
 #define xfs_calc_dquots_per_chunk	libxfs_calc_dquots_per_chunk
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index a457e127d8a..17e62c9fbb7 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -133,6 +133,7 @@ extern void cmn_err(int, char *, ...);
 enum ce { CE_DEBUG, CE_CONT, CE_NOTE, CE_WARN, CE_ALERT, CE_PANIC };
 
 #define xfs_info(mp,fmt,args...)	cmn_err(CE_CONT, _(fmt), ## args)
+#define xfs_info_ratelimited(mp,fmt,args...) cmn_err(CE_CONT, _(fmt), ## args)
 #define xfs_notice(mp,fmt,args...)	cmn_err(CE_NOTE, _(fmt), ## args)
 #define xfs_warn(mp,fmt,args...)	cmn_err((mp) ? CE_WARN : CE_WARN, _(fmt), ## args)
 #define xfs_err(mp,fmt,args...)		cmn_err(CE_ALERT, _(fmt), ## args)
diff --git a/libxfs/xfs_inode_util.c b/libxfs/xfs_inode_util.c
index a51d0342b98..8f053e3941c 100644
--- a/libxfs/xfs_inode_util.c
+++ b/libxfs/xfs_inode_util.c
@@ -615,3 +615,56 @@ xfs_iunlink_remove(
 
 	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
 }
+
+/*
+ * Decrement the link count on an inode & log the change.  If this causes the
+ * link count to go to zero, move the inode to AGI unlinked list so that it can
+ * be freed when the last active reference goes away via xfs_inactive().
+ */
+int
+xfs_droplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	if (inode->i_nlink == 0) {
+		xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count dropped below zero.  Pinning link count.",
+				ip->i_ino);
+		set_nlink(inode, XFS_NLINK_PINNED);
+	}
+	if (inode->i_nlink != XFS_NLINK_PINNED)
+		drop_nlink(inode);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (inode->i_nlink)
+		return 0;
+
+	return xfs_iunlink(tp, ip);
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+void
+xfs_bumplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	if (inode->i_nlink == XFS_NLINK_PINNED - 1)
+		xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count exceeded maximum.  Pinning link count.",
+				ip->i_ino);
+	if (inode->i_nlink != XFS_NLINK_PINNED)
+		inc_nlink(inode);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
diff --git a/libxfs/xfs_inode_util.h b/libxfs/xfs_inode_util.h
index 0406401d21b..a2a1796a72f 100644
--- a/libxfs/xfs_inode_util.h
+++ b/libxfs/xfs_inode_util.h
@@ -60,5 +60,7 @@ void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
 int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
 int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
 		struct xfs_inode *ip);
+int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
 
 #endif /* __XFS_INODE_UTIL_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/28] xfs: create libxfs helper to link a new inode into a directory
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-31 23:26   ` [PATCH 18/28] xfs: hoist xfs_{bump,drop}link " Darrick J. Wong
@ 2023-12-31 23:27   ` Darrick J. Wong
  2023-12-31 23:27   ` [PATCH 20/28] xfs: create libxfs helper to link an existing " Darrick J. Wong
                     ` (8 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:27 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to link a newly created inode into a
directory.  The upcoming metadata directory feature will need this to
create a metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_dir2.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_dir2.h |   12 ++++++++++++
 2 files changed, 60 insertions(+)


diff --git a/libxfs/xfs_dir2.c b/libxfs/xfs_dir2.c
index b906f39e0fe..3b7c828fec1 100644
--- a/libxfs/xfs_dir2.c
+++ b/libxfs/xfs_dir2.c
@@ -18,6 +18,10 @@
 #include "xfs_errortag.h"
 #include "xfs_trace.h"
 #include "xfs_health.h"
+#include "xfs_shared.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_parent.h"
 
 const struct xfs_name xfs_name_dotdot = {
 	.name	= (const unsigned char *)"..",
@@ -781,3 +785,47 @@ xfs_dir2_compname(
 		return xfs_ascii_ci_compname(args, name, len);
 	return xfs_da_compname(args, name, len);
 }
+
+/*
+ * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip
+ * into @dp under the given @name.  If @ip is a directory, it will be
+ * initialized.  Both inodes must have the ILOCK held and the transaction must
+ * have sufficient blocks reserved.
+ */
+int
+xfs_dir_create_child(
+	struct xfs_trans	*tp,
+	unsigned int		resblks,
+	struct xfs_dir_update	*du)
+{
+	struct xfs_inode	*dp = du->dp;
+	const struct xfs_name	*name = du->name;
+	struct xfs_inode	*ip = du->ip;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_EXCL));
+
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks);
+	if (error) {
+		ASSERT(error != -ENOSPC);
+		return error;
+	}
+
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		error = xfs_dir_init(tp, ip, dp);
+		if (error)
+			return error;
+
+		xfs_bumplink(tp, dp);
+	}
+
+	/*
+	 * If we have parent pointers, we need to add the attribute containing
+	 * the parent information now.
+	 */
+	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
+}
diff --git a/libxfs/xfs_dir2.h b/libxfs/xfs_dir2.h
index ca1949ed4f5..71a8d8e8a8e 100644
--- a/libxfs/xfs_dir2.h
+++ b/libxfs/xfs_dir2.h
@@ -293,4 +293,16 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c)
 	return c;
 }
 
+struct xfs_parent_args;
+
+struct xfs_dir_update {
+	struct xfs_inode	*dp;
+	const struct xfs_name	*name;
+	struct xfs_inode	*ip;
+	struct xfs_parent_args	*ppargs;
+};
+
+int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks,
+		struct xfs_dir_update *du);
+
 #endif	/* __XFS_DIR2_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/28] xfs: create libxfs helper to link an existing inode into a directory
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-31 23:27   ` [PATCH 19/28] xfs: create libxfs helper to link a new inode into a directory Darrick J. Wong
@ 2023-12-31 23:27   ` Darrick J. Wong
  2023-12-31 23:27   ` [PATCH 21/28] xfs: hoist inode free function to libxfs Darrick J. Wong
                     ` (7 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:27 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to link an existing inode into a directory.
The upcoming metadata directory feature will need this to create a
metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_dir2.c |   65 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 libxfs/xfs_dir2.h |    4 ++-
 2 files changed, 65 insertions(+), 4 deletions(-)


diff --git a/libxfs/xfs_dir2.c b/libxfs/xfs_dir2.c
index 3b7c828fec1..b2f978e657c 100644
--- a/libxfs/xfs_dir2.c
+++ b/libxfs/xfs_dir2.c
@@ -22,6 +22,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_parent.h"
+#include "xfs_ag.h"
 
 const struct xfs_name xfs_name_dotdot = {
 	.name	= (const unsigned char *)"..",
@@ -562,9 +563,9 @@ xfs_dir_replace(
  */
 int
 xfs_dir_canenter(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*dp,
-	struct xfs_name	*name)		/* name of entry to add */
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	const struct xfs_name	*name)		/* name of entry to add */
 {
 	return xfs_dir_createname(tp, dp, name, 0, 0);
 }
@@ -829,3 +830,61 @@ xfs_dir_create_child(
 	 */
 	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
 }
+
+/*
+ * Given a directory @dp, an existing non-directory inode @ip, and a @name,
+ * link @ip into @dp under the given @name.  Both inodes must have the ILOCK
+ * held.
+ */
+int
+xfs_dir_add_child(
+	struct xfs_trans	*tp,
+	unsigned int		resblks,
+	struct xfs_dir_update	*du)
+{
+	struct xfs_inode	*dp = du->dp;
+	const struct xfs_name	*name = du->name;
+	struct xfs_inode	*ip = du->ip;
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_EXCL));
+	ASSERT(!S_ISDIR(VFS_I(ip)->i_mode));
+
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, dp, name);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Handle initial link state of O_TMPFILE inode
+	 */
+	if (VFS_I(ip)->i_nlink == 0) {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+		error = xfs_iunlink_remove(tp, pag, ip);
+		xfs_perag_put(pag);
+		if (error)
+			return error;
+	}
+
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks);
+	if (error)
+		return error;
+
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	xfs_bumplink(tp, ip);
+
+	/*
+	 * If we have parent pointers, we now need to add the parent record to
+	 * the attribute fork of the inode. If this is the initial parent
+	 * attribute, we need to create it correctly, otherwise we can just add
+	 * the parent to the inode.
+	 */
+	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
+}
diff --git a/libxfs/xfs_dir2.h b/libxfs/xfs_dir2.h
index 71a8d8e8a8e..1215d5d1ebe 100644
--- a/libxfs/xfs_dir2.h
+++ b/libxfs/xfs_dir2.h
@@ -61,7 +61,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
 				const struct xfs_name *name, xfs_ino_t inum,
 				xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-				struct xfs_name *name);
+				const struct xfs_name *name);
 
 /*
  * Direct call from the bmap code, bypassing the generic directory layer.
@@ -304,5 +304,7 @@ struct xfs_dir_update {
 
 int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks,
 		struct xfs_dir_update *du);
+int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks,
+		struct xfs_dir_update *du);
 
 #endif	/* __XFS_DIR2_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/28] xfs: hoist inode free function to libxfs
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-31 23:27   ` [PATCH 20/28] xfs: create libxfs helper to link an existing " Darrick J. Wong
@ 2023-12-31 23:27   ` Darrick J. Wong
  2023-12-31 23:28   ` [PATCH 22/28] xfs: create libxfs helper to remove an existing inode/name from a directory Darrick J. Wong
                     ` (6 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:27 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a libxfs helper function that marks an inode free on disk.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_inode_util.c |   50 +++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_inode_util.h |    5 +++++
 2 files changed, 55 insertions(+)


diff --git a/libxfs/xfs_inode_util.c b/libxfs/xfs_inode_util.c
index 8f053e3941c..19d26950db9 100644
--- a/libxfs/xfs_inode_util.c
+++ b/libxfs/xfs_inode_util.c
@@ -668,3 +668,53 @@ xfs_bumplink(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
+
+/* Mark an inode free on disk. */
+int
+xfs_dir_ifree(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip,
+	struct xfs_icluster	*xic)
+{
+	int			error;
+
+	/*
+	 * Free the inode first so that we guarantee that the AGI lock is going
+	 * to be taken before we remove the inode from the unlinked list. This
+	 * makes the AGI lock -> unlinked list modification order the same as
+	 * used in O_TMPFILE creation.
+	 */
+	error = xfs_difree(tp, pag, ip->i_ino, xic);
+	if (error)
+		return error;
+
+	error = xfs_iunlink_remove(tp, pag, ip);
+	if (error)
+		return error;
+
+	/*
+	 * Free any local-format data sitting around before we reset the
+	 * data fork to extents format.  Note that the attr fork data has
+	 * already been freed by xfs_attr_inactive.
+	 */
+	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		kmem_free(ip->i_df.if_u1.if_data);
+		ip->i_df.if_u1.if_data = NULL;
+		ip->i_df.if_bytes = 0;
+	}
+
+	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */
+	ip->i_diflags = 0;
+	ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2;
+	ip->i_forkoff = 0;		/* mark the attr fork not in use */
+	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+
+	/*
+	 * Bump the generation count so no one will be confused
+	 * by reincarnations of this inode.
+	 */
+	VFS_I(ip)->i_generation++;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
diff --git a/libxfs/xfs_inode_util.h b/libxfs/xfs_inode_util.h
index a2a1796a72f..521f044198c 100644
--- a/libxfs/xfs_inode_util.h
+++ b/libxfs/xfs_inode_util.h
@@ -6,6 +6,8 @@
 #ifndef	__XFS_INODE_UTIL_H__
 #define	__XFS_INODE_UTIL_H__
 
+struct xfs_icluster;
+
 uint16_t	xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags);
 uint64_t	xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags);
 uint32_t	xfs_dic2xflags(struct xfs_inode *ip);
@@ -57,6 +59,9 @@ void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags);
 void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
 		struct xfs_inode *ip);
 
+int xfs_dir_ifree(struct xfs_trans *tp, struct xfs_perag *pag,
+		struct xfs_inode *ip, struct xfs_icluster *xic);
+
 int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
 int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
 		struct xfs_inode *ip);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/28] xfs: create libxfs helper to remove an existing inode/name from a directory
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-31 23:27   ` [PATCH 21/28] xfs: hoist inode free function to libxfs Darrick J. Wong
@ 2023-12-31 23:28   ` Darrick J. Wong
  2023-12-31 23:28   ` [PATCH 23/28] xfs: create libxfs helper to exchange two directory entries Darrick J. Wong
                     ` (5 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:28 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to remove a (name, inode) entry from a
directory.  The upcoming metadata directory feature will need this to
create a metadata directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_dir2.c |   75 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_dir2.h |    2 +
 2 files changed, 77 insertions(+)


diff --git a/libxfs/xfs_dir2.c b/libxfs/xfs_dir2.c
index b2f978e657c..43b7bc31166 100644
--- a/libxfs/xfs_dir2.c
+++ b/libxfs/xfs_dir2.c
@@ -888,3 +888,78 @@ xfs_dir_add_child(
 	 */
 	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
 }
+
+/*
+ * Given a directory @dp, a child @ip, and a @name, remove the (@name, @ip)
+ * entry from the directory.  Both inodes must have the ILOCK held.
+ */
+int
+xfs_dir_remove_child(
+	struct xfs_trans	*tp,
+	unsigned int		resblks,
+	struct xfs_dir_update	*du)
+{
+	struct xfs_inode	*dp = du->dp;
+	const struct xfs_name	*name = du->name;
+	struct xfs_inode	*ip = du->ip;
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_EXCL));
+
+	/*
+	 * If we're removing a directory perform some additional validation.
+	 */
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		ASSERT(VFS_I(ip)->i_nlink >= 2);
+		if (VFS_I(ip)->i_nlink != 2)
+			return -ENOTEMPTY;
+		if (!xfs_dir_isempty(ip))
+			return -ENOTEMPTY;
+
+		/* Drop the link from ip's "..".  */
+		error = xfs_droplink(tp, dp);
+		if (error)
+			return error;
+
+		/* Drop the "." link from ip to self.  */
+		error = xfs_droplink(tp, ip);
+		if (error)
+			return error;
+
+		/*
+		 * Point the unlinked child directory's ".." entry to the root
+		 * directory to eliminate back-references to inodes that may
+		 * get freed before the child directory is closed.  If the fs
+		 * gets shrunk, this can lead to dirent inode validation errors.
+		 */
+		if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
+			error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+					tp->t_mountp->m_sb.sb_rootino, 0);
+			if (error)
+				return error;
+		}
+	} else {
+		/*
+		 * When removing a non-directory we need to log the parent
+		 * inode here.  For a directory this is done implicitly
+		 * by the xfs_droplink call for the ".." entry.
+		 */
+		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/* Drop the link from dp to ip. */
+	error = xfs_droplink(tp, ip);
+	if (error)
+		return error;
+
+	error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
+	if (error) {
+		ASSERT(error != -ENOENT);
+		return error;
+	}
+
+	/* Remove parent pointer. */
+	return xfs_parent_remove(tp, du->ppargs, dp, name, ip);
+}
diff --git a/libxfs/xfs_dir2.h b/libxfs/xfs_dir2.h
index 1215d5d1ebe..8c8b55b487d 100644
--- a/libxfs/xfs_dir2.h
+++ b/libxfs/xfs_dir2.h
@@ -306,5 +306,7 @@ int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks,
 		struct xfs_dir_update *du);
 int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks,
 		struct xfs_dir_update *du);
+int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks,
+		struct xfs_dir_update *du);
 
 #endif	/* __XFS_DIR2_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/28] xfs: create libxfs helper to exchange two directory entries
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-31 23:28   ` [PATCH 22/28] xfs: create libxfs helper to remove an existing inode/name from a directory Darrick J. Wong
@ 2023-12-31 23:28   ` Darrick J. Wong
  2023-12-31 23:28   ` [PATCH 24/28] xfs: create libxfs helper to rename " Darrick J. Wong
                     ` (4 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:28 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to exchange two directory entries.
The upcoming metadata directory feature will need this to replace a
metadata inode directory entry.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_dir2.c |  117 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_dir2.h |    3 +
 2 files changed, 120 insertions(+)


diff --git a/libxfs/xfs_dir2.c b/libxfs/xfs_dir2.c
index 43b7bc31166..d14e81403ad 100644
--- a/libxfs/xfs_dir2.c
+++ b/libxfs/xfs_dir2.c
@@ -963,3 +963,120 @@ xfs_dir_remove_child(
 	/* Remove parent pointer. */
 	return xfs_parent_remove(tp, du->ppargs, dp, name, ip);
 }
+
+/*
+ * Exchange the entry (@name1, @ip1) in directory @dp1 with the entry (@name2,
+ * @ip2) in directory @dp2, and update '..' @ip1 and @ip2's entries as needed.
+ * @ip1 and @ip2 need not be of the same type.
+ *
+ * All inodes must have the ILOCK held, and both entries must already exist.
+ */
+int
+xfs_dir_exchange_children(
+	struct xfs_trans	*tp,
+	struct xfs_dir_update	*du1,
+	struct xfs_dir_update	*du2,
+	unsigned int		spaceres)
+{
+	struct xfs_inode	*dp1 = du1->dp;
+	const struct xfs_name	*name1 = du1->name;
+	struct xfs_inode	*ip1 = du1->ip;
+	struct xfs_inode	*dp2 = du2->dp;
+	const struct xfs_name	*name2 = du2->name;
+	struct xfs_inode	*ip2 = du2->ip;
+	int			ip1_flags = 0;
+	int			ip2_flags = 0;
+	int			dp2_flags = 0;
+	int			error;
+
+	/* Swap inode number for dirent in first parent */
+	error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
+	if (error)
+		return error;
+
+	/* Swap inode number for dirent in second parent */
+	error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
+	if (error)
+		return error;
+
+	/*
+	 * If we're renaming one or more directories across different parents,
+	 * update the respective ".." entries (and link counts) to match the new
+	 * parents.
+	 */
+	if (dp1 != dp2) {
+		dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+
+		if (S_ISDIR(VFS_I(ip2)->i_mode)) {
+			error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
+						dp1->i_ino, spaceres);
+			if (error)
+				return error;
+
+			/* transfer ip2 ".." reference to dp1 */
+			if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
+				error = xfs_droplink(tp, dp2);
+				if (error)
+					return error;
+				xfs_bumplink(tp, dp1);
+			}
+
+			/*
+			 * Although ip1 isn't changed here, userspace needs
+			 * to be warned about the change, so that applications
+			 * relying on it (like backup ones), will properly
+			 * notify the change
+			 */
+			ip1_flags |= XFS_ICHGTIME_CHG;
+			ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+		}
+
+		if (S_ISDIR(VFS_I(ip1)->i_mode)) {
+			error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
+						dp2->i_ino, spaceres);
+			if (error)
+				return error;
+
+			/* transfer ip1 ".." reference to dp2 */
+			if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
+				error = xfs_droplink(tp, dp1);
+				if (error)
+					return error;
+				xfs_bumplink(tp, dp2);
+			}
+
+			/*
+			 * Although ip2 isn't changed here, userspace needs
+			 * to be warned about the change, so that applications
+			 * relying on it (like backup ones), will properly
+			 * notify the change
+			 */
+			ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+			ip2_flags |= XFS_ICHGTIME_CHG;
+		}
+	}
+
+	if (ip1_flags) {
+		xfs_trans_ichgtime(tp, ip1, ip1_flags);
+		xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+	}
+	if (ip2_flags) {
+		xfs_trans_ichgtime(tp, ip2, ip2_flags);
+		xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+	}
+	if (dp2_flags) {
+		xfs_trans_ichgtime(tp, dp2, dp2_flags);
+		xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
+	}
+	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+
+	/* Schedule parent pointer replacements */
+	error = xfs_parent_replace(tp, du1->ppargs, dp1, name1, dp2, name2,
+			ip1);
+	if (error)
+		return error;
+
+	return xfs_parent_replace(tp, du2->ppargs, dp2, name2, dp1, name1,
+			ip2);
+}
diff --git a/libxfs/xfs_dir2.h b/libxfs/xfs_dir2.h
index 8c8b55b487d..dbca60ec934 100644
--- a/libxfs/xfs_dir2.h
+++ b/libxfs/xfs_dir2.h
@@ -309,4 +309,7 @@ int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks,
 int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks,
 		struct xfs_dir_update *du);
 
+int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1,
+		struct xfs_dir_update *du2, unsigned int spaceres);
+
 #endif	/* __XFS_DIR2_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/28] xfs: create libxfs helper to rename two directory entries
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-31 23:28   ` [PATCH 23/28] xfs: create libxfs helper to exchange two directory entries Darrick J. Wong
@ 2023-12-31 23:28   ` Darrick J. Wong
  2023-12-31 23:28   ` [PATCH 25/28] xfs: move dirent update hooks to xfs_dir2.c Darrick J. Wong
                     ` (3 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:28 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new libxfs function to rename two directory entries.  The
upcoming metadata directory feature will need this to replace a metadata
inode directory entry.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_dir2.c |  217 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_dir2.h |    3 +
 2 files changed, 220 insertions(+)


diff --git a/libxfs/xfs_dir2.c b/libxfs/xfs_dir2.c
index d14e81403ad..7a1d55dff69 100644
--- a/libxfs/xfs_dir2.c
+++ b/libxfs/xfs_dir2.c
@@ -23,6 +23,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_parent.h"
 #include "xfs_ag.h"
+#include "xfs_ialloc.h"
 
 const struct xfs_name xfs_name_dotdot = {
 	.name	= (const unsigned char *)"..",
@@ -1080,3 +1081,219 @@ xfs_dir_exchange_children(
 	return xfs_parent_replace(tp, du2->ppargs, dp2, name2, dp1, name1,
 			ip2);
 }
+
+/*
+ * Given an entry (@src_name, @src_ip) in directory @src_dp, make the entry
+ * @target_name in directory @target_dp point to @src_ip and remove the
+ * original entry, cleaning up everything left behind.
+ *
+ * Cleanup involves dropping a link count on @target_ip, and either removing
+ * the (@src_name, @src_ip) entry from @src_dp or simply replacing the entry
+ * with (@src_name, @wip) if a whiteout inode @wip is supplied.
+ *
+ * All inodes must have the ILOCK held.  We assume that if @src_ip is a
+ * directory then its '..' doesn't already point to @target_dp, and that @wip
+ * is a freshly allocated whiteout.
+ */
+int
+xfs_dir_rename_children(
+	struct xfs_trans	*tp,
+	struct xfs_dir_update	*du_src,
+	struct xfs_dir_update	*du_tgt,
+	unsigned int		spaceres,
+	struct xfs_dir_update	*du_wip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*src_dp = du_src->dp;
+	const struct xfs_name	*src_name = du_src->name;
+	struct xfs_inode	*src_ip = du_src->ip;
+	struct xfs_inode	*target_dp = du_tgt->dp;
+	const struct xfs_name	*target_name = du_tgt->name;
+	struct xfs_inode	*target_ip = du_tgt->ip;
+	bool			new_parent = (src_dp != target_dp);
+	bool			src_is_directory;
+	int			error;
+
+	src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
+
+	/*
+	 * Check for expected errors before we dirty the transaction
+	 * so we can return an error without a transaction abort.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If there's no space reservation, check the entry will
+		 * fit before actually inserting it.
+		 */
+		if (!spaceres) {
+			error = xfs_dir_canenter(tp, target_dp, target_name);
+			if (error)
+				return error;
+		}
+	} else {
+		/*
+		 * If target exists and it's a directory, check that whether
+		 * it can be destroyed.
+		 */
+		if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+		    (!xfs_dir_isempty(target_ip) ||
+		     (VFS_I(target_ip)->i_nlink > 2)))
+			return -EEXIST;
+	}
+
+	/*
+	 * Directory entry creation below may acquire the AGF. Remove
+	 * the whiteout from the unlinked list first to preserve correct
+	 * AGI/AGF locking order. This dirties the transaction so failures
+	 * after this point will abort and log recovery will clean up the
+	 * mess.
+	 *
+	 * For whiteouts, we need to bump the link count on the whiteout
+	 * inode. After this point, we have a real link, clear the tmpfile
+	 * state flag from the inode so it doesn't accidentally get misused
+	 * in future.
+	 */
+	if (du_wip->ip) {
+		struct xfs_perag	*pag;
+
+		ASSERT(VFS_I(du_wip->ip)->i_nlink == 0);
+
+		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, du_wip->ip->i_ino));
+		error = xfs_iunlink_remove(tp, pag, du_wip->ip);
+		xfs_perag_put(pag);
+		if (error)
+			return error;
+
+		xfs_bumplink(tp, du_wip->ip);
+	}
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If target does not exist and the rename crosses
+		 * directories, adjust the target directory link count
+		 * to account for the ".." reference from the new entry.
+		 */
+		error = xfs_dir_createname(tp, target_dp, target_name,
+					   src_ip->i_ino, spaceres);
+		if (error)
+			return error;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		if (new_parent && src_is_directory) {
+			xfs_bumplink(tp, target_dp);
+		}
+	} else { /* target_ip != NULL */
+		/*
+		 * Link the source inode under the target name.
+		 * If the source inode is a directory and we are moving
+		 * it across directories, its ".." entry will be
+		 * inconsistent until we replace that down below.
+		 *
+		 * In case there is already an entry with the same
+		 * name at the destination directory, remove it first.
+		 */
+		error = xfs_dir_replace(tp, target_dp, target_name,
+					src_ip->i_ino, spaceres);
+		if (error)
+			return error;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		/*
+		 * Decrement the link count on the target since the target
+		 * dir no longer points to it.
+		 */
+		error = xfs_droplink(tp, target_ip);
+		if (error)
+			return error;
+
+		if (src_is_directory) {
+			/*
+			 * Drop the link from the old "." entry.
+			 */
+			error = xfs_droplink(tp, target_ip);
+			if (error)
+				return error;
+		}
+	} /* target_ip != NULL */
+
+	/*
+	 * Remove the source.
+	 */
+	if (new_parent && src_is_directory) {
+		/*
+		 * Rewrite the ".." entry to point to the new
+		 * directory.
+		 */
+		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+					target_dp->i_ino, spaceres);
+		ASSERT(error != -EEXIST);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * We always want to hit the ctime on the source inode.
+	 *
+	 * This isn't strictly required by the standards since the source
+	 * inode isn't really being changed, but old unix file systems did
+	 * it and some incremental backup programs won't work without it.
+	 */
+	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+	/*
+	 * Adjust the link count on src_dp.  This is necessary when
+	 * renaming a directory, either within one parent when
+	 * the target existed, or across two parent directories.
+	 */
+	if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+		/*
+		 * Decrement link count on src_directory since the
+		 * entry that's moved no longer points to it.
+		 */
+		error = xfs_droplink(tp, src_dp);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * For whiteouts, we only need to update the source dirent with the
+	 * inode number of the whiteout inode rather than removing it
+	 * altogether.
+	 */
+	if (du_wip->ip)
+		error = xfs_dir_replace(tp, src_dp, src_name, du_wip->ip->i_ino,
+					spaceres);
+	else
+		error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+					   spaceres);
+	if (error)
+		return error;
+
+	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+	if (new_parent)
+		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
+	/* Schedule parent pointer updates. */
+	error = xfs_parent_add(tp, du_wip->ppargs, src_dp, src_name,
+			du_wip->ip);
+	if (error)
+		return error;
+
+	error = xfs_parent_replace(tp, du_src->ppargs, src_dp, src_name,
+			target_dp, target_name, src_ip);
+	if (error)
+		return error;
+
+	return xfs_parent_remove(tp, du_tgt->ppargs, target_dp, target_name,
+			target_ip);
+}
diff --git a/libxfs/xfs_dir2.h b/libxfs/xfs_dir2.h
index dbca60ec934..5e8b18f3f00 100644
--- a/libxfs/xfs_dir2.h
+++ b/libxfs/xfs_dir2.h
@@ -311,5 +311,8 @@ int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks,
 
 int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1,
 		struct xfs_dir_update *du2, unsigned int spaceres);
+int xfs_dir_rename_children(struct xfs_trans *tp, struct xfs_dir_update *du_src,
+		struct xfs_dir_update *du_tgt, unsigned int spaceres,
+		struct xfs_dir_update *du_wip);
 
 #endif	/* __XFS_DIR2_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 25/28] xfs: move dirent update hooks to xfs_dir2.c
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (23 preceding siblings ...)
  2023-12-31 23:28   ` [PATCH 24/28] xfs: create libxfs helper to rename " Darrick J. Wong
@ 2023-12-31 23:28   ` Darrick J. Wong
  2023-12-31 23:29   ` [PATCH 26/28] xfs_db: port the iunlink command to use the libxfs iunlink function Darrick J. Wong
                     ` (2 subsequent siblings)
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:28 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Move the directory entry update hook code to xfs_dir2 so that it is
mostly consolidated with the higher level directory functions.  Retain
the exports so that online fsck can still send notifications through the
hooks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_dir2.c |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 libxfs/xfs_dir2.h |   24 ++++++++++
 2 files changed, 144 insertions(+), 5 deletions(-)


diff --git a/libxfs/xfs_dir2.c b/libxfs/xfs_dir2.c
index 7a1d55dff69..bfdbcc3cd3e 100644
--- a/libxfs/xfs_dir2.c
+++ b/libxfs/xfs_dir2.c
@@ -788,6 +788,72 @@ xfs_dir2_compname(
 	return xfs_da_compname(args, name, len);
 }
 
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of directory live update hooks.
+ * If the compiler supports jump labels, the static branch will be replaced by
+ * a nop sled when there are no hook users.  Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
+
+void
+xfs_dir_hook_disable(void)
+{
+	xfs_hooks_switch_off(&xfs_dir_hooks_switch);
+}
+
+void
+xfs_dir_hook_enable(void)
+{
+	xfs_hooks_switch_on(&xfs_dir_hooks_switch);
+}
+
+/* Call hooks for a directory update relating to a child dirent update. */
+inline void
+xfs_dir_update_hook(
+	struct xfs_inode		*dp,
+	struct xfs_inode		*ip,
+	int				delta,
+	const struct xfs_name		*name)
+{
+	if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
+		struct xfs_dir_update_params	p = {
+			.dp		= dp,
+			.ip		= ip,
+			.delta		= delta,
+			.name		= name,
+		};
+		struct xfs_mount	*mp = ip->i_mount;
+
+		xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+	}
+}
+
+/* Call the specified function during a directory update. */
+int
+xfs_dir_hook_add(
+	struct xfs_mount	*mp,
+	struct xfs_dir_hook	*hook)
+{
+	return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Stop calling the specified function during a directory update. */
+void
+xfs_dir_hook_del(
+	struct xfs_mount	*mp,
+	struct xfs_dir_hook	*hook)
+{
+	xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 /*
  * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip
  * into @dp under the given @name.  If @ip is a directory, it will be
@@ -829,7 +895,12 @@ xfs_dir_create_child(
 	 * If we have parent pointers, we need to add the attribute containing
 	 * the parent information now.
 	 */
-	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
+	error = xfs_parent_add(tp, du->ppargs, dp, name, ip);
+	if (error)
+		return error;
+
+	xfs_dir_update_hook(dp, ip, 1, name);
+	return 0;
 }
 
 /*
@@ -887,7 +958,12 @@ xfs_dir_add_child(
 	 * attribute, we need to create it correctly, otherwise we can just add
 	 * the parent to the inode.
 	 */
-	return xfs_parent_add(tp, du->ppargs, dp, name, ip);
+	error = xfs_parent_add(tp, du->ppargs, dp, name, ip);
+	if (error)
+		return error;
+
+	xfs_dir_update_hook(dp, ip, 1, name);
+	return 0;
 }
 
 /*
@@ -962,7 +1038,12 @@ xfs_dir_remove_child(
 	}
 
 	/* Remove parent pointer. */
-	return xfs_parent_remove(tp, du->ppargs, dp, name, ip);
+	error = xfs_parent_remove(tp, du->ppargs, dp, name, ip);
+	if (error)
+		return error;
+
+	xfs_dir_update_hook(dp, ip, -1, name);
+	return 0;
 }
 
 /*
@@ -1078,8 +1159,24 @@ xfs_dir_exchange_children(
 	if (error)
 		return error;
 
-	return xfs_parent_replace(tp, du2->ppargs, dp2, name2, dp1, name1,
+	error = xfs_parent_replace(tp, du2->ppargs, dp2, name2, dp1, name1,
 			ip2);
+	if (error)
+		return error;
+
+	/*
+	 * Inform our hook clients that we've finished an exchange operation as
+	 * follows: removed the source and target files from their directories;
+	 * added the target to the source directory; and added the source to
+	 * the target directory.  All inodes are locked, so it's ok to model a
+	 * rename this way so long as we say we deleted entries before we add
+	 * new ones.
+	 */
+	xfs_dir_update_hook(dp1, ip1, -1, name1);
+	xfs_dir_update_hook(dp2, ip2, -1, name2);
+	xfs_dir_update_hook(dp1, ip2, 1, name1);
+	xfs_dir_update_hook(dp2, ip1, 1, name2);
+	return 0;
 }
 
 /*
@@ -1294,6 +1391,24 @@ xfs_dir_rename_children(
 	if (error)
 		return error;
 
-	return xfs_parent_remove(tp, du_tgt->ppargs, target_dp, target_name,
+	error = xfs_parent_remove(tp, du_tgt->ppargs, target_dp, target_name,
 			target_ip);
+	if (error)
+		return error;
+
+	/*
+	 * Inform our hook clients that we've finished a rename operation as
+	 * follows: removed the source and target files from their directories;
+	 * that we've added the source to the target directory; and finally
+	 * that we've added the whiteout, if there was one.  All inodes are
+	 * locked, so it's ok to model a rename this way so long as we say we
+	 * deleted entries before we add new ones.
+	 */
+	if (target_ip)
+		xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
+	xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
+	xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
+	if (du_wip->ip)
+		xfs_dir_update_hook(src_dp, du_wip->ip, 1, src_name);
+	return 0;
 }
diff --git a/libxfs/xfs_dir2.h b/libxfs/xfs_dir2.h
index 5e8b18f3f00..57b124abb17 100644
--- a/libxfs/xfs_dir2.h
+++ b/libxfs/xfs_dir2.h
@@ -293,6 +293,30 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c)
 	return c;
 }
 
+struct xfs_dir_update_params {
+	const struct xfs_inode	*dp;
+	const struct xfs_inode	*ip;
+	const struct xfs_name	*name;
+	int			delta;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
+		int delta, const struct xfs_name *name);
+
+struct xfs_dir_hook {
+	struct xfs_hook		dirent_hook;
+};
+
+void xfs_dir_hook_disable(void);
+void xfs_dir_hook_enable(void);
+
+int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+#else
+# define xfs_dir_update_hook(dp, ip, delta, name)	((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 struct xfs_parent_args;
 
 struct xfs_dir_update {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 26/28] xfs_db: port the iunlink command to use the libxfs iunlink function
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (24 preceding siblings ...)
  2023-12-31 23:28   ` [PATCH 25/28] xfs: move dirent update hooks to xfs_dir2.c Darrick J. Wong
@ 2023-12-31 23:29   ` Darrick J. Wong
  2023-12-31 23:29   ` [PATCH 27/28] xfs_repair: use library functions to reset root/rbm/rsum inodes Darrick J. Wong
  2023-12-31 23:29   ` [PATCH 28/28] xfs_repair: use library functions for orphanage creation Darrick J. Wong
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:29 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we've ported the kernel's iunlink code to userspace, adapt the
debugger command to use it instead of duplicating the logic.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/iunlink.c             |  110 ----------------------------------------------
 libxfs/libxfs_api_defs.h |    1 
 2 files changed, 2 insertions(+), 109 deletions(-)


diff --git a/db/iunlink.c b/db/iunlink.c
index af452d028bd..c87b98431e5 100644
--- a/db/iunlink.c
+++ b/db/iunlink.c
@@ -197,114 +197,6 @@ static const cmdinfo_t	dump_iunlinked_cmd =
 	  N_("[-a agno] [-b bucket] [-q] [-v]"),
 	  N_("dump chain of unlinked inode buckets"), NULL };
 
-/*
- * Look up the inode cluster buffer and log the on-disk unlinked inode change
- * we need to make.
- */
-static int
-iunlink_log_dinode(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	xfs_agino_t		next_agino)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_dinode	*dip;
-	struct xfs_buf		*ibp;
-	int			offset;
-	int			error;
-
-	error = -libxfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
-	if (error)
-		return error;
-
-	dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
-
-	dip->di_next_unlinked = cpu_to_be32(next_agino);
-	offset = ip->i_imap.im_boffset +
-			offsetof(struct xfs_dinode, di_next_unlinked);
-
-	libxfs_dinode_calc_crc(mp, dip);
-	libxfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
-	return 0;
-}
-
-static int
-iunlink_insert_inode(
-	struct xfs_trans	*tp,
-	struct xfs_perag	*pag,
-	struct xfs_buf		*agibp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi = agibp->b_addr;
-	xfs_agino_t		next_agino;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-	int			offset;
-	int			error;
-
-	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the pointer isn't garbage and that this inode
-	 * isn't already on the list.
-	 */
-	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-	if (next_agino == agino || !xfs_verify_agino_or_null(pag, next_agino))
-		return EFSCORRUPTED;
-
-	if (next_agino != NULLAGINO) {
-		/*
-		 * There is already another inode in the bucket, so point this
-		 * inode to the current head of the list.
-		 */
-		error = iunlink_log_dinode(tp, ip, pag, next_agino);
-		if (error)
-			return error;
-	}
-
-	/* Update the bucket. */
-	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
-	offset = offsetof(struct xfs_agi, agi_unlinked) +
-			(sizeof(xfs_agino_t) * bucket_index);
-	libxfs_trans_log_buf(tp, agibp, offset,
-			offset + sizeof(xfs_agino_t) - 1);
-	return 0;
-}
-
-/*
- * This is called when the inode's link count has gone to 0 or we are creating
- * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
- *
- * We place the on-disk inode on a list in the AGI.  It will be pulled from this
- * list when the inode is freed.
- */
-static int
-iunlink(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_perag	*pag;
-	struct xfs_buf		*agibp;
-	int			error;
-
-	ASSERT(VFS_I(ip)->i_nlink == 0);
-	ASSERT(VFS_I(ip)->i_mode != 0);
-
-	pag = libxfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
-	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = -libxfs_read_agi(pag, tp, &agibp);
-	if (error)
-		goto out;
-
-	error = iunlink_insert_inode(tp, pag, agibp, ip);
-out:
-	libxfs_perag_put(pag);
-	return error;
-}
-
 static int
 create_unlinked(
 	struct xfs_mount	*mp)
@@ -341,7 +233,7 @@ create_unlinked(
 		goto out_cancel;
 	}
 
-	error = iunlink(tp, ip);
+	error = -libxfs_iunlink(tp, ip);
 	if (error) {
 		dbprintf(_("unlink inode: %s\n"), strerror(error));
 		goto out_rele;
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index de59d6cdb5d..58c643d7535 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -185,6 +185,7 @@
 
 #define xfs_iread_extents		libxfs_iread_extents
 #define xfs_irele			libxfs_irele
+#define xfs_iunlink			libxfs_iunlink
 #define xfs_link_space_res		libxfs_link_space_res
 #define xfs_log_calc_minimum_size	libxfs_log_calc_minimum_size
 #define xfs_log_get_max_trans_res	libxfs_log_get_max_trans_res


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 27/28] xfs_repair: use library functions to reset root/rbm/rsum inodes
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (25 preceding siblings ...)
  2023-12-31 23:29   ` [PATCH 26/28] xfs_db: port the iunlink command to use the libxfs iunlink function Darrick J. Wong
@ 2023-12-31 23:29   ` Darrick J. Wong
  2023-12-31 23:29   ` [PATCH 28/28] xfs_repair: use library functions for orphanage creation Darrick J. Wong
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:29 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use the iroot reset function to reset root inodes instead of open-coding
the reset routine.  While we're at it, fix a longstanding memory leak if
the inode being reset actually had an xattr fork full of mappings.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    2 +
 repair/phase6.c          |  127 ++++++++++------------------------------------
 2 files changed, 29 insertions(+), 100 deletions(-)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 58c643d7535..72c6d65d75f 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -148,6 +148,7 @@
 #define xfs_droplink			libxfs_droplink
 
 #define xfs_finobt_calc_reserves	libxfs_finobt_calc_reserves
+#define xfs_fixed_inode_reset		libxfs_fixed_inode_reset
 #define xfs_free_extent			libxfs_free_extent
 #define xfs_free_extent_later		libxfs_free_extent_later
 #define xfs_free_perag			libxfs_free_perag
@@ -177,6 +178,7 @@
 #define xfs_inode_from_disk		libxfs_inode_from_disk
 #define xfs_inode_from_disk_ts		libxfs_inode_from_disk_ts
 #define xfs_inode_hasattr		libxfs_inode_hasattr
+#define xfs_inode_init			libxfs_inode_init
 #define xfs_inode_to_disk		libxfs_inode_to_disk
 #define xfs_inode_validate_cowextsize	libxfs_inode_validate_cowextsize
 #define xfs_inode_validate_extsize	libxfs_inode_validate_extsize
diff --git a/repair/phase6.c b/repair/phase6.c
index 13f2fb2290b..88f1b8f106e 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -446,20 +446,29 @@ res_failed(
 		do_error(_("xfs_trans_reserve returned %d\n"), err);
 }
 
-static inline void
-reset_inode_fields(struct xfs_inode *ip)
+/*
+ * Forcibly reinitialize a fixed-location inode, such as a filesystem root
+ * directory or the realtime metadata inodes.  The inode must not otherwise be
+ * in use; the data fork must be empty, and the attr fork will be reset.
+ */
+static void
+reset_root_ino(
+	struct xfs_trans	*tp,
+	umode_t			mode,
+	struct xfs_inode	*ip)
 {
-	ip->i_projid = 0;
-	ip->i_disk_size = 0;
-	ip->i_nblocks = 0;
-	ip->i_extsize = 0;
-	ip->i_cowextsize = 0;
-	ip->i_flushiter = 0;
+	struct xfs_icreate_args	args = {
+		.nlink			= S_ISDIR(mode) ? 2 : 1,
+	};
+
+	libxfs_icreate_args_rootfile(&args, ip->i_mount, mode, false);
+
+	/* Erase the attr fork since libxfs_inode_init won't do it for us. */
 	ip->i_forkoff = 0;
-	ip->i_diflags = 0;
-	ip->i_diflags2 = 0;
-	ip->i_crtime.tv_sec = 0;
-	ip->i_crtime.tv_nsec = 0;
+	libxfs_ifork_zap_attr(ip);
+
+	libxfs_trans_ijoin(tp, ip, 0);
+	libxfs_inode_init(tp, &args, ip);
 }
 
 static void
@@ -473,7 +482,6 @@ mk_rbmino(xfs_mount_t *mp)
 	int		error;
 	xfs_fileoff_t	bno;
 	xfs_bmbt_irec_t	map[XFS_BMAP_MAX_NMAP];
-	int		times;
 	uint		blocks;
 
 	/*
@@ -490,34 +498,9 @@ mk_rbmino(xfs_mount_t *mp)
 			error);
 	}
 
-	reset_inode_fields(ip);
-
-	VFS_I(ip)->i_mode = S_IFREG;
-	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
-	libxfs_ifork_zap_attr(ip);
-
-	set_nlink(VFS_I(ip), 1);	/* account for sb ptr */
-
-	times = XFS_ICHGTIME_CHG | XFS_ICHGTIME_MOD;
-	if (xfs_has_v3inodes(mp)) {
-		VFS_I(ip)->i_version = 1;
-		ip->i_diflags2 = 0;
-		times |= XFS_ICHGTIME_CREATE;
-	}
-	libxfs_trans_ichgtime(tp, ip, times);
-
-	/*
-	 * now the ifork
-	 */
-	ip->i_df.if_bytes = 0;
-	ip->i_df.if_u1.if_root = NULL;
-
+	/* Reset the realtime bitmap inode. */
+	reset_root_ino(tp, S_IFREG, ip);
 	ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize;
-
-	/*
-	 * commit changes
-	 */
-	libxfs_trans_ijoin(tp, ip, 0);
 	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	error = -libxfs_trans_commit(tp);
 	if (error)
@@ -728,7 +711,6 @@ mk_rsumino(xfs_mount_t *mp)
 	int		nsumblocks;
 	xfs_fileoff_t	bno;
 	xfs_bmbt_irec_t	map[XFS_BMAP_MAX_NMAP];
-	int		times;
 	uint		blocks;
 
 	/*
@@ -745,34 +727,9 @@ mk_rsumino(xfs_mount_t *mp)
 			error);
 	}
 
-	reset_inode_fields(ip);
-
-	VFS_I(ip)->i_mode = S_IFREG;
-	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
-	libxfs_ifork_zap_attr(ip);
-
-	set_nlink(VFS_I(ip), 1);	/* account for sb ptr */
-
-	times = XFS_ICHGTIME_CHG | XFS_ICHGTIME_MOD;
-	if (xfs_has_v3inodes(mp)) {
-		VFS_I(ip)->i_version = 1;
-		ip->i_diflags2 = 0;
-		times |= XFS_ICHGTIME_CREATE;
-	}
-	libxfs_trans_ichgtime(tp, ip, times);
-
-	/*
-	 * now the ifork
-	 */
-	ip->i_df.if_bytes = 0;
-	ip->i_df.if_u1.if_root = NULL;
-
+	/* Reset the rt summary inode. */
+	reset_root_ino(tp, S_IFREG, ip);
 	ip->i_disk_size = mp->m_rsumsize;
-
-	/*
-	 * commit changes
-	 */
-	libxfs_trans_ijoin(tp, ip, 0);
 	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	error = -libxfs_trans_commit(tp);
 	if (error)
@@ -828,7 +785,6 @@ mk_root_dir(xfs_mount_t *mp)
 	int		error;
 	const mode_t	mode = 0755;
 	ino_tree_node_t	*irec;
-	int		times;
 
 	ip = NULL;
 	i = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 10, 0, 0, &tp);
@@ -840,37 +796,8 @@ mk_root_dir(xfs_mount_t *mp)
 		do_error(_("could not iget root inode -- error - %d\n"), error);
 	}
 
-	/*
-	 * take care of the core since we didn't call the libxfs ialloc function
-	 * (comment changed to avoid tangling xfs/437)
-	 */
-	reset_inode_fields(ip);
-
-	VFS_I(ip)->i_mode = mode|S_IFDIR;
-	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
-	libxfs_ifork_zap_attr(ip);
-
-	set_nlink(VFS_I(ip), 2);	/* account for . and .. */
-
-	times = XFS_ICHGTIME_CHG | XFS_ICHGTIME_MOD;
-	if (xfs_has_v3inodes(mp)) {
-		VFS_I(ip)->i_version = 1;
-		ip->i_diflags2 = 0;
-		times |= XFS_ICHGTIME_CREATE;
-	}
-	libxfs_trans_ichgtime(tp, ip, times);
-	libxfs_trans_ijoin(tp, ip, 0);
-	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	/*
-	 * now the ifork
-	 */
-	ip->i_df.if_bytes = 0;
-	ip->i_df.if_u1.if_root = NULL;
-
-	/*
-	 * initialize the directory
-	 */
+	/* Reset the root directory. */
+	reset_root_ino(tp, mode | S_IFDIR, ip);
 	libxfs_dir_init(tp, ip, ip);
 
 	error = -libxfs_trans_commit(tp);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 28/28] xfs_repair: use library functions for orphanage creation
  2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
                     ` (26 preceding siblings ...)
  2023-12-31 23:29   ` [PATCH 27/28] xfs_repair: use library functions to reset root/rbm/rsum inodes Darrick J. Wong
@ 2023-12-31 23:29   ` Darrick J. Wong
  27 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:29 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Use new library functions to create lost+found.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    1 +
 repair/phase6.c          |   60 +++++++++++++++++-----------------------------
 2 files changed, 23 insertions(+), 38 deletions(-)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 72c6d65d75f..3ac0afec41f 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -134,6 +134,7 @@
 #define xfs_dir2_shrink_inode		libxfs_dir2_shrink_inode
 
 #define xfs_dir_createname		libxfs_dir_createname
+#define xfs_dir_create_child		libxfs_dir_create_child
 #define xfs_dir_init			libxfs_dir_init
 #define xfs_dir_ino_validate		libxfs_dir_ino_validate
 #define xfs_dir_lookup			libxfs_dir_lookup
diff --git a/repair/phase6.c b/repair/phase6.c
index 88f1b8f106e..6a3c5e2a37a 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -822,9 +822,15 @@ mk_orphanage(
 	struct xfs_icreate_args	args = {
 		.nlink		= 2,
 	};
+	struct xfs_name		xname = {
+		.name		= ORPHANAGE,
+		.len		= strlen(ORPHANAGE),
+		.type		= XFS_DIR3_FT_DIR,
+	};
+	struct xfs_dir_update	du = {
+		.name		= &xname,
+	};
 	struct xfs_trans	*tp;
-	struct xfs_inode	*ip;
-	struct xfs_inode	*pip;
 	struct ino_tree_node	*irec;
 	xfs_ino_t		ino;
 	int			ino_offset = 0;
@@ -832,10 +838,8 @@ mk_orphanage(
 	int			error;
 	int			nres;
 	const umode_t		mode = S_IFDIR | 0755;
-	struct xfs_name		xname;
-	struct xfs_parent_args	*ppargs;
 
-	i = -libxfs_parent_start(mp, &ppargs);
+	i = -libxfs_parent_start(mp, &du.ppargs);
 	if (i)
 		do_error(_("%d - couldn't allocate parent pointer for %s\n"),
 			i, ORPHANAGE);
@@ -848,17 +852,14 @@ mk_orphanage(
 	 * would have been cleared in phase3 and phase4.
 	 */
 
-	i = -libxfs_iget(mp, NULL, mp->m_sb.sb_rootino, 0, &pip);
+	i = -libxfs_iget(mp, NULL, mp->m_sb.sb_rootino, 0, &du.dp);
 	if (i)
 		do_error(_("%d - couldn't iget root inode to obtain %s\n"),
 			i, ORPHANAGE);
 
-	args.pip = pip;
-	xname.name = (unsigned char *)ORPHANAGE;
-	xname.len = strlen(ORPHANAGE);
-	xname.type = XFS_DIR3_FT_DIR;
+	args.pip = du.dp;
 
-	if (libxfs_dir_lookup(NULL, pip, &xname, &ino, NULL) == 0)
+	if (libxfs_dir_lookup(NULL, du.dp, &xname, &ino, NULL) == 0)
 		return ino;
 
 	/*
@@ -869,21 +870,12 @@ mk_orphanage(
 	if (i)
 		res_failed(i);
 
-	/*
-	 * use iget/ijoin instead of trans_iget because the ialloc
-	 * wrapper can commit the transaction and start a new one
-	 */
-/*	i = -libxfs_iget(mp, NULL, mp->m_sb.sb_rootino, 0, &pip);
-	if (i)
-		do_error(_("%d - couldn't iget root inode to make %s\n"),
-			i, ORPHANAGE);*/
-
 	error = -libxfs_dialloc(&tp, mp->m_sb.sb_rootino, mode, &ino);
 	if (error)
 		do_error(_("%s inode allocation failed %d\n"),
 			ORPHANAGE, error);
 
-	error = -libxfs_icreate(tp, ino, &args, &ip);
+	error = -libxfs_icreate(tp, ino, &args, &du.ip);
 	if (error)
 		do_error(_("%s inode initialization failed %d\n"),
 			ORPHANAGE, error);
@@ -921,46 +913,38 @@ mk_orphanage(
 	 * now that we know the transaction will stay around,
 	 * add the root inode to it
 	 */
-	libxfs_trans_ijoin(tp, pip, 0);
+	libxfs_trans_ijoin(tp, du.dp, 0);
 
 	/*
 	 * create the actual entry
 	 */
-	error = -libxfs_dir_createname(tp, pip, &xname, ip->i_ino, nres);
+	error = -libxfs_dir_create_child(tp, nres, &du);
 	if (error)
 		do_error(
 		_("can't make %s, createname error %d\n"),
 			ORPHANAGE, error);
-	add_parent_ptr(ip->i_ino, ORPHANAGE, pip);
 
-	error = -libxfs_parent_add(tp, ppargs, pip, &xname, ip);
-	if (error)
-		do_error(
- _("committing %s parent pointer failed, error %d.\n"),
-				ORPHANAGE, error);
+	/* Remember this for the pptr scan later */
+	add_parent_ptr(du.ip->i_ino, ORPHANAGE, du.dp);
 
 	/*
-	 * bump up the link count in the root directory to account
-	 * for .. in the new directory, and update the irec copy of the
+	 * We bumped up the link count in the root directory to account
+	 * for .. in the new directory, so now update the irec copy of the
 	 * on-disk nlink so we don't fail the link count check later.
 	 */
-	libxfs_bumplink(tp, pip);
 	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino),
 				  XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino));
 	add_inode_ref(irec, 0);
 	set_inode_disk_nlinks(irec, 0, get_inode_disk_nlinks(irec, 0) + 1);
 
-	libxfs_trans_log_inode(tp, pip, XFS_ILOG_CORE);
-	libxfs_dir_init(tp, ip, pip);
-	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	error = -libxfs_trans_commit(tp);
 	if (error) {
 		do_error(_("%s directory creation failed -- bmapf error %d\n"),
 			ORPHANAGE, error);
 	}
-	libxfs_irele(ip);
-	libxfs_irele(pip);
-	libxfs_parent_finish(mp, ppargs);
+	libxfs_irele(du.ip);
+	libxfs_irele(du.dp);
+	libxfs_parent_finish(mp, du.ppargs);
 
 	return(ino);
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/58] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
@ 2023-12-31 23:29   ` Darrick J. Wong
  2023-12-31 23:30   ` [PATCH 02/58] xfs: create imeta abstractions to get and set metadata inodes Darrick J. Wong
                     ` (56 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:29 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Currently, the XFS_SB_CRC_OFF macro uses the incore superblock struct
(xfs_sb) to compute the address of sb_crc within the ondisk superblock
struct (xfs_dsb).  This is a landmine if we ever change the layout of
the incore superblock (as we're about to do), so redefine the macro
to use xfs_dsb to compute the layout of xfs_dsb.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/sb.c                   |    4 ++--
 libxfs/xfs_format.h       |    9 ++++-----
 libxfs/xfs_ondisk.h       |    1 +
 mdrestore/xfs_mdrestore.c |    6 ++----
 repair/agheader.c         |   12 ++++++------
 5 files changed, 15 insertions(+), 17 deletions(-)


diff --git a/db/sb.c b/db/sb.c
index 9a5d665dfbd..e738065b5be 100644
--- a/db/sb.c
+++ b/db/sb.c
@@ -50,8 +50,8 @@ sb_init(void)
 	add_command(&version_cmd);
 }
 
-#define	OFF(f)	bitize(offsetof(xfs_sb_t, sb_ ## f))
-#define	SZC(f)	szcount(xfs_sb_t, sb_ ## f)
+#define	OFF(f)	bitize(offsetof(struct xfs_dsb, sb_ ## f))
+#define	SZC(f)	szcount(struct xfs_dsb, sb_ ## f)
 const field_t	sb_flds[] = {
 	{ "magicnum", FLDT_UINT32X, OI(OFF(magicnum)), C1, 0, TYP_NONE },
 	{ "blocksize", FLDT_UINT32D, OI(OFF(blocksize)), C1, 0, TYP_NONE },
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index b0aaa825539..c60af436963 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -90,8 +90,7 @@ struct xfs_ifork;
 #define XFSLABEL_MAX			12
 
 /*
- * Superblock - in core version.  Must match the ondisk version below.
- * Must be padded to 64 bit alignment.
+ * Superblock - in core version.  Must be padded to 64 bit alignment.
  */
 typedef struct xfs_sb {
 	uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
@@ -178,10 +177,8 @@ typedef struct xfs_sb {
 	/* must be padded to 64 bit alignment */
 } xfs_sb_t;
 
-#define XFS_SB_CRC_OFF		offsetof(struct xfs_sb, sb_crc)
-
 /*
- * Superblock - on disk version.  Must match the in core version above.
+ * Superblock - on disk version.
  * Must be padded to 64 bit alignment.
  */
 struct xfs_dsb {
@@ -265,6 +262,8 @@ struct xfs_dsb {
 	/* must be padded to 64 bit alignment */
 };
 
+#define XFS_SB_CRC_OFF		offsetof(struct xfs_dsb, sb_crc)
+
 /*
  * Misc. Flags - warning - these will be cleared by xfs_repair unless
  * a feature bit is set when the flag is used.
diff --git a/libxfs/xfs_ondisk.h b/libxfs/xfs_ondisk.h
index bffd39242d4..832d96f0f3c 100644
--- a/libxfs/xfs_ondisk.h
+++ b/libxfs/xfs_ondisk.h
@@ -85,6 +85,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,	12);
 	 */
 
+	XFS_CHECK_OFFSET(struct xfs_dsb, sb_crc,		224);
 	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen,	0);
 	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen,	2);
 	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval,	3);
diff --git a/mdrestore/xfs_mdrestore.c b/mdrestore/xfs_mdrestore.c
index 8e3998db06c..6465a481ce3 100644
--- a/mdrestore/xfs_mdrestore.c
+++ b/mdrestore/xfs_mdrestore.c
@@ -101,10 +101,8 @@ fixup_superblock(
 	memset(block_buffer, 0, sb->sb_sectsize);
 	sb->sb_inprogress = 0;
 	libxfs_sb_to_disk((struct xfs_dsb *)block_buffer, sb);
-	if (xfs_sb_version_hascrc(sb)) {
-		xfs_update_cksum(block_buffer, sb->sb_sectsize,
-				 offsetof(struct xfs_sb, sb_crc));
-	}
+	if (xfs_sb_version_hascrc(sb))
+		xfs_update_cksum(block_buffer, sb->sb_sectsize, XFS_SB_CRC_OFF);
 
 	if (pwrite(ddev_fd, block_buffer, sb->sb_sectsize, 0) < 0)
 		fatal("error writing primary superblock: %s\n", strerror(errno));
diff --git a/repair/agheader.c b/repair/agheader.c
index 762901581e1..3930a0ac091 100644
--- a/repair/agheader.c
+++ b/repair/agheader.c
@@ -358,22 +358,22 @@ secondary_sb_whack(
 	 * size is the size of data which is valid for this sb.
 	 */
 	if (xfs_sb_version_hasmetauuid(sb))
-		size = offsetof(xfs_sb_t, sb_meta_uuid)
+		size = offsetof(struct xfs_dsb, sb_meta_uuid)
 			+ sizeof(sb->sb_meta_uuid);
 	else if (xfs_sb_version_hascrc(sb))
-		size = offsetof(xfs_sb_t, sb_lsn)
+		size = offsetof(struct xfs_dsb, sb_lsn)
 			+ sizeof(sb->sb_lsn);
 	else if (xfs_sb_version_hasmorebits(sb))
-		size = offsetof(xfs_sb_t, sb_bad_features2)
+		size = offsetof(struct xfs_dsb, sb_bad_features2)
 			+ sizeof(sb->sb_bad_features2);
 	else if (xfs_sb_version_haslogv2(sb))
-		size = offsetof(xfs_sb_t, sb_logsunit)
+		size = offsetof(struct xfs_dsb, sb_logsunit)
 			+ sizeof(sb->sb_logsunit);
 	else if (xfs_sb_version_hassector(sb))
-		size = offsetof(xfs_sb_t, sb_logsectsize)
+		size = offsetof(struct xfs_dsb, sb_logsectsize)
 			+ sizeof(sb->sb_logsectsize);
 	else /* only support dirv2 or more recent */
-		size = offsetof(xfs_sb_t, sb_dirblklog)
+		size = offsetof(struct xfs_dsb, sb_dirblklog)
 			+ sizeof(sb->sb_dirblklog);
 
 	/* Check the buffer we read from disk for garbage outside size */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/58] xfs: create imeta abstractions to get and set metadata inodes
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
  2023-12-31 23:29   ` [PATCH 01/58] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Darrick J. Wong
@ 2023-12-31 23:30   ` Darrick J. Wong
  2023-12-31 23:30   ` [PATCH 03/58] xfs: create transaction reservations for metadata inode operations Darrick J. Wong
                     ` (55 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:30 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create some helper routines to get and set metadata inode numbers
instead of open-coding them throughout xfs.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/libxfs.h         |    2 
 include/xfs_trace.h      |   11 +
 libxfs/Makefile          |    4 
 libxfs/imeta_utils.c     |  190 +++++++++++++++++++++
 libxfs/imeta_utils.h     |   26 +++
 libxfs/init.c            |   30 +++
 libxfs/libxfs_api_defs.h |   15 ++
 libxfs/libxfs_priv.h     |    2 
 libxfs/xfs_imeta.c       |  410 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_imeta.h       |   46 +++++
 libxfs/xfs_types.c       |    5 -
 mkfs/xfs_mkfs.c          |    2 
 12 files changed, 739 insertions(+), 4 deletions(-)
 create mode 100644 libxfs/imeta_utils.c
 create mode 100644 libxfs/imeta_utils.h
 create mode 100644 libxfs/xfs_imeta.c
 create mode 100644 libxfs/xfs_imeta.h


diff --git a/include/libxfs.h b/include/libxfs.h
index 0d60b9f9f51..7fa061d5e1b 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -89,6 +89,8 @@ struct iomap;
 #include "xfs_symlink_remote.h"
 #include "xfs_ag_resv.h"
 #include "xfs_parent.h"
+#include "xfs_imeta.h"
+#include "imeta_utils.h"
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index 7dcf88b7900..dd6011af90e 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -360,6 +360,17 @@
 
 #define trace_xlog_intent_recovery_failed(...)	((void) 0)
 
+#define trace_xfs_imeta_teardown(...)		((void) 0)
+#define trace_xfs_imeta_sb_create(...)		((void) 0)
+#define trace_xfs_imeta_sb_link(...)		((void) 0)
+#define trace_xfs_imeta_sb_lookup(...)		((void) 0)
+#define trace_xfs_imeta_sb_unlink(...)		((void) 0)
+#define trace_xfs_imeta_start_create(...)	((void) 0)
+#define trace_xfs_imeta_start_link(...)		((void) 0)
+#define trace_xfs_imeta_start_unlink(...)	((void) 0)
+#define trace_xfs_imeta_update_cancel(...)	((void) 0)
+#define trace_xfs_imeta_update_commit(...)	((void) 0)
+
 #define trace_xfs_iunlink_update_bucket(...)	((void) 0)
 #define trace_xfs_iunlink_update_dinode(...)	((void) 0)
 #define trace_xfs_iunlink(...)			((void) 0)
diff --git a/libxfs/Makefile b/libxfs/Makefile
index 26781ceb913..258ce473200 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -24,6 +24,7 @@ HFILES = \
 	defer_item.h \
 	libxfs_io.h \
 	libxfs_api_defs.h \
+	imeta_utils.h \
 	init.h \
 	iunlink.h \
 	libxfs_priv.h \
@@ -50,6 +51,7 @@ HFILES = \
 	xfs_errortag.h \
 	xfs_ialloc.h \
 	xfs_ialloc_btree.h \
+	xfs_imeta.h \
 	xfs_inode_buf.h \
 	xfs_inode_fork.h \
 	xfs_inode_util.h \
@@ -69,6 +71,7 @@ HFILES = \
 
 CFILES = cache.c \
 	defer_item.c \
+	imeta_utils.c \
 	init.c \
 	inode.c \
 	iunlink.c \
@@ -104,6 +107,7 @@ CFILES = cache.c \
 	xfs_dquot_buf.c \
 	xfs_ialloc.c \
 	xfs_iext_tree.c \
+	xfs_imeta.c \
 	xfs_inode_buf.c \
 	xfs_inode_fork.c \
 	xfs_inode_util.c \
diff --git a/libxfs/imeta_utils.c b/libxfs/imeta_utils.c
new file mode 100644
index 00000000000..4610c2f49c8
--- /dev/null
+++ b/libxfs/imeta_utils.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs_priv.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_imeta.h"
+#include "xfs_trace.h"
+#include "imeta_utils.h"
+
+/* Initialize a metadata update structure. */
+static inline int
+xfs_imeta_init(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_imeta_update		*upd)
+{
+	memset(upd, 0, sizeof(struct xfs_imeta_update));
+	upd->mp = mp;
+	upd->path = path;
+	return 0;
+}
+
+/*
+ * Unlock and release resources after committing (or cancelling) a metadata
+ * directory tree operation.  The caller retains its reference to @upd->ip
+ * and must release it explicitly.
+ */
+static inline void
+xfs_imeta_teardown(
+	struct xfs_imeta_update		*upd,
+	int				error)
+{
+	trace_xfs_imeta_teardown(upd, error);
+
+	if (upd->ip) {
+		if (upd->ip_locked)
+			xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
+		upd->ip_locked = false;
+	}
+}
+
+/*
+ * Begin the process of creating a metadata file by allocating transactions
+ * and taking whatever resources we're going to need.
+ */
+int
+xfs_imeta_start_create(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	error = xfs_imeta_init(mp, path, upd);
+	if (error)
+		return error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+			xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp);
+	if (error)
+		goto out_teardown;
+
+	trace_xfs_imeta_start_create(upd);
+	return 0;
+out_teardown:
+	xfs_imeta_teardown(upd, error);
+	return error;
+}
+
+/*
+ * Begin the process of linking a metadata file by allocating transactions
+ * and locking whatever resources we're going to need.
+ */
+static inline int
+xfs_imeta_start_dir_update(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_inode		*ip,
+	struct xfs_trans_res		*tr_resv,
+	unsigned int			resblks,
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	error = xfs_imeta_init(mp, path, upd);
+	if (error)
+		return error;
+
+	upd->ip = ip;
+
+	error = xfs_trans_alloc_inode(upd->ip, tr_resv, resblks, 0, false,
+			&upd->tp);
+	if (error)
+		goto out_teardown;
+
+	upd->ip_locked = true;
+	return 0;
+out_teardown:
+	xfs_imeta_teardown(upd, error);
+	return error;
+}
+
+/*
+ * Begin the process of linking a metadata file by allocating transactions
+ * and locking whatever resources we're going to need.
+ */
+int
+xfs_imeta_start_link(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_inode		*ip,
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	error = xfs_imeta_start_dir_update(mp, path, ip, &M_RES(mp)->tr_link,
+			xfs_link_space_res(mp, MAXNAMELEN), upd);
+	if (error)
+		return error;
+
+	trace_xfs_imeta_start_link(upd);
+	return 0;
+}
+
+/*
+ * Begin the process of unlinking a metadata file by allocating transactions
+ * and locking whatever resources we're going to need.
+ */
+int
+xfs_imeta_start_unlink(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_inode		*ip,
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	error = xfs_imeta_start_dir_update(mp, path, ip, &M_RES(mp)->tr_remove,
+			xfs_remove_space_res(mp, MAXNAMELEN), upd);
+	if (error)
+		return error;
+
+	trace_xfs_imeta_start_unlink(upd);
+	return 0;
+}
+
+/* Commit a metadir update and unlock/drop all resources. */
+int
+xfs_imeta_commit_update(
+	struct xfs_imeta_update		*upd)
+{
+	int				error;
+
+	trace_xfs_imeta_update_commit(upd);
+
+	error = xfs_trans_commit(upd->tp);
+	upd->tp = NULL;
+
+	xfs_imeta_teardown(upd, error);
+	return error;
+}
+
+/* Cancel a metadir update and unlock/drop all resources. */
+void
+xfs_imeta_cancel_update(
+	struct xfs_imeta_update		*upd,
+	int				error)
+{
+	trace_xfs_imeta_update_cancel(upd);
+
+	xfs_trans_cancel(upd->tp);
+	upd->tp = NULL;
+
+	xfs_imeta_teardown(upd, error);
+}
diff --git a/libxfs/imeta_utils.h b/libxfs/imeta_utils.h
new file mode 100644
index 00000000000..a2afc0f37ac
--- /dev/null
+++ b/libxfs/imeta_utils.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_IMETA_UTILS_H__
+#define __XFS_IMETA_UTILS_H__
+
+int xfs_imeta_start_create(struct xfs_mount *mp,
+		const struct xfs_imeta_path *path,
+		struct xfs_imeta_update *upd);
+
+int xfs_imeta_start_link(struct xfs_mount *mp,
+		const struct xfs_imeta_path *path,
+		struct xfs_inode *ip, struct xfs_imeta_update *upd);
+
+int xfs_imeta_start_unlink(struct xfs_mount *mp,
+		const struct xfs_imeta_path *path,
+		struct xfs_inode *ip, struct xfs_imeta_update *upd);
+
+int xfs_imeta_commit_update(struct xfs_imeta_update *upd);
+void xfs_imeta_cancel_update(struct xfs_imeta_update *upd, int error);
+
+#endif /* __XFS_IMETA_UTILS_H__ */
+
+
diff --git a/libxfs/init.c b/libxfs/init.c
index 397ce088d3a..c199aeea45e 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -669,6 +669,34 @@ libxfs_compute_all_maxlevels(
 
 }
 
+/* Mount the metadata files under the metadata directory tree. */
+STATIC void
+libxfs_mountfs_imeta(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	/* Ignore filesystems that are under construction. */
+	if (mp->m_sb.sb_inprogress)
+		return;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return;
+
+	error = -xfs_imeta_mount(tp);
+	if (error) {
+		fprintf(stderr,
+ _("%s: Failed to load metadata inodes, error %d\n"),
+			progname, error);
+		goto err_cancel;
+	}
+
+err_cancel:
+	libxfs_trans_cancel(tp);
+}
+
 /*
  * precalculate the low space thresholds for dynamic speculative preallocation.
  */
@@ -843,6 +871,8 @@ libxfs_mount(
 	}
 	xfs_set_perag_data_loaded(mp);
 
+	libxfs_mountfs_imeta(mp);
+
 	return mp;
 out_da:
 	xfs_da_unmount(mp);
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 3ac0afec41f..9fd53415b47 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -162,6 +162,8 @@
 #define xfs_iallocbt_calc_size		libxfs_iallocbt_calc_size
 #define xfs_iallocbt_maxlevels_ondisk	libxfs_iallocbt_maxlevels_ondisk
 #define xfs_ialloc_read_agi		libxfs_ialloc_read_agi
+#define xfs_icreate			libxfs_icreate
+#define xfs_icreate_args_rootfile	libxfs_icreate_args_rootfile
 #define xfs_idata_realloc		libxfs_idata_realloc
 #define xfs_idestroy_fork		libxfs_idestroy_fork
 #define xfs_iext_first			libxfs_iext_first
@@ -170,6 +172,18 @@
 #define xfs_iext_next			libxfs_iext_next
 #define xfs_ifork_zap_attr		libxfs_ifork_zap_attr
 #define xfs_imap_to_bp			libxfs_imap_to_bp
+
+#define xfs_imeta_cancel_update		libxfs_imeta_cancel_update
+#define xfs_imeta_commit_update		libxfs_imeta_commit_update
+#define xfs_imeta_create		libxfs_imeta_create
+#define xfs_imeta_link			libxfs_imeta_link
+#define xfs_imeta_lookup		libxfs_imeta_lookup
+#define xfs_imeta_mount			libxfs_imeta_mount
+#define xfs_imeta_start_create		libxfs_imeta_start_create
+#define xfs_imeta_start_link		libxfs_imeta_start_link
+#define xfs_imeta_start_unlink		libxfs_imeta_start_unlink
+#define xfs_imeta_unlink		libxfs_imeta_unlink
+
 #define xfs_initialize_perag		libxfs_initialize_perag
 #define xfs_initialize_perag_data	libxfs_initialize_perag_data
 #define xfs_init_local_fork		libxfs_init_local_fork
@@ -188,6 +202,7 @@
 
 #define xfs_iread_extents		libxfs_iread_extents
 #define xfs_irele			libxfs_irele
+#define xfs_is_meta_ino			libxfs_is_meta_ino
 #define xfs_iunlink			libxfs_iunlink
 #define xfs_link_space_res		libxfs_link_space_res
 #define xfs_log_calc_minimum_size	libxfs_log_calc_minimum_size
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 17e62c9fbb7..9bca059776d 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -484,6 +484,8 @@ void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
 static inline int retzero(void) { return 0; }
 #define xfs_trans_unreserve_quota_nblks(t,i,b,n,f)	retzero()
 #define xfs_quota_unreserve_blkres(i,b) 		retzero()
+#define xfs_qm_dqattach(i)				(0)
+#define xfs_qm_dqattach_locked(ip, alloc)		(0)
 
 #define xfs_quota_reserve_blkres(i,b)		(0)
 #define xfs_qm_dqattach(i)			(0)
diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
new file mode 100644
index 00000000000..b89843926fe
--- /dev/null
+++ b/libxfs/xfs_imeta.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs_priv.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_imeta.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+
+/*
+ * Metadata File Management
+ * ========================
+ *
+ * These functions provide an abstraction layer for looking up, creating, and
+ * deleting metadata inodes.  These pointers live in the in-core superblock,
+ * so the functions moderate access to those fields and take care of logging.
+ *
+ * For the five existing metadata inodes (real time bitmap & summary; and the
+ * user, group, and quotas) we'll continue to maintain the in-core superblock
+ * inodes for reads and only require xfs_imeta_create and xfs_imeta_unlink to
+ * persist changes.  New metadata inode types must only use the xfs_imeta_*
+ * functions.
+ *
+ * Callers wishing to create or unlink a metadata inode must pass in a
+ * xfs_imeta_end structure.  After committing or cancelling the transaction,
+ * this structure must be passed to xfs_imeta_end_update to free resources that
+ * cannot be freed during the transaction.
+ *
+ * Right now we only support callers passing in the predefined metadata inode
+ * paths; the goal is that callers will some day locate metadata inodes based
+ * on path lookups into a metadata directory structure.
+ */
+
+/* Static metadata inode paths */
+
+const struct xfs_imeta_path XFS_IMETA_RTBITMAP = {
+	.bogus = 0,
+};
+
+const struct xfs_imeta_path XFS_IMETA_RTSUMMARY = {
+	.bogus = 1,
+};
+
+const struct xfs_imeta_path XFS_IMETA_USRQUOTA = {
+	.bogus = 2,
+};
+
+const struct xfs_imeta_path XFS_IMETA_GRPQUOTA = {
+	.bogus = 3,
+};
+
+const struct xfs_imeta_path XFS_IMETA_PRJQUOTA = {
+	.bogus = 4,
+};
+
+/* Are these two paths equal? */
+STATIC bool
+xfs_imeta_path_compare(
+	const struct xfs_imeta_path	*a,
+	const struct xfs_imeta_path	*b)
+{
+	return a == b;
+}
+
+/* Is this path ok? */
+static inline bool
+xfs_imeta_path_check(
+	const struct xfs_imeta_path	*path)
+{
+	return true;
+}
+
+/* Functions for storing and retrieving superblock inode values. */
+
+/* Mapping of metadata inode paths to in-core superblock values. */
+static const struct xfs_imeta_sbmap {
+	const struct xfs_imeta_path	*path;
+	unsigned int			offset;
+} xfs_imeta_sbmaps[] = {
+	{
+		.path	= &XFS_IMETA_RTBITMAP,
+		.offset	= offsetof(struct xfs_sb, sb_rbmino),
+	},
+	{
+		.path	= &XFS_IMETA_RTSUMMARY,
+		.offset	= offsetof(struct xfs_sb, sb_rsumino),
+	},
+	{
+		.path	= &XFS_IMETA_USRQUOTA,
+		.offset	= offsetof(struct xfs_sb, sb_uquotino),
+	},
+	{
+		.path	= &XFS_IMETA_GRPQUOTA,
+		.offset	= offsetof(struct xfs_sb, sb_gquotino),
+	},
+	{
+		.path	= &XFS_IMETA_PRJQUOTA,
+		.offset	= offsetof(struct xfs_sb, sb_pquotino),
+	},
+	{ NULL, 0 },
+};
+
+/* Return a pointer to the in-core superblock inode value. */
+static inline xfs_ino_t *
+xfs_imeta_sbmap_to_inop(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_sbmap	*map)
+{
+	return (xfs_ino_t *)(((char *)&mp->m_sb) + map->offset);
+}
+
+/* Compute location of metadata inode pointer in the in-core superblock */
+static inline xfs_ino_t *
+xfs_imeta_path_to_sb_inop(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path)
+{
+	const struct xfs_imeta_sbmap	*p;
+
+	for (p = xfs_imeta_sbmaps; p->path; p++)
+		if (xfs_imeta_path_compare(p->path, path))
+			return xfs_imeta_sbmap_to_inop(mp, p);
+
+	return NULL;
+}
+
+/* Look up a superblock metadata inode by its path. */
+STATIC int
+xfs_imeta_sb_lookup(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	xfs_ino_t			*inop)
+{
+	xfs_ino_t			*sb_inop;
+
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, path);
+	if (!sb_inop)
+		return -EINVAL;
+
+	trace_xfs_imeta_sb_lookup(mp, sb_inop);
+	*inop = *sb_inop;
+	return 0;
+}
+
+/* Update inode pointers in the superblock. */
+static inline void
+xfs_imeta_log_sb(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp = xfs_trans_getsb(tp);
+
+	/*
+	 * Update the inode flags in the ondisk superblock without touching
+	 * the summary counters.  We have not quiesced inode chunk allocation,
+	 * so we cannot coordinate with updates to the icount and ifree percpu
+	 * counters.
+	 */
+	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+}
+
+/*
+ * Create a new metadata inode and set a superblock pointer to this new inode.
+ * The superblock field must not already be pointing to an inode.
+ */
+STATIC int
+xfs_imeta_sb_create(
+	struct xfs_imeta_update		*upd,
+	umode_t				mode)
+{
+	struct xfs_icreate_args		args = {
+		.nlink			= S_ISDIR(mode) ? 2 : 1,
+	};
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+	xfs_ino_t			ino;
+	int				error;
+
+	/* Files rooted in the superblock do not have parents. */
+	xfs_icreate_args_rootfile(&args, mp, mode, false);
+
+	/* Reject if the sb already points to some inode. */
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (!sb_inop)
+		return -EINVAL;
+
+	if (*sb_inop != NULLFSINO)
+		return -EEXIST;
+
+	/* Create a new inode and set the sb pointer. */
+	error = xfs_dialloc(&upd->tp, 0, mode, &ino);
+	if (error)
+		return error;
+	error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
+	if (error)
+		return error;
+	upd->ip_locked = true;
+
+	/*
+	 * If we ever need the ability to create rt metadata files on a
+	 * pre-metadir filesystem, we'll need to dqattach the child here.
+	 * Currently we assume that mkfs will create the files and quotacheck
+	 * will account for them.
+	 */
+
+	/* Update superblock pointer. */
+	*sb_inop = ino;
+	xfs_imeta_log_sb(upd->tp);
+
+	trace_xfs_imeta_sb_create(upd);
+	return 0;
+}
+
+/*
+ * Clear the given inode pointer from the superblock and drop the link count
+ * of the metadata inode.
+ */
+STATIC int
+xfs_imeta_sb_unlink(
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+
+	ASSERT(xfs_isilocked(upd->ip, XFS_ILOCK_EXCL));
+
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (!sb_inop)
+		return -EINVAL;
+
+	/* Reject if the sb doesn't point to the inode that was passed in. */
+	if (*sb_inop != upd->ip->i_ino)
+		return -ENOENT;
+
+	trace_xfs_imeta_sb_unlink(upd);
+
+	*sb_inop = NULLFSINO;
+	xfs_imeta_log_sb(upd->tp);
+	return xfs_droplink(upd->tp, upd->ip);
+}
+
+/* Set the given inode pointer in the superblock. */
+STATIC int
+xfs_imeta_sb_link(
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+
+	ASSERT(xfs_isilocked(upd->ip, XFS_ILOCK_EXCL));
+
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (!sb_inop)
+		return -EINVAL;
+	if (*sb_inop != NULLFSINO)
+		return -EEXIST;
+
+	trace_xfs_imeta_sb_link(upd);
+
+	xfs_bumplink(upd->tp, upd->ip);
+	xfs_imeta_log_sb(upd->tp);
+
+	*sb_inop = upd->ip->i_ino;
+	return 0;
+}
+
+/* General functions for managing metadata inode pointers */
+
+/*
+ * Is this metadata inode pointer ok?  We allow the fields to be set to
+ * NULLFSINO if the metadata structure isn't present, and we don't allow
+ * obviously incorrect inode pointers.
+ */
+static inline bool
+xfs_imeta_verify(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	if (ino == NULLFSINO)
+		return true;
+	return xfs_verify_ino(mp, ino);
+}
+
+/* Look up a metadata inode by its path. */
+int
+xfs_imeta_lookup(
+	struct xfs_trans		*tp,
+	const struct xfs_imeta_path	*path,
+	xfs_ino_t			*inop)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_ino_t			ino;
+	int				error;
+
+	ASSERT(xfs_imeta_path_check(path));
+
+	error = xfs_imeta_sb_lookup(mp, path, &ino);
+	if (error)
+		return error;
+
+	if (!xfs_imeta_verify(mp, ino))
+		return -EFSCORRUPTED;
+
+	*inop = ino;
+	return 0;
+}
+
+/*
+ * Create a metadata inode with the given @mode, and insert it into the
+ * metadata directory tree at the given @path.  The path (up to the final
+ * component) must already exist.
+ *
+ * The new metadata inode will be attached to the update structure @upd->ip,
+ * with the ILOCK held until the caller releases it.  @ipp is set to upd->ip
+ * as a convenience for callers.
+ *
+ * Callers must ensure that the root dquots are allocated, if applicable.
+ *
+ * NOTE: This function may return a new inode to the caller even if it returns
+ * a negative error code.  If an inode is passed back, the caller must finish
+ * setting up the inode before releasing it.
+ */
+int
+xfs_imeta_create(
+	struct xfs_imeta_update		*upd,
+	umode_t				mode,
+	struct xfs_inode		**ipp)
+{
+	int				error;
+
+	ASSERT(xfs_imeta_path_check(upd->path));
+
+	*ipp = NULL;
+
+	error = xfs_imeta_sb_create(upd, mode);
+	*ipp = upd->ip;
+	return error;
+}
+
+/*
+ * Unlink a metadata inode @upd->ip from the metadata directory given by @path.
+ * The path must already exist.
+ */
+int
+xfs_imeta_unlink(
+	struct xfs_imeta_update		*upd)
+{
+	ASSERT(xfs_imeta_path_check(upd->path));
+	ASSERT(xfs_imeta_verify(upd->mp, upd->ip->i_ino));
+
+	return xfs_imeta_sb_unlink(upd);
+}
+
+/*
+ * Link the metadata directory given by @path to the inode @upd->ip.
+ * The path (up to the final component) must already exist, but the final
+ * component must not already exist.
+ */
+int
+xfs_imeta_link(
+	struct xfs_imeta_update		*upd)
+{
+	ASSERT(xfs_imeta_path_check(upd->path));
+
+	return xfs_imeta_sb_link(upd);
+}
+
+/* Does this inode number refer to a static metadata inode? */
+bool
+xfs_is_static_meta_ino(
+	struct xfs_mount		*mp,
+	xfs_ino_t			ino)
+{
+	const struct xfs_imeta_sbmap	*p;
+
+	if (ino == NULLFSINO)
+		return false;
+
+	for (p = xfs_imeta_sbmaps; p->path; p++)
+		if (ino == *xfs_imeta_sbmap_to_inop(mp, p))
+			return true;
+
+	return false;
+}
+
+/*
+ * Ensure that the in-core superblock has all the values that it should.
+ * Caller should pass in an empty transaction to avoid livelocking on btree
+ * cycles.
+ */
+int
+xfs_imeta_mount(
+	struct xfs_trans	*tp)
+{
+	return 0;
+}
diff --git a/libxfs/xfs_imeta.h b/libxfs/xfs_imeta.h
new file mode 100644
index 00000000000..c1833b8b1c9
--- /dev/null
+++ b/libxfs/xfs_imeta.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_IMETA_H__
+#define __XFS_IMETA_H__
+
+/* Key for looking up metadata inodes. */
+struct xfs_imeta_path {
+	/* Temporary: integer to keep the static imeta definitions unique */
+	int		bogus;
+};
+
+/* Cleanup widget for metadata inode creation and deletion. */
+struct xfs_imeta_update {
+	struct xfs_mount	*mp;
+	struct xfs_trans	*tp;
+
+	const struct xfs_imeta_path *path;
+
+	/* Metadata inode */
+	struct xfs_inode	*ip;
+
+	unsigned int		ip_locked:1;
+};
+
+/* Lookup keys for static metadata inodes. */
+extern const struct xfs_imeta_path XFS_IMETA_RTBITMAP;
+extern const struct xfs_imeta_path XFS_IMETA_RTSUMMARY;
+extern const struct xfs_imeta_path XFS_IMETA_USRQUOTA;
+extern const struct xfs_imeta_path XFS_IMETA_GRPQUOTA;
+extern const struct xfs_imeta_path XFS_IMETA_PRJQUOTA;
+
+int xfs_imeta_lookup(struct xfs_trans *tp, const struct xfs_imeta_path *path,
+		xfs_ino_t *ino);
+
+int xfs_imeta_create(struct xfs_imeta_update *upd, umode_t mode,
+		struct xfs_inode **ipp);
+int xfs_imeta_unlink(struct xfs_imeta_update *upd);
+int xfs_imeta_link(struct xfs_imeta_update *upd);
+
+bool xfs_is_static_meta_ino(struct xfs_mount *mp, xfs_ino_t ino);
+int xfs_imeta_mount(struct xfs_trans *tp);
+
+#endif /* __XFS_IMETA_H__ */
diff --git a/libxfs/xfs_types.c b/libxfs/xfs_types.c
index 74ab1965a8f..88720b297d7 100644
--- a/libxfs/xfs_types.c
+++ b/libxfs/xfs_types.c
@@ -12,6 +12,7 @@
 #include "xfs_bit.h"
 #include "xfs_mount.h"
 #include "xfs_ag.h"
+#include "xfs_imeta.h"
 
 
 /*
@@ -115,9 +116,7 @@ xfs_internal_inum(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino)
 {
-	return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
-		(xfs_has_quota(mp) &&
-		 xfs_is_quota_inode(&mp->m_sb, ino));
+	return xfs_is_static_meta_ino(mp, ino);
 }
 
 /*
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 482275e0e0d..15be7e0fb60 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -3878,6 +3878,7 @@ start_superblock_setup(
 	struct xfs_mount	*mp,
 	struct xfs_sb		*sbp)
 {
+	sbp->sb_inprogress = 1;	/* mkfs is in progress */
 	sbp->sb_magicnum = XFS_SB_MAGIC;
 	sbp->sb_sectsize = (uint16_t)cfg->sectorsize;
 	sbp->sb_sectlog = (uint8_t)cfg->sectorlog;
@@ -3960,7 +3961,6 @@ finish_superblock_setup(
 	sbp->sb_rbmblocks = cfg->rtbmblocks;
 	sbp->sb_logblocks = (xfs_extlen_t)cfg->logblocks;
 	sbp->sb_rextslog = libxfs_compute_rextslog(cfg->rtextents);
-	sbp->sb_inprogress = 1;	/* mkfs is in progress */
 	sbp->sb_imax_pct = cfg->imaxpct;
 	sbp->sb_icount = 0;
 	sbp->sb_ifree = 0;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/58] xfs: create transaction reservations for metadata inode operations
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
  2023-12-31 23:29   ` [PATCH 01/58] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Darrick J. Wong
  2023-12-31 23:30   ` [PATCH 02/58] xfs: create imeta abstractions to get and set metadata inodes Darrick J. Wong
@ 2023-12-31 23:30   ` Darrick J. Wong
  2023-12-31 23:30   ` [PATCH 04/58] mkfs: clean up the rtinit() function Darrick J. Wong
                     ` (54 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:30 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create transaction reservation types and block reservation helpers to
help us calculate transaction requirements.  Right now the reservations
are the same as always; we're just separating the symbols for a future
patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/imeta_utils.c     |    8 +++++---
 libxfs/libxfs_api_defs.h |    3 +++
 libxfs/xfs_imeta.c       |    4 ++++
 libxfs/xfs_imeta.h       |    4 ++++
 libxfs/xfs_trans_resv.c  |    5 +++++
 libxfs/xfs_trans_resv.h  |    3 +++
 6 files changed, 24 insertions(+), 3 deletions(-)


diff --git a/libxfs/imeta_utils.c b/libxfs/imeta_utils.c
index 4610c2f49c8..f3165b5eed3 100644
--- a/libxfs/imeta_utils.c
+++ b/libxfs/imeta_utils.c
@@ -70,7 +70,7 @@ xfs_imeta_start_create(
 	if (error)
 		return error;
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_imeta_create,
 			xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp);
 	if (error)
 		goto out_teardown;
@@ -128,7 +128,8 @@ xfs_imeta_start_link(
 {
 	int				error;
 
-	error = xfs_imeta_start_dir_update(mp, path, ip, &M_RES(mp)->tr_link,
+	error = xfs_imeta_start_dir_update(mp, path, ip,
+			&M_RES(mp)->tr_imeta_link,
 			xfs_link_space_res(mp, MAXNAMELEN), upd);
 	if (error)
 		return error;
@@ -150,7 +151,8 @@ xfs_imeta_start_unlink(
 {
 	int				error;
 
-	error = xfs_imeta_start_dir_update(mp, path, ip, &M_RES(mp)->tr_remove,
+	error = xfs_imeta_start_dir_update(mp, path, ip,
+			&M_RES(mp)->tr_imeta_unlink,
 			xfs_remove_space_res(mp, MAXNAMELEN), upd);
 	if (error)
 		return error;
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 9fd53415b47..167617771d8 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -176,13 +176,16 @@
 #define xfs_imeta_cancel_update		libxfs_imeta_cancel_update
 #define xfs_imeta_commit_update		libxfs_imeta_commit_update
 #define xfs_imeta_create		libxfs_imeta_create
+#define xfs_imeta_create_space_res	libxfs_imeta_create_space_res
 #define xfs_imeta_link			libxfs_imeta_link
+#define xfs_imeta_link_space_res	libxfs_imeta_link_space_res
 #define xfs_imeta_lookup		libxfs_imeta_lookup
 #define xfs_imeta_mount			libxfs_imeta_mount
 #define xfs_imeta_start_create		libxfs_imeta_start_create
 #define xfs_imeta_start_link		libxfs_imeta_start_link
 #define xfs_imeta_start_unlink		libxfs_imeta_start_unlink
 #define xfs_imeta_unlink		libxfs_imeta_unlink
+#define xfs_imeta_unlink_space_res	libxfs_imeta_unlink_space_res
 
 #define xfs_initialize_perag		libxfs_initialize_perag
 #define xfs_initialize_perag_data	libxfs_initialize_perag_data
diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
index b89843926fe..086c250a3c2 100644
--- a/libxfs/xfs_imeta.c
+++ b/libxfs/xfs_imeta.c
@@ -18,6 +18,10 @@
 #include "xfs_trace.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_trans_space.h"
 
 /*
  * Metadata File Management
diff --git a/libxfs/xfs_imeta.h b/libxfs/xfs_imeta.h
index c1833b8b1c9..7b3da865c09 100644
--- a/libxfs/xfs_imeta.h
+++ b/libxfs/xfs_imeta.h
@@ -43,4 +43,8 @@ int xfs_imeta_link(struct xfs_imeta_update *upd);
 bool xfs_is_static_meta_ino(struct xfs_mount *mp, xfs_ino_t ino);
 int xfs_imeta_mount(struct xfs_trans *tp);
 
+unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp);
+unsigned int xfs_imeta_link_space_res(struct xfs_mount *mp);
+unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp);
+
 #endif /* __XFS_IMETA_H__ */
diff --git a/libxfs/xfs_trans_resv.c b/libxfs/xfs_trans_resv.c
index 78e7a575baa..5a1ad959870 100644
--- a/libxfs/xfs_trans_resv.c
+++ b/libxfs/xfs_trans_resv.c
@@ -1246,4 +1246,9 @@ xfs_trans_resv_calc(
 	resp->tr_itruncate.tr_logcount += logcount_adj;
 	resp->tr_write.tr_logcount += logcount_adj;
 	resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
+
+	/* metadata inode creation and unlink */
+	resp->tr_imeta_create = resp->tr_create;
+	resp->tr_imeta_link = resp->tr_link;
+	resp->tr_imeta_unlink = resp->tr_remove;
 }
diff --git a/libxfs/xfs_trans_resv.h b/libxfs/xfs_trans_resv.h
index 0554b9d775d..6b851dfe1ac 100644
--- a/libxfs/xfs_trans_resv.h
+++ b/libxfs/xfs_trans_resv.h
@@ -48,6 +48,9 @@ struct xfs_trans_resv {
 	struct xfs_trans_res	tr_qm_dqalloc;	/* allocate quota on disk */
 	struct xfs_trans_res	tr_sb;		/* modify superblock */
 	struct xfs_trans_res	tr_fsyncts;	/* update timestamps on fsync */
+	struct xfs_trans_res	tr_imeta_create; /* create metadata inode */
+	struct xfs_trans_res	tr_imeta_link;	/* link metadata inode */
+	struct xfs_trans_res	tr_imeta_unlink; /* unlink metadata inode */
 };
 
 /* shorthand way of accessing reservation structure */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/58] mkfs: clean up the rtinit() function
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 23:30   ` [PATCH 03/58] xfs: create transaction reservations for metadata inode operations Darrick J. Wong
@ 2023-12-31 23:30   ` Darrick J. Wong
  2023-12-31 23:30   ` [PATCH 05/58] libxfs: convert all users to libxfs_imeta_create Darrick J. Wong
                     ` (53 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:30 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Clean up some of the warts in this function, like the inconsistent use
of @i for @error, missing comments, and make this more visually pleasing
by adding some whitespace between major sections.  Some things are left
untouched for the next patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 mkfs/proto.c |   70 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 34 insertions(+), 36 deletions(-)


diff --git a/mkfs/proto.c b/mkfs/proto.c
index f9b0f837ed9..a519aaeb72b 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -750,28 +750,26 @@ parse_proto(
  */
 static void
 rtinit(
-	xfs_mount_t	*mp)
+	struct xfs_mount	*mp)
 {
-	xfs_fileoff_t	bno;
-	xfs_bmbt_irec_t	*ep;
-	int		error;
-	int		i;
-	xfs_bmbt_irec_t	map[XFS_BMAP_MAX_NMAP];
-	xfs_extlen_t	nsumblocks;
-	uint		blocks;
-	int		nmap;
-	xfs_inode_t	*rbmip;
-	xfs_inode_t	*rsumip;
-	xfs_trans_t	*tp;
-	struct cred	creds;
-	struct fsxattr	fsxattrs;
+	struct cred		creds;
+	struct fsxattr		fsxattrs;
+	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
+	struct xfs_inode	*rbmip;
+	struct xfs_inode	*rsumip;
+	struct xfs_trans	*tp;
+	struct xfs_bmbt_irec	*ep;
+	xfs_fileoff_t		bno;
+	xfs_extlen_t		nsumblocks;
+	uint			blocks;
+	int			i;
+	int			nmap;
+	int			error;
 
-	/*
-	 * First, allocate the inodes.
-	 */
-	i = -libxfs_trans_alloc_rollable(mp, MKFS_BLOCKRES_INODE, &tp);
-	if (i)
-		res_failed(i);
+	/* Create the realtime bitmap inode. */
+	error = -libxfs_trans_alloc_rollable(mp, MKFS_BLOCKRES_INODE, &tp);
+	if (error)
+		res_failed(error);
 
 	memset(&creds, 0, sizeof(creds));
 	memset(&fsxattrs, 0, sizeof(fsxattrs));
@@ -792,6 +790,8 @@ rtinit(
 	libxfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE);
 	libxfs_log_sb(tp);
 	mp->m_rbmip = rbmip;
+
+	/* Create the realtime summary inode. */
 	error = creatproto(&tp, NULL, S_IFREG, 1, 0, &creds, &fsxattrs,
 			&rsumip);
 	if (error) {
@@ -806,14 +806,13 @@ rtinit(
 		fail(_("Completion of the realtime summary inode failed"),
 				error);
 	mp->m_rsumip = rsumip;
-	/*
-	 * Next, give the bitmap file some zero-filled blocks.
-	 */
+
+	/* Zero the realtime bitmap. */
 	blocks = mp->m_sb.sb_rbmblocks +
 			XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - 1;
-	i = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
-	if (i)
-		res_failed(i);
+	error = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
+	if (error)
+		res_failed(error);
 
 	libxfs_trans_ijoin(tp, rbmip, 0);
 	bno = 0;
@@ -822,10 +821,10 @@ rtinit(
 		error = -libxfs_bmapi_write(tp, rbmip, bno,
 				(xfs_extlen_t)(mp->m_sb.sb_rbmblocks - bno),
 				0, mp->m_sb.sb_rbmblocks, map, &nmap);
-		if (error) {
+		if (error)
 			fail(_("Allocation of the realtime bitmap failed"),
 				error);
-		}
+
 		for (i = 0, ep = map; i < nmap; i++, ep++) {
 			libxfs_device_zero(mp->m_ddev_targp,
 				XFS_FSB_TO_DADDR(mp, ep->br_startblock),
@@ -839,25 +838,24 @@ rtinit(
 		fail(_("Block allocation of the realtime bitmap inode failed"),
 				error);
 
-	/*
-	 * Give the summary file some zero-filled blocks.
-	 */
+	/* Zero the summary file. */
 	nsumblocks = mp->m_rsumsize >> mp->m_sb.sb_blocklog;
 	blocks = nsumblocks + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - 1;
-	i = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
-	if (i)
-		res_failed(i);
+	error = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
+	if (error)
+		res_failed(error);
 	libxfs_trans_ijoin(tp, rsumip, 0);
+
 	bno = 0;
 	while (bno < nsumblocks) {
 		nmap = XFS_BMAP_MAX_NMAP;
 		error = -libxfs_bmapi_write(tp, rsumip, bno,
 				(xfs_extlen_t)(nsumblocks - bno),
 				0, nsumblocks, map, &nmap);
-		if (error) {
+		if (error)
 			fail(_("Allocation of the realtime summary failed"),
 				error);
-		}
+
 		for (i = 0, ep = map; i < nmap; i++, ep++) {
 			libxfs_device_zero(mp->m_ddev_targp,
 				XFS_FSB_TO_DADDR(mp, ep->br_startblock),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/58] libxfs: convert all users to libxfs_imeta_create
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 23:30   ` [PATCH 04/58] mkfs: clean up the rtinit() function Darrick J. Wong
@ 2023-12-31 23:30   ` Darrick J. Wong
  2023-12-31 23:31   ` [PATCH 06/58] mkfs: break up the rest of the rtinit() function Darrick J. Wong
                     ` (52 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:30 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert all open-coded sb metadata inode pointer logging to use
libxfs_imeta_create to create metadata inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 mkfs/proto.c |   44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)


diff --git a/mkfs/proto.c b/mkfs/proto.c
index a519aaeb72b..8ae0aba777c 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -752,8 +752,7 @@ static void
 rtinit(
 	struct xfs_mount	*mp)
 {
-	struct cred		creds;
-	struct fsxattr		fsxattrs;
+	struct xfs_imeta_update	upd;
 	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
 	struct xfs_inode	*rbmip;
 	struct xfs_inode	*rsumip;
@@ -767,41 +766,38 @@ rtinit(
 	int			error;
 
 	/* Create the realtime bitmap inode. */
-	error = -libxfs_trans_alloc_rollable(mp, MKFS_BLOCKRES_INODE, &tp);
+	error = -libxfs_imeta_start_create(mp, &XFS_IMETA_RTBITMAP, &upd);
 	if (error)
 		res_failed(error);
 
-	memset(&creds, 0, sizeof(creds));
-	memset(&fsxattrs, 0, sizeof(fsxattrs));
-	error = creatproto(&tp, NULL, S_IFREG, 1, 0, &creds, &fsxattrs,
-			&rbmip);
-	if (error) {
+	error = -libxfs_imeta_create(&upd, S_IFREG, &rbmip);
+	if (error)
 		fail(_("Realtime bitmap inode allocation failed"), error);
-	}
-	/*
-	 * Do our thing with rbmip before allocating rsumip,
-	 * because the next call to createproto may
-	 * commit the transaction in which rbmip was allocated.
-	 */
-	mp->m_sb.sb_rbmino = rbmip->i_ino;
+
 	rbmip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize;
-	rbmip->i_diflags = XFS_DIFLAG_NEWRTBM;
+	rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
 	inode_set_atime(VFS_I(rbmip), 0, 0);
 	libxfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE);
-	libxfs_log_sb(tp);
+
+	error = -libxfs_imeta_commit_update(&upd);
+	if (error)
+		fail(_("Completion of the realtime bitmap inode failed"),
+				error);
 	mp->m_rbmip = rbmip;
 
 	/* Create the realtime summary inode. */
-	error = creatproto(&tp, NULL, S_IFREG, 1, 0, &creds, &fsxattrs,
-			&rsumip);
-	if (error) {
+	error = -libxfs_imeta_start_create(mp, &XFS_IMETA_RTSUMMARY, &upd);
+	if (error)
+		res_failed(error);
+
+	error = -libxfs_imeta_create(&upd, S_IFREG, &rsumip);
+	if (error)
 		fail(_("Realtime summary inode allocation failed"), error);
-	}
-	mp->m_sb.sb_rsumino = rsumip->i_ino;
+
 	rsumip->i_disk_size = mp->m_rsumsize;
 	libxfs_trans_log_inode(tp, rsumip, XFS_ILOG_CORE);
-	libxfs_log_sb(tp);
-	error = -libxfs_trans_commit(tp);
+
+	error = -libxfs_imeta_commit_update(&upd);
 	if (error)
 		fail(_("Completion of the realtime summary inode failed"),
 				error);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/58] mkfs: break up the rest of the rtinit() function
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 23:30   ` [PATCH 05/58] libxfs: convert all users to libxfs_imeta_create Darrick J. Wong
@ 2023-12-31 23:31   ` Darrick J. Wong
  2023-12-31 23:31   ` [PATCH 07/58] xfs: iget for metadata inodes Darrick J. Wong
                     ` (51 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:31 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Break up this really long function into smaller functions that each do
one thing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 mkfs/proto.c |   89 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 25 deletions(-)


diff --git a/mkfs/proto.c b/mkfs/proto.c
index 8ae0aba777c..2eff1f32173 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -745,27 +745,15 @@ parse_proto(
 	parseproto(mp, NULL, fsx, pp, NULL);
 }
 
-/*
- * Allocate the realtime bitmap and summary inodes, and fill in data if any.
- */
+/* Create the realtime bitmap inode. */
 static void
-rtinit(
+rtbitmap_create(
 	struct xfs_mount	*mp)
 {
 	struct xfs_imeta_update	upd;
-	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
 	struct xfs_inode	*rbmip;
-	struct xfs_inode	*rsumip;
-	struct xfs_trans	*tp;
-	struct xfs_bmbt_irec	*ep;
-	xfs_fileoff_t		bno;
-	xfs_extlen_t		nsumblocks;
-	uint			blocks;
-	int			i;
-	int			nmap;
 	int			error;
 
-	/* Create the realtime bitmap inode. */
 	error = -libxfs_imeta_start_create(mp, &XFS_IMETA_RTBITMAP, &upd);
 	if (error)
 		res_failed(error);
@@ -777,15 +765,24 @@ rtinit(
 	rbmip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize;
 	rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
 	inode_set_atime(VFS_I(rbmip), 0, 0);
-	libxfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE);
+	libxfs_trans_log_inode(upd.tp, rbmip, XFS_ILOG_CORE);
 
 	error = -libxfs_imeta_commit_update(&upd);
 	if (error)
 		fail(_("Completion of the realtime bitmap inode failed"),
 				error);
 	mp->m_rbmip = rbmip;
+}
+
+/* Create the realtime summary inode. */
+static void
+rtsummary_create(
+	struct xfs_mount	*mp)
+{
+	struct xfs_imeta_update	upd;
+	struct xfs_inode	*rsumip;
+	int			error;
 
-	/* Create the realtime summary inode. */
 	error = -libxfs_imeta_start_create(mp, &XFS_IMETA_RTSUMMARY, &upd);
 	if (error)
 		res_failed(error);
@@ -795,26 +792,40 @@ rtinit(
 		fail(_("Realtime summary inode allocation failed"), error);
 
 	rsumip->i_disk_size = mp->m_rsumsize;
-	libxfs_trans_log_inode(tp, rsumip, XFS_ILOG_CORE);
+	libxfs_trans_log_inode(upd.tp, rsumip, XFS_ILOG_CORE);
 
 	error = -libxfs_imeta_commit_update(&upd);
 	if (error)
 		fail(_("Completion of the realtime summary inode failed"),
 				error);
 	mp->m_rsumip = rsumip;
+}
+
+/* Zero the realtime bitmap. */
+static void
+rtbitmap_init(
+	struct xfs_mount	*mp)
+{
+	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
+	struct xfs_trans	*tp;
+	struct xfs_bmbt_irec	*ep;
+	xfs_fileoff_t		bno;
+	uint			blocks;
+	int			i;
+	int			nmap;
+	int			error;
 
-	/* Zero the realtime bitmap. */
 	blocks = mp->m_sb.sb_rbmblocks +
 			XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - 1;
 	error = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
 	if (error)
 		res_failed(error);
 
-	libxfs_trans_ijoin(tp, rbmip, 0);
+	libxfs_trans_ijoin(tp, mp->m_rbmip, 0);
 	bno = 0;
 	while (bno < mp->m_sb.sb_rbmblocks) {
 		nmap = XFS_BMAP_MAX_NMAP;
-		error = -libxfs_bmapi_write(tp, rbmip, bno,
+		error = -libxfs_bmapi_write(tp, mp->m_rbmip, bno,
 				(xfs_extlen_t)(mp->m_sb.sb_rbmblocks - bno),
 				0, mp->m_sb.sb_rbmblocks, map, &nmap);
 		if (error)
@@ -833,19 +844,34 @@ rtinit(
 	if (error)
 		fail(_("Block allocation of the realtime bitmap inode failed"),
 				error);
+}
+
+/* Zero the realtime summary file. */
+static void
+rtsummary_init(
+	struct xfs_mount	*mp)
+{
+	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
+	struct xfs_trans	*tp;
+	struct xfs_bmbt_irec	*ep;
+	xfs_fileoff_t		bno;
+	xfs_extlen_t		nsumblocks;
+	uint			blocks;
+	int			i;
+	int			nmap;
+	int			error;
 
-	/* Zero the summary file. */
 	nsumblocks = mp->m_rsumsize >> mp->m_sb.sb_blocklog;
 	blocks = nsumblocks + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - 1;
 	error = -libxfs_trans_alloc_rollable(mp, blocks, &tp);
 	if (error)
 		res_failed(error);
-	libxfs_trans_ijoin(tp, rsumip, 0);
+	libxfs_trans_ijoin(tp, mp->m_rsumip, 0);
 
 	bno = 0;
 	while (bno < nsumblocks) {
 		nmap = XFS_BMAP_MAX_NMAP;
-		error = -libxfs_bmapi_write(tp, rsumip, bno,
+		error = -libxfs_bmapi_write(tp, mp->m_rsumip, bno,
 				(xfs_extlen_t)(nsumblocks - bno),
 				0, nsumblocks, map, &nmap);
 		if (error)
@@ -863,8 +889,6 @@ rtinit(
 	if (error)
 		fail(_("Block allocation of the realtime summary inode failed"),
 				error);
-
-	rtfreesp_init(mp);
 }
 
 /*
@@ -903,6 +927,21 @@ rtfreesp_init(
 	}
 }
 
+/*
+ * Allocate the realtime bitmap and summary inodes, and fill in data if any.
+ */
+static void
+rtinit(
+	struct xfs_mount	*mp)
+{
+	rtbitmap_create(mp);
+	rtsummary_create(mp);
+
+	rtbitmap_init(mp);
+	rtsummary_init(mp);
+	rtfreesp_init(mp);
+}
+
 static long
 filesize(
 	int		fd)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/58] xfs: iget for metadata inodes
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 23:31   ` [PATCH 06/58] mkfs: break up the rest of the rtinit() function Darrick J. Wong
@ 2023-12-31 23:31   ` Darrick J. Wong
  2023-12-31 23:31   ` [PATCH 08/58] xfs: define the on-disk format for the metadir feature Darrick J. Wong
                     ` (50 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:31 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a xfs_iget_meta function for metadata inodes to ensure that we
always check that the inobt thinks a metadata inode is in use.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/init.c            |    4 ++--
 libxfs/inode.c           |   36 ++++++++++++++++++++++++++++++++++++
 libxfs/libxfs_api_defs.h |    2 ++
 libxfs/xfs_imeta.h       |    5 +++++
 4 files changed, 45 insertions(+), 2 deletions(-)


diff --git a/libxfs/init.c b/libxfs/init.c
index c199aeea45e..c0f148e75a2 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -883,9 +883,9 @@ void
 libxfs_rtmount_destroy(xfs_mount_t *mp)
 {
 	if (mp->m_rsumip)
-		libxfs_irele(mp->m_rsumip);
+		libxfs_imeta_irele(mp->m_rsumip);
 	if (mp->m_rbmip)
-		libxfs_irele(mp->m_rbmip);
+		libxfs_imeta_irele(mp->m_rbmip);
 	mp->m_rsumip = mp->m_rbmip = NULL;
 }
 
diff --git a/libxfs/inode.c b/libxfs/inode.c
index 47c9b9d6bd7..560b127ee9d 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -225,6 +225,35 @@ libxfs_iget(
 	return error;
 }
 
+/*
+ * Get a metadata inode.  The ftype must match exactly.  Caller must supply
+ * a transaction (even if empty) to avoid livelocking if the inobt has a cycle.
+ */
+int
+libxfs_imeta_iget(
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	unsigned char		ftype,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*ip;
+	int			error;
+
+	error = libxfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, &ip);
+	if (error)
+		return error;
+
+	if (ftype == XFS_DIR3_FT_UNKNOWN ||
+	    xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) {
+		libxfs_irele(ip);
+		return -EFSCORRUPTED;
+	}
+
+	*ipp = ip;
+	return 0;
+}
+
 static void
 libxfs_idestroy(
 	struct xfs_inode	*ip)
@@ -258,6 +287,13 @@ libxfs_irele(
 	}
 }
 
+void
+libxfs_imeta_irele(
+	struct xfs_inode	*ip)
+{
+	libxfs_irele(ip);
+}
+
 static inline void inode_fsgid_set(struct inode *inode,
 				   struct mnt_idmap *idmap)
 {
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 167617771d8..873995f265c 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -177,6 +177,8 @@
 #define xfs_imeta_commit_update		libxfs_imeta_commit_update
 #define xfs_imeta_create		libxfs_imeta_create
 #define xfs_imeta_create_space_res	libxfs_imeta_create_space_res
+#define xfs_imeta_iget			libxfs_imeta_iget
+#define xfs_imeta_irele			libxfs_imeta_irele
 #define xfs_imeta_link			libxfs_imeta_link
 #define xfs_imeta_link_space_res	libxfs_imeta_link_space_res
 #define xfs_imeta_lookup		libxfs_imeta_lookup
diff --git a/libxfs/xfs_imeta.h b/libxfs/xfs_imeta.h
index 7b3da865c09..0a4361bda1c 100644
--- a/libxfs/xfs_imeta.h
+++ b/libxfs/xfs_imeta.h
@@ -47,4 +47,9 @@ unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp);
 unsigned int xfs_imeta_link_space_res(struct xfs_mount *mp);
 unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp);
 
+/* Must be implemented by the libxfs client */
+int xfs_imeta_iget(struct xfs_trans *tp, xfs_ino_t ino, unsigned char ftype,
+		struct xfs_inode **ipp);
+void xfs_imeta_irele(struct xfs_inode *ip);
+
 #endif /* __XFS_IMETA_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/58] xfs: define the on-disk format for the metadir feature
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 23:31   ` [PATCH 07/58] xfs: iget for metadata inodes Darrick J. Wong
@ 2023-12-31 23:31   ` Darrick J. Wong
  2023-12-31 23:31   ` [PATCH 09/58] xfs: update imeta transaction reservations for metadir Darrick J. Wong
                     ` (49 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:31 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Define the on-disk layout and feature flags for the metadata inode
directory feature.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h     |    5 ++++
 include/xfs_mount.h     |    2 ++
 libxfs/xfs_format.h     |   60 +++++++++++++++++++++++++++++++++++++++++------
 libxfs/xfs_inode_util.c |    2 ++
 libxfs/xfs_sb.c         |    2 ++
 5 files changed, 63 insertions(+), 8 deletions(-)


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 5d7bc69e3ff..1fdae6c1d3a 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -386,6 +386,11 @@ static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
 	return false;
 }
 
+static inline bool xfs_is_metadir_inode(struct xfs_inode *ip)
+{
+	return ip->i_diflags2 & XFS_DIFLAG2_METADIR;
+}
+
 extern void	libxfs_trans_inode_alloc_buf (struct xfs_trans *,
 				struct xfs_buf *);
 
diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index da621f3b992..7dfa94dfd9f 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -175,6 +175,7 @@ typedef struct xfs_mount {
 #define XFS_FEAT_BIGTIME	(1ULL << 24)	/* large timestamps */
 #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
 #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
+#define XFS_FEAT_METADIR	(1ULL << 27)	/* metadata directory tree */
 
 #define __XFS_HAS_FEAT(name, NAME) \
 static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
@@ -219,6 +220,7 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
 __XFS_HAS_FEAT(bigtime, BIGTIME)
 __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
 __XFS_HAS_FEAT(large_extent_counts, NREXT64)
+__XFS_HAS_FEAT(metadir, METADIR)
 
 /* Kernel mount features that we don't support */
 #define __XFS_UNSUPP_FEAT(name) \
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index c60af436963..7596b928698 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -174,6 +174,16 @@ typedef struct xfs_sb {
 	xfs_lsn_t	sb_lsn;		/* last write sequence */
 	uuid_t		sb_meta_uuid;	/* metadata file system unique id */
 
+	/* Fields beyond here do not match xfs_dsb.  Be very careful! */
+
+	/*
+	 * Metadata Directory Inode.  On disk this lives in the sb_rbmino slot,
+	 * but we continue to use the in-core superblock to cache the classic
+	 * inodes (rt bitmap; rt summary; user, group, and project quotas) so
+	 * we cache the metadir inode value here too.
+	 */
+	xfs_ino_t	sb_metadirino;
+
 	/* must be padded to 64 bit alignment */
 } xfs_sb_t;
 
@@ -190,7 +200,14 @@ struct xfs_dsb {
 	uuid_t		sb_uuid;	/* user-visible file system unique id */
 	__be64		sb_logstart;	/* starting block of log if internal */
 	__be64		sb_rootino;	/* root inode number */
-	__be64		sb_rbmino;	/* bitmap inode for realtime extents */
+	/*
+	 * bitmap inode for realtime extents.
+	 *
+	 * The metadata directory feature uses the sb_rbmino field to point to
+	 * the root of the metadata directory tree.  All other sb inode
+	 * pointers are no longer used.
+	 */
+	__be64		sb_rbmino;
 	__be64		sb_rsumino;	/* summary inode for rt bitmap */
 	__be32		sb_rextsize;	/* realtime extent size, blocks */
 	__be32		sb_agblocks;	/* size of an allocation group */
@@ -373,6 +390,7 @@ xfs_sb_has_ro_compat_feature(
 #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4)	/* needs xfs_repair */
 #define XFS_SB_FEAT_INCOMPAT_NREXT64	(1 << 5)	/* large extent counters */
 #define XFS_SB_FEAT_INCOMPAT_PARENT	(1 << 6)	/* parent pointers */
+#define XFS_SB_FEAT_INCOMPAT_METADIR	(1U << 31)	/* metadata dir tree */
 #define XFS_SB_FEAT_INCOMPAT_ALL \
 		(XFS_SB_FEAT_INCOMPAT_FTYPE|	\
 		 XFS_SB_FEAT_INCOMPAT_SPINODES|	\
@@ -1109,17 +1127,43 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 #define XFS_DIFLAG2_REFLINK_BIT	1	/* file's blocks may be shared */
 #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
 #define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
-#define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
+#define XFS_DIFLAG2_NREXT64_BIT	4	/* large extent counters */
+#define XFS_DIFLAG2_METADIR_BIT	63	/* filesystem metadata */
 
-#define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
-#define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
-#define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
-#define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
-#define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_DAX		(1ULL << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK	(1ULL << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE	(1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME	(1ULL << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64	(1ULL << XFS_DIFLAG2_NREXT64_BIT)
+
+/*
+ * The inode contains filesystem metadata and can be found through the metadata
+ * directory tree.  Metadata inodes must satisfy the following constraints:
+ *
+ * - V5 filesystem (and ftype) are enabled;
+ * - The only valid modes are regular files and directories;
+ * - The access bits must be zero;
+ * - DMAPI event and state masks are zero;
+ * - The user, group, and project IDs must be zero;
+ * - The immutable, sync, noatime, nodump, nodefrag flags must be set.
+ * - The dax flag must not be set.
+ * - Directories must have nosymlinks set.
+ *
+ * These requirements are chosen defensively to minimize the ability of
+ * userspace to read or modify the contents, should a metadata file ever
+ * escape to userspace.
+ *
+ * There are further constraints on the directory tree itself:
+ *
+ * - Metadata inodes must never be resolvable through the root directory;
+ * - They must never be accessed by userspace;
+ * - Metadata directory entries must have correct ftype.
+ */
+#define XFS_DIFLAG2_METADIR	(1ULL << XFS_DIFLAG2_METADIR_BIT)
 
 #define XFS_DIFLAG2_ANY \
 	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADIR)
 
 static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
 {
diff --git a/libxfs/xfs_inode_util.c b/libxfs/xfs_inode_util.c
index 19d26950db9..b63aacb6971 100644
--- a/libxfs/xfs_inode_util.c
+++ b/libxfs/xfs_inode_util.c
@@ -222,6 +222,8 @@ xfs_inode_inherit_flags2(
 	}
 	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
 		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+	if (pip->i_diflags2 & XFS_DIFLAG2_METADIR)
+		ip->i_diflags2 |= XFS_DIFLAG2_METADIR;
 
 	/* Don't let invalid cowextsize hints propagate. */
 	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index d150170d87b..49d62281995 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -176,6 +176,8 @@ xfs_sb_version_to_features(
 		features |= XFS_FEAT_NREXT64;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
 		features |= XFS_FEAT_PARENT;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
+		features |= XFS_FEAT_METADIR;
 
 	return features;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/58] xfs: update imeta transaction reservations for metadir
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 23:31   ` [PATCH 08/58] xfs: define the on-disk format for the metadir feature Darrick J. Wong
@ 2023-12-31 23:31   ` Darrick J. Wong
  2023-12-31 23:32   ` [PATCH 10/58] xfs: load metadata directory root at mount time Darrick J. Wong
                     ` (48 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:31 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Update the new metadata inode transaction reservations to handle
metadata directories if that feature is enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_trans_resv.c |  101 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 98 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_trans_resv.c b/libxfs/xfs_trans_resv.c
index 5a1ad959870..a46360e2bdb 100644
--- a/libxfs/xfs_trans_resv.c
+++ b/libxfs/xfs_trans_resv.c
@@ -1106,6 +1106,81 @@ xfs_calc_sb_reservation(
 	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
 }
 
+/*
+ * Metadata inode creation needs enough space to create or mkdir a directory,
+ * plus logging the superblock.
+ */
+static unsigned int
+xfs_calc_imeta_create_resv(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	unsigned int		ret;
+
+	ret = xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+	ret += resp->tr_create.tr_logres;
+	return ret;
+}
+
+/* Metadata inode creation needs enough rounds to create or mkdir a directory */
+static int
+xfs_calc_imeta_create_count(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	return resp->tr_create.tr_logcount;
+}
+
+/*
+ * Metadata inode link needs enough space to add a file plus logging the
+ * superblock.
+ */
+static unsigned int
+xfs_calc_imeta_link_resv(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	unsigned int		ret;
+
+	ret = xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+	ret += resp->tr_link.tr_logres;
+	return ret;
+}
+
+/* Metadata inode linking needs enough rounds to remove a file. */
+static int
+xfs_calc_imeta_link_count(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	return resp->tr_link.tr_logcount;
+}
+
+/*
+ * Metadata inode unlink needs enough space to remove a file plus logging the
+ * superblock.
+ */
+static unsigned int
+xfs_calc_imeta_unlink_resv(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	unsigned int		ret;
+
+	ret = xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+	ret += resp->tr_remove.tr_logres;
+	return ret;
+}
+
+/* Metadata inode unlinking needs enough rounds to remove a file. */
+static int
+xfs_calc_imeta_unlink_count(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	return resp->tr_remove.tr_logcount;
+}
+
 /*
  * Namespace reservations.
  *
@@ -1248,7 +1323,27 @@ xfs_trans_resv_calc(
 	resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
 
 	/* metadata inode creation and unlink */
-	resp->tr_imeta_create = resp->tr_create;
-	resp->tr_imeta_link = resp->tr_link;
-	resp->tr_imeta_unlink = resp->tr_remove;
+	if (xfs_has_metadir(mp)) {
+		resp->tr_imeta_create.tr_logres =
+				xfs_calc_imeta_create_resv(mp, resp);
+		resp->tr_imeta_create.tr_logcount =
+				xfs_calc_imeta_create_count(mp, resp);
+		resp->tr_imeta_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+		resp->tr_imeta_link.tr_logres =
+				xfs_calc_imeta_link_resv(mp, resp);
+		resp->tr_imeta_link.tr_logcount =
+				xfs_calc_imeta_link_count(mp, resp);
+		resp->tr_imeta_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+		resp->tr_imeta_unlink.tr_logres =
+				xfs_calc_imeta_unlink_resv(mp, resp);
+		resp->tr_imeta_unlink.tr_logcount =
+				xfs_calc_imeta_unlink_count(mp, resp);
+		resp->tr_imeta_unlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+	} else {
+		resp->tr_imeta_create = resp->tr_create;
+		resp->tr_imeta_link = resp->tr_link;
+		resp->tr_imeta_unlink = resp->tr_remove;
+	}
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/58] xfs: load metadata directory root at mount time
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 23:31   ` [PATCH 09/58] xfs: update imeta transaction reservations for metadir Darrick J. Wong
@ 2023-12-31 23:32   ` Darrick J. Wong
  2023-12-31 23:32   ` [PATCH 11/58] xfs: convert metadata inode lookup keys to use paths Darrick J. Wong
                     ` (47 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:32 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Load the metadata directory root inode into memory at mount time and
release it at unmount time.  We also make sure that the obsolete inode
pointers in the superblock are not logged or read from the superblock.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_mount.h |    1 +
 libxfs/init.c       |   13 +++++++++++++
 libxfs/xfs_sb.c     |   31 +++++++++++++++++++++++++++++++
 libxfs/xfs_types.c  |    2 +-
 4 files changed, 46 insertions(+), 1 deletion(-)


diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index 7dfa94dfd9f..8952869d89a 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -66,6 +66,7 @@ typedef struct xfs_mount {
 	uint8_t			*m_rsum_cache;
 	struct xfs_inode	*m_rbmip;	/* pointer to bitmap inode */
 	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
+	struct xfs_inode	*m_metadirip;	/* ptr to metadata directory */
 	struct xfs_buftarg	*m_ddev_targp;
 	struct xfs_buftarg	*m_logdev_targp;
 	struct xfs_buftarg	*m_rtdev_targp;
diff --git a/libxfs/init.c b/libxfs/init.c
index c0f148e75a2..67f9e1fe99b 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -685,6 +685,17 @@ libxfs_mountfs_imeta(
 	if (error)
 		return;
 
+	if (xfs_has_metadir(mp)) {
+		error = -libxfs_imeta_iget(tp, mp->m_sb.sb_metadirino,
+				XFS_DIR3_FT_DIR, &mp->m_metadirip);
+		if (error) {
+			fprintf(stderr,
+ _("%s: Failed to load metadir root directory, error %d\n"),
+					progname, error);
+			goto err_cancel;
+		}
+	}
+
 	error = -xfs_imeta_mount(tp);
 	if (error) {
 		fprintf(stderr,
@@ -991,6 +1002,8 @@ libxfs_umount(
 	int			error;
 
 	libxfs_rtmount_destroy(mp);
+	if (mp->m_metadirip)
+		libxfs_imeta_irele(mp->m_metadirip);
 
 	/*
 	 * Purge the buffer cache to write all dirty buffers to disk and free
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index 49d62281995..b74a170605d 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -680,6 +680,25 @@ __xfs_sb_from_disk(
 	/* Convert on-disk flags to in-memory flags? */
 	if (convert_xquota)
 		xfs_sb_quota_from_disk(to);
+
+	if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+		/*
+		 * Set metadirino here and null out the in-core fields for
+		 * the other inodes because metadir initialization will load
+		 * them later.
+		 */
+		to->sb_metadirino = be64_to_cpu(from->sb_rbmino);
+		to->sb_rbmino = NULLFSINO;
+		to->sb_rsumino = NULLFSINO;
+
+		/*
+		 * We don't have to worry about quota inode conversion here
+		 * because metadir requires a v5 filesystem.
+		 */
+		to->sb_uquotino = NULLFSINO;
+		to->sb_gquotino = NULLFSINO;
+		to->sb_pquotino = NULLFSINO;
+	}
 }
 
 void
@@ -827,6 +846,18 @@ xfs_sb_to_disk(
 	to->sb_lsn = cpu_to_be64(from->sb_lsn);
 	if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
 		uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+
+	if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+		/*
+		 * Save metadirino here and null out the on-disk fields for
+		 * the other inodes, at least until we reuse the fields.
+		 */
+		to->sb_rbmino = cpu_to_be64(from->sb_metadirino);
+		to->sb_rsumino = cpu_to_be64(NULLFSINO);
+		to->sb_uquotino = cpu_to_be64(NULLFSINO);
+		to->sb_gquotino = cpu_to_be64(NULLFSINO);
+		to->sb_pquotino = cpu_to_be64(NULLFSINO);
+	}
 }
 
 /*
diff --git a/libxfs/xfs_types.c b/libxfs/xfs_types.c
index 88720b297d7..f5eab8839e3 100644
--- a/libxfs/xfs_types.c
+++ b/libxfs/xfs_types.c
@@ -128,7 +128,7 @@ xfs_verify_dir_ino(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino)
 {
-	if (xfs_internal_inum(mp, ino))
+	if (!xfs_has_metadir(mp) && xfs_internal_inum(mp, ino))
 		return false;
 	return xfs_verify_ino(mp, ino);
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/58] xfs: convert metadata inode lookup keys to use paths
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 23:32   ` [PATCH 10/58] xfs: load metadata directory root at mount time Darrick J. Wong
@ 2023-12-31 23:32   ` Darrick J. Wong
  2023-12-31 23:32   ` [PATCH 12/58] xfs: enforce metadata inode flag Darrick J. Wong
                     ` (46 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:32 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert the magic metadata inode lookup keys to use actual strings
for paths.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_imeta.c |   48 ++++++++++++++++++++++++++----------------------
 libxfs/xfs_imeta.h |   27 +++++++++++++++++++++++++--
 2 files changed, 51 insertions(+), 24 deletions(-)


diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
index 086c250a3c2..e59b0f414ed 100644
--- a/libxfs/xfs_imeta.c
+++ b/libxfs/xfs_imeta.c
@@ -48,26 +48,17 @@
  */
 
 /* Static metadata inode paths */
-
-const struct xfs_imeta_path XFS_IMETA_RTBITMAP = {
-	.bogus = 0,
-};
-
-const struct xfs_imeta_path XFS_IMETA_RTSUMMARY = {
-	.bogus = 1,
-};
-
-const struct xfs_imeta_path XFS_IMETA_USRQUOTA = {
-	.bogus = 2,
-};
-
-const struct xfs_imeta_path XFS_IMETA_GRPQUOTA = {
-	.bogus = 3,
-};
-
-const struct xfs_imeta_path XFS_IMETA_PRJQUOTA = {
-	.bogus = 4,
-};
+static const unsigned char *rtbitmap_path[]	= {"realtime", "bitmap"};
+static const unsigned char *rtsummary_path[]	= {"realtime", "summary"};
+static const unsigned char *usrquota_path[]	= {"quota", "user"};
+static const unsigned char *grpquota_path[]	= {"quota", "group"};
+static const unsigned char *prjquota_path[]	= {"quota", "project"};
+
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_RTBITMAP,	rtbitmap_path);
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_RTSUMMARY,	rtsummary_path);
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_USRQUOTA,	usrquota_path);
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_GRPQUOTA,	grpquota_path);
+XFS_IMETA_DEFINE_PATH(XFS_IMETA_PRJQUOTA,	prjquota_path);
 
 /* Are these two paths equal? */
 STATIC bool
@@ -75,7 +66,20 @@ xfs_imeta_path_compare(
 	const struct xfs_imeta_path	*a,
 	const struct xfs_imeta_path	*b)
 {
-	return a == b;
+	unsigned int			i;
+
+	if (a == b)
+		return true;
+
+	if (a->im_depth != b->im_depth)
+		return false;
+
+	for (i = 0; i < a->im_depth; i++)
+		if (a->im_path[i] != b->im_path[i] &&
+		    strcmp(a->im_path[i], b->im_path[i]))
+			return false;
+
+	return true;
 }
 
 /* Is this path ok? */
@@ -83,7 +87,7 @@ static inline bool
 xfs_imeta_path_check(
 	const struct xfs_imeta_path	*path)
 {
-	return true;
+	return path->im_depth <= XFS_IMETA_MAX_DEPTH;
 }
 
 /* Functions for storing and retrieving superblock inode values. */
diff --git a/libxfs/xfs_imeta.h b/libxfs/xfs_imeta.h
index 0a4361bda1c..60e0f6a6c13 100644
--- a/libxfs/xfs_imeta.h
+++ b/libxfs/xfs_imeta.h
@@ -6,10 +6,23 @@
 #ifndef __XFS_IMETA_H__
 #define __XFS_IMETA_H__
 
+/* How deep can we nest metadata dirs? */
+#define XFS_IMETA_MAX_DEPTH	64
+
+/* Form an imeta path from a simple array of strings. */
+#define XFS_IMETA_DEFINE_PATH(name, path) \
+const struct xfs_imeta_path name = { \
+	.im_path = (path), \
+	.im_depth = ARRAY_SIZE(path), \
+}
+
 /* Key for looking up metadata inodes. */
 struct xfs_imeta_path {
-	/* Temporary: integer to keep the static imeta definitions unique */
-	int		bogus;
+	/* Array of string pointers. */
+	const unsigned char	**im_path;
+
+	/* Number of strings in path. */
+	uint8_t			im_depth;
 };
 
 /* Cleanup widget for metadata inode creation and deletion. */
@@ -25,6 +38,16 @@ struct xfs_imeta_update {
 	unsigned int		ip_locked:1;
 };
 
+/* Grab the last path component, mostly for tracing. */
+static inline const unsigned char *
+xfs_imeta_lastpath(
+	const struct xfs_imeta_update	*upd)
+{
+	if (upd->path && upd->path->im_path && upd->path->im_depth > 0)
+		return upd->path->im_path[upd->path->im_depth - 1];
+	return "?";
+}
+
 /* Lookup keys for static metadata inodes. */
 extern const struct xfs_imeta_path XFS_IMETA_RTBITMAP;
 extern const struct xfs_imeta_path XFS_IMETA_RTSUMMARY;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/58] xfs: enforce metadata inode flag
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 23:32   ` [PATCH 11/58] xfs: convert metadata inode lookup keys to use paths Darrick J. Wong
@ 2023-12-31 23:32   ` Darrick J. Wong
  2023-12-31 23:32   ` [PATCH 13/58] xfs: allow deletion of metadata directory files Darrick J. Wong
                     ` (45 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:32 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add checks for the metadata inode flag so that we don't ever leak
metadata inodes out to userspace, and we don't ever try to read a
regular inode as metadata.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/inode.c         |    5 +++
 libxfs/xfs_inode_buf.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_inode_buf.h |    3 ++
 3 files changed, 80 insertions(+), 1 deletion(-)


diff --git a/libxfs/inode.c b/libxfs/inode.c
index 560b127ee9d..5cb2fd7891a 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -244,7 +244,8 @@ libxfs_imeta_iget(
 	if (error)
 		return error;
 
-	if (ftype == XFS_DIR3_FT_UNKNOWN ||
+	if ((xfs_has_metadir(mp) && !xfs_is_metadir_inode(ip)) ||
+	    ftype == XFS_DIR3_FT_UNKNOWN ||
 	    xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) {
 		libxfs_irele(ip);
 		return -EFSCORRUPTED;
@@ -291,6 +292,8 @@ void
 libxfs_imeta_irele(
 	struct xfs_inode	*ip)
 {
+	ASSERT(!xfs_has_metadir(ip->i_mount) || xfs_is_metadir_inode(ip));
+
 	libxfs_irele(ip);
 }
 
diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c
index aee581d53c8..8d100595756 100644
--- a/libxfs/xfs_inode_buf.c
+++ b/libxfs/xfs_inode_buf.c
@@ -457,6 +457,73 @@ xfs_dinode_verify_nrext64(
 	return NULL;
 }
 
+/*
+ * Validate all the picky requirements we have for a file that claims to be
+ * filesystem metadata.
+ */
+xfs_failaddr_t
+xfs_dinode_verify_metadir(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip,
+	uint16_t		mode,
+	uint16_t		flags,
+	uint64_t		flags2)
+{
+	if (!xfs_has_metadir(mp))
+		return __this_address;
+
+	/* V5 filesystem only */
+	if (dip->di_version < 3)
+		return __this_address;
+
+	/* V3 inode fields that are always zero */
+	if (dip->di_onlink)
+		return __this_address;
+	if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad)
+		return __this_address;
+	if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter)
+		return __this_address;
+
+	/* Metadata files can only be directories or regular files */
+	if (!S_ISDIR(mode) && !S_ISREG(mode))
+		return __this_address;
+
+	/* They must have zero access permissions */
+	if (mode & 0777)
+		return __this_address;
+
+	/* DMAPI event and state masks are zero */
+	if (dip->di_dmevmask || dip->di_dmstate)
+		return __this_address;
+
+	/* User, group, and project IDs must be zero */
+	if (dip->di_uid || dip->di_gid ||
+	    dip->di_projid_lo || dip->di_projid_hi)
+		return __this_address;
+
+	/* Immutable, sync, noatime, nodump, and nodefrag flags must be set */
+	if (!(flags & XFS_DIFLAG_IMMUTABLE))
+		return __this_address;
+	if (!(flags & XFS_DIFLAG_SYNC))
+		return __this_address;
+	if (!(flags & XFS_DIFLAG_NOATIME))
+		return __this_address;
+	if (!(flags & XFS_DIFLAG_NODUMP))
+		return __this_address;
+	if (!(flags & XFS_DIFLAG_NODEFRAG))
+		return __this_address;
+
+	/* Directories must have nosymlinks flags set */
+	if (S_ISDIR(mode) && !(flags & XFS_DIFLAG_NOSYMLINKS))
+		return __this_address;
+
+	/* dax flags2 must not be set */
+	if (flags2 & XFS_DIFLAG2_DAX)
+		return __this_address;
+
+	return NULL;
+}
+
 xfs_failaddr_t
 xfs_dinode_verify(
 	struct xfs_mount	*mp,
@@ -621,6 +688,12 @@ xfs_dinode_verify(
 	    !xfs_has_bigtime(mp))
 		return __this_address;
 
+	if (flags2 & XFS_DIFLAG2_METADIR) {
+		fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2);
+		if (fa)
+			return fa;
+	}
+
 	return NULL;
 }
 
diff --git a/libxfs/xfs_inode_buf.h b/libxfs/xfs_inode_buf.h
index 585ed5a110a..8d43d2641c7 100644
--- a/libxfs/xfs_inode_buf.h
+++ b/libxfs/xfs_inode_buf.h
@@ -28,6 +28,9 @@ int	xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
 
 xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
 			   struct xfs_dinode *dip);
+xfs_failaddr_t xfs_dinode_verify_metadir(struct xfs_mount *mp,
+		struct xfs_dinode *dip, uint16_t mode, uint16_t flags,
+		uint64_t flags2);
 xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
 		uint32_t extsize, uint16_t mode, uint16_t flags);
 xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/58] xfs: allow deletion of metadata directory files
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 23:32   ` [PATCH 12/58] xfs: enforce metadata inode flag Darrick J. Wong
@ 2023-12-31 23:32   ` Darrick J. Wong
  2023-12-31 23:33   ` [PATCH 14/58] xfs: read and write metadata inode directory Darrick J. Wong
                     ` (44 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:32 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make it possible to free metadata files once we've unlinked them from
the directory structure.  We don't do this in the kernel, at least not
yet, but don't leave a logic bomb for later.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h      |    3 +
 libxfs/inode.c           |   93 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/libxfs_api_defs.h |    1 
 libxfs/xfs_imeta.c       |   49 ++++++++++++++++++++++++
 4 files changed, 145 insertions(+), 1 deletion(-)


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 1fdae6c1d3a..4aacc488fa5 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -418,4 +418,7 @@ extern void	libxfs_irele(struct xfs_inode *ip);
 #define xfs_inherit_nosymlinks	(false)
 #define xfs_inherit_nodefrag	(false)
 
+int libxfs_ifree_cluster(struct xfs_trans *tp, struct xfs_perag *pag,
+		struct xfs_inode *free_ip, struct xfs_icluster *xic);
+
 #endif /* __XFS_INODE_H__ */
diff --git a/libxfs/inode.c b/libxfs/inode.c
index 5cb2fd7891a..87b5df84f2a 100644
--- a/libxfs/inode.c
+++ b/libxfs/inode.c
@@ -317,3 +317,96 @@ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
 		inode_fsgid_set(inode, idmap);
 	inode->i_mode = mode;
 }
+
+/*
+ * This call is used to indicate that the buffer is going to
+ * be staled and was an inode buffer. This means it gets
+ * special processing during unpin - where any inodes
+ * associated with the buffer should be removed from ail.
+ * There is also special processing during recovery,
+ * any replay of the inodes in the buffer needs to be
+ * prevented as the buffer may have been reused.
+ */
+static void
+xfs_trans_stale_inode_buf(
+	xfs_trans_t		*tp,
+	struct xfs_buf		*bp)
+{
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bp->b_flags |= _XBF_INODES;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * A big issue when freeing the inode cluster is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
+int
+libxfs_ifree_cluster(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_inode	*free_ip,
+	struct xfs_icluster	*xic)
+{
+	struct xfs_mount	*mp = free_ip->i_mount;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	struct xfs_buf		*bp;
+	xfs_daddr_t		blkno;
+	xfs_ino_t		inum = xic->first_ino;
+	int			nbufs;
+	int			j;
+	int			ioffset;
+	int			error;
+
+	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
+
+	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
+		/*
+		 * The allocation bitmap tells us which inodes of the chunk were
+		 * physically allocated. Skip the cluster if an inode falls into
+		 * a sparse region.
+		 */
+		ioffset = inum - xic->first_ino;
+		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+			ASSERT(ioffset % igeo->inodes_per_cluster == 0);
+			continue;
+		}
+
+		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
+					 XFS_INO_TO_AGBNO(mp, inum));
+
+		/*
+		 * We obtain and lock the backing buffer first in the process
+		 * here to ensure dirty inodes attached to the buffer remain in
+		 * the flushing state while we mark them stale.
+		 *
+		 * If we scan the in-memory inodes first, then buffer IO can
+		 * complete before we get a lock on it, and hence we may fail
+		 * to mark all the active inodes on the buffer stale.
+		 */
+		error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+				mp->m_bsize * igeo->blocks_per_cluster,
+				XBF_UNMAPPED, &bp);
+		if (error)
+			return error;
+
+		/*
+		 * This buffer may not have been correctly initialised as we
+		 * didn't read it from disk. That's not important because we are
+		 * only using to mark the buffer as stale in the log, and to
+		 * attach stale cached inodes on it. That means it will never be
+		 * dispatched for IO. If it is, we want to know about it, and we
+		 * want it to fail. We can acheive this by adding a write
+		 * verifier to the buffer.
+		 */
+		bp->b_ops = &xfs_inode_buf_ops;
+
+		xfs_trans_stale_inode_buf(tp, bp);
+		xfs_trans_binval(tp, bp);
+	}
+	return 0;
+}
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 873995f265c..a0cdad40ff9 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -171,6 +171,7 @@
 #define xfs_iext_lookup_extent		libxfs_iext_lookup_extent
 #define xfs_iext_next			libxfs_iext_next
 #define xfs_ifork_zap_attr		libxfs_ifork_zap_attr
+#define xfs_ifree_cluster		libxfs_ifree_cluster
 #define xfs_imap_to_bp			libxfs_imap_to_bp
 
 #define xfs_imeta_cancel_update		libxfs_imeta_cancel_update
diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
index e59b0f414ed..672aba4d0e7 100644
--- a/libxfs/xfs_imeta.c
+++ b/libxfs/xfs_imeta.c
@@ -22,6 +22,7 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_trans_space.h"
+#include "xfs_ag.h"
 
 /*
  * Metadata File Management
@@ -359,6 +360,38 @@ xfs_imeta_create(
 	return error;
 }
 
+/* Free a file from the metadata directory tree. */
+STATIC int
+xfs_imeta_ifree(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	struct xfs_icluster	xic = { 0 };
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(ip->i_df.if_nextents == 0);
+	ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
+	ASSERT(ip->i_nblocks == 0);
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	error = xfs_dir_ifree(tp, pag, ip, &xic);
+	if (error)
+		goto out;
+
+	/* Metadata files do not support ownership changes or DMAPI. */
+
+	if (xic.deleted)
+		error = xfs_ifree_cluster(tp, pag, ip, &xic);
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
 /*
  * Unlink a metadata inode @upd->ip from the metadata directory given by @path.
  * The path must already exist.
@@ -367,10 +400,24 @@ int
 xfs_imeta_unlink(
 	struct xfs_imeta_update		*upd)
 {
+	int				error;
+
 	ASSERT(xfs_imeta_path_check(upd->path));
 	ASSERT(xfs_imeta_verify(upd->mp, upd->ip->i_ino));
 
-	return xfs_imeta_sb_unlink(upd);
+	error = xfs_imeta_sb_unlink(upd);
+	if (error)
+		return error;
+
+	/*
+	 * Metadata files require explicit resource cleanup.  In other words,
+	 * the inactivation system will not touch these files, so we must free
+	 * the ondisk inode by ourselves if warranted.
+	 */
+	if (VFS_I(upd->ip)->i_nlink > 0)
+		return 0;
+
+	return xfs_imeta_ifree(upd->tp, upd->ip);
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/58] xfs: read and write metadata inode directory
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 23:32   ` [PATCH 13/58] xfs: allow deletion of metadata directory files Darrick J. Wong
@ 2023-12-31 23:33   ` Darrick J. Wong
  2023-12-31 23:33   ` [PATCH 15/58] xfs: ensure metadata directory paths exist before creating files Darrick J. Wong
                     ` (43 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:33 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Plumb in the bits we need to look up metadata inode numbers from the
metadata inode directory and save them back.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h      |    5 
 include/xfs_trace.h      |    5 
 include/xfs_trans.h      |    3 
 libxfs/imeta_utils.c     |   66 +++++
 libxfs/libxfs_api_defs.h |    2 
 libxfs/libxfs_priv.h     |    2 
 libxfs/trans.c           |   33 +++
 libxfs/xfs_imeta.c       |  563 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_imeta.h       |   17 +
 libxfs/xfs_inode_util.c  |    3 
 libxfs/xfs_trans_resv.c  |    8 +
 11 files changed, 695 insertions(+), 12 deletions(-)


diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index 4aacc488fa5..2675abdffcd 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -32,6 +32,11 @@ static inline kgid_t make_kgid(gid_t gid)
 	return v;
 }
 
+#define KUIDT_INIT(value) (kuid_t){ value }
+#define KGIDT_INIT(value) (kgid_t){ value }
+#define GLOBAL_ROOT_UID KUIDT_INIT(0)
+#define GLOBAL_ROOT_GID KGIDT_INIT(0)
+
 /* These match kernel side includes */
 #include "xfs_inode_buf.h"
 #include "xfs_inode_fork.h"
diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index dd6011af90e..2cca9394b70 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -360,6 +360,11 @@
 
 #define trace_xlog_intent_recovery_failed(...)	((void) 0)
 
+#define trace_xfs_imeta_dir_link(...)		((void) 0)
+#define trace_xfs_imeta_dir_lookup(...)		((void) 0)
+#define trace_xfs_imeta_dir_try_create(...)	((void) 0)
+#define trace_xfs_imeta_dir_create(...)		((void) 0)
+#define trace_xfs_imeta_dir_unlink(...)		((void) 0)
 #define trace_xfs_imeta_teardown(...)		((void) 0)
 #define trace_xfs_imeta_sb_create(...)		((void) 0)
 #define trace_xfs_imeta_sb_link(...)		((void) 0)
diff --git a/include/xfs_trans.h b/include/xfs_trans.h
index b7f01ff073c..183163e81a5 100644
--- a/include/xfs_trans.h
+++ b/include/xfs_trans.h
@@ -93,6 +93,9 @@ int	libxfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
 int	libxfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
 			unsigned int dblocks, unsigned int rblocks, bool force,
 			struct xfs_trans **tpp);
+int	libxfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
+			struct xfs_inode *ip, unsigned int dblocks,
+			struct xfs_trans **tpp);
 int	libxfs_trans_alloc_rollable(struct xfs_mount *mp, uint blocks,
 				    struct xfs_trans **tpp);
 int	libxfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp);
diff --git a/libxfs/imeta_utils.c b/libxfs/imeta_utils.c
index f3165b5eed3..ce6000530d3 100644
--- a/libxfs/imeta_utils.c
+++ b/libxfs/imeta_utils.c
@@ -20,6 +20,7 @@
 #include "xfs_sb.h"
 #include "xfs_imeta.h"
 #include "xfs_trace.h"
+#include "xfs_parent.h"
 #include "imeta_utils.h"
 
 /* Initialize a metadata update structure. */
@@ -29,10 +30,33 @@ xfs_imeta_init(
 	const struct xfs_imeta_path	*path,
 	struct xfs_imeta_update		*upd)
 {
+	struct xfs_trans		*tp;
+	int				error;
+
 	memset(upd, 0, sizeof(struct xfs_imeta_update));
 	upd->mp = mp;
 	upd->path = path;
-	return 0;
+
+	if (!xfs_has_metadir(mp))
+		return 0;
+
+	/*
+	 * Find the parent of the last path component.  If the parent path does
+	 * not exist, we consider this corruption because paths are supposed
+	 * to exist.  For example, if the path is /quota/user, we require that
+	 * /quota already exists.
+	 */
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+	error = xfs_imeta_dir_parent(tp, upd->path, &upd->dp);
+	xfs_trans_cancel(tp);
+	if (error == -ENOENT)
+		return -EFSCORRUPTED;
+	if (error)
+		return error;
+
+	return xfs_parent_start(mp, &upd->ppargs);
 }
 
 /*
@@ -47,11 +71,25 @@ xfs_imeta_teardown(
 {
 	trace_xfs_imeta_teardown(upd, error);
 
+	if (upd->ppargs) {
+		xfs_parent_finish(upd->mp, upd->ppargs);
+		upd->ppargs = NULL;
+	}
+
 	if (upd->ip) {
 		if (upd->ip_locked)
 			xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
 		upd->ip_locked = false;
 	}
+
+	if (upd->dp) {
+		if (upd->dp_locked)
+			xfs_iunlock(upd->dp, XFS_ILOCK_EXCL);
+		upd->dp_locked = false;
+
+		xfs_imeta_irele(upd->dp);
+		upd->dp = NULL;
+	}
 }
 
 /*
@@ -75,6 +113,15 @@ xfs_imeta_start_create(
 	if (error)
 		goto out_teardown;
 
+	/*
+	 * Lock the parent directory if there is one.  We can't ijoin it to
+	 * the transaction until after the child file has been created.
+	 */
+	if (upd->dp) {
+		xfs_ilock(upd->dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+		upd->dp_locked = true;
+	}
+
 	trace_xfs_imeta_start_create(upd);
 	return 0;
 out_teardown:
@@ -103,10 +150,19 @@ xfs_imeta_start_dir_update(
 
 	upd->ip = ip;
 
-	error = xfs_trans_alloc_inode(upd->ip, tr_resv, resblks, 0, false,
-			&upd->tp);
-	if (error)
-		goto out_teardown;
+	if (upd->dp) {
+		error = xfs_trans_alloc_dir(upd->dp, tr_resv, upd->ip,
+				resblks, &upd->tp);
+		if (error)
+			goto out_teardown;
+
+		upd->dp_locked = true;
+	} else {
+		error = xfs_trans_alloc_inode(upd->ip, tr_resv, resblks, 0,
+				false, &upd->tp);
+		if (error)
+			goto out_teardown;
+	}
 
 	upd->ip_locked = true;
 	return 0;
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index a0cdad40ff9..90304da8983 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -184,6 +184,7 @@
 #define xfs_imeta_link_space_res	libxfs_imeta_link_space_res
 #define xfs_imeta_lookup		libxfs_imeta_lookup
 #define xfs_imeta_mount			libxfs_imeta_mount
+#define xfs_imeta_set_metaflag		libxfs_imeta_set_metaflag
 #define xfs_imeta_start_create		libxfs_imeta_start_create
 #define xfs_imeta_start_link		libxfs_imeta_start_link
 #define xfs_imeta_start_unlink		libxfs_imeta_start_unlink
@@ -289,6 +290,7 @@
 #define xfs_trans_add_item		libxfs_trans_add_item
 #define xfs_trans_alloc_empty		libxfs_trans_alloc_empty
 #define xfs_trans_alloc			libxfs_trans_alloc
+#define xfs_trans_alloc_dir		libxfs_trans_alloc_dir
 #define xfs_trans_alloc_inode		libxfs_trans_alloc_inode
 #define xfs_trans_bhold			libxfs_trans_bhold
 #define xfs_trans_bhold_release		libxfs_trans_bhold_release
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 9bca059776d..f4614ee9631 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -175,6 +175,8 @@ enum ce { CE_DEBUG, CE_CONT, CE_NOTE, CE_WARN, CE_ALERT, CE_PANIC };
 
 #define XFS_ERRLEVEL_LOW		1
 #define XFS_ILOCK_EXCL			0
+#define XFS_IOLOCK_SHARED		0
+#define XFS_IOLOCK_EXCL			0
 #define XFS_STATS_INC(mp, count)	do { (mp) = (mp); } while (0)
 #define XFS_STATS_DEC(mp, count, x)	do { (mp) = (mp); } while (0)
 #define XFS_STATS_ADD(mp, count, x)	do { (mp) = (mp); } while (0)
diff --git a/libxfs/trans.c b/libxfs/trans.c
index 7fec2caff49..8d969400984 100644
--- a/libxfs/trans.c
+++ b/libxfs/trans.c
@@ -1184,6 +1184,39 @@ libxfs_trans_alloc_inode(
 	return 0;
 }
 
+/*
+ * Allocate an transaction, lock and join the directory and child inodes to it,
+ * and reserve quota for a directory update.
+ *
+ * The ILOCKs will be dropped when the transaction is committed or cancelled.
+ *
+ * Caller is responsible for unlocking the inodes manually upon return
+ */
+int
+libxfs_trans_alloc_dir(
+	struct xfs_inode	*dp,
+	struct xfs_trans_res	*resv,
+	struct xfs_inode	*ip,
+	unsigned int		resblks,
+	struct xfs_trans	**tpp)
+{
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, dp, 0);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	*tpp = tp;
+	return 0;
+}
+
 /*
  * Try to reserve more blocks for a transaction.  The single use case we
  * support is for offline repair -- use a transaction to gather data without
diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
index 672aba4d0e7..b1c5c6ec5e6 100644
--- a/libxfs/xfs_imeta.c
+++ b/libxfs/xfs_imeta.c
@@ -23,6 +23,8 @@
 #include "xfs_da_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
 
 /*
  * Metadata File Management
@@ -43,9 +45,16 @@
  * this structure must be passed to xfs_imeta_end_update to free resources that
  * cannot be freed during the transaction.
  *
- * Right now we only support callers passing in the predefined metadata inode
- * paths; the goal is that callers will some day locate metadata inodes based
- * on path lookups into a metadata directory structure.
+ * When the metadata directory tree (metadir) feature is enabled, we can create
+ * a complex directory tree in which to store metadata inodes.  Inodes within
+ * the metadata directory tree should have the "metadata" inode flag set to
+ * prevent them from being exposed to the outside world.
+ *
+ * Callers are not expected to take the IOLOCK of metadata directories.  They
+ * are expected to take the ILOCK of any inode in the metadata directory tree
+ * (just like the regular to synchronize access to that inode.  It is not
+ * necessary to take the MMAPLOCK since metadata inodes should never be exposed
+ * to user space.
  */
 
 /* Static metadata inode paths */
@@ -61,6 +70,11 @@ XFS_IMETA_DEFINE_PATH(XFS_IMETA_USRQUOTA,	usrquota_path);
 XFS_IMETA_DEFINE_PATH(XFS_IMETA_GRPQUOTA,	grpquota_path);
 XFS_IMETA_DEFINE_PATH(XFS_IMETA_PRJQUOTA,	prjquota_path);
 
+const struct xfs_imeta_path XFS_IMETA_METADIR = {
+	.im_depth = 0,
+	.im_ftype = XFS_DIR3_FT_DIR,
+};
+
 /* Are these two paths equal? */
 STATIC bool
 xfs_imeta_path_compare(
@@ -118,6 +132,10 @@ static const struct xfs_imeta_sbmap {
 		.path	= &XFS_IMETA_PRJQUOTA,
 		.offset	= offsetof(struct xfs_sb, sb_pquotino),
 	},
+	{
+		.path	= &XFS_IMETA_METADIR,
+		.offset	= offsetof(struct xfs_sb, sb_metadirino),
+	},
 	{ NULL, 0 },
 };
 
@@ -287,6 +305,521 @@ xfs_imeta_sb_link(
 	return 0;
 }
 
+/* Functions for storing and retrieving metadata directory inode values. */
+
+static inline void
+xfs_imeta_set_xname(
+	struct xfs_name			*xname,
+	const struct xfs_imeta_path	*path,
+	unsigned int			path_idx,
+	unsigned char			ftype)
+{
+	xname->name = (const unsigned char *)path->im_path[path_idx];
+	xname->len = strlen(path->im_path[path_idx]);
+	xname->type = ftype;
+}
+
+/*
+ * Look up the inode number and filetype for an exact name in a directory.
+ * Caller must hold ILOCK_EXCL.
+ */
+static inline int
+xfs_imeta_dir_lookup(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_name		*xname,
+	xfs_ino_t		*ino)
+{
+	struct xfs_da_args	args = {
+		.trans		= tp,
+		.dp		= dp,
+		.geo		= dp->i_mount->m_dir_geo,
+		.name		= xname->name,
+		.namelen	= xname->len,
+		.hashval	= xfs_dir2_hashname(dp->i_mount, xname),
+		.whichfork	= XFS_DATA_FORK,
+		.op_flags	= XFS_DA_OP_OKNOENT,
+		.owner		= dp->i_ino,
+	};
+	bool			isblock, isleaf;
+	int			error;
+
+	if (xfs_is_shutdown(dp->i_mount))
+		return -EIO;
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_dir2_sf_lookup(&args);
+		goto out_unlock;
+	}
+
+	/* dir2 functions require that the data fork is loaded */
+	error = xfs_iread_extents(tp, dp, XFS_DATA_FORK);
+	if (error)
+		goto out_unlock;
+
+	error = xfs_dir2_isblock(&args, &isblock);
+	if (error)
+		goto out_unlock;
+
+	if (isblock) {
+		error = xfs_dir2_block_lookup(&args);
+		goto out_unlock;
+	}
+
+	error = xfs_dir2_isleaf(&args, &isleaf);
+	if (error)
+		goto out_unlock;
+
+	if (isleaf) {
+		error = xfs_dir2_leaf_lookup(&args);
+		goto out_unlock;
+	}
+
+	error = xfs_dir2_node_lookup(&args);
+
+out_unlock:
+	if (error == -EEXIST)
+		error = 0;
+	if (error)
+		return error;
+
+	*ino = args.inumber;
+	xname->type = args.filetype;
+	return 0;
+}
+
+/*
+ * Given a parent directory @dp and a metadata inode path component @xname,
+ * Look up the inode number in the directory, returning it in @ino.
+ * @xname.type must match the directory entry's ftype.
+ *
+ * Caller must hold ILOCK_EXCL.
+ */
+static inline int
+xfs_imeta_dir_lookup_component(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_name		*xname,
+	xfs_ino_t		*ino)
+{
+	int			type_wanted = xname->type;
+	int			error;
+
+	if (!S_ISDIR(VFS_I(dp)->i_mode))
+		return -EFSCORRUPTED;
+
+	error = xfs_imeta_dir_lookup(tp, dp, xname, ino);
+	if (error)
+		return error;
+	if (!xfs_verify_ino(dp->i_mount, *ino))
+		return -EFSCORRUPTED;
+	if (type_wanted != XFS_DIR3_FT_UNKNOWN && xname->type != type_wanted)
+		return -EFSCORRUPTED;
+
+	trace_xfs_imeta_dir_lookup(dp, xname, *ino);
+	return 0;
+}
+
+/*
+ * Traverse a metadata directory tree path, returning the inode corresponding
+ * to the parent of the last path component.  If any of the path components do
+ * not exist, return -ENOENT.  Caller must supply a transaction to avoid
+ * livelocks on btree cycles.
+ *
+ * @dp is returned without any locks held.
+ */
+int
+xfs_imeta_dir_parent(
+	struct xfs_trans		*tp,
+	const struct xfs_imeta_path	*path,
+	struct xfs_inode		**dpp)
+{
+	struct xfs_name			xname;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_inode		*dp = NULL;
+	xfs_ino_t			ino;
+	unsigned int			i;
+	int				error;
+
+	/* Caller wanted the root, we're done! */
+	if (path->im_depth == 0)
+		goto out;
+
+	/* No metadata directory means no parent. */
+	if (mp->m_metadirip == NULL)
+		return -ENOENT;
+
+	/* Grab a new reference to the metadir root dir. */
+	error = xfs_imeta_iget(tp, mp->m_metadirip->i_ino, XFS_DIR3_FT_DIR,
+			&dp);
+	if (error)
+		return error;
+
+	for (i = 0; i < path->im_depth - 1; i++) {
+		struct xfs_inode	*ip = NULL;
+
+		xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+		/* Look up the name in the current directory. */
+		xfs_imeta_set_xname(&xname, path, i, XFS_DIR3_FT_DIR);
+		error = xfs_imeta_dir_lookup_component(tp, dp, &xname, &ino);
+		if (error)
+			goto out_rele;
+
+		/*
+		 * Grab the child inode while we still have the parent
+		 * directory locked.
+		 */
+		error = xfs_imeta_iget(tp, ino, XFS_DIR3_FT_DIR, &ip);
+		if (error)
+			goto out_rele;
+
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		xfs_imeta_irele(dp);
+		dp = ip;
+	}
+
+out:
+	*dpp = dp;
+	return 0;
+
+out_rele:
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	xfs_imeta_irele(dp);
+	return error;
+}
+
+/*
+ * Look up a metadata inode from the metadata directory.  If the last path
+ * component doesn't exist, return NULLFSINO.  If any other part of the path
+ * does not exist, return -ENOENT so we can distinguish the two.
+ */
+STATIC int
+xfs_imeta_dir_lookup_int(
+	struct xfs_trans		*tp,
+	const struct xfs_imeta_path	*path,
+	xfs_ino_t			*inop)
+{
+	struct xfs_name			xname;
+	struct xfs_inode		*dp = NULL;
+	xfs_ino_t			ino;
+	int				error;
+
+	/* metadir ino is recorded in superblock */
+	if (xfs_imeta_path_compare(path, &XFS_IMETA_METADIR))
+		return xfs_imeta_sb_lookup(tp->t_mountp, path, inop);
+
+	ASSERT(path->im_depth > 0);
+
+	/* Find the parent of the last path component. */
+	error = xfs_imeta_dir_parent(tp, path, &dp);
+	if (error)
+		return error;
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/* Look up the name in the current directory. */
+	xfs_imeta_set_xname(&xname, path, path->im_depth - 1, path->im_ftype);
+	error = xfs_imeta_dir_lookup_component(tp, dp, &xname, &ino);
+	switch (error) {
+	case 0:
+		*inop = ino;
+		break;
+	case -ENOENT:
+		*inop = NULLFSINO;
+		error = 0;
+		break;
+	}
+
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	xfs_imeta_irele(dp);
+	return error;
+}
+
+/*
+ * Load all the metadata inode pointers that are cached in the in-core
+ * superblock but live somewhere in the metadata directory tree.
+ */
+STATIC int
+xfs_imeta_dir_mount(
+	struct xfs_trans		*tp)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	const struct xfs_imeta_sbmap	*p;
+	xfs_ino_t			*sb_inop;
+	int				err2;
+	int				error = 0;
+
+	for (p = xfs_imeta_sbmaps; p->path && p->path->im_depth > 0; p++) {
+		if (p->path == &XFS_IMETA_METADIR)
+			continue;
+		sb_inop = xfs_imeta_sbmap_to_inop(mp, p);
+		err2 = xfs_imeta_dir_lookup_int(tp, p->path, sb_inop);
+		if (err2 == -ENOENT) {
+			*sb_inop = NULLFSINO;
+			continue;
+		}
+		if (!error && err2)
+			error = err2;
+	}
+
+	return error;
+}
+
+/* Set up an inode to be recognized as a metadata directory inode. */
+void
+xfs_imeta_set_iflag(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	VFS_I(ip)->i_mode &= ~0777;
+	VFS_I(ip)->i_uid = GLOBAL_ROOT_UID;
+	VFS_I(ip)->i_gid = GLOBAL_ROOT_GID;
+	ip->i_projid = 0;
+	ip->i_diflags |= (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_SYNC |
+			  XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP |
+			  XFS_DIFLAG_NODEFRAG);
+	if (S_ISDIR(VFS_I(ip)->i_mode))
+		ip->i_diflags |= XFS_DIFLAG_NOSYMLINKS;
+	ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+	ip->i_diflags2 |= XFS_DIFLAG2_METADIR;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+
+/* Clear the metadata directory inode flag. */
+void
+xfs_imeta_clear_iflag(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	ASSERT(xfs_is_metadir_inode(ip));
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+
+	ip->i_diflags2 &= ~XFS_DIFLAG2_METADIR;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+/*
+ * Create a new metadata inode accessible via the given metadata directory path.
+ * Callers must ensure that the directory entry does not already exist; a new
+ * one will be created.
+ */
+STATIC int
+xfs_imeta_dir_create(
+	struct xfs_imeta_update		*upd,
+	umode_t				mode)
+{
+	struct xfs_icreate_args		args = {
+		.pip			= upd->dp,
+		.nlink			= S_ISDIR(mode) ? 2 : 1,
+	};
+	struct xfs_name			xname;
+	struct xfs_dir_update		du = {
+		.dp			= upd->dp,
+		.name			= &xname,
+		.ppargs			= upd->ppargs,
+	};
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+	xfs_ino_t			ino;
+	unsigned int			resblks;
+	int				error;
+
+	ASSERT(xfs_isilocked(upd->dp, XFS_ILOCK_EXCL));
+
+	/* metadir ino is recorded in superblock; only mkfs gets to do this */
+	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
+		error = xfs_imeta_sb_create(upd, mode);
+		if (error)
+			return error;
+
+		/* Set the metadata iflag, initialize directory. */
+		xfs_imeta_set_iflag(upd->tp, upd->ip);
+		return xfs_dir_init(upd->tp, upd->ip, upd->ip);
+	}
+
+	ASSERT(upd->path->im_depth > 0);
+
+	xfs_icreate_args_rootfile(&args, mp, mode, xfs_has_parent(mp));
+
+	/* Check that the name does not already exist in the directory. */
+	xfs_imeta_set_xname(&xname, upd->path, upd->path->im_depth - 1,
+			XFS_DIR3_FT_UNKNOWN);
+	error = xfs_imeta_dir_lookup_component(upd->tp, upd->dp, &xname, &ino);
+	switch (error) {
+	case -ENOENT:
+		break;
+	case 0:
+		error = -EEXIST;
+		fallthrough;
+	default:
+		return error;
+	}
+
+	/*
+	 * A newly created regular or special file just has one directory
+	 * entry pointing to them, but a directory also the "." entry
+	 * pointing to itself.
+	 */
+	error = xfs_dialloc(&upd->tp, upd->dp->i_ino, mode, &ino);
+	if (error)
+		return error;
+	error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
+	if (error)
+		return error;
+	du.ip = upd->ip;
+	xfs_imeta_set_iflag(upd->tp, upd->ip);
+	upd->ip_locked = true;
+
+	/*
+	 * Join the directory inode to the transaction.  We do not do it
+	 * earlier because xfs_dialloc rolls the transaction.
+	 */
+	xfs_trans_ijoin(upd->tp, upd->dp, 0);
+
+	/* Create the entry. */
+	if (S_ISDIR(args.mode))
+		resblks = xfs_mkdir_space_res(mp, xname.len);
+	else
+		resblks = xfs_create_space_res(mp, xname.len);
+	xname.type = xfs_mode_to_ftype(args.mode);
+
+	trace_xfs_imeta_dir_try_create(upd);
+
+	error = xfs_dir_create_child(upd->tp, resblks, &du);
+	if (error)
+		return error;
+
+	/* Metadir files are not accounted to quota. */
+
+	trace_xfs_imeta_dir_create(upd);
+
+	/* Update the in-core superblock value if there is one. */
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (sb_inop)
+		*sb_inop = ino;
+	return 0;
+}
+
+/*
+ * Remove the given entry from the metadata directory and drop the link count
+ * of the metadata inode.
+ */
+STATIC int
+xfs_imeta_dir_unlink(
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_name			xname;
+	struct xfs_dir_update		du = {
+		.dp			= upd->dp,
+		.name			= &xname,
+		.ip			= upd->ip,
+		.ppargs			= upd->ppargs,
+	};
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+	xfs_ino_t			ino;
+	unsigned int			resblks;
+	int				error;
+
+	ASSERT(xfs_isilocked(upd->dp, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(upd->ip, XFS_ILOCK_EXCL));
+
+	/* Metadata directory root cannot be unlinked. */
+	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	ASSERT(upd->path->im_depth > 0);
+
+	/* Look up the name in the current directory. */
+	xfs_imeta_set_xname(&xname, upd->path, upd->path->im_depth - 1,
+			xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode));
+	error = xfs_imeta_dir_lookup_component(upd->tp, upd->dp, &xname, &ino);
+	switch (error) {
+	case 0:
+		if (ino != upd->ip->i_ino)
+			error = -ENOENT;
+		break;
+	case -ENOENT:
+		error = -EFSCORRUPTED;
+		break;
+	}
+	if (error)
+		return error;
+
+	resblks = xfs_remove_space_res(mp, xname.len);
+	error = xfs_dir_remove_child(upd->tp, resblks, &du);
+	if (error)
+		return error;
+
+	trace_xfs_imeta_dir_unlink(upd);
+
+	/* Update the in-core superblock value if there is one. */
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (sb_inop)
+		*sb_inop = NULLFSINO;
+	return 0;
+}
+
+/* Set the given path in the metadata directory to point to an inode. */
+STATIC int
+xfs_imeta_dir_link(
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_name			xname;
+	struct xfs_dir_update		du = {
+		.dp			= upd->dp,
+		.name			= &xname,
+		.ip			= upd->ip,
+		.ppargs			= upd->ppargs,
+	};
+	struct xfs_mount		*mp = upd->mp;
+	xfs_ino_t			*sb_inop;
+	xfs_ino_t			ino;
+	unsigned int			resblks;
+	int				error;
+
+	ASSERT(xfs_isilocked(upd->dp, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(upd->ip, XFS_ILOCK_EXCL));
+
+	/* Metadata directory root cannot be linked. */
+	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	ASSERT(upd->path->im_depth > 0);
+
+	/* Look up the name in the current directory. */
+	xfs_imeta_set_xname(&xname, upd->path, upd->path->im_depth - 1,
+			xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode));
+	error = xfs_imeta_dir_lookup_component(upd->tp, upd->dp, &xname, &ino);
+	switch (error) {
+	case -ENOENT:
+		break;
+	case 0:
+		error = -EEXIST;
+		fallthrough;
+	default:
+		return error;
+	}
+
+	resblks = xfs_link_space_res(mp, xname.len);
+	error = xfs_dir_add_child(upd->tp, resblks, &du);
+	if (error)
+		return error;
+
+	trace_xfs_imeta_dir_link(upd);
+
+	/* Update the in-core superblock value if there is one. */
+	sb_inop = xfs_imeta_path_to_sb_inop(mp, upd->path);
+	if (sb_inop)
+		*sb_inop = upd->ip->i_ino;
+	return 0;
+}
+
 /* General functions for managing metadata inode pointers */
 
 /*
@@ -317,7 +850,13 @@ xfs_imeta_lookup(
 
 	ASSERT(xfs_imeta_path_check(path));
 
-	error = xfs_imeta_sb_lookup(mp, path, &ino);
+	if (xfs_has_metadir(mp)) {
+		error = xfs_imeta_dir_lookup_int(tp, path, &ino);
+		if (error == -ENOENT)
+			return -EFSCORRUPTED;
+	} else {
+		error = xfs_imeta_sb_lookup(mp, path, &ino);
+	}
 	if (error)
 		return error;
 
@@ -349,13 +888,17 @@ xfs_imeta_create(
 	umode_t				mode,
 	struct xfs_inode		**ipp)
 {
+	struct xfs_mount		*mp = upd->mp;
 	int				error;
 
 	ASSERT(xfs_imeta_path_check(upd->path));
 
 	*ipp = NULL;
 
-	error = xfs_imeta_sb_create(upd, mode);
+	if (xfs_has_metadir(mp))
+		error = xfs_imeta_dir_create(upd, mode);
+	else
+		error = xfs_imeta_sb_create(upd, mode);
 	*ipp = upd->ip;
 	return error;
 }
@@ -405,7 +948,10 @@ xfs_imeta_unlink(
 	ASSERT(xfs_imeta_path_check(upd->path));
 	ASSERT(xfs_imeta_verify(upd->mp, upd->ip->i_ino));
 
-	error = xfs_imeta_sb_unlink(upd);
+	if (xfs_has_metadir(upd->mp))
+		error = xfs_imeta_dir_unlink(upd);
+	else
+		error = xfs_imeta_sb_unlink(upd);
 	if (error)
 		return error;
 
@@ -431,6 +977,8 @@ xfs_imeta_link(
 {
 	ASSERT(xfs_imeta_path_check(upd->path));
 
+	if (xfs_has_metadir(upd->mp))
+		return xfs_imeta_dir_link(upd);
 	return xfs_imeta_sb_link(upd);
 }
 
@@ -461,5 +1009,8 @@ int
 xfs_imeta_mount(
 	struct xfs_trans	*tp)
 {
+	if (xfs_has_metadir(tp->t_mountp))
+		return xfs_imeta_dir_mount(tp);
+
 	return 0;
 }
diff --git a/libxfs/xfs_imeta.h b/libxfs/xfs_imeta.h
index 60e0f6a6c13..b8e360bbdfb 100644
--- a/libxfs/xfs_imeta.h
+++ b/libxfs/xfs_imeta.h
@@ -13,6 +13,7 @@
 #define XFS_IMETA_DEFINE_PATH(name, path) \
 const struct xfs_imeta_path name = { \
 	.im_path = (path), \
+	.im_ftype = XFS_DIR3_FT_REG_FILE, \
 	.im_depth = ARRAY_SIZE(path), \
 }
 
@@ -23,6 +24,9 @@ struct xfs_imeta_path {
 
 	/* Number of strings in path. */
 	uint8_t			im_depth;
+
+	/* Expected file type. */
+	uint8_t			im_ftype;
 };
 
 /* Cleanup widget for metadata inode creation and deletion. */
@@ -32,9 +36,16 @@ struct xfs_imeta_update {
 
 	const struct xfs_imeta_path *path;
 
+	/* Parent pointer update context */
+	struct xfs_parent_args	*ppargs;
+
+	/* Parent directory */
+	struct xfs_inode	*dp;
+
 	/* Metadata inode */
 	struct xfs_inode	*ip;
 
+	unsigned int		dp_locked:1;
 	unsigned int		ip_locked:1;
 };
 
@@ -54,9 +65,15 @@ extern const struct xfs_imeta_path XFS_IMETA_RTSUMMARY;
 extern const struct xfs_imeta_path XFS_IMETA_USRQUOTA;
 extern const struct xfs_imeta_path XFS_IMETA_GRPQUOTA;
 extern const struct xfs_imeta_path XFS_IMETA_PRJQUOTA;
+extern const struct xfs_imeta_path XFS_IMETA_METADIR;
 
 int xfs_imeta_lookup(struct xfs_trans *tp, const struct xfs_imeta_path *path,
 		xfs_ino_t *ino);
+int xfs_imeta_dir_parent(struct xfs_trans *tp,
+		const struct xfs_imeta_path *path, struct xfs_inode **dpp);
+
+void xfs_imeta_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_imeta_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
 
 int xfs_imeta_create(struct xfs_imeta_update *upd, umode_t mode,
 		struct xfs_inode **ipp);
diff --git a/libxfs/xfs_inode_util.c b/libxfs/xfs_inode_util.c
index b63aacb6971..c2e51a6ee94 100644
--- a/libxfs/xfs_inode_util.c
+++ b/libxfs/xfs_inode_util.c
@@ -19,6 +19,7 @@
 #include "xfs_bmap.h"
 #include "xfs_trace.h"
 #include "xfs_ag.h"
+#include "xfs_imeta.h"
 #include "iunlink.h"
 
 uint16_t
@@ -646,6 +647,8 @@ xfs_droplink(
 	if (inode->i_nlink)
 		return 0;
 
+	if (xfs_is_metadir_inode(ip))
+		xfs_imeta_clear_iflag(tp, ip);
 	return xfs_iunlink(tp, ip);
 }
 
diff --git a/libxfs/xfs_trans_resv.c b/libxfs/xfs_trans_resv.c
index a46360e2bdb..74f46539179 100644
--- a/libxfs/xfs_trans_resv.c
+++ b/libxfs/xfs_trans_resv.c
@@ -1118,7 +1118,10 @@ xfs_calc_imeta_create_resv(
 	unsigned int		ret;
 
 	ret = xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-	ret += resp->tr_create.tr_logres;
+	if (xfs_has_metadir(mp))
+		ret += max(resp->tr_create.tr_logres, resp->tr_mkdir.tr_logres);
+	else
+		ret += resp->tr_create.tr_logres;
 	return ret;
 }
 
@@ -1128,6 +1131,9 @@ xfs_calc_imeta_create_count(
 	struct xfs_mount	*mp,
 	struct xfs_trans_resv	*resp)
 {
+	if (xfs_has_metadir(mp))
+		return max(resp->tr_create.tr_logcount,
+			   resp->tr_mkdir.tr_logcount);
 	return resp->tr_create.tr_logcount;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/58] xfs: ensure metadata directory paths exist before creating files
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-31 23:33   ` [PATCH 14/58] xfs: read and write metadata inode directory Darrick J. Wong
@ 2023-12-31 23:33   ` Darrick J. Wong
  2023-12-31 23:33   ` [PATCH 16/58] xfs: disable the agi rotor for metadata inodes Darrick J. Wong
                     ` (42 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:33 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Since xfs_imeta_create can create new metadata files arbitrarily deep in
the metadata directory tree, we must supply a function that can ensure
that all directories in a path exist, and call it before the quota
functions create the quota inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/imeta_utils.c     |   69 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/imeta_utils.h     |    3 ++
 libxfs/libxfs_api_defs.h |    1 +
 mkfs/proto.c             |    8 +++++
 4 files changed, 81 insertions(+)


diff --git a/libxfs/imeta_utils.c b/libxfs/imeta_utils.c
index ce6000530d3..0186968ed3e 100644
--- a/libxfs/imeta_utils.c
+++ b/libxfs/imeta_utils.c
@@ -246,3 +246,72 @@ xfs_imeta_cancel_update(
 
 	xfs_imeta_teardown(upd, error);
 }
+
+/* Create a metadata for the last component of the path. */
+STATIC int
+xfs_imeta_mkdir(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path)
+{
+	struct xfs_imeta_update		upd;
+	struct xfs_inode		*ip = NULL;
+	int				error;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	/* Allocate a transaction to create the last directory. */
+	error = xfs_imeta_start_create(mp, path, &upd);
+	if (error)
+		return error;
+
+	/* Create the subdirectory and take our reference. */
+	error = xfs_imeta_create(&upd, S_IFDIR, &ip);
+	if (error)
+		goto out_cancel;
+
+	error = xfs_imeta_commit_update(&upd);
+
+	/*
+	 * We don't pass the directory we just created to the caller, so finish
+	 * setting up the inode, then release the dir and the dquots.
+	 */
+	goto out_irele;
+
+out_cancel:
+	xfs_imeta_cancel_update(&upd, error);
+out_irele:
+	/* Have to finish setting up the inode to ensure it's deleted. */
+	if (ip)
+		xfs_irele(ip);
+	return error;
+}
+
+/*
+ * Make sure that every metadata directory path component exists and is a
+ * directory.
+ */
+int
+xfs_imeta_ensure_dirpath(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path)
+{
+	struct xfs_imeta_path		temp_path = {
+		.im_path		= path->im_path,
+		.im_depth		= 1,
+		.im_ftype		= XFS_DIR3_FT_DIR,
+	};
+	unsigned int			i;
+	int				error = 0;
+
+	if (!xfs_has_metadir(mp))
+		return 0;
+
+	for (i = 0; i < path->im_depth - 1; i++, temp_path.im_depth++) {
+		error = xfs_imeta_mkdir(mp, &temp_path);
+		if (error && error != -EEXIST)
+			return error;
+	}
+
+	return 0;
+}
diff --git a/libxfs/imeta_utils.h b/libxfs/imeta_utils.h
index a2afc0f37ac..8ab81149b05 100644
--- a/libxfs/imeta_utils.h
+++ b/libxfs/imeta_utils.h
@@ -18,6 +18,9 @@ int xfs_imeta_start_unlink(struct xfs_mount *mp,
 		const struct xfs_imeta_path *path,
 		struct xfs_inode *ip, struct xfs_imeta_update *upd);
 
+int xfs_imeta_ensure_dirpath(struct xfs_mount *mp,
+		const struct xfs_imeta_path *path);
+
 int xfs_imeta_commit_update(struct xfs_imeta_update *upd);
 void xfs_imeta_cancel_update(struct xfs_imeta_update *upd, int error);
 
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 90304da8983..e08be764a49 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -178,6 +178,7 @@
 #define xfs_imeta_commit_update		libxfs_imeta_commit_update
 #define xfs_imeta_create		libxfs_imeta_create
 #define xfs_imeta_create_space_res	libxfs_imeta_create_space_res
+#define xfs_imeta_ensure_dirpath	libxfs_imeta_ensure_dirpath
 #define xfs_imeta_iget			libxfs_imeta_iget
 #define xfs_imeta_irele			libxfs_imeta_irele
 #define xfs_imeta_link			libxfs_imeta_link
diff --git a/mkfs/proto.c b/mkfs/proto.c
index 2eff1f32173..5e17ea420f4 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -754,6 +754,10 @@ rtbitmap_create(
 	struct xfs_inode	*rbmip;
 	int			error;
 
+	error = -libxfs_imeta_ensure_dirpath(mp, &XFS_IMETA_RTBITMAP);
+	if (error)
+		fail(_("Realtime bitmap directory allocation failed"), error);
+
 	error = -libxfs_imeta_start_create(mp, &XFS_IMETA_RTBITMAP, &upd);
 	if (error)
 		res_failed(error);
@@ -783,6 +787,10 @@ rtsummary_create(
 	struct xfs_inode	*rsumip;
 	int			error;
 
+	error = -libxfs_imeta_ensure_dirpath(mp, &XFS_IMETA_RTSUMMARY);
+	if (error)
+		fail(_("Realtime summary directory allocation failed"), error);
+
 	error = -libxfs_imeta_start_create(mp, &XFS_IMETA_RTSUMMARY, &upd);
 	if (error)
 		res_failed(error);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/58] xfs: disable the agi rotor for metadata inodes
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-31 23:33   ` [PATCH 15/58] xfs: ensure metadata directory paths exist before creating files Darrick J. Wong
@ 2023-12-31 23:33   ` Darrick J. Wong
  2023-12-31 23:34   ` [PATCH 17/58] xfs: advertise metadata directory feature Darrick J. Wong
                     ` (41 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:33 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Ideally, we'd put all the metadata inodes in one place if we could, so
that the metadata all stay reasonably close together instead of
spreading out over the disk.  Furthermore, if the log is internal we'd
probably prefer to keep the metadata near the log.  Therefore, disable
AGI rotoring for metadata inode allocations.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/iunlink.c        |    2 +-
 libxfs/xfs_ialloc.c |   56 +++++++++++++++++++++++++++++++++++----------------
 libxfs/xfs_ialloc.h |    2 +-
 libxfs/xfs_imeta.c  |    4 ++--
 mkfs/proto.c        |    3 +--
 repair/phase6.c     |    2 +-
 6 files changed, 44 insertions(+), 25 deletions(-)


diff --git a/db/iunlink.c b/db/iunlink.c
index c87b98431e5..fd5ed64c9e2 100644
--- a/db/iunlink.c
+++ b/db/iunlink.c
@@ -221,7 +221,7 @@ create_unlinked(
 		return error;
 	}
 
-	error = -libxfs_dialloc(&tp, 0, args.mode, &ino);
+	error = -libxfs_dialloc(&tp, args.pip, args.mode, &ino);
 	if (error) {
 		dbprintf(_("alloc inode: %s\n"), strerror(error));
 		goto out_cancel;
diff --git a/libxfs/xfs_ialloc.c b/libxfs/xfs_ialloc.c
index 2c941603986..19543f76994 100644
--- a/libxfs/xfs_ialloc.c
+++ b/libxfs/xfs_ialloc.c
@@ -1794,6 +1794,37 @@ xfs_dialloc_try_ag(
 	return error;
 }
 
+/*
+ * Pick an AG for the new inode.
+ *
+ * Directories, symlinks, and regular files frequently allocate at least one
+ * block, so factor that potential expansion when we examine whether an AG has
+ * enough space for file creation.  Try to keep metadata files all in the same
+ * AG.
+ */
+static inline xfs_agnumber_t
+xfs_dialloc_pick_ag(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp,
+	umode_t			mode)
+{
+	xfs_agnumber_t		start_agno;
+
+	if (!dp)
+		return 0;
+	if (xfs_is_metadir_inode(dp))
+		return 0;
+
+	if (S_ISDIR(mode))
+		return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi;
+
+	start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino);
+	if (start_agno >= mp->m_maxagi)
+		start_agno = 0;
+
+	return start_agno;
+}
+
 /*
  * Allocate an on-disk inode.
  *
@@ -1805,34 +1836,23 @@ xfs_dialloc_try_ag(
 int
 xfs_dialloc(
 	struct xfs_trans	**tpp,
-	xfs_ino_t		parent,
+	struct xfs_inode	*dp,
 	umode_t			mode,
 	xfs_ino_t		*new_ino)
 {
 	struct xfs_mount	*mp = (*tpp)->t_mountp;
-	xfs_agnumber_t		agno;
-	int			error = 0;
-	xfs_agnumber_t		start_agno;
 	struct xfs_perag	*pag;
 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	xfs_ino_t		ino = NULLFSINO;
+	xfs_ino_t		parent = dp ? dp->i_ino : 0;
+	xfs_agnumber_t		agno;
+	xfs_agnumber_t		start_agno;
 	bool			ok_alloc = true;
 	bool			low_space = false;
 	int			flags;
-	xfs_ino_t		ino = NULLFSINO;
+	int			error = 0;
 
-	/*
-	 * Directories, symlinks, and regular files frequently allocate at least
-	 * one block, so factor that potential expansion when we examine whether
-	 * an AG has enough space for file creation.
-	 */
-	if (S_ISDIR(mode))
-		start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) %
-				mp->m_maxagi;
-	else {
-		start_agno = XFS_INO_TO_AGNO(mp, parent);
-		if (start_agno >= mp->m_maxagi)
-			start_agno = 0;
-	}
+	start_agno = xfs_dialloc_pick_ag(mp, dp, mode);
 
 	/*
 	 * If we have already hit the ceiling of inode blocks then clear
diff --git a/libxfs/xfs_ialloc.h b/libxfs/xfs_ialloc.h
index f1412183bb4..9bfe2d8d84b 100644
--- a/libxfs/xfs_ialloc.h
+++ b/libxfs/xfs_ialloc.h
@@ -37,7 +37,7 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
  * Allocate an inode on disk.  Mode is used to tell whether the new inode will
  * need space, and whether it is a directory.
  */
-int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode,
+int xfs_dialloc(struct xfs_trans **tpp, struct xfs_inode *dp, umode_t mode,
 		xfs_ino_t *new_ino);
 
 int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag,
diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
index b1c5c6ec5e6..2defee9562b 100644
--- a/libxfs/xfs_imeta.c
+++ b/libxfs/xfs_imeta.c
@@ -229,7 +229,7 @@ xfs_imeta_sb_create(
 		return -EEXIST;
 
 	/* Create a new inode and set the sb pointer. */
-	error = xfs_dialloc(&upd->tp, 0, mode, &ino);
+	error = xfs_dialloc(&upd->tp, NULL, mode, &ino);
 	if (error)
 		return error;
 	error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
@@ -661,7 +661,7 @@ xfs_imeta_dir_create(
 	 * entry pointing to them, but a directory also the "." entry
 	 * pointing to itself.
 	 */
-	error = xfs_dialloc(&upd->tp, upd->dp->i_ino, mode, &ino);
+	error = xfs_dialloc(&upd->tp, upd->dp, mode, &ino);
 	if (error)
 		return error;
 	error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
diff --git a/mkfs/proto.c b/mkfs/proto.c
index 5e17ea420f4..0103fe54a5d 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -431,7 +431,6 @@ creatproto(
 				  XFS_ICREATE_ARGS_FORCE_MODE,
 	};
 	struct xfs_inode	*ip;
-	xfs_ino_t		parent_ino = dp ? dp->i_ino : 0;
 	xfs_ino_t		ino;
 	int			error;
 
@@ -442,7 +441,7 @@ creatproto(
 	 * Call the space management code to pick the on-disk inode to be
 	 * allocated.
 	 */
-	error = -libxfs_dialloc(tpp, parent_ino, mode, &ino);
+	error = -libxfs_dialloc(tpp, dp, mode, &ino);
 	if (error)
 		return error;
 
diff --git a/repair/phase6.c b/repair/phase6.c
index 6a3c5e2a37a..fe9a4da62dc 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -870,7 +870,7 @@ mk_orphanage(
 	if (i)
 		res_failed(i);
 
-	error = -libxfs_dialloc(&tp, mp->m_sb.sb_rootino, mode, &ino);
+	error = -libxfs_dialloc(&tp, du.dp, mode, &ino);
 	if (error)
 		do_error(_("%s inode allocation failed %d\n"),
 			ORPHANAGE, error);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/58] xfs: advertise metadata directory feature
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-31 23:33   ` [PATCH 16/58] xfs: disable the agi rotor for metadata inodes Darrick J. Wong
@ 2023-12-31 23:34   ` Darrick J. Wong
  2023-12-31 23:34   ` [PATCH 18/58] xfs: allow bulkstat to return metadata directories Darrick J. Wong
                     ` (40 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:34 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Advertise the existence of the metadata directory feature; this will be
used by scrub to decide if it needs to scan the metadir too.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs.h                 |    1 +
 libxfs/xfs_sb.c                 |    2 ++
 man/man2/ioctl_xfs_fsgeometry.2 |    3 +++
 3 files changed, 6 insertions(+)


diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index 77fbca573e1..2e31fc2240e 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_BIGTIME	(1 << 21) /* 64-bit nsec timestamps */
 #define XFS_FSOP_GEOM_FLAGS_INOBTCNT	(1 << 22) /* inobt btree counter */
 #define XFS_FSOP_GEOM_FLAGS_NREXT64	(1 << 23) /* large extent counters */
+#define XFS_FSOP_GEOM_FLAGS_METADIR	(1U << 29) /* metadata directories */
 #define XFS_FSOP_GEOM_FLAGS_PARENT	(1U << 30) /* parent pointers */
 
 /* atomic file extent swap available to userspace */
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index b74a170605d..719da346d36 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -1296,6 +1296,8 @@ xfs_fs_geometry(
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
 	if (xfs_atomic_swap_supported(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATOMIC_SWAP;
+	if (xfs_has_metadir(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
 	geo->rtsectsize = sbp->sb_blocksize;
 	geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
 
diff --git a/man/man2/ioctl_xfs_fsgeometry.2 b/man/man2/ioctl_xfs_fsgeometry.2
index 4c7ff9a270b..9c0ad165cbe 100644
--- a/man/man2/ioctl_xfs_fsgeometry.2
+++ b/man/man2/ioctl_xfs_fsgeometry.2
@@ -214,6 +214,9 @@ Filesystem supports sharing blocks between files.
 .TP
 .B XFS_FSOP_GEOM_FLAGS_ATOMICSWAP
 Filesystem can exchange file contents atomically via XFS_IOC_EXCHANGE_RANGE.
+.TP
+.B XFS_FSOP_GEOM_FLAGS_METADIR
+Filesystem contains a metadata directory tree.
 .RE
 .SH XFS METADATA HEALTH REPORTING
 .PP


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/58] xfs: allow bulkstat to return metadata directories
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-31 23:34   ` [PATCH 17/58] xfs: advertise metadata directory feature Darrick J. Wong
@ 2023-12-31 23:34   ` Darrick J. Wong
  2023-12-31 23:34   ` [PATCH 19/58] xfs: enable creation of dynamically allocated metadir path structures Darrick J. Wong
                     ` (39 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:34 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow the V5 bulkstat ioctl to return information about metadata
directory files so that xfs_scrub can find and scrub them, since they
are otherwise ordinary directories.

(Metadata files of course require per-file scrub code and hence do not
need exposure.)

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs.h |   10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index 2e31fc2240e..ab961a01181 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -491,9 +491,17 @@ struct xfs_bulk_ireq {
  */
 #define XFS_BULK_IREQ_NREXT64	(1U << 2)
 
+/*
+ * Allow bulkstat to return information about metadata directories.  This
+ * enables xfs_scrub to find them for scanning, as they are otherwise ordinary
+ * directories.
+ */
+#define XFS_BULK_IREQ_METADIR	(1U << 31)
+
 #define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO |	 \
 				 XFS_BULK_IREQ_SPECIAL | \
-				 XFS_BULK_IREQ_NREXT64)
+				 XFS_BULK_IREQ_NREXT64 | \
+				 XFS_BULK_IREQ_METADIR)
 
 /* Operate on the root directory inode. */
 #define XFS_BULK_IREQ_SPECIAL_ROOT	(1)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/58] xfs: enable creation of dynamically allocated metadir path structures
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-31 23:34   ` [PATCH 18/58] xfs: allow bulkstat to return metadata directories Darrick J. Wong
@ 2023-12-31 23:34   ` Darrick J. Wong
  2023-12-31 23:34   ` [PATCH 20/58] xfs: adjust xfs_bmap_add_attrfork for metadir Darrick J. Wong
                     ` (38 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:34 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a few helper functions so that it's possible to allocate
xfs_imeta_path objects dynamically, along with dynamically allocated
path components.  Eventually we're going to want to support paths of the
form "/realtime/$rtgroup.rmap", and this is necessary for that.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/kmem.h           |    5 ++++-
 libxfs/libxfs_api_defs.h |    2 ++
 libxfs/xfs_imeta.c       |   46 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_imeta.h       |   15 +++++++++++++++
 4 files changed, 67 insertions(+), 1 deletion(-)


diff --git a/include/kmem.h b/include/kmem.h
index 8ae919c7066..c71e311cabc 100644
--- a/include/kmem.h
+++ b/include/kmem.h
@@ -26,7 +26,7 @@ typedef unsigned int __bitwise gfp_t;
 #define __GFP_NOFAIL	((__force gfp_t)0)
 #define __GFP_NOLOCKDEP	((__force gfp_t)0)
 
-#define __GFP_ZERO	(__force gfp_t)1
+#define __GFP_ZERO	((__force gfp_t)1)
 
 struct kmem_cache * kmem_cache_create(const char *name, unsigned int size,
 		unsigned int align, unsigned int slab_flags,
@@ -65,6 +65,9 @@ static inline void *kmalloc(size_t size, gfp_t flags)
 	return kvmalloc(size, flags);
 }
 
+#define kvcalloc(nr, size, gfp)	kvmalloc((nr) * (size), (gfp) | __GFP_ZERO)
+#define kzalloc(size, gfp)	kvmalloc((size), (gfp) | __GFP_ZERO)
+
 static inline void kfree(const void *ptr)
 {
 	return kmem_free(ptr);
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index e08be764a49..ca8e231c0fd 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -177,8 +177,10 @@
 #define xfs_imeta_cancel_update		libxfs_imeta_cancel_update
 #define xfs_imeta_commit_update		libxfs_imeta_commit_update
 #define xfs_imeta_create		libxfs_imeta_create
+#define xfs_imeta_create_file_path	libxfs_imeta_create_file_path
 #define xfs_imeta_create_space_res	libxfs_imeta_create_space_res
 #define xfs_imeta_ensure_dirpath	libxfs_imeta_ensure_dirpath
+#define xfs_imeta_free_path		libxfs_imeta_free_path
 #define xfs_imeta_iget			libxfs_imeta_iget
 #define xfs_imeta_irele			libxfs_imeta_irele
 #define xfs_imeta_link			libxfs_imeta_link
diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
index 2defee9562b..ad429c82b47 100644
--- a/libxfs/xfs_imeta.c
+++ b/libxfs/xfs_imeta.c
@@ -1014,3 +1014,49 @@ xfs_imeta_mount(
 
 	return 0;
 }
+
+/* Create a path to a file within the metadata directory tree. */
+int
+xfs_imeta_create_file_path(
+	struct xfs_mount	*mp,
+	unsigned int		nr_components,
+	struct xfs_imeta_path	**pathp)
+{
+	struct xfs_imeta_path	*p;
+	unsigned char		**components;
+
+	p = kzalloc(sizeof(struct xfs_imeta_path), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	components = kvcalloc(nr_components, sizeof(unsigned char *),
+			GFP_KERNEL);
+	if (!components) {
+		kfree(p);
+		return -ENOMEM;
+	}
+
+	p->im_depth = nr_components;
+	p->im_path = (const unsigned char **)components;
+	p->im_ftype = XFS_DIR3_FT_REG_FILE;
+	*pathp = p;
+	return 0;
+}
+
+/* Free a metadata directory tree path. */
+void
+xfs_imeta_free_path(
+	const struct xfs_imeta_path	*path)
+{
+	unsigned int			i;
+
+	if (path->im_flags & XFS_IMETA_PATH_STATIC)
+		return;
+
+	for (i = 0; i < path->im_depth; i++) {
+		if ((path->im_dynamicmask & (1ULL << i)) && path->im_path[i])
+			kfree(path->im_path[i]);
+	}
+	kfree(path->im_path);
+	kfree(path);
+}
diff --git a/libxfs/xfs_imeta.h b/libxfs/xfs_imeta.h
index b8e360bbdfb..3b5953efc01 100644
--- a/libxfs/xfs_imeta.h
+++ b/libxfs/xfs_imeta.h
@@ -15,6 +15,8 @@ const struct xfs_imeta_path name = { \
 	.im_path = (path), \
 	.im_ftype = XFS_DIR3_FT_REG_FILE, \
 	.im_depth = ARRAY_SIZE(path), \
+	.im_flags = XFS_IMETA_PATH_STATIC, \
+	.im_dynamicmask = 0, \
 }
 
 /* Key for looking up metadata inodes. */
@@ -22,6 +24,12 @@ struct xfs_imeta_path {
 	/* Array of string pointers. */
 	const unsigned char	**im_path;
 
+	/* Each bit corresponds to an element of im_path needing to be freed */
+	unsigned long long	im_dynamicmask;
+
+	/* XFS_IMETA_* path flags */
+	uint16_t		im_flags;
+
 	/* Number of strings in path. */
 	uint8_t			im_depth;
 
@@ -29,6 +37,9 @@ struct xfs_imeta_path {
 	uint8_t			im_ftype;
 };
 
+/* Path is statically allocated. */
+#define XFS_IMETA_PATH_STATIC	(1U << 0)
+
 /* Cleanup widget for metadata inode creation and deletion. */
 struct xfs_imeta_update {
 	struct xfs_mount	*mp;
@@ -72,6 +83,10 @@ int xfs_imeta_lookup(struct xfs_trans *tp, const struct xfs_imeta_path *path,
 int xfs_imeta_dir_parent(struct xfs_trans *tp,
 		const struct xfs_imeta_path *path, struct xfs_inode **dpp);
 
+int xfs_imeta_create_file_path(struct xfs_mount *mp,
+		unsigned int nr_components, struct xfs_imeta_path **pathp);
+void xfs_imeta_free_path(const struct xfs_imeta_path *path);
+
 void xfs_imeta_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
 void xfs_imeta_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/58] xfs: adjust xfs_bmap_add_attrfork for metadir
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-31 23:34   ` [PATCH 19/58] xfs: enable creation of dynamically allocated metadir path structures Darrick J. Wong
@ 2023-12-31 23:34   ` Darrick J. Wong
  2023-12-31 23:35   ` [PATCH 21/58] xfs: record health problems with the metadata directory Darrick J. Wong
                     ` (37 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:34 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Online repair might use the xfs_bmap_add_attrfork to repair a file in
the metadata directory tree if (say) the metadata file lacks the correct
parent pointers.  In that case, it is not correct to check that the file
is dqattached -- metadata files must be not have /any/ dquot attached at
all.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_attr.c |    5 ++++-
 libxfs/xfs_bmap.c |    5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)


diff --git a/libxfs/xfs_attr.c b/libxfs/xfs_attr.c
index 5da0ac9f706..aa391f4bae5 100644
--- a/libxfs/xfs_attr.c
+++ b/libxfs/xfs_attr.c
@@ -944,7 +944,10 @@ xfs_attr_add_fork(
 	unsigned int		blks;		/* space reservation */
 	int			error;		/* error return value */
 
-	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	if (xfs_is_metadir_inode(ip))
+		ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+	else
+		ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 
 	blks = XFS_ADDAFORK_SPACE_RES(mp);
 
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index ed50bb90e6a..0dfb37073a7 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -1018,7 +1018,10 @@ xfs_bmap_add_attrfork(
 	int			error;		/* error return value */
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	if (xfs_is_metadir_inode(ip))
+		ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+	else
+		ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 	ASSERT(!xfs_inode_has_attr_fork(ip));
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/58] xfs: record health problems with the metadata directory
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-31 23:34   ` [PATCH 20/58] xfs: adjust xfs_bmap_add_attrfork for metadir Darrick J. Wong
@ 2023-12-31 23:35   ` Darrick J. Wong
  2023-12-31 23:35   ` [PATCH 22/58] xfs: check metadata directory file path connectivity Darrick J. Wong
                     ` (36 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:35 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make a report to the health monitoring subsystem any time we encounter
something in the metadata directory tree that looks like corruption.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/imeta_utils.c |    5 ++++-
 libxfs/xfs_fs.h      |    1 +
 libxfs/xfs_health.h  |    4 +++-
 libxfs/xfs_imeta.c   |   24 +++++++++++++++++++-----
 4 files changed, 27 insertions(+), 7 deletions(-)


diff --git a/libxfs/imeta_utils.c b/libxfs/imeta_utils.c
index 0186968ed3e..3b6e891d2a6 100644
--- a/libxfs/imeta_utils.c
+++ b/libxfs/imeta_utils.c
@@ -21,6 +21,7 @@
 #include "xfs_imeta.h"
 #include "xfs_trace.h"
 #include "xfs_parent.h"
+#include "xfs_health.h"
 #include "imeta_utils.h"
 
 /* Initialize a metadata update structure. */
@@ -51,8 +52,10 @@ xfs_imeta_init(
 		return error;
 	error = xfs_imeta_dir_parent(tp, upd->path, &upd->dp);
 	xfs_trans_cancel(tp);
-	if (error == -ENOENT)
+	if (error == -ENOENT) {
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
+	}
 	if (error)
 		return error;
 
diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index ab961a01181..952e4fc93c4 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -197,6 +197,7 @@ struct xfs_fsop_geom {
 #define XFS_FSOP_GEOM_SICK_RT_SUMMARY	(1 << 5)  /* realtime summary */
 #define XFS_FSOP_GEOM_SICK_QUOTACHECK	(1 << 6)  /* quota counts */
 #define XFS_FSOP_GEOM_SICK_NLINKS	(1 << 7)  /* inode link counts */
+#define XFS_FSOP_GEOM_SICK_METADIR	(1 << 8)  /* metadata directory */
 
 /* Output for XFS_FS_COUNTS */
 typedef struct xfs_fsop_counts {
diff --git a/libxfs/xfs_health.h b/libxfs/xfs_health.h
index bca1990f71d..d9b9968607f 100644
--- a/libxfs/xfs_health.h
+++ b/libxfs/xfs_health.h
@@ -60,6 +60,7 @@ struct xfs_da_args;
 #define XFS_SICK_FS_PQUOTA	(1 << 3)  /* project quota */
 #define XFS_SICK_FS_QUOTACHECK	(1 << 4)  /* quota counts */
 #define XFS_SICK_FS_NLINKS	(1 << 5)  /* inode link counts */
+#define XFS_SICK_FS_METADIR	(1 << 6)  /* metadata directory tree */
 
 /* Observable health issues for realtime volume metadata. */
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
@@ -103,7 +104,8 @@ struct xfs_da_args;
 				 XFS_SICK_FS_GQUOTA | \
 				 XFS_SICK_FS_PQUOTA | \
 				 XFS_SICK_FS_QUOTACHECK | \
-				 XFS_SICK_FS_NLINKS)
+				 XFS_SICK_FS_NLINKS | \
+				 XFS_SICK_FS_METADIR)
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY)
diff --git a/libxfs/xfs_imeta.c b/libxfs/xfs_imeta.c
index ad429c82b47..6ada36d5559 100644
--- a/libxfs/xfs_imeta.c
+++ b/libxfs/xfs_imeta.c
@@ -25,6 +25,7 @@
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
+#include "xfs_health.h"
 
 /*
  * Metadata File Management
@@ -405,16 +406,22 @@ xfs_imeta_dir_lookup_component(
 	int			type_wanted = xname->type;
 	int			error;
 
-	if (!S_ISDIR(VFS_I(dp)->i_mode))
+	if (!S_ISDIR(VFS_I(dp)->i_mode)) {
+		xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
+	}
 
 	error = xfs_imeta_dir_lookup(tp, dp, xname, ino);
 	if (error)
 		return error;
-	if (!xfs_verify_ino(dp->i_mount, *ino))
+	if (!xfs_verify_ino(dp->i_mount, *ino)) {
+		xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
-	if (type_wanted != XFS_DIR3_FT_UNKNOWN && xname->type != type_wanted)
+	}
+	if (type_wanted != XFS_DIR3_FT_UNKNOWN && xname->type != type_wanted) {
+		xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
+	}
 
 	trace_xfs_imeta_dir_lookup(dp, xname, *ino);
 	return 0;
@@ -728,6 +735,7 @@ xfs_imeta_dir_unlink(
 	/* Metadata directory root cannot be unlinked. */
 	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
 		ASSERT(0);
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
 	}
 
@@ -743,6 +751,7 @@ xfs_imeta_dir_unlink(
 			error = -ENOENT;
 		break;
 	case -ENOENT:
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		error = -EFSCORRUPTED;
 		break;
 	}
@@ -787,6 +796,7 @@ xfs_imeta_dir_link(
 	/* Metadata directory root cannot be linked. */
 	if (xfs_imeta_path_compare(upd->path, &XFS_IMETA_METADIR)) {
 		ASSERT(0);
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
 	}
 
@@ -852,16 +862,20 @@ xfs_imeta_lookup(
 
 	if (xfs_has_metadir(mp)) {
 		error = xfs_imeta_dir_lookup_int(tp, path, &ino);
-		if (error == -ENOENT)
+		if (error == -ENOENT) {
+			xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 			return -EFSCORRUPTED;
+		}
 	} else {
 		error = xfs_imeta_sb_lookup(mp, path, &ino);
 	}
 	if (error)
 		return error;
 
-	if (!xfs_imeta_verify(mp, ino))
+	if (!xfs_imeta_verify(mp, ino)) {
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
 		return -EFSCORRUPTED;
+	}
 
 	*inop = ino;
 	return 0;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/58] xfs: check metadata directory file path connectivity
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-31 23:35   ` [PATCH 21/58] xfs: record health problems with the metadata directory Darrick J. Wong
@ 2023-12-31 23:35   ` Darrick J. Wong
  2023-12-31 23:35   ` [PATCH 23/58] libfrog: report metadata directories in the geometry report Darrick J. Wong
                     ` (35 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:35 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new scrubber type that checks that well known metadata
directory paths are connected to the metadata inode that the incore
structures think is in use.  IOWs, check that "/quota/user" in the
metadata directory tree actually points to
mp->m_quotainfo->qi_uquotaip->i_ino.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs.h                     |   17 +++++++++++++++-
 libxfs/xfs_health.h                 |    4 +++-
 man/man2/ioctl_xfs_scrub_metadata.2 |   38 +++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)


diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index 952e4fc93c4..a0efcbde5ae 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -198,6 +198,7 @@ struct xfs_fsop_geom {
 #define XFS_FSOP_GEOM_SICK_QUOTACHECK	(1 << 6)  /* quota counts */
 #define XFS_FSOP_GEOM_SICK_NLINKS	(1 << 7)  /* inode link counts */
 #define XFS_FSOP_GEOM_SICK_METADIR	(1 << 8)  /* metadata directory */
+#define XFS_FSOP_GEOM_SICK_METAPATH	(1 << 9)  /* metadir tree path */
 
 /* Output for XFS_FS_COUNTS */
 typedef struct xfs_fsop_counts {
@@ -731,9 +732,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_NLINKS	26	/* inode link counts */
 #define XFS_SCRUB_TYPE_HEALTHY	27	/* everything checked out ok */
 #define XFS_SCRUB_TYPE_DIRTREE	28	/* directory tree structure */
+#define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	29
+#define XFS_SCRUB_TYPE_NR	30
 
 /*
  * This special type code only applies to the vectored scrub implementation.
@@ -788,6 +790,19 @@ struct xfs_scrub_metadata {
 				 XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
 #define XFS_SCRUB_FLAGS_ALL	(XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
 
+/*
+ * i: sm_ino values for XFS_SCRUB_TYPE_METAPATH to select a metadata file for
+ * path checking.
+ */
+#define XFS_SCRUB_METAPATH_RTBITMAP	0
+#define XFS_SCRUB_METAPATH_RTSUMMARY	1
+#define XFS_SCRUB_METAPATH_USRQUOTA	2
+#define XFS_SCRUB_METAPATH_GRPQUOTA	3
+#define XFS_SCRUB_METAPATH_PRJQUOTA	4
+
+/* Number of metapath sm_ino values */
+#define XFS_SCRUB_METAPATH_NR		5
+
 /*
  * ioctl limits
  */
diff --git a/libxfs/xfs_health.h b/libxfs/xfs_health.h
index d9b9968607f..1816c67351a 100644
--- a/libxfs/xfs_health.h
+++ b/libxfs/xfs_health.h
@@ -61,6 +61,7 @@ struct xfs_da_args;
 #define XFS_SICK_FS_QUOTACHECK	(1 << 4)  /* quota counts */
 #define XFS_SICK_FS_NLINKS	(1 << 5)  /* inode link counts */
 #define XFS_SICK_FS_METADIR	(1 << 6)  /* metadata directory tree */
+#define XFS_SICK_FS_METAPATH	(1 << 7)  /* metadata directory tree path */
 
 /* Observable health issues for realtime volume metadata. */
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
@@ -105,7 +106,8 @@ struct xfs_da_args;
 				 XFS_SICK_FS_PQUOTA | \
 				 XFS_SICK_FS_QUOTACHECK | \
 				 XFS_SICK_FS_NLINKS | \
-				 XFS_SICK_FS_METADIR)
+				 XFS_SICK_FS_METADIR | \
+				 XFS_SICK_FS_METAPATH)
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY)
diff --git a/man/man2/ioctl_xfs_scrub_metadata.2 b/man/man2/ioctl_xfs_scrub_metadata.2
index 44aa139b297..b1db740560d 100644
--- a/man/man2/ioctl_xfs_scrub_metadata.2
+++ b/man/man2/ioctl_xfs_scrub_metadata.2
@@ -200,6 +200,44 @@ Scan all inodes in the filesystem to verify each file's link count.
 Mark everything healthy after a clean scrub run.
 This clears out all the indirect health problem markers that might remain
 in the system.
+
+.TP
+.B XFS_SCRUB_TYPE_METAPATH
+Check that a metadata directory path actually points to the active metadata
+inode.
+Metadata inodes are usually cached for the duration of the mount, so this
+scrubber ensures that the same inode will still be reachable after an unmount
+and mount cycle.
+Discrepancies can happen if the directory or parent pointer scrubbers rebuild
+a metadata directory but lose a link in the process.
+The
+.B sm_ino
+field should be passed one of the following special values to communicate which
+path to check:
+
+.RS 7
+.TP
+.B XFS_SCRUB_METAPATH_RTBITMAP
+Realtime bitmap file.
+.TP
+.B XFS_SCRUB_METAPATH_RTSUMMARY
+Realtime summary file.
+.TP
+.B XFS_SCRUB_METAPATH_USRQUOTA
+User quota file.
+.TP
+.B XFS_SCRUB_METAPATH_GRPQUOTA
+Group quota file.
+.TP
+.B XFS_SCRUB_METAPATH_PRJQUOTA
+Project quota file.
+.RE
+
+The values of
+.I sm_agno
+and
+.I sm_gen
+must be zero.
 .RE
 
 .PD 1


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/58] libfrog: report metadata directories in the geometry report
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-31 23:35   ` [PATCH 22/58] xfs: check metadata directory file path connectivity Darrick J. Wong
@ 2023-12-31 23:35   ` Darrick J. Wong
  2023-12-31 23:35   ` [PATCH 24/58] libfrog: allow METADIR in xfrog_bulkstat_single5 Darrick J. Wong
                     ` (34 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:35 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Report the presence of a metadata directory tree in the geometry report.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/fsgeom.c |    4 ++++
 1 file changed, 4 insertions(+)


diff --git a/libfrog/fsgeom.c b/libfrog/fsgeom.c
index 061995fa2c7..e2732fb7a09 100644
--- a/libfrog/fsgeom.c
+++ b/libfrog/fsgeom.c
@@ -32,6 +32,7 @@ xfs_report_geom(
 	int			inobtcount;
 	int			nrext64;
 	int			parent;
+	int			metadir;
 
 	isint = geo->logstart > 0;
 	lazycount = geo->flags & XFS_FSOP_GEOM_FLAGS_LAZYSB ? 1 : 0;
@@ -51,12 +52,14 @@ xfs_report_geom(
 	inobtcount = geo->flags & XFS_FSOP_GEOM_FLAGS_INOBTCNT ? 1 : 0;
 	nrext64 = geo->flags & XFS_FSOP_GEOM_FLAGS_NREXT64 ? 1 : 0;
 	parent = geo->flags & XFS_FSOP_GEOM_FLAGS_PARENT ? 1 : 0;
+	metadir = geo->flags & XFS_FSOP_GEOM_FLAGS_METADIR ? 1 : 0;
 
 	printf(_(
 "meta-data=%-22s isize=%-6d agcount=%u, agsize=%u blks\n"
 "         =%-22s sectsz=%-5u attr=%u, projid32bit=%u\n"
 "         =%-22s crc=%-8u finobt=%u, sparse=%u, rmapbt=%u\n"
 "         =%-22s reflink=%-4u bigtime=%u inobtcount=%u nrext64=%u\n"
+"         =%-22s metadir=%u\n"
 "data     =%-22s bsize=%-6u blocks=%llu, imaxpct=%u\n"
 "         =%-22s sunit=%-6u swidth=%u blks\n"
 "naming   =version %-14u bsize=%-6u ascii-ci=%d, ftype=%d, parent=%d\n"
@@ -67,6 +70,7 @@ xfs_report_geom(
 		"", geo->sectsize, attrversion, projid32bit,
 		"", crcs_enabled, finobt_enabled, spinodes, rmapbt_enabled,
 		"", reflink_enabled, bigtime_enabled, inobtcount, nrext64,
+		"", metadir,
 		"", geo->blocksize, (unsigned long long)geo->datablocks,
 			geo->imaxpct,
 		"", geo->sunit, geo->swidth,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/58] libfrog: allow METADIR in xfrog_bulkstat_single5
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-31 23:35   ` [PATCH 23/58] libfrog: report metadata directories in the geometry report Darrick J. Wong
@ 2023-12-31 23:35   ` Darrick J. Wong
  2023-12-31 23:36   ` [PATCH 25/58] xfs_io: support scrubbing metadata directory paths Darrick J. Wong
                     ` (33 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:35 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

This is a valid flag for a single-file bulkstat, so add that to the
filter.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/bulkstat.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/libfrog/bulkstat.c b/libfrog/bulkstat.c
index c863bcb6bf8..6eceef2a8fa 100644
--- a/libfrog/bulkstat.c
+++ b/libfrog/bulkstat.c
@@ -53,7 +53,8 @@ xfrog_bulkstat_single5(
 	struct xfs_bulkstat_req		*req;
 	int				ret;
 
-	if (flags & ~(XFS_BULK_IREQ_SPECIAL | XFS_BULK_IREQ_NREXT64))
+	if (flags & ~(XFS_BULK_IREQ_SPECIAL | XFS_BULK_IREQ_NREXT64 |
+		      XFS_BULK_IREQ_METADIR))
 		return -EINVAL;
 
 	if (xfd->fsgeom.flags & XFS_FSOP_GEOM_FLAGS_NREXT64)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 25/58] xfs_io: support scrubbing metadata directory paths
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (23 preceding siblings ...)
  2023-12-31 23:35   ` [PATCH 24/58] libfrog: allow METADIR in xfrog_bulkstat_single5 Darrick J. Wong
@ 2023-12-31 23:36   ` Darrick J. Wong
  2023-12-31 23:36   ` [PATCH 26/58] xfs_db: basic xfs_check support for metadir Darrick J. Wong
                     ` (32 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:36 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Support invoking the metadata directory path scrubber from xfs_io for
testing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/scrub.c |   34 +++++++++++++++++++++++++++++++++-
 libfrog/scrub.h |    2 ++
 scrub/scrub.c   |   18 ++++++++++++++++++
 3 files changed, 53 insertions(+), 1 deletion(-)


diff --git a/libfrog/scrub.c b/libfrog/scrub.c
index 8264aab00ef..f48bb1a0d2b 100644
--- a/libfrog/scrub.c
+++ b/libfrog/scrub.c
@@ -154,8 +154,40 @@ const struct xfrog_scrub_descr xfrog_scrubbers[XFS_SCRUB_TYPE_NR] = {
 		.descr	= "directory tree structure",
 		.group	= XFROG_SCRUB_GROUP_INODE,
 	},
+	[XFS_SCRUB_TYPE_METAPATH] = {
+		.name	= "metapath",
+		.descr	= "metadata directory paths",
+		.group	= XFROG_SCRUB_GROUP_METAPATH,
+	},
+};
+
+const struct xfrog_scrub_descr xfrog_metapaths[XFS_SCRUB_METAPATH_NR] = {
+	[XFS_SCRUB_METAPATH_RTBITMAP]	= {
+		.name	= "rtbitmap",
+		.descr	= "realtime bitmap metadir path",
+		.group	= XFROG_SCRUB_GROUP_FS,
+	},
+	[XFS_SCRUB_METAPATH_RTSUMMARY]	= {
+		.name	= "rtsummary",
+		.descr	= "realtime summary metadir path",
+		.group	= XFROG_SCRUB_GROUP_FS,
+	},
+	[XFS_SCRUB_METAPATH_USRQUOTA]	= {
+		.name	= "usrquota",
+		.descr	= "user quota metadir path",
+		.group	= XFROG_SCRUB_GROUP_FS,
+	},
+	[XFS_SCRUB_METAPATH_GRPQUOTA]	= {
+		.name	= "grpquota",
+		.descr	= "group quota metadir path",
+		.group	= XFROG_SCRUB_GROUP_FS,
+	},
+	[XFS_SCRUB_METAPATH_PRJQUOTA]	= {
+		.name	= "prjquota",
+		.descr	= "project quota metadir path",
+		.group	= XFROG_SCRUB_GROUP_FS,
+	},
 };
-#undef DEP
 
 /* Invoke the scrub ioctl.  Returns zero or negative error code. */
 int
diff --git a/libfrog/scrub.h b/libfrog/scrub.h
index 43456230479..5fa0fafef56 100644
--- a/libfrog/scrub.h
+++ b/libfrog/scrub.h
@@ -15,6 +15,7 @@ enum xfrog_scrub_group {
 	XFROG_SCRUB_GROUP_INODE,	/* per-inode metadata */
 	XFROG_SCRUB_GROUP_ISCAN,	/* metadata requiring full inode scan */
 	XFROG_SCRUB_GROUP_SUMMARY,	/* summary metadata */
+	XFROG_SCRUB_GROUP_METAPATH,	/* metadata directory path */
 };
 
 /* Catalog of scrub types and names, indexed by XFS_SCRUB_TYPE_* */
@@ -25,6 +26,7 @@ struct xfrog_scrub_descr {
 };
 
 extern const struct xfrog_scrub_descr xfrog_scrubbers[XFS_SCRUB_TYPE_NR];
+extern const struct xfrog_scrub_descr xfrog_metapaths[XFS_SCRUB_METAPATH_NR];
 
 int xfrog_scrub_metadata(struct xfs_fd *xfd, struct xfs_scrub_metadata *meta);
 
diff --git a/scrub/scrub.c b/scrub/scrub.c
index 2ec3cbc9aac..bad1384dcfb 100644
--- a/scrub/scrub.c
+++ b/scrub/scrub.c
@@ -53,6 +53,22 @@ static const unsigned int scrub_deps[XFS_SCRUB_TYPE_NR] = {
 };
 #undef DEP
 
+static int
+format_metapath_descr(
+	char				*buf,
+	size_t				buflen,
+	struct xfs_scrub_vec_head	*vhead)
+{
+	const struct xfrog_scrub_descr	*sc;
+
+	if (vhead->svh_ino >= XFS_SCRUB_METAPATH_NR)
+		return snprintf(buf, buflen, _("unknown metadir path %llu"),
+				(unsigned long long)vhead->svh_ino);
+
+	sc = &xfrog_metapaths[vhead->svh_ino];
+	return snprintf(buf, buflen, "%s", _(sc->descr));
+}
+
 /* Describe the current state of a vectored scrub. */
 int
 format_scrubv_descr(
@@ -80,6 +96,8 @@ format_scrubv_descr(
 	case XFROG_SCRUB_GROUP_ISCAN:
 	case XFROG_SCRUB_GROUP_NONE:
 		return snprintf(buf, buflen, _("%s"), _(sc->descr));
+	case XFROG_SCRUB_GROUP_METAPATH:
+		return format_metapath_descr(buf, buflen, vhead);
 	}
 	return -1;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 26/58] xfs_db: basic xfs_check support for metadir
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (24 preceding siblings ...)
  2023-12-31 23:36   ` [PATCH 25/58] xfs_io: support scrubbing metadata directory paths Darrick J. Wong
@ 2023-12-31 23:36   ` Darrick J. Wong
  2023-12-31 23:36   ` [PATCH 27/58] xfs_db: report metadir support for version command Darrick J. Wong
                     ` (31 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:36 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Support metadata directories in xfs_check.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/check.c |   21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)


diff --git a/db/check.c b/db/check.c
index 2f2fbc7cbd8..ae527471161 100644
--- a/db/check.c
+++ b/db/check.c
@@ -2663,7 +2663,9 @@ process_dir(
 		if (!sflag || id->ilist || CHECK_BLIST(bno))
 			dbprintf(_("no .. entry for directory %lld\n"), id->ino);
 		error++;
-	} else if (parent == id->ino && id->ino != mp->m_sb.sb_rootino) {
+	} else if (parent == id->ino &&
+		   id->ino != mp->m_sb.sb_rootino &&
+		   id->ino != mp->m_sb.sb_metadirino) {
 		if (!sflag || id->ilist || CHECK_BLIST(bno))
 			dbprintf(_(". and .. same for non-root directory %lld\n"),
 				id->ino);
@@ -2673,6 +2675,11 @@ process_dir(
 			dbprintf(_("root directory %lld has .. %lld\n"), id->ino,
 				parent);
 		error++;
+	} else if (id->ino == mp->m_sb.sb_metadirino && id->ino != parent) {
+		if (!sflag || id->ilist || CHECK_BLIST(bno))
+			dbprintf(_("metadata directory %lld has .. %lld\n"),
+				id->ino, parent);
+		error++;
 	} else if (parent != NULLFSINO && id->ino != parent)
 		addparent_inode(id, parent);
 }
@@ -2916,6 +2923,9 @@ process_inode(
 		type = DBM_DIR;
 		if (dip->di_format == XFS_DINODE_FMT_LOCAL)
 			break;
+		if (xfs_has_metadir(mp) &&
+		    id->ino == mp->m_sb.sb_metadirino)
+			addlink_inode(id);
 		blkmap = blkmap_alloc(dnextents);
 		break;
 	case S_IFREG:
@@ -2924,18 +2934,21 @@ process_inode(
 		else if (id->ino == mp->m_sb.sb_rbmino) {
 			type = DBM_RTBITMAP;
 			blkmap = blkmap_alloc(dnextents);
-			addlink_inode(id);
+			if (!xfs_has_metadir(mp))
+				addlink_inode(id);
 		} else if (id->ino == mp->m_sb.sb_rsumino) {
 			type = DBM_RTSUM;
 			blkmap = blkmap_alloc(dnextents);
-			addlink_inode(id);
+			if (!xfs_has_metadir(mp))
+				addlink_inode(id);
 		}
 		else if (id->ino == mp->m_sb.sb_uquotino ||
 			 id->ino == mp->m_sb.sb_gquotino ||
 			 id->ino == mp->m_sb.sb_pquotino) {
 			type = DBM_QUOTA;
 			blkmap = blkmap_alloc(dnextents);
-			addlink_inode(id);
+			if (!xfs_has_metadir(mp))
+				addlink_inode(id);
 		}
 		else
 			type = DBM_DATA;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 27/58] xfs_db: report metadir support for version command
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (25 preceding siblings ...)
  2023-12-31 23:36   ` [PATCH 26/58] xfs_db: basic xfs_check support for metadir Darrick J. Wong
@ 2023-12-31 23:36   ` Darrick J. Wong
  2023-12-31 23:36   ` [PATCH 28/58] xfs_db: don't obfuscate metadata directories and attributes Darrick J. Wong
                     ` (30 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:36 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Report metadir support if we have it enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/inode.c |    3 +++
 db/sb.c    |    2 ++
 2 files changed, 5 insertions(+)


diff --git a/db/inode.c b/db/inode.c
index c9b506b905d..4e2be6a1156 100644
--- a/db/inode.c
+++ b/db/inode.c
@@ -207,6 +207,9 @@ const field_t	inode_v3_flds[] = {
 	{ "nrext64", FLDT_UINT1,
 	  OI(COFF(flags2) + bitsz(uint64_t) - XFS_DIFLAG2_NREXT64_BIT - 1), C1,
 	  0, TYP_NONE },
+	{ "metadir", FLDT_UINT1,
+	  OI(COFF(flags2) + bitsz(uint64_t) - XFS_DIFLAG2_METADIR_BIT-1), C1,
+	  0, TYP_NONE },
 	{ NULL }
 };
 
diff --git a/db/sb.c b/db/sb.c
index e738065b5be..002736b02b7 100644
--- a/db/sb.c
+++ b/db/sb.c
@@ -708,6 +708,8 @@ version_string(
 		strcat(s, ",NREXT64");
 	if (xfs_has_parent(mp))
 		strcat(s, ",PARENT");
+	if (xfs_has_metadir(mp))
+		strcat(s, ",METADIR");
 	return s;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 28/58] xfs_db: don't obfuscate metadata directories and attributes
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (26 preceding siblings ...)
  2023-12-31 23:36   ` [PATCH 27/58] xfs_db: report metadir support for version command Darrick J. Wong
@ 2023-12-31 23:36   ` Darrick J. Wong
  2023-12-31 23:37   ` [PATCH 29/58] xfs_db: support metadata directories in the path command Darrick J. Wong
                     ` (29 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:36 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Don't obfuscate the directory and attribute names of metadata inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/metadump.c |  114 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 79 insertions(+), 35 deletions(-)


diff --git a/db/metadump.c b/db/metadump.c
index f5b930d51d2..714be862231 100644
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -1056,9 +1056,16 @@ generate_obfuscated_name(
 		free(orig_name);
 }
 
+static inline bool
+want_obfuscate_dirents(bool is_meta)
+{
+	return metadump.obfuscate && !is_meta;
+}
+
 static void
 process_sf_dir(
-	struct xfs_dinode	*dip)
+	struct xfs_dinode	*dip,
+	bool			is_meta)
 {
 	struct xfs_dir2_sf_hdr	*sfp;
 	xfs_dir2_sf_entry_t	*sfep;
@@ -1105,7 +1112,7 @@ process_sf_dir(
 					 (char *)sfp);
 		}
 
-		if (metadump.obfuscate)
+		if (want_obfuscate_dirents(is_meta))
 			generate_obfuscated_name(
 					 libxfs_dir2_sf_get_ino(mp, sfp, sfep),
 					 namelen, &sfep->name[0]);
@@ -1195,9 +1202,10 @@ want_obfuscate_pptr(
 	const void	*name,
 	unsigned int	namelen,
 	const void	*value,
-	unsigned int	valuelen)
+	unsigned int	valuelen,
+	bool		is_meta)
 {
-	if (!metadump.obfuscate)
+	if (!metadump.obfuscate || is_meta)
 		return false;
 
 	/* Ignore if parent pointers aren't enabled. */
@@ -1293,9 +1301,10 @@ want_obfuscate_attr(
 	const void	*name,
 	unsigned int	namelen,
 	const void	*value,
-	unsigned int	valuelen)
+	unsigned int	valuelen,
+	bool		is_meta)
 {
-	if (!metadump.obfuscate)
+	if (!metadump.obfuscate || is_meta)
 		return false;
 
 	/*
@@ -1310,7 +1319,8 @@ want_obfuscate_attr(
 
 static void
 process_sf_attr(
-	struct xfs_dinode		*dip)
+	struct xfs_dinode		*dip,
+	bool				is_meta)
 {
 	/*
 	 * with extended attributes, obfuscate the names and fill the actual
@@ -1357,11 +1367,11 @@ process_sf_attr(
 		name = &asfep->nameval[0];
 		value = &asfep->nameval[asfep->namelen];
 
-		if (want_obfuscate_pptr(asfep->flags, name, namelen, value,
-					asfep->valuelen)) {
+		if (want_obfuscate_pptr(asfep->flags, name, namelen,
+					value, asfep->valuelen, is_meta)) {
 			obfuscate_parent_pointer(name, value, asfep->valuelen);
 		} else if (want_obfuscate_attr(asfep->flags, name, namelen,
-					value, asfep->valuelen)) {
+					value, asfep->valuelen, is_meta)) {
 			generate_obfuscated_name(0, asfep->namelen, name);
 			memset(value, 'v', asfep->valuelen);
 		}
@@ -1463,7 +1473,8 @@ static void
 process_dir_data_block(
 	char		*block,
 	xfs_fileoff_t	offset,
-	int		is_block_format)
+	int		is_block_format,
+	bool		is_meta)
 {
 	/*
 	 * we have to rely on the fileoffset and signature of the block to
@@ -1570,7 +1581,7 @@ process_dir_data_block(
 				dir_offset)
 			return;
 
-		if (metadump.obfuscate)
+		if (want_obfuscate_dirents(is_meta))
 			generate_obfuscated_name(be64_to_cpu(dep->inumber),
 					 dep->namelen, &dep->name[0]);
 		dir_offset += length;
@@ -1595,7 +1606,8 @@ process_symlink_block(
 	xfs_fsblock_t	s,
 	xfs_filblks_t	c,
 	typnm_t		btype,
-	xfs_fileoff_t	last)
+	xfs_fileoff_t	last,
+	bool		is_meta)
 {
 	struct bbmap	map;
 	char		*link;
@@ -1620,7 +1632,7 @@ process_symlink_block(
 	if (xfs_has_crc((mp)))
 		link += sizeof(struct xfs_dsymlink_hdr);
 
-	if (metadump.obfuscate)
+	if (want_obfuscate_dirents(is_meta))
 		obfuscate_path_components(link, XFS_SYMLINK_BUF_SPACE(mp,
 							mp->m_sb.sb_blocksize));
 	if (metadump.zero_stale_data) {
@@ -1671,7 +1683,8 @@ add_remote_vals(
 static void
 process_attr_block(
 	char				*block,
-	xfs_fileoff_t			offset)
+	xfs_fileoff_t			offset,
+	bool				is_meta)
 {
 	struct xfs_attr_leafblock	*leaf;
 	struct xfs_attr3_icleaf_hdr	hdr;
@@ -1750,11 +1763,11 @@ process_attr_block(
 
 			if (want_obfuscate_pptr(entry->flags, name,
 						local->namelen, value,
-						valuelen)) {
+						valuelen, is_meta)) {
 				obfuscate_parent_pointer(name, value, valuelen);
 			} else if (want_obfuscate_attr(entry->flags, name,
 						local->namelen, value,
-						valuelen)) {
+						valuelen, is_meta)) {
 				generate_obfuscated_name(0, local->namelen,
 						name);
 				memset(value, 'v', valuelen);
@@ -1776,7 +1789,7 @@ process_attr_block(
 						(long long)metadump.cur_ino);
 				break;
 			}
-			if (metadump.obfuscate) {
+			if (want_obfuscate_dirents(is_meta)) {
 				generate_obfuscated_name(0, remote->namelen,
 							 &remote->name[0]);
 				add_remote_vals(be32_to_cpu(remote->valueblk),
@@ -1809,7 +1822,8 @@ process_single_fsb_objects(
 	xfs_fsblock_t	s,
 	xfs_filblks_t	c,
 	typnm_t		btype,
-	xfs_fileoff_t	last)
+	xfs_fileoff_t	last,
+	bool		is_meta)
 {
 	int		rval = 1;
 	char		*dp;
@@ -1879,12 +1893,13 @@ process_single_fsb_objects(
 				process_dir_leaf_block(dp);
 			} else {
 				process_dir_data_block(dp, o,
-					 last == mp->m_dir_geo->fsbcount);
+					 last == mp->m_dir_geo->fsbcount,
+					 is_meta);
 			}
 			iocur_top->need_crc = 1;
 			break;
 		case TYP_ATTR:
-			process_attr_block(dp, o);
+			process_attr_block(dp, o, is_meta);
 			iocur_top->need_crc = 1;
 			break;
 		default:
@@ -1917,7 +1932,8 @@ process_multi_fsb_dir(
 	xfs_fsblock_t	s,
 	xfs_filblks_t	c,
 	typnm_t		btype,
-	xfs_fileoff_t	last)
+	xfs_fileoff_t	last,
+	bool		is_meta)
 {
 	char		*dp;
 	int		rval = 1;
@@ -1961,7 +1977,8 @@ process_multi_fsb_dir(
 				process_dir_leaf_block(dp);
 			} else {
 				process_dir_data_block(dp, o,
-					 last == mp->m_dir_geo->fsbcount);
+					 last == mp->m_dir_geo->fsbcount,
+					 is_meta);
 			}
 			iocur_top->need_crc = 1;
 write:
@@ -1998,13 +2015,14 @@ process_multi_fsb_objects(
 	xfs_fsblock_t	s,
 	xfs_filblks_t	c,
 	typnm_t		btype,
-	xfs_fileoff_t	last)
+	xfs_fileoff_t	last,
+	bool		is_meta)
 {
 	switch (btype) {
 	case TYP_DIR2:
-		return process_multi_fsb_dir(o, s, c, btype, last);
+		return process_multi_fsb_dir(o, s, c, btype, last, is_meta);
 	case TYP_SYMLINK:
-		return process_symlink_block(o, s, c, btype, last);
+		return process_symlink_block(o, s, c, btype, last, is_meta);
 	default:
 		print_warning("bad type for multi-fsb object %d", btype);
 		return 1;
@@ -2016,7 +2034,8 @@ static int
 process_bmbt_reclist(
 	xfs_bmbt_rec_t		*rp,
 	int			numrecs,
-	typnm_t			btype)
+	typnm_t			btype,
+	bool			is_meta)
 {
 	int			i;
 	xfs_fileoff_t		o, op = NULLFILEOFF;
@@ -2096,10 +2115,10 @@ process_bmbt_reclist(
 		/* multi-extent blocks require special handling */
 		if (is_multi_fsb)
 			rval = process_multi_fsb_objects(o, s, c, btype,
-					last);
+					last, is_meta);
 		else
 			rval = process_single_fsb_objects(o, s, c, btype,
-					last);
+					last, is_meta);
 		if (!rval)
 			break;
 	}
@@ -2107,6 +2126,11 @@ process_bmbt_reclist(
 	return rval;
 }
 
+struct scan_bmap {
+	enum typnm	typ;
+	bool		is_meta;
+};
+
 static int
 scanfunc_bmap(
 	struct xfs_btree_block	*block,
@@ -2116,6 +2140,7 @@ scanfunc_bmap(
 	typnm_t			btype,
 	void			*arg)	/* ptr to itype */
 {
+	struct scan_bmap	*sbm = arg;
 	int			i;
 	xfs_bmbt_ptr_t		*pp;
 	int			nrecs;
@@ -2131,7 +2156,7 @@ scanfunc_bmap(
 			return 1;
 		}
 		return process_bmbt_reclist(XFS_BMBT_REC_ADDR(mp, block, 1),
-					    nrecs, *(typnm_t*)arg);
+					    nrecs, sbm->typ, sbm->is_meta);
 	}
 
 	if (nrecs > mp->m_bmap_dmxr[1]) {
@@ -2163,6 +2188,15 @@ scanfunc_bmap(
 	return 1;
 }
 
+static inline bool
+is_metadata_ino(
+	struct xfs_dinode	*dip)
+{
+	return xfs_has_metadir(mp) &&
+			dip->di_version >= 3 &&
+			(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR));
+}
+
 static int
 process_btinode(
 	struct xfs_dinode 	*dip,
@@ -2176,6 +2210,7 @@ process_btinode(
 	int			maxrecs;
 	int			whichfork;
 	typnm_t			btype;
+	bool			is_meta = is_metadata_ino(dip);
 
 	whichfork = (itype == TYP_ATTR) ? XFS_ATTR_FORK : XFS_DATA_FORK;
 	btype = (itype == TYP_ATTR) ? TYP_BMAPBTA : TYP_BMAPBTD;
@@ -2194,7 +2229,7 @@ process_btinode(
 
 	if (level == 0) {
 		return process_bmbt_reclist(XFS_BMDR_REC_ADDR(dib, 1),
-					    nrecs, itype);
+					    nrecs, itype, is_meta);
 	}
 
 	maxrecs = libxfs_bmdr_maxrecs(XFS_DFORK_SIZE(dip, mp, whichfork), 0);
@@ -2221,6 +2256,10 @@ process_btinode(
 	}
 
 	for (i = 0; i < nrecs; i++) {
+		struct scan_bmap	sbm = {
+			.typ = itype,
+			.is_meta = is_meta,
+		};
 		xfs_agnumber_t	ag;
 		xfs_agblock_t	bno;
 
@@ -2237,7 +2276,7 @@ process_btinode(
 			continue;
 		}
 
-		if (!scan_btree(ag, bno, level, btype, &itype, scanfunc_bmap))
+		if (!scan_btree(ag, bno, level, btype, &sbm, scanfunc_bmap))
 			return 0;
 	}
 	return 1;
@@ -2251,6 +2290,7 @@ process_exinode(
 	int			whichfork;
 	int			used;
 	xfs_extnum_t		nex, max_nex;
+	bool			is_meta = is_metadata_ino(dip);
 
 	whichfork = (itype == TYP_ATTR) ? XFS_ATTR_FORK : XFS_DATA_FORK;
 
@@ -2275,7 +2315,7 @@ process_exinode(
 
 
 	return process_bmbt_reclist((xfs_bmbt_rec_t *)XFS_DFORK_PTR(dip,
-					whichfork), nex, itype);
+					whichfork), nex, itype, is_meta);
 }
 
 static int
@@ -2283,6 +2323,8 @@ process_inode_data(
 	struct xfs_dinode	*dip,
 	typnm_t			itype)
 {
+	bool			is_meta = is_metadata_ino(dip);
+
 	switch (dip->di_format) {
 		case XFS_DINODE_FMT_LOCAL:
 			if (!(metadump.obfuscate || metadump.zero_stale_data))
@@ -2303,7 +2345,7 @@ process_inode_data(
 
 			switch (itype) {
 				case TYP_DIR2:
-					process_sf_dir(dip);
+					process_sf_dir(dip, is_meta);
 					break;
 
 				case TYP_SYMLINK:
@@ -2421,13 +2463,15 @@ process_inode(
 
 	/* copy extended attributes if they exist and forkoff is valid */
 	if (XFS_DFORK_DSIZE(dip, mp) < XFS_LITINO(mp)) {
+		bool	is_meta = is_metadata_ino(dip);
+
 		attr_data.remote_val_count = 0;
 		switch (dip->di_aformat) {
 			case XFS_DINODE_FMT_LOCAL:
 				need_new_crc = true;
 				if (metadump.obfuscate ||
 				    metadump.zero_stale_data)
-					process_sf_attr(dip);
+					process_sf_attr(dip, is_meta);
 				break;
 
 			case XFS_DINODE_FMT_EXTENTS:


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 29/58] xfs_db: support metadata directories in the path command
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (27 preceding siblings ...)
  2023-12-31 23:36   ` [PATCH 28/58] xfs_db: don't obfuscate metadata directories and attributes Darrick J. Wong
@ 2023-12-31 23:37   ` Darrick J. Wong
  2023-12-31 23:37   ` [PATCH 30/58] xfs_db: mask superblock fields when metadir feature is enabled Darrick J. Wong
                     ` (28 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:37 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the path command to traverse the metadata directory tree by
passing a '\' as the first letter in the path.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/namei.c        |   71 ++++++++++++++++++++++++++++++++++++++++++++---------
 man/man8/xfs_db.8 |   23 ++++++++++++++---
 2 files changed, 78 insertions(+), 16 deletions(-)


diff --git a/db/namei.c b/db/namei.c
index e75179b2c67..db467141f13 100644
--- a/db/namei.c
+++ b/db/namei.c
@@ -139,11 +139,11 @@ path_navigate(
 /* Walk a directory path to an inode and set the io cursor to that inode. */
 static int
 path_walk(
+	xfs_ino_t	rootino,
 	char		*path)
 {
 	struct dirpath	*dirpath;
 	char		*p = path;
-	xfs_ino_t	rootino = mp->m_sb.sb_rootino;
 	int		error = 0;
 
 	if (*p == '/') {
@@ -173,6 +173,9 @@ path_help(void)
 	dbprintf(_(
 "\n"
 " Navigate to an inode via directory path.\n"
+"\n"
+" Options:\n"
+"   -m -- Walk an absolute path down the metadata directory tree.\n"
 	));
 }
 
@@ -181,18 +184,34 @@ path_f(
 	int		argc,
 	char		**argv)
 {
+	xfs_ino_t	rootino = mp->m_sb.sb_rootino;
 	int		c;
 	int		error;
 
-	while ((c = getopt(argc, argv, "")) != -1) {
+	while ((c = getopt(argc, argv, "m")) != -1) {
 		switch (c) {
+		case 'm':
+			/* Absolute path, start from metadata rootdir. */
+			if (!xfs_has_metadir(mp)) {
+				dbprintf(
+	_("filesystem does not support metadata directories.\n"));
+				exitcode = 1;
+				return 0;
+			}
+			rootino = mp->m_sb.sb_metadirino;
+			break;
 		default:
 			path_help();
 			return 0;
 		}
 	}
 
-	error = path_walk(argv[optind]);
+	if (argc == optind || argc > optind + 1) {
+		dbprintf(_("Only supply one path.\n"));
+		return -1;
+	}
+
+	error = path_walk(rootino, argv[optind]);
 	if (error) {
 		dbprintf("%s: %s\n", argv[optind], strerror(error));
 		exitcode = 1;
@@ -206,7 +225,7 @@ static struct cmdinfo path_cmd = {
 	.altname	= NULL,
 	.cfunc		= path_f,
 	.argmin		= 1,
-	.argmax		= 1,
+	.argmax		= -1,
 	.canpush	= 0,
 	.args		= "",
 	.help		= path_help,
@@ -521,6 +540,7 @@ ls_help(void)
 " Options:\n"
 "   -i -- Resolve the given paths to their corresponding inode numbers.\n"
 "         If no paths are given, display the current inode number.\n"
+"   -m -- Walk an absolute path down the metadata directory tree.\n"
 "\n"
 " Directory contents will be listed in the format:\n"
 " dir_cookie	inode_number	type	hash	name_length	name\n"
@@ -532,15 +552,26 @@ ls_f(
 	int			argc,
 	char			**argv)
 {
+	xfs_ino_t		rootino = mp->m_sb.sb_rootino;
 	bool			inum_only = false;
 	int			c;
 	int			error = 0;
 
-	while ((c = getopt(argc, argv, "i")) != -1) {
+	while ((c = getopt(argc, argv, "im")) != -1) {
 		switch (c) {
 		case 'i':
 			inum_only = true;
 			break;
+		case 'm':
+			/* Absolute path, start from metadata rootdir. */
+			if (!xfs_has_metadir(mp)) {
+				dbprintf(
+	_("filesystem does not support metadata directories.\n"));
+				exitcode = 1;
+				return 0;
+			}
+			rootino = mp->m_sb.sb_metadirino;
+			break;
 		default:
 			ls_help();
 			return 0;
@@ -563,7 +594,7 @@ ls_f(
 	for (c = optind; c < argc; c++) {
 		push_cur();
 
-		error = path_walk(argv[c]);
+		error = path_walk(rootino, argv[c]);
 		if (error)
 			goto err_cur;
 
@@ -874,11 +905,22 @@ parent_f(
 	int			argc,
 	char			**argv)
 {
+	xfs_ino_t		rootino = mp->m_sb.sb_rootino;
 	int			c;
 	int			error = 0;
 
-	while ((c = getopt(argc, argv, "")) != -1) {
+	while ((c = getopt(argc, argv, "m")) != -1) {
 		switch (c) {
+		case 'm':
+			/* Absolute path, start from metadata rootdir. */
+			if (!xfs_has_metadir(mp)) {
+				dbprintf(
+	_("filesystem does not support metadata directories.\n"));
+				exitcode = 1;
+				return 0;
+			}
+			rootino = mp->m_sb.sb_metadirino;
+			break;
 		default:
 			ls_help();
 			return 0;
@@ -898,7 +940,7 @@ parent_f(
 	for (c = optind; c < argc; c++) {
 		push_cur();
 
-		error = path_walk(argv[c]);
+		error = path_walk(rootino, argv[c]);
 		if (error)
 			goto err_cur;
 
@@ -926,7 +968,7 @@ static struct cmdinfo parent_cmd = {
 	.argmin		= 0,
 	.argmax		= -1,
 	.canpush	= 0,
-	.args		= "[paths...]",
+	.args		= "[-m] [paths...]",
 	.help		= parent_help,
 };
 
@@ -940,6 +982,7 @@ link_help(void)
 "\n"
 " Options:\n"
 "   -i   -- Point to this specific inode number.\n"
+"   -m   -- Select the metadata directory tree.\n"
 "   -p   -- Point to the inode given by this path.\n"
 "   -t   -- Set the file type to this value.\n"
 "   name -- Create this directory entry with this name.\n"
@@ -1051,11 +1094,12 @@ link_f(
 {
 	xfs_ino_t		child_ino = NULLFSINO;
 	int			ftype = XFS_DIR3_FT_UNKNOWN;
+	xfs_ino_t		rootino = mp->m_sb.sb_rootino;
 	unsigned int		i;
 	int			c;
 	int			error = 0;
 
-	while ((c = getopt(argc, argv, "i:p:t:")) != -1) {
+	while ((c = getopt(argc, argv, "i:mp:t:")) != -1) {
 		switch (c) {
 		case 'i':
 			errno = 0;
@@ -1066,9 +1110,12 @@ link_f(
 				return 0;
 			}
 			break;
+		case 'm':
+			rootino = mp->m_sb.sb_metadirino;
+			break;
 		case 'p':
 			push_cur();
-			error = path_walk(optarg);
+			error = path_walk(rootino, optarg);
 			if (error) {
 				printf("%s: %s\n", optarg, strerror(error));
 				exitcode = 1;
@@ -1138,7 +1185,7 @@ static struct cmdinfo link_cmd = {
 	.argmin		= 0,
 	.argmax		= -1,
 	.canpush	= 0,
-	.args		= "[-i ino] [-p path] [-t ftype] name",
+	.args		= "[-i ino] [-m] [-p path] [-t ftype] name",
 	.help		= link_help,
 };
 
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 638a8dc9352..b1712a92f76 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -884,7 +884,7 @@ will result in truncation and a warning will be issued. If no
 .I label
 is given, the current filesystem label is printed.
 .TP
-.BI "link [-i " ino "] [-p " path "] [-t " ftype "] name"
+.BI "link [-i " ino "] [-m] [-p " path "] [-t " ftype "] name"
 In the current directory, create a directory entry with the given
 .I name
 pointing to a file.
@@ -897,6 +897,10 @@ The file type in the directory entry will be determined from the mode of the
 child file unless the
 .I ftype
 option is given.
+The
+.B -m
+option specifies that the path lookup should be done in the metadata directory
+tree.
 The file being targetted must not be on the iunlink list.
 .TP
 .BI "log [stop | start " filename ]
@@ -917,7 +921,7 @@ This makes it easier to find discrepancies in the reservation calculations
 between xfsprogs and the kernel, which will help when diagnosing minimum
 log size calculation errors.
 .TP
-.BI "ls [\-i] [" paths "]..."
+.BI "ls [\-im] [" paths "]..."
 List the contents of a directory.
 If a path resolves to a directory, the directory will be listed.
 If no paths are supplied and the IO cursor points at a directory inode,
@@ -931,6 +935,9 @@ directory cookie, inode number, file type, hash, name length, name.
 Resolve each of the given paths to an inode number and print that number.
 If no paths are given and the IO cursor points to an inode, print the inode
 number.
+.TP
+.B \-m
+Absolute paths should be walked from the root of the metadata directory tree.
 .RE
 .TP
 .BI "metadump [\-egow] " filename
@@ -958,18 +965,26 @@ See the
 .B print
 command.
 .TP
-.BI "parent [" paths "]..."
+.BI "parent [\-m] [" paths "]..."
 List the parents of a file.
 If a path resolves to a file, the parents of that file will be listed.
 If no paths are supplied and the IO cursor points at an inode, the parents of
 that file will be listed.
+The
+.B \-m
+option causes absolute paths to be walked from the root of the metadata
+directory tree.
 
 The output format is:
 inode number, inode generation, ondisk namehash, namehash, name length, name.
 .TP
-.BI "path " dir_path
+.BI "path [\-m] " dir_path
 Walk the directory tree to an inode using the supplied path.
 Absolute and relative paths are supported.
+The
+.B \-m
+option causes absolute paths to be walked from the root of the metadata
+directory tree.
 .TP
 .B pop
 Pop location from the stack.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 30/58] xfs_db: mask superblock fields when metadir feature is enabled
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (28 preceding siblings ...)
  2023-12-31 23:37   ` [PATCH 29/58] xfs_db: support metadata directories in the path command Darrick J. Wong
@ 2023-12-31 23:37   ` Darrick J. Wong
  2023-12-31 23:37   ` [PATCH 31/58] xfs_io: support the bulkstat metadata directory flag Darrick J. Wong
                     ` (27 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:37 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When the metadata directory feature is enabled, mask the superblock
fields (rt, quota inodes) that got migrated to the directory tree.
Similarly, hide the 'metadirino' field when the feature is disabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/sb.c |   41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)


diff --git a/db/sb.c b/db/sb.c
index 002736b02b7..2ad032cc81a 100644
--- a/db/sb.c
+++ b/db/sb.c
@@ -50,6 +50,30 @@ sb_init(void)
 	add_command(&version_cmd);
 }
 
+/*
+ * Counts superblock fields that only exist when the metadata directory feature
+ * is enabled.
+ */
+static int
+metadirino_count(
+	void		*obj,
+	int		startoff)
+{
+	return xfs_has_metadir(mp) ? 1 : 0;
+}
+
+/*
+ * Counts superblock fields that only existed before the metadata directory
+ * feature came along.
+ */
+static int
+rootino_count(
+	void		*obj,
+	int		startoff)
+{
+	return xfs_has_metadir(mp) ? 0 : 1;
+}
+
 #define	OFF(f)	bitize(offsetof(struct xfs_dsb, sb_ ## f))
 #define	SZC(f)	szcount(struct xfs_dsb, sb_ ## f)
 const field_t	sb_flds[] = {
@@ -61,8 +85,12 @@ const field_t	sb_flds[] = {
 	{ "uuid", FLDT_UUID, OI(OFF(uuid)), C1, 0, TYP_NONE },
 	{ "logstart", FLDT_DFSBNO, OI(OFF(logstart)), C1, 0, TYP_LOG },
 	{ "rootino", FLDT_INO, OI(OFF(rootino)), C1, 0, TYP_INODE },
-	{ "rbmino", FLDT_INO, OI(OFF(rbmino)), C1, 0, TYP_INODE },
-	{ "rsumino", FLDT_INO, OI(OFF(rsumino)), C1, 0, TYP_INODE },
+	{ "metadirino", FLDT_INO, OI(OFF(rbmino)), metadirino_count,
+	  FLD_COUNT, TYP_INODE },
+	{ "rbmino", FLDT_INO, OI(OFF(rbmino)), rootino_count, FLD_COUNT,
+	  TYP_INODE },
+	{ "rsumino", FLDT_INO, OI(OFF(rsumino)), rootino_count, FLD_COUNT,
+	  TYP_INODE },
 	{ "rextsize", FLDT_AGBLOCK, OI(OFF(rextsize)), C1, 0, TYP_NONE },
 	{ "agblocks", FLDT_AGBLOCK, OI(OFF(agblocks)), C1, 0, TYP_NONE },
 	{ "agcount", FLDT_AGNUMBER, OI(OFF(agcount)), C1, 0, TYP_NONE },
@@ -85,8 +113,10 @@ const field_t	sb_flds[] = {
 	{ "ifree", FLDT_UINT64D, OI(OFF(ifree)), C1, 0, TYP_NONE },
 	{ "fdblocks", FLDT_UINT64D, OI(OFF(fdblocks)), C1, 0, TYP_NONE },
 	{ "frextents", FLDT_UINT64D, OI(OFF(frextents)), C1, 0, TYP_NONE },
-	{ "uquotino", FLDT_INO, OI(OFF(uquotino)), C1, 0, TYP_INODE },
-	{ "gquotino", FLDT_INO, OI(OFF(gquotino)), C1, 0, TYP_INODE },
+	{ "uquotino", FLDT_INO, OI(OFF(uquotino)), rootino_count, FLD_COUNT,
+	  TYP_INODE },
+	{ "gquotino", FLDT_INO, OI(OFF(gquotino)), rootino_count, FLD_COUNT,
+	  TYP_INODE },
 	{ "qflags", FLDT_UINT16X, OI(OFF(qflags)), C1, 0, TYP_NONE },
 	{ "flags", FLDT_UINT8X, OI(OFF(flags)), C1, 0, TYP_NONE },
 	{ "shared_vn", FLDT_UINT8D, OI(OFF(shared_vn)), C1, 0, TYP_NONE },
@@ -110,7 +140,8 @@ const field_t	sb_flds[] = {
 		C1, 0, TYP_NONE },
 	{ "crc", FLDT_CRC, OI(OFF(crc)), C1, 0, TYP_NONE },
 	{ "spino_align", FLDT_EXTLEN, OI(OFF(spino_align)), C1, 0, TYP_NONE },
-	{ "pquotino", FLDT_INO, OI(OFF(pquotino)), C1, 0, TYP_INODE },
+	{ "pquotino", FLDT_INO, OI(OFF(pquotino)), rootino_count, FLD_COUNT,
+	  TYP_INODE },
 	{ "lsn", FLDT_UINT64X, OI(OFF(lsn)), C1, 0, TYP_NONE },
 	{ "meta_uuid", FLDT_UUID, OI(OFF(meta_uuid)), C1, 0, TYP_NONE },
 	{ NULL }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 31/58] xfs_io: support the bulkstat metadata directory flag
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (29 preceding siblings ...)
  2023-12-31 23:37   ` [PATCH 30/58] xfs_db: mask superblock fields when metadir feature is enabled Darrick J. Wong
@ 2023-12-31 23:37   ` Darrick J. Wong
  2023-12-31 23:37   ` [PATCH 32/58] xfs_io: support scrubbing metadata directory paths Darrick J. Wong
                     ` (26 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:37 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Support the new XFS_BULK_IREQ_METADIR flag for bulkstat commands.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/bulkstat.c     |   16 +++++++++++++++-
 man/man8/xfs_io.8 |   10 +++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)


diff --git a/io/bulkstat.c b/io/bulkstat.c
index a9ad87ca183..829f6a02515 100644
--- a/io/bulkstat.c
+++ b/io/bulkstat.c
@@ -70,6 +70,7 @@ bulkstat_help(void)
 "   -d         Print debugging output.\n"
 "   -q         Be quiet, no output.\n"
 "   -e <ino>   Stop after this inode.\n"
+"   -m         Include metadata directories.\n"
 "   -n <nr>    Ask for this many results at once.\n"
 "   -s <ino>   Inode to start with.\n"
 "   -v <ver>   Use this version of the ioctl (1 or 5).\n"));
@@ -107,11 +108,12 @@ bulkstat_f(
 	bool			has_agno = false;
 	bool			debug = false;
 	bool			quiet = false;
+	bool			metadir = false;
 	unsigned int		i;
 	int			c;
 	int			ret;
 
-	while ((c = getopt(argc, argv, "a:de:n:qs:v:")) != -1) {
+	while ((c = getopt(argc, argv, "a:de:mn:qs:v:")) != -1) {
 		switch (c) {
 		case 'a':
 			agno = cvt_u32(optarg, 10);
@@ -131,6 +133,9 @@ bulkstat_f(
 				return 1;
 			}
 			break;
+		case 'm':
+			metadir = true;
+			break;
 		case 'n':
 			batch_size = cvt_u32(optarg, 10);
 			if (errno) {
@@ -185,6 +190,8 @@ bulkstat_f(
 
 	if (has_agno)
 		xfrog_bulkstat_set_ag(breq, agno);
+	if (metadir)
+		breq->hdr.flags |= XFS_BULK_IREQ_METADIR;
 
 	set_xfd_flags(&xfd, ver);
 
@@ -253,6 +260,7 @@ bulkstat_single_f(
 	unsigned long		ver = 0;
 	unsigned int		i;
 	bool			debug = false;
+	bool			metadir = false;
 	int			c;
 	int			ret;
 
@@ -261,6 +269,9 @@ bulkstat_single_f(
 		case 'd':
 			debug = true;
 			break;
+		case 'm':
+			metadir = true;
+			break;
 		case 'v':
 			errno = 0;
 			ver = strtoull(optarg, NULL, 10);
@@ -313,6 +324,9 @@ bulkstat_single_f(
 			}
 		}
 
+		if (metadir)
+			flags |= XFS_BULK_IREQ_METADIR;
+
 		ret = -xfrog_bulkstat_single(&xfd, ino, flags, &bulkstat);
 		if (ret) {
 			xfrog_perror(ret, "xfrog_bulkstat_single");
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 5a6b2724504..1975f5d1011 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1240,7 +1240,7 @@ for the current memory mapping.
 
 .SH FILESYSTEM COMMANDS
 .TP
-.BI "bulkstat [ \-a " agno " ] [ \-d ] [ \-e " endino " ] [ \-n " batchsize " ] [ \-q ] [ \-s " startino " ] [ \-v " version" ]
+.BI "bulkstat [ \-a " agno " ] [ \-d ] [ \-e " endino " ] [ \-m ] [ \-n " batchsize " ] [ \-q ] [ \-s " startino " ] [ \-v " version" ]
 Display raw stat information about a bunch of inodes in an XFS filesystem.
 Options are as follows:
 .RS 1.0i
@@ -1257,6 +1257,9 @@ Print debugging information about call results.
 Stop displaying records when this inode number is reached.
 Defaults to stopping when the system call stops returning results.
 .TP
+.BI \-m
+Include metadata directories in the output.
+.TP
 .BI \-n " batchsize"
 Retrieve at most this many records per call.
 Defaults to 4,096.
@@ -1277,10 +1280,11 @@ Currently supported versions are 1 and 5.
 .RE
 .PD
 .TP
-.BI "bulkstat_single [ \-d ] [ \-v " version " ] [ " inum... " | " special... " ]
+.BI "bulkstat_single [ \-d ] [ \-m ] [ \-v " version " ] [ " inum... " | " special... " ]
 Display raw stat information about individual inodes in an XFS filesystem.
 The
-.B \-d
+.BR \-d ,
+.BR \-m ,
 and
 .B \-v
 options are the same as the


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 32/58] xfs_io: support scrubbing metadata directory paths
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (30 preceding siblings ...)
  2023-12-31 23:37   ` [PATCH 31/58] xfs_io: support the bulkstat metadata directory flag Darrick J. Wong
@ 2023-12-31 23:37   ` Darrick J. Wong
  2023-12-31 23:38   ` [PATCH 33/58] xfs_spaceman: report health of metadir inodes too Darrick J. Wong
                     ` (25 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:37 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Support invoking the metadata directory path scrubber from xfs_io for
testing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/scrub.c        |   62 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 man/man8/xfs_io.8 |    3 ++-
 2 files changed, 62 insertions(+), 3 deletions(-)


diff --git a/io/scrub.c b/io/scrub.c
index 456d1594f22..cb4e24503dc 100644
--- a/io/scrub.c
+++ b/io/scrub.c
@@ -41,6 +41,12 @@ scrub_help(void)
 " Known metadata scrub types are:"));
 	for (i = 0, d = xfrog_scrubbers; i < XFS_SCRUB_TYPE_NR; i++, d++)
 		printf(" %s", d->name);
+	printf(_(
+"\n"
+"\n"
+" Known metapath scrub arguments are:"));
+	for (i = 0, d = xfrog_metapaths; i < XFS_SCRUB_METAPATH_NR; i++, d++)
+		printf(" %s", d->name);
 	printf("\n");
 }
 
@@ -125,6 +131,40 @@ parse_none(
 	return true;
 }
 
+static bool
+parse_metapath(
+	int		argc,
+	char		**argv,
+	int		optind,
+	__u64		*ino)
+{
+	char		*p;
+	unsigned long long control;
+	int		i;
+
+	if (optind != argc - 1) {
+		fprintf(stderr, _("Must specify metapath number.\n"));
+		return false;
+	}
+
+	for (i = 0; i < XFS_SCRUB_METAPATH_NR; i++) {
+		if (!strcmp(argv[optind], xfrog_metapaths[i].name)) {
+			*ino = i;
+			return true;
+		}
+	}
+
+	control = strtoll(argv[optind], &p, 0);
+	if (*p != '\0') {
+		fprintf(stderr, _("Bad metapath number '%s'.\n"),
+				argv[optind]);
+		return false;
+	}
+
+	*ino = control;
+	return true;
+}
+
 static int
 parse_args(
 	int				argc,
@@ -170,6 +210,12 @@ parse_args(
 	meta->sm_flags = flags;
 
 	switch (d->group) {
+	case XFROG_SCRUB_GROUP_METAPATH:
+		if (!parse_metapath(argc, argv, optind, &meta->sm_ino)) {
+			exitcode = 1;
+			return command_usage(cmdinfo);
+		}
+		break;
 	case XFROG_SCRUB_GROUP_INODE:
 		if (!parse_inode(argc, argv, optind, &meta->sm_ino,
 						     &meta->sm_gen)) {
@@ -244,7 +290,7 @@ scrub_init(void)
 	scrub_cmd.argmin = 1;
 	scrub_cmd.argmax = -1;
 	scrub_cmd.flags = CMD_NOMAP_OK;
-	scrub_cmd.args = _("type [agno|ino gen]");
+	scrub_cmd.args = _("type [agno|ino gen|metapath]");
 	scrub_cmd.oneline = _("scrubs filesystem metadata");
 	scrub_cmd.help = scrub_help;
 
@@ -275,6 +321,12 @@ repair_help(void)
 " Known metadata repair types are:"));
 	for (i = 0, d = xfrog_scrubbers; i < XFS_SCRUB_TYPE_NR; i++, d++)
 		printf(" %s", d->name);
+	printf(_(
+"\n"
+"\n"
+" Known metapath repair arguments are:"));
+	for (i = 0, d = xfrog_metapaths; i < XFS_SCRUB_METAPATH_NR; i++, d++)
+		printf(" %s", d->name);
 	printf("\n");
 }
 
@@ -327,7 +379,7 @@ repair_init(void)
 	repair_cmd.argmin = 1;
 	repair_cmd.argmax = -1;
 	repair_cmd.flags = CMD_NOMAP_OK;
-	repair_cmd.args = _("type [agno|ino gen]");
+	repair_cmd.args = _("type [agno|ino gen|metapath]");
 	repair_cmd.oneline = _("repairs filesystem metadata");
 	repair_cmd.help = repair_help;
 
@@ -500,6 +552,12 @@ scrubv_f(
 	optind++;
 
 	switch (group) {
+	case XFROG_SCRUB_GROUP_METAPATH:
+		if (!parse_metapath(argc, argv, optind, &vhead->svh_ino)) {
+			exitcode = 1;
+			return command_usage(&scrubv_cmd);
+		}
+		break;
 	case XFROG_SCRUB_GROUP_INODE:
 		if (!parse_inode(argc, argv, optind, &vhead->svh_ino,
 						     &vhead->svh_gen)) {
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 1975f5d1011..ef3a5681a1a 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1422,13 +1422,14 @@ Currently supported versions are 1 and 5.
 .RE
 .PD
 .TP
-.BI "scrub " type " [ " agnumber " | " "ino" " " "gen" " ]"
+.BI "scrub " type " [ " agnumber " | " "ino" " " "gen" " | " metapath " ]"
 Scrub internal XFS filesystem metadata.  The
 .BI type
 parameter specifies which type of metadata to scrub.
 For AG metadata, one AG number must be specified.
 For file metadata, the scrub is applied to the open file unless the
 inode number and generation number are specified.
+For metapath, the name of a file or a raw number must be specified.
 .RE
 .PD
 .TP


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 33/58] xfs_spaceman: report health of metadir inodes too
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (31 preceding siblings ...)
  2023-12-31 23:37   ` [PATCH 32/58] xfs_io: support scrubbing metadata directory paths Darrick J. Wong
@ 2023-12-31 23:38   ` Darrick J. Wong
  2023-12-31 23:38   ` [PATCH 34/58] xfs_scrub: scan metadata directories during phase 3 Darrick J. Wong
                     ` (24 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:38 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If the filesystem has a metadata directory tree, we should include those
inodes in the health report.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 spaceman/health.c |    2 ++
 1 file changed, 2 insertions(+)


diff --git a/spaceman/health.c b/spaceman/health.c
index 8ba78152cb6..47cf9099492 100644
--- a/spaceman/health.c
+++ b/spaceman/health.c
@@ -328,6 +328,8 @@ report_bulkstat_health(
 
 	if (agno != NULLAGNUMBER)
 		xfrog_bulkstat_set_ag(breq, agno);
+	if (file->xfd.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR)
+		breq->hdr.flags |= XFS_BULK_IREQ_METADIR;
 
 	do {
 		error = -xfrog_bulkstat(&file->xfd, breq);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 34/58] xfs_scrub: scan metadata directories during phase 3
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (32 preceding siblings ...)
  2023-12-31 23:38   ` [PATCH 33/58] xfs_spaceman: report health of metadir inodes too Darrick J. Wong
@ 2023-12-31 23:38   ` Darrick J. Wong
  2023-12-31 23:38   ` [PATCH 35/58] xfs_scrub: re-run metafile scrubbers during phase 5 Darrick J. Wong
                     ` (23 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:38 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Scan metadata directories for correctness during phase 3.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/inodes.c |   11 ++++++++++-
 scrub/inodes.h |    5 ++++-
 scrub/phase3.c |    7 ++++++-
 scrub/phase5.c |    5 ++++-
 scrub/phase6.c |    2 +-
 5 files changed, 25 insertions(+), 5 deletions(-)


diff --git a/scrub/inodes.c b/scrub/inodes.c
index 16c79cf495c..3fe759e8f48 100644
--- a/scrub/inodes.c
+++ b/scrub/inodes.c
@@ -56,6 +56,7 @@ bulkstat_for_inumbers(
 {
 	struct xfs_bulkstat	*bstat = breq->bulkstat;
 	struct xfs_bulkstat	*bs;
+	unsigned int		flags = 0;
 	int			i;
 	int			error;
 
@@ -70,6 +71,9 @@ bulkstat_for_inumbers(
 			 strerror_r(error, errbuf, DESCR_BUFSZ));
 	}
 
+	if (breq->hdr.flags & XFS_BULK_IREQ_METADIR)
+		flags |= XFS_BULK_IREQ_METADIR;
+
 	/*
 	 * Check each of the stats we got back to make sure we got the inodes
 	 * we asked for.
@@ -84,7 +88,7 @@ bulkstat_for_inumbers(
 
 		/* Load the one inode. */
 		error = -xfrog_bulkstat_single(&ctx->mnt,
-				inumbers->xi_startino + i, 0, bs);
+				inumbers->xi_startino + i, flags, bs);
 		if (error || bs->bs_ino != inumbers->xi_startino + i) {
 			memset(bs, 0, sizeof(struct xfs_bulkstat));
 			bs->bs_ino = inumbers->xi_startino + i;
@@ -100,6 +104,7 @@ struct scan_inodes {
 	scrub_inode_iter_fn	fn;
 	void			*arg;
 	unsigned int		nr_threads;
+	unsigned int		flags;
 	bool			aborted;
 };
 
@@ -158,6 +163,8 @@ alloc_ichunk(
 
 	breq = ichunk_to_bulkstat(ichunk);
 	breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
+	if (si->flags & SCRUB_SCAN_METADIR)
+		breq->hdr.flags |= XFS_BULK_IREQ_METADIR;
 
 	*ichunkp = ichunk;
 	return 0;
@@ -380,10 +387,12 @@ int
 scrub_scan_all_inodes(
 	struct scrub_ctx	*ctx,
 	scrub_inode_iter_fn	fn,
+	unsigned int		flags,
 	void			*arg)
 {
 	struct scan_inodes	si = {
 		.fn		= fn,
+		.flags		= flags,
 		.arg		= arg,
 		.nr_threads	= scrub_nproc_workqueue(ctx),
 	};
diff --git a/scrub/inodes.h b/scrub/inodes.h
index 9447fb56aa6..7a0b275e575 100644
--- a/scrub/inodes.h
+++ b/scrub/inodes.h
@@ -17,8 +17,11 @@
 typedef int (*scrub_inode_iter_fn)(struct scrub_ctx *ctx,
 		struct xfs_handle *handle, struct xfs_bulkstat *bs, void *arg);
 
+/* Return metadata directories too. */
+#define SCRUB_SCAN_METADIR	(1 << 0)
+
 int scrub_scan_all_inodes(struct scrub_ctx *ctx, scrub_inode_iter_fn fn,
-		void *arg);
+		unsigned int flags, void *arg);
 
 int scrub_open_handle(struct xfs_handle *handle);
 
diff --git a/scrub/phase3.c b/scrub/phase3.c
index 046a42c1da8..c90da784394 100644
--- a/scrub/phase3.c
+++ b/scrub/phase3.c
@@ -312,6 +312,7 @@ phase3_func(
 	struct scrub_inode_ctx	ictx = { .ctx = ctx };
 	uint64_t		val;
 	xfs_agnumber_t		agno;
+	unsigned int		scan_flags = 0;
 	int			err;
 
 	err = -ptvar_alloc(scrub_nproc(ctx), sizeof(struct action_list),
@@ -328,6 +329,10 @@ phase3_func(
 		goto out_ptvar;
 	}
 
+	/* Scan the metadata directory tree too. */
+	if (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR)
+		scan_flags |= SCRUB_SCAN_METADIR;
+
 	/*
 	 * If we already have ag/fs metadata to repair from previous phases,
 	 * we would rather not try to repair file metadata until we've tried
@@ -338,7 +343,7 @@ phase3_func(
 			ictx.always_defer_repairs = true;
 	}
 
-	err = scrub_scan_all_inodes(ctx, scrub_inode, &ictx);
+	err = scrub_scan_all_inodes(ctx, scrub_inode, scan_flags, &ictx);
 	if (!err && ictx.aborted)
 		err = ECANCELED;
 	if (err)
diff --git a/scrub/phase5.c b/scrub/phase5.c
index f6c295c64ad..b922f53398e 100644
--- a/scrub/phase5.c
+++ b/scrub/phase5.c
@@ -455,6 +455,9 @@ retry_deferred_inode(
 	unsigned int		flags = 0;
 	int			error;
 
+	if (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR)
+		flags |= XFS_BULK_IREQ_METADIR;
+
 	error = -xfrog_bulkstat_single(&ctx->mnt, ino, flags, &bstat);
 	if (error == ENOENT) {
 		/* Directory is gone, mark it clear. */
@@ -765,7 +768,7 @@ _("Filesystem has errors, skipping connectivity checks."));
 
 	pthread_mutex_init(&ncs.lock, NULL);
 
-	ret = scrub_scan_all_inodes(ctx, check_inode_names, &ncs);
+	ret = scrub_scan_all_inodes(ctx, check_inode_names, 0, &ncs);
 	if (ret)
 		goto out_lock;
 	if (ncs.aborted) {
diff --git a/scrub/phase6.c b/scrub/phase6.c
index 66ca57507e9..b95966c1f82 100644
--- a/scrub/phase6.c
+++ b/scrub/phase6.c
@@ -558,7 +558,7 @@ report_all_media_errors(
 	}
 
 	/* Scan for unlinked files. */
-	return scrub_scan_all_inodes(ctx, report_inode_loss, vs);
+	return scrub_scan_all_inodes(ctx, report_inode_loss, 0, vs);
 }
 
 /* Schedule a read-verify of a (data block) extent. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 35/58] xfs_scrub: re-run metafile scrubbers during phase 5
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (33 preceding siblings ...)
  2023-12-31 23:38   ` [PATCH 34/58] xfs_scrub: scan metadata directories during phase 3 Darrick J. Wong
@ 2023-12-31 23:38   ` Darrick J. Wong
  2023-12-31 23:38   ` [PATCH 36/58] xfs_repair: don't zero the incore secondary super when zeroing Darrick J. Wong
                     ` (22 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:38 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

For metadata files on a metadir filesystem, re-run the scrubbers during
phase 5 to ensure that the metadata files are still connected.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/phase5.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 scrub/scrub.h  |    7 ++++
 2 files changed, 103 insertions(+), 1 deletion(-)


diff --git a/scrub/phase5.c b/scrub/phase5.c
index b922f53398e..3b8daf39ae6 100644
--- a/scrub/phase5.c
+++ b/scrub/phase5.c
@@ -738,6 +738,87 @@ run_kernel_fs_scan_scrubbers(
 	return ret;
 }
 
+/* Queue one metapath scrubber. */
+static int
+queue_metapath_scan(
+	struct workqueue	*wq,
+	bool			*abortedp,
+	uint64_t		type)
+{
+	struct fs_scan_item	*item;
+	struct scrub_ctx	*ctx = wq->wq_ctx;
+	int			ret;
+
+	item = malloc(sizeof(struct fs_scan_item));
+	if (!item) {
+		ret = ENOMEM;
+		str_liberror(ctx, ret, _("setting up metapath scan"));
+		return ret;
+	}
+	scrub_item_init_metapath(&item->sri, type);
+	scrub_item_schedule(&item->sri, XFS_SCRUB_TYPE_METAPATH);
+	item->abortedp = abortedp;
+
+	ret = -workqueue_add(wq, fs_scan_worker, 0, item);
+	if (ret)
+		str_liberror(ctx, ret, _("queuing metapath scan work"));
+
+	return ret;
+}
+
+/*
+ * Scrub metadata directory file paths to ensure that fs metadata are still
+ * connected where the fs needs to find them.
+ */
+static int
+run_kernel_metadir_path_scrubbers(
+	struct scrub_ctx	*ctx)
+{
+	struct workqueue	wq;
+	const struct xfrog_scrub_descr	*sc;
+	uint64_t		type;
+	unsigned int		nr_threads = scrub_nproc_workqueue(ctx);
+	bool			aborted = false;
+	int			ret, ret2;
+
+	ret = -workqueue_create(&wq, (struct xfs_mount *)ctx, nr_threads);
+	if (ret) {
+		str_liberror(ctx, ret, _("setting up metapath scan workqueue"));
+		return ret;
+	}
+
+	/*
+	 * Scan all the metadata files in parallel if metadata directories
+	 * are enabled, because the phase 3 scrubbers might have taken out
+	 * parts of the metadir tree.
+	 */
+	for (type = 0; type < XFS_SCRUB_METAPATH_NR; type++) {
+		sc = &xfrog_metapaths[type];
+		if (sc->group != XFROG_SCRUB_GROUP_FS)
+			continue;
+
+		ret = queue_metapath_scan(&wq, &aborted, type);
+		if (ret) {
+			str_liberror(ctx, ret,
+ _("queueing metapath scrub work"));
+			goto wait;
+		}
+	}
+
+wait:
+	ret2 = -workqueue_terminate(&wq);
+	if (ret2) {
+		str_liberror(ctx, ret2, _("joining metapath scan workqueue"));
+		if (!ret)
+			ret = ret2;
+	}
+	if (aborted && !ret)
+		ret = ECANCELED;
+
+	workqueue_destroy(&wq);
+	return ret;
+}
+
 /* Check directory connectivity. */
 int
 phase5_func(
@@ -746,6 +827,16 @@ phase5_func(
 	struct ncheck_state	ncs = { .ctx = ctx };
 	int			ret;
 
+	/*
+	 * Make sure metadata files are still connected to the metadata
+	 * directory tree now that phase 3 pruned all corrupt directory tree
+	 * links.
+	 */
+	if (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR) {
+		ret = run_kernel_metadir_path_scrubbers(ctx);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Check and fix anything that requires a full filesystem scan.  We do
@@ -798,8 +889,12 @@ phase5_estimate(
 	unsigned int		*nr_threads,
 	int			*rshift)
 {
+	unsigned int		scans = 2;
+
 	*items = scrub_estimate_iscan_work(ctx);
-	*nr_threads = scrub_nproc(ctx) * 2;
+	if (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR)
+		scans++;
+	*nr_threads = scrub_nproc(ctx) * scans;
 	*rshift = 0;
 	return 0;
 }
diff --git a/scrub/scrub.h b/scrub/scrub.h
index c3eed1b261d..3bb3ea1d07b 100644
--- a/scrub/scrub.h
+++ b/scrub/scrub.h
@@ -108,6 +108,13 @@ scrub_item_init_file(struct scrub_item *sri, const struct xfs_bulkstat *bstat)
 	sri->sri_gen = bstat->bs_gen;
 }
 
+static inline void
+scrub_item_init_metapath(struct scrub_item *sri, uint64_t metapath)
+{
+	memset(sri, 0, sizeof(*sri));
+	sri->sri_ino = metapath;
+}
+
 void scrub_item_dump(struct scrub_item *sri, unsigned int group_mask,
 		const char *tag);
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 36/58] xfs_repair: don't zero the incore secondary super when zeroing
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (34 preceding siblings ...)
  2023-12-31 23:38   ` [PATCH 35/58] xfs_scrub: re-run metafile scrubbers during phase 5 Darrick J. Wong
@ 2023-12-31 23:38   ` Darrick J. Wong
  2023-12-31 23:39   ` [PATCH 37/58] xfs_repair: refactor metadata inode tagging Darrick J. Wong
                     ` (21 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:38 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If secondary_sb_whack detects nonzero bytes beyond the end of the ondisk
superblock, it will try to zero the end of the ondisk buffer as well as
the incore superblock prior to scan_ag using that incore super to
rewrite the ondisk super.

However, the metadata directory feature adds a sb_metadirino field to
the incore super.  On disk, this is stored in the same slot as
sb_rbmino, but we wanted to cache both inumbers incore to minimize the
churn.  Therefore, it is now only safe to zero the "end" of an xfs_dsb
buffer, and never an xfs_sb object.

Most of the XFS codebase moved off that second behavior long ago, with
the exception of this one part of repair.  The zeroing probably ought to
be turned into explicit logic to zero fields that weren't defined with
the featureset encoded in the primary superblock, but for now we'll
resort to always resetting the values from the xfs_mount's xfs_sb.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/agheader.c |    7 +++++++
 1 file changed, 7 insertions(+)


diff --git a/repair/agheader.c b/repair/agheader.c
index 3930a0ac091..af88802ffdf 100644
--- a/repair/agheader.c
+++ b/repair/agheader.c
@@ -405,6 +405,13 @@ secondary_sb_whack(
 				mp->m_sb.sb_sectsize - size);
 			/* Preserve meta_uuid so we don't fail uuid checks */
 			memcpy(&sb->sb_meta_uuid, &tmpuuid, sizeof(uuid_t));
+
+			/*
+			 * Preserve the parts of the incore super that extend
+			 * beyond the part that's supposed to match the ondisk
+			 * super byte for byte.
+			 */
+			sb->sb_metadirino = mp->m_sb.sb_metadirino;
 		} else
 			do_warn(
 	_("would zero unused portion of %s superblock (AG #%u)\n"),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 37/58] xfs_repair: refactor metadata inode tagging
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (35 preceding siblings ...)
  2023-12-31 23:38   ` [PATCH 36/58] xfs_repair: don't zero the incore secondary super when zeroing Darrick J. Wong
@ 2023-12-31 23:39   ` Darrick J. Wong
  2023-12-31 23:39   ` [PATCH 38/58] xfs_repair: reject regular directory dirents that point to metadata fieles Darrick J. Wong
                     ` (20 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:39 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Refactor tagging of metadata inodes into a single helper function
instead of open-coding a if-else statement.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dir2.c |   60 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 24 deletions(-)


diff --git a/repair/dir2.c b/repair/dir2.c
index e46ae9ae46f..9f10fde09a1 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -136,6 +136,31 @@ process_sf_dir2_fixoff(
 	}
 }
 
+static bool
+is_meta_ino(
+	struct xfs_mount	*mp,
+	xfs_ino_t		dirino,
+	xfs_ino_t		lino,
+	char			**junkreason)
+{
+	char			*reason = NULL;
+
+	if (lino == mp->m_sb.sb_rbmino)
+		reason = _("realtime bitmap");
+	else if (lino == mp->m_sb.sb_rsumino)
+		reason = _("realtime summary");
+	else if (lino == mp->m_sb.sb_uquotino)
+		reason = _("user quota");
+	else if (lino == mp->m_sb.sb_gquotino)
+		reason = _("group quota");
+	else if (lino == mp->m_sb.sb_pquotino)
+		reason = _("project quota");
+
+	if (reason)
+		*junkreason = reason;
+	return reason != NULL;
+}
+
 /*
  * this routine performs inode discovery and tries to fix things
  * in place.  available redundancy -- inode data size should match
@@ -227,21 +252,12 @@ process_sf_dir2(
 		} else if (!libxfs_verify_dir_ino(mp, lino)) {
 			junkit = 1;
 			junkreason = _("invalid");
-		} else if (lino == mp->m_sb.sb_rbmino)  {
+		} else if (is_meta_ino(mp, ino, lino, &junkreason)) {
+			/*
+			 * Directories that are not in the metadir tree should
+			 * not be linking to metadata files.
+			 */
 			junkit = 1;
-			junkreason = _("realtime bitmap");
-		} else if (lino == mp->m_sb.sb_rsumino)  {
-			junkit = 1;
-			junkreason = _("realtime summary");
-		} else if (lino == mp->m_sb.sb_uquotino)  {
-			junkit = 1;
-			junkreason = _("user quota");
-		} else if (lino == mp->m_sb.sb_gquotino)  {
-			junkit = 1;
-			junkreason = _("group quota");
-		} else if (lino == mp->m_sb.sb_pquotino)  {
-			junkit = 1;
-			junkreason = _("project quota");
 		} else if ((irec_p = find_inode_rec(mp,
 					XFS_INO_TO_AGNO(mp, lino),
 					XFS_INO_TO_AGINO(mp, lino))) != NULL) {
@@ -698,16 +714,12 @@ process_dir2_data(
 			 * directory since it's still structurally intact.
 			 */
 			clearreason = _("invalid");
-		} else if (ent_ino == mp->m_sb.sb_rbmino) {
-			clearreason = _("realtime bitmap");
-		} else if (ent_ino == mp->m_sb.sb_rsumino) {
-			clearreason = _("realtime summary");
-		} else if (ent_ino == mp->m_sb.sb_uquotino) {
-			clearreason = _("user quota");
-		} else if (ent_ino == mp->m_sb.sb_gquotino) {
-			clearreason = _("group quota");
-		} else if (ent_ino == mp->m_sb.sb_pquotino) {
-			clearreason = _("project quota");
+		} else if (is_meta_ino(mp, ino, ent_ino, &clearreason)) {
+			/*
+			 * Directories that are not in the metadir tree should
+			 * not be linking to metadata files.
+			 */
+			clearino = 1;
 		} else {
 			irec_p = find_inode_rec(mp,
 						XFS_INO_TO_AGNO(mp, ent_ino),


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 38/58] xfs_repair: reject regular directory dirents that point to metadata fieles
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (36 preceding siblings ...)
  2023-12-31 23:39   ` [PATCH 37/58] xfs_repair: refactor metadata inode tagging Darrick J. Wong
@ 2023-12-31 23:39   ` Darrick J. Wong
  2023-12-31 23:39   ` [PATCH 39/58] xfs_repair: dont check metadata directory dirent inumbers Darrick J. Wong
                     ` (19 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:39 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If a directory that's in the regular (non-metadata) directory tree has
an entry that points to a metadata file, trash the dirent.  Files are
not allowed to cross between the two trees.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dir2.c |    8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)


diff --git a/repair/dir2.c b/repair/dir2.c
index 9f10fde09a1..b4ebcd56d57 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -140,6 +140,7 @@ static bool
 is_meta_ino(
 	struct xfs_mount	*mp,
 	xfs_ino_t		dirino,
+	const struct xfs_dinode	*dip,
 	xfs_ino_t		lino,
 	char			**junkreason)
 {
@@ -155,6 +156,9 @@ is_meta_ino(
 		reason = _("group quota");
 	else if (lino == mp->m_sb.sb_pquotino)
 		reason = _("project quota");
+	else if (dip->di_version >= 3 &&
+		 (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
+		reason = _("metadata directory file");
 
 	if (reason)
 		*junkreason = reason;
@@ -252,7 +256,7 @@ process_sf_dir2(
 		} else if (!libxfs_verify_dir_ino(mp, lino)) {
 			junkit = 1;
 			junkreason = _("invalid");
-		} else if (is_meta_ino(mp, ino, lino, &junkreason)) {
+		} else if (is_meta_ino(mp, ino, dip, lino, &junkreason)) {
 			/*
 			 * Directories that are not in the metadir tree should
 			 * not be linking to metadata files.
@@ -714,7 +718,7 @@ process_dir2_data(
 			 * directory since it's still structurally intact.
 			 */
 			clearreason = _("invalid");
-		} else if (is_meta_ino(mp, ino, ent_ino, &clearreason)) {
+		} else if (is_meta_ino(mp, ino, dip, ent_ino, &clearreason)) {
 			/*
 			 * Directories that are not in the metadir tree should
 			 * not be linking to metadata files.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 39/58] xfs_repair: dont check metadata directory dirent inumbers
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (37 preceding siblings ...)
  2023-12-31 23:39   ` [PATCH 38/58] xfs_repair: reject regular directory dirents that point to metadata fieles Darrick J. Wong
@ 2023-12-31 23:39   ` Darrick J. Wong
  2023-12-31 23:40   ` [PATCH 40/58] xfs_repair: refactor fixing dotdot Darrick J. Wong
                     ` (18 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:39 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Phase 6 always rebuilds the entire metadata directory tree, and repair
quietly ignores all the DIFLAG2_METADATA directory inodes that it finds.
As a result, none of the metadata directories are marked inuse in the
incore data.  Therefore, the is_inode_free checks are not valid for
anything we find in a metadata directory.

Therefore, avoid checking is_inode_free when scanning metadata directory
dirents.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    1 +
 repair/dir2.c            |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index ca8e231c0fd..e171755a6f0 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -106,6 +106,7 @@
 #define xfs_dinode_calc_crc		libxfs_dinode_calc_crc
 #define xfs_dinode_good_version		libxfs_dinode_good_version
 #define xfs_dinode_verify		libxfs_dinode_verify
+#define xfs_dinode_verify_metadir	libxfs_dinode_verify_metadir
 
 #define xfs_dir2_data_bestfree_p	libxfs_dir2_data_bestfree_p
 #define xfs_dir2_data_entry_tag_p	libxfs_dir2_data_entry_tag_p
diff --git a/repair/dir2.c b/repair/dir2.c
index b4ebcd56d57..1184392ff47 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -165,6 +165,28 @@ is_meta_ino(
 	return reason != NULL;
 }
 
+static inline bool
+is_metadata_directory(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip)
+{
+	xfs_failaddr_t		fa;
+	uint16_t		mode;
+	uint16_t		flags;
+	uint64_t		flags2;
+
+	if (!xfs_has_metadir(mp))
+		return false;
+	if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
+		return false;
+
+	mode = be16_to_cpu(dip->di_mode);
+	flags = be16_to_cpu(dip->di_flags);
+	flags2 = be64_to_cpu(dip->di_flags2);
+	fa = libxfs_dinode_verify_metadir(mp, dip, mode, flags, flags2);
+	return fa == NULL;
+}
+
 /*
  * this routine performs inode discovery and tries to fix things
  * in place.  available redundancy -- inode data size should match
@@ -256,6 +278,12 @@ process_sf_dir2(
 		} else if (!libxfs_verify_dir_ino(mp, lino)) {
 			junkit = 1;
 			junkreason = _("invalid");
+		} else if (is_metadata_directory(mp, dip)) {
+			/*
+			 * Metadata directories are always rebuilt, so don't
+			 * bother checking if the child inode is free or not.
+			 */
+			junkit = 0;
 		} else if (is_meta_ino(mp, ino, dip, lino, &junkreason)) {
 			/*
 			 * Directories that are not in the metadir tree should
@@ -718,6 +746,12 @@ process_dir2_data(
 			 * directory since it's still structurally intact.
 			 */
 			clearreason = _("invalid");
+		} else if (is_metadata_directory(mp, dip)) {
+			/*
+			 * Metadata directories are always rebuilt, so don't
+			 * bother checking if the child inode is free or not.
+			 */
+			clearino = 0;
 		} else if (is_meta_ino(mp, ino, dip, ent_ino, &clearreason)) {
 			/*
 			 * Directories that are not in the metadir tree should


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 40/58] xfs_repair: refactor fixing dotdot
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (38 preceding siblings ...)
  2023-12-31 23:39   ` [PATCH 39/58] xfs_repair: dont check metadata directory dirent inumbers Darrick J. Wong
@ 2023-12-31 23:40   ` Darrick J. Wong
  2023-12-31 23:40   ` [PATCH 41/58] xfs_repair: refactor marking of metadata inodes Darrick J. Wong
                     ` (17 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:40 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pull the code that fixes a directory's dot-dot entry into a separate
helper function so that we can call it on the rootdir and (later) the
metadir.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   96 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 57 insertions(+), 39 deletions(-)


diff --git a/repair/phase6.c b/repair/phase6.c
index fe9a4da62dc..65a387aa97f 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -2829,6 +2829,62 @@ dir_hash_add_parent_ptrs(
 	}
 }
 
+/*
+ * If we have to create a .. for /, do it now *before* we delete the bogus
+ * entries, otherwise the directory could transform into a shortform dir which
+ * would probably cause the simulation to choke.  Even if the illegal entries
+ * get shifted around, it's ok because the entries are structurally intact and
+ * in in hash-value order so the simulation won't get confused if it has to
+ * move them around.
+ */
+static void
+fix_dotdot(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	struct xfs_inode	*ip,
+	xfs_ino_t		rootino,
+	const char		*tag,
+	int			*need_dotdot)
+{
+	struct xfs_trans	*tp;
+	int			nres;
+	int			error;
+
+	if (ino != rootino || !*need_dotdot)
+		return;
+
+	if (no_modify) {
+		do_warn(_("would recreate %s directory .. entry\n"), tag);
+		return;
+	}
+
+	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL);
+
+	do_warn(_("recreating %s directory .. entry\n"), tag);
+
+	nres = libxfs_mkdir_space_res(mp, 2);
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_mkdir, nres, 0, 0, &tp);
+	if (error)
+		res_failed(error);
+
+	libxfs_trans_ijoin(tp, ip, 0);
+
+	error = -libxfs_dir_createname(tp, ip, &xfs_name_dotdot, ip->i_ino,
+			nres);
+	if (error)
+		do_error(
+_("can't make \"..\" entry in %s inode %" PRIu64 ", createname error %d\n"),
+			tag ,ino, error);
+
+	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	error = -libxfs_trans_commit(tp);
+	if (error)
+		do_error(
+_("%s inode \"..\" entry recreation failed (%d)\n"), tag, error);
+
+	*need_dotdot = 0;
+}
+
 /*
  * processes all reachable inodes in directories
  */
@@ -2958,45 +3014,7 @@ _("error %d fixing shortform directory %llu\n"),
 	dir_hash_add_parent_ptrs(ip, hashtab);
 	dir_hash_done(hashtab);
 
-	/*
-	 * if we have to create a .. for /, do it now *before*
-	 * we delete the bogus entries, otherwise the directory
-	 * could transform into a shortform dir which would
-	 * probably cause the simulation to choke.  Even
-	 * if the illegal entries get shifted around, it's ok
-	 * because the entries are structurally intact and in
-	 * in hash-value order so the simulation won't get confused
-	 * if it has to move them around.
-	 */
-	if (!no_modify && need_root_dotdot && ino == mp->m_sb.sb_rootino)  {
-		ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL);
-
-		do_warn(_("recreating root directory .. entry\n"));
-
-		nres = libxfs_mkdir_space_res(mp, 2);
-		error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_mkdir,
-					    nres, 0, 0, &tp);
-		if (error)
-			res_failed(error);
-
-		libxfs_trans_ijoin(tp, ip, 0);
-
-		error = -libxfs_dir_createname(tp, ip, &xfs_name_dotdot,
-					ip->i_ino, nres);
-		if (error)
-			do_error(
-	_("can't make \"..\" entry in root inode %" PRIu64 ", createname error %d\n"), ino, error);
-
-		libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		error = -libxfs_trans_commit(tp);
-		if (error)
-			do_error(
-	_("root inode \"..\" entry recreation failed (%d)\n"), error);
-
-		need_root_dotdot = 0;
-	} else if (need_root_dotdot && ino == mp->m_sb.sb_rootino)  {
-		do_warn(_("would recreate root directory .. entry\n"));
-	}
+	fix_dotdot(mp, ino, ip, mp->m_sb.sb_rootino, "root", &need_root_dotdot);
 
 	/*
 	 * if we need to create the '.' entry, do so only if


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 41/58] xfs_repair: refactor marking of metadata inodes
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (39 preceding siblings ...)
  2023-12-31 23:40   ` [PATCH 40/58] xfs_repair: refactor fixing dotdot Darrick J. Wong
@ 2023-12-31 23:40   ` Darrick J. Wong
  2023-12-31 23:40   ` [PATCH 42/58] xfs_repair: refactor root directory initialization Darrick J. Wong
                     ` (16 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:40 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Refactor the mechanics of marking a metadata inode into a helper
function so that we don't have to open-code that for every single
metadata inode.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   76 ++++++++++++++++++++-----------------------------------
 1 file changed, 28 insertions(+), 48 deletions(-)


diff --git a/repair/phase6.c b/repair/phase6.c
index 65a387aa97f..e01e81ae014 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -3071,6 +3071,22 @@ _("error %d fixing shortform directory %llu\n"),
 	libxfs_irele(ip);
 }
 
+static void
+mark_inode(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct ino_tree_node	*irec;
+	int			offset;
+
+	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, ino),
+			XFS_INO_TO_AGINO(mp, ino));
+
+	offset = XFS_INO_TO_AGINO(mp, ino) - irec->ino_startnum;
+
+	add_inode_reached(irec, offset);
+}
+
 /*
  * mark realtime bitmap and summary inodes as reached.
  * quota inode will be marked here as well
@@ -3078,54 +3094,18 @@ _("error %d fixing shortform directory %llu\n"),
 static void
 mark_standalone_inodes(xfs_mount_t *mp)
 {
-	ino_tree_node_t		*irec;
-	int			offset;
-
-	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rbmino),
-			XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rbmino));
-
-	offset = XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rbmino) -
-			irec->ino_startnum;
-
-	add_inode_reached(irec, offset);
-
-	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rsumino),
-			XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rsumino));
-
-	offset = XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rsumino) -
-			irec->ino_startnum;
-
-	add_inode_reached(irec, offset);
-
-	if (fs_quotas)  {
-		if (mp->m_sb.sb_uquotino
-				&& mp->m_sb.sb_uquotino != NULLFSINO)  {
-			irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp,
-						mp->m_sb.sb_uquotino),
-				XFS_INO_TO_AGINO(mp, mp->m_sb.sb_uquotino));
-			offset = XFS_INO_TO_AGINO(mp, mp->m_sb.sb_uquotino)
-					- irec->ino_startnum;
-			add_inode_reached(irec, offset);
-		}
-		if (mp->m_sb.sb_gquotino
-				&& mp->m_sb.sb_gquotino != NULLFSINO)  {
-			irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp,
-						mp->m_sb.sb_gquotino),
-				XFS_INO_TO_AGINO(mp, mp->m_sb.sb_gquotino));
-			offset = XFS_INO_TO_AGINO(mp, mp->m_sb.sb_gquotino)
-					- irec->ino_startnum;
-			add_inode_reached(irec, offset);
-		}
-		if (mp->m_sb.sb_pquotino
-				&& mp->m_sb.sb_pquotino != NULLFSINO)  {
-			irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp,
-						mp->m_sb.sb_pquotino),
-				XFS_INO_TO_AGINO(mp, mp->m_sb.sb_pquotino));
-			offset = XFS_INO_TO_AGINO(mp, mp->m_sb.sb_pquotino)
-					- irec->ino_startnum;
-			add_inode_reached(irec, offset);
-		}
-	}
+	mark_inode(mp, mp->m_sb.sb_rbmino);
+	mark_inode(mp, mp->m_sb.sb_rsumino);
+
+	if (!fs_quotas)
+		return;
+
+	if (mp->m_sb.sb_uquotino && mp->m_sb.sb_uquotino != NULLFSINO)
+		mark_inode(mp, mp->m_sb.sb_uquotino);
+	if (mp->m_sb.sb_gquotino && mp->m_sb.sb_gquotino != NULLFSINO)
+		mark_inode(mp, mp->m_sb.sb_gquotino);
+	if (mp->m_sb.sb_pquotino && mp->m_sb.sb_pquotino != NULLFSINO)
+		mark_inode(mp, mp->m_sb.sb_pquotino);
 }
 
 static void


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 42/58] xfs_repair: refactor root directory initialization
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (40 preceding siblings ...)
  2023-12-31 23:40   ` [PATCH 41/58] xfs_repair: refactor marking of metadata inodes Darrick J. Wong
@ 2023-12-31 23:40   ` Darrick J. Wong
  2023-12-31 23:40   ` [PATCH 43/58] xfs_repair: refactor grabbing realtime metadata inodes Darrick J. Wong
                     ` (15 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:40 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Refactor root directory initialization into a separate function we can
call for both the root dir and the metadir.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   63 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 23 deletions(-)


diff --git a/repair/phase6.c b/repair/phase6.c
index e01e81ae014..017c46f43af 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -773,27 +773,27 @@ mk_rsumino(xfs_mount_t *mp)
 	libxfs_irele(ip);
 }
 
-/*
- * makes a new root directory.
- */
-static void
-mk_root_dir(xfs_mount_t *mp)
+/* Initialize a root directory. */
+static int
+init_fs_root_dir(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	mode_t			mode,
+	struct xfs_inode	**ipp)
 {
-	xfs_trans_t	*tp;
-	xfs_inode_t	*ip;
-	int		i;
-	int		error;
-	const mode_t	mode = 0755;
-	ino_tree_node_t	*irec;
+	struct xfs_trans	*tp;
+	struct xfs_inode	*ip = NULL;
+	struct ino_tree_node	*irec;
+	int			error;
 
-	ip = NULL;
-	i = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 10, 0, 0, &tp);
-	if (i)
-		res_failed(i);
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 10, 0, 0, &tp);
+	if (error)
+		return error;
 
-	error = -libxfs_iget(mp, tp, mp->m_sb.sb_rootino, 0, &ip);
+	error = -libxfs_iget(mp, tp, ino, 0, &ip);
 	if (error) {
-		do_error(_("could not iget root inode -- error - %d\n"), error);
+		libxfs_trans_cancel(tp);
+		return error;
 	}
 
 	/* Reset the root directory. */
@@ -802,14 +802,31 @@ mk_root_dir(xfs_mount_t *mp)
 
 	error = -libxfs_trans_commit(tp);
 	if (error)
-		do_error(_("%s: commit failed, error %d\n"), __func__, error);
+		return error;
+
+	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, ino),
+				XFS_INO_TO_AGINO(mp, ino));
+	set_inode_isadir(irec, XFS_INO_TO_AGINO(mp, ino) - irec->ino_startnum);
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * makes a new root directory.
+ */
+static void
+mk_root_dir(xfs_mount_t *mp)
+{
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	error = init_fs_root_dir(mp, mp->m_sb.sb_rootino, 0755, &ip);
+	if (error)
+		do_error(
+	_("Could not reinitialize root directory inode, error %d\n"),
+			error);
 
 	libxfs_irele(ip);
-
-	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino),
-				XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino));
-	set_inode_isadir(irec, XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino) -
-				irec->ino_startnum);
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 43/58] xfs_repair: refactor grabbing realtime metadata inodes
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (41 preceding siblings ...)
  2023-12-31 23:40   ` [PATCH 42/58] xfs_repair: refactor root directory initialization Darrick J. Wong
@ 2023-12-31 23:40   ` Darrick J. Wong
  2023-12-31 23:41   ` [PATCH 44/58] xfs_repair: check metadata inode flag Darrick J. Wong
                     ` (14 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:40 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a helper function to grab a realtime metadata inode.  When
metadir arrives, the bitmap and summary inodes can float, so we'll
turn this function into a "load or allocate" function.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   90 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 37 deletions(-)


diff --git a/repair/phase6.c b/repair/phase6.c
index 017c46f43af..afb09ff7232 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -471,18 +471,37 @@ reset_root_ino(
 	libxfs_inode_init(tp, &args, ip);
 }
 
+/* Load a realtime metadata inode from disk and reset it. */
+static int
+ensure_rtino(
+	struct xfs_trans		*tp,
+	xfs_ino_t			ino,
+	struct xfs_inode		**ipp)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	int				error;
+
+	error = -libxfs_iget(mp, tp, ino, 0, ipp);
+	if (error)
+		return error;
+
+	reset_root_ino(tp, S_IFREG, *ipp);
+	return 0;
+}
+
 static void
-mk_rbmino(xfs_mount_t *mp)
+mk_rbmino(
+	struct xfs_mount	*mp)
 {
-	xfs_trans_t	*tp;
-	xfs_inode_t	*ip;
-	xfs_bmbt_irec_t	*ep;
-	int		i;
-	int		nmap;
-	int		error;
-	xfs_fileoff_t	bno;
-	xfs_bmbt_irec_t	map[XFS_BMAP_MAX_NMAP];
-	uint		blocks;
+	struct xfs_trans	*tp;
+	struct xfs_inode	*ip;
+	struct xfs_bmbt_irec	*ep;
+	int			i;
+	int			nmap;
+	int			error;
+	xfs_fileoff_t		bno;
+	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
+	uint			blocks;
 
 	/*
 	 * first set up inode
@@ -491,15 +510,13 @@ mk_rbmino(xfs_mount_t *mp)
 	if (i)
 		res_failed(i);
 
-	error = -libxfs_iget(mp, tp, mp->m_sb.sb_rbmino, 0, &ip);
-	if (error) {
-		do_error(
-		_("couldn't iget realtime bitmap inode -- error - %d\n"),
-			error);
-	}
-
 	/* Reset the realtime bitmap inode. */
-	reset_root_ino(tp, S_IFREG, ip);
+	error = ensure_rtino(tp, mp->m_sb.sb_rbmino, &ip);
+	if (error) {
+		do_error(
+		_("couldn't iget realtime bitmap inode -- error - %d\n"),
+			error);
+	}
 	ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize;
 	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	error = -libxfs_trans_commit(tp);
@@ -700,18 +717,19 @@ _("can't access block %" PRIu64 " (fsbno %" PRIu64 ") of realtime summary inode
 }
 
 static void
-mk_rsumino(xfs_mount_t *mp)
+mk_rsumino(
+	struct xfs_mount	*mp)
 {
-	xfs_trans_t	*tp;
-	xfs_inode_t	*ip;
-	xfs_bmbt_irec_t	*ep;
-	int		i;
-	int		nmap;
-	int		error;
-	int		nsumblocks;
-	xfs_fileoff_t	bno;
-	xfs_bmbt_irec_t	map[XFS_BMAP_MAX_NMAP];
-	uint		blocks;
+	struct xfs_trans	*tp;
+	struct xfs_inode	*ip;
+	struct xfs_bmbt_irec	*ep;
+	int			i;
+	int			nmap;
+	int			error;
+	int			nsumblocks;
+	xfs_fileoff_t		bno;
+	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
+	uint			blocks;
 
 	/*
 	 * first set up inode
@@ -720,15 +738,13 @@ mk_rsumino(xfs_mount_t *mp)
 	if (i)
 		res_failed(i);
 
-	error = -libxfs_iget(mp, tp, mp->m_sb.sb_rsumino, 0, &ip);
-	if (error) {
-		do_error(
-		_("couldn't iget realtime summary inode -- error - %d\n"),
-			error);
-	}
-
 	/* Reset the rt summary inode. */
-	reset_root_ino(tp, S_IFREG, ip);
+	error = ensure_rtino(tp, mp->m_sb.sb_rsumino, &ip);
+	if (error) {
+		do_error(
+		_("couldn't iget realtime summary inode -- error - %d\n"),
+			error);
+	}
 	ip->i_disk_size = mp->m_rsumsize;
 	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	error = -libxfs_trans_commit(tp);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 44/58] xfs_repair: check metadata inode flag
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (42 preceding siblings ...)
  2023-12-31 23:40   ` [PATCH 43/58] xfs_repair: refactor grabbing realtime metadata inodes Darrick J. Wong
@ 2023-12-31 23:41   ` Darrick J. Wong
  2023-12-31 23:41   ` [PATCH 45/58] xfs_repair: rebuild the metadata directory Darrick J. Wong
                     ` (13 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:41 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Check whether or not the metadata inode flag is set appropriately.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |   14 ++++++++++++++
 1 file changed, 14 insertions(+)


diff --git a/repair/dinode.c b/repair/dinode.c
index f2da4325d5e..4af7c91d5c9 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -2669,6 +2669,20 @@ _("bad (negative) size %" PRId64 " on inode %" PRIu64 "\n"),
 			}
 		}
 
+		if (flags2 & XFS_DIFLAG2_METADIR) {
+			xfs_failaddr_t	fa;
+
+			fa = libxfs_dinode_verify_metadir(mp, dino, di_mode,
+					be16_to_cpu(dino->di_flags), flags2);
+			if (fa) {
+				if (!uncertain)
+					do_warn(
+	_("inode %" PRIu64 " is incorrectly marked as metadata\n"),
+						lino);
+				goto clear_bad_out;
+			}
+		}
+
 		if ((flags2 & XFS_DIFLAG2_REFLINK) &&
 		    !xfs_has_reflink(mp)) {
 			if (!uncertain) {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 45/58] xfs_repair: rebuild the metadata directory
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (43 preceding siblings ...)
  2023-12-31 23:41   ` [PATCH 44/58] xfs_repair: check metadata inode flag Darrick J. Wong
@ 2023-12-31 23:41   ` Darrick J. Wong
  2023-12-31 23:41   ` [PATCH 46/58] xfs_repair: don't let metadata and regular files mix Darrick J. Wong
                     ` (12 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:41 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Check the metadata directory for problems and rebuild it if necessary.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    2 
 repair/dino_chunks.c     |   12 ++
 repair/dir2.c            |   25 +++
 repair/globals.c         |    3 
 repair/globals.h         |    3 
 repair/phase1.c          |    2 
 repair/phase2.c          |    7 +
 repair/phase4.c          |   16 ++
 repair/phase6.c          |  361 ++++++++++++++++++++++++++++++++++++++++++----
 repair/pptr.c            |   51 ++++++
 repair/pptr.h            |    2 
 repair/sb.c              |    3 
 repair/xfs_repair.c      |   81 ++++++++++
 13 files changed, 526 insertions(+), 42 deletions(-)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index e171755a6f0..5f2250ac5e2 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -188,7 +188,7 @@
 #define xfs_imeta_link_space_res	libxfs_imeta_link_space_res
 #define xfs_imeta_lookup		libxfs_imeta_lookup
 #define xfs_imeta_mount			libxfs_imeta_mount
-#define xfs_imeta_set_metaflag		libxfs_imeta_set_metaflag
+#define xfs_imeta_set_iflag		libxfs_imeta_set_iflag
 #define xfs_imeta_start_create		libxfs_imeta_start_create
 #define xfs_imeta_start_link		libxfs_imeta_start_link
 #define xfs_imeta_start_unlink		libxfs_imeta_start_unlink
diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c
index 19536133451..479dc9db760 100644
--- a/repair/dino_chunks.c
+++ b/repair/dino_chunks.c
@@ -932,6 +932,18 @@ process_inode_chunk(
 	_("would clear root inode %" PRIu64 "\n"),
 						ino);
 				}
+			} else if (mp->m_sb.sb_metadirino == ino) {
+				need_metadir_inode = true;
+
+				if (!no_modify)  {
+					do_warn(
+	_("cleared metadata directory %" PRIu64 "\n"),
+						ino);
+				} else  {
+					do_warn(
+	_("would clear metadata directory %" PRIu64 "\n"),
+						ino);
+				}
 			} else if (mp->m_sb.sb_rbmino == ino) {
 				need_rbmino = 1;
 
diff --git a/repair/dir2.c b/repair/dir2.c
index 1184392ff47..a7f5018fba2 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -146,6 +146,10 @@ is_meta_ino(
 {
 	char			*reason = NULL;
 
+	/* in metadir land we don't have static metadata inodes anymore */
+	if (xfs_has_metadir(mp))
+		return false;
+
 	if (lino == mp->m_sb.sb_rbmino)
 		reason = _("realtime bitmap");
 	else if (lino == mp->m_sb.sb_rsumino)
@@ -160,6 +164,16 @@ is_meta_ino(
 		 (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
 		reason = _("metadata directory file");
 
+	if (xfs_has_metadir(mp) &&
+	    dirino == mp->m_sb.sb_metadirino) {
+		if (reason == NULL) {
+			/* no regular files in the metadir */
+			*junkreason = _("non-metadata inode");
+			return true;
+		}
+		return false;
+	}
+
 	if (reason)
 		*junkreason = reason;
 	return reason != NULL;
@@ -583,7 +597,8 @@ _("corrected root directory %" PRIu64 " .. entry, was %" PRIu64 ", now %" PRIu64
 _("would have corrected root directory %" PRIu64 " .. entry from %" PRIu64" to %" PRIu64 "\n"),
 				ino, *parent, ino);
 		}
-	} else if (ino == *parent && ino != mp->m_sb.sb_rootino)  {
+	} else if (ino == *parent && ino != mp->m_sb.sb_rootino &&
+		   ino != mp->m_sb.sb_metadirino)  {
 		/*
 		 * likewise, non-root directories can't have .. pointing
 		 * to .
@@ -879,7 +894,8 @@ _("entry at block %u offset %" PRIdPTR " in directory inode %" PRIu64 " has ille
 				 * NULLFSINO otherwise.
 				 */
 				if (ino == ent_ino &&
-						ino != mp->m_sb.sb_rootino) {
+				    ino != mp->m_sb.sb_rootino &&
+				    ino != mp->m_sb.sb_metadirino) {
 					*parent = NULLFSINO;
 					do_warn(
 _("bad .. entry in directory inode %" PRIu64 ", points to self: "),
@@ -1520,9 +1536,14 @@ process_dir2(
 	} else if (dotdot == 0 && ino == mp->m_sb.sb_rootino) {
 		do_warn(_("no .. entry for root directory %" PRIu64 "\n"), ino);
 		need_root_dotdot = 1;
+	} else if (dotdot == 0 && ino == mp->m_sb.sb_metadirino) {
+		do_warn(_("no .. entry for metaino directory %" PRIu64 "\n"), ino);
+		need_metadir_dotdot = 1;
 	}
 
 	ASSERT((ino != mp->m_sb.sb_rootino && ino != *parent) ||
+		(ino == mp->m_sb.sb_metadirino &&
+			(ino == *parent || need_metadir_dotdot == 1)) ||
 		(ino == mp->m_sb.sb_rootino &&
 			(ino == *parent || need_root_dotdot == 1)));
 
diff --git a/repair/globals.c b/repair/globals.c
index 7d95e210e8e..22cb096c6a4 100644
--- a/repair/globals.c
+++ b/repair/globals.c
@@ -69,6 +69,9 @@ int	fs_is_dirty;
 int	need_root_inode;
 int	need_root_dotdot;
 
+bool	need_metadir_inode;
+int	need_metadir_dotdot;
+
 int	need_rbmino;
 int	need_rsumino;
 
diff --git a/repair/globals.h b/repair/globals.h
index 71a64b94365..c3709f11874 100644
--- a/repair/globals.h
+++ b/repair/globals.h
@@ -110,6 +110,9 @@ extern int		fs_is_dirty;
 extern int		need_root_inode;
 extern int		need_root_dotdot;
 
+extern bool		need_metadir_inode;
+extern int		need_metadir_dotdot;
+
 extern int		need_rbmino;
 extern int		need_rsumino;
 
diff --git a/repair/phase1.c b/repair/phase1.c
index 00b98584eed..40e7f164c55 100644
--- a/repair/phase1.c
+++ b/repair/phase1.c
@@ -48,6 +48,8 @@ phase1(xfs_mount_t *mp)
 	primary_sb_modified = 0;
 	need_root_inode = 0;
 	need_root_dotdot = 0;
+	need_metadir_inode = false;
+	need_metadir_dotdot = 0;
 	need_rbmino = 0;
 	need_rsumino = 0;
 	lost_quotas = 0;
diff --git a/repair/phase2.c b/repair/phase2.c
index a58fa7d8a7b..5a08cbc31c6 100644
--- a/repair/phase2.c
+++ b/repair/phase2.c
@@ -646,8 +646,11 @@ phase2(
 	 * make sure we know about the root inode chunk
 	 */
 	if ((ino_rec = find_inode_rec(mp, 0, mp->m_sb.sb_rootino)) == NULL)  {
-		ASSERT(mp->m_sb.sb_rbmino == mp->m_sb.sb_rootino + 1 &&
-			mp->m_sb.sb_rsumino == mp->m_sb.sb_rootino + 2);
+		ASSERT(!xfs_has_metadir(mp) ||
+		       mp->m_sb.sb_metadirino == mp->m_sb.sb_rootino + 1);
+		ASSERT(xfs_has_metadir(mp) ||
+		       (mp->m_sb.sb_rbmino == mp->m_sb.sb_rootino + 1 &&
+			mp->m_sb.sb_rsumino == mp->m_sb.sb_rootino + 2));
 		do_warn(_("root inode chunk not found\n"));
 
 		/*
diff --git a/repair/phase4.c b/repair/phase4.c
index 5e5d8c3c7d9..f004111ea4e 100644
--- a/repair/phase4.c
+++ b/repair/phase4.c
@@ -264,6 +264,22 @@ phase4(xfs_mount_t *mp)
 			do_warn(_("root inode lost\n"));
 	}
 
+	/*
+	 * If metadata directory trees are enabled, the metadata root directory
+	 * always comes immediately after the regular root directory, even if
+	 * it's free.
+	 */
+	if (xfs_has_metadir(mp) &&
+	    (is_inode_free(irec, 1) || !inode_isadir(irec, 1))) {
+		need_metadir_inode = true;
+		if (no_modify)
+			do_warn(
+	_("metadata directory root inode would be lost\n"));
+		else
+			do_warn(
+	_("metadata directory root inode lost\n"));
+	}
+
 	for (i = 0; i < mp->m_sb.sb_agcount; i++)  {
 		ag_end = (i < mp->m_sb.sb_agcount - 1) ? mp->m_sb.sb_agblocks :
 			mp->m_sb.sb_dblocks -
diff --git a/repair/phase6.c b/repair/phase6.c
index afb09ff7232..05e0b8ac593 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -471,6 +471,144 @@ reset_root_ino(
 	libxfs_inode_init(tp, &args, ip);
 }
 
+/* Mark a newly allocated inode in use in the incore bitmap. */
+static void
+mark_ino_inuse(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	int			mode,
+	xfs_ino_t		parent)
+{
+	struct ino_tree_node	*irec;
+	int			ino_offset;
+	int			i;
+
+	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, ino),
+			XFS_INO_TO_AGINO(mp, ino));
+
+	if (irec == NULL) {
+		/*
+		 * This inode is allocated from a newly created inode
+		 * chunk and therefore did not exist when inode chunks
+		 * were processed in phase3. Add this group of inodes to
+		 * the entry avl tree as if they were discovered in phase3.
+		 */
+		irec = set_inode_free_alloc(mp,
+				XFS_INO_TO_AGNO(mp, ino),
+				XFS_INO_TO_AGINO(mp, ino));
+		alloc_ex_data(irec);
+
+		for (i = 0; i < XFS_INODES_PER_CHUNK; i++)
+			set_inode_free(irec, i);
+	}
+
+	ino_offset = get_inode_offset(mp, ino, irec);
+
+	/*
+	 * Mark the inode allocated so it is not skipped in phase 7.  We'll
+	 * find it with the directory traverser soon, so we don't need to
+	 * mark it reached.
+	 */
+	set_inode_used(irec, ino_offset);
+	set_inode_ftype(irec, ino_offset, libxfs_mode_to_ftype(mode));
+	set_inode_parent(irec, ino_offset, parent);
+	if (S_ISDIR(mode))
+		set_inode_isadir(irec, ino_offset);
+}
+
+/* Make sure this metadata directory path exists. */
+static int
+ensure_imeta_dirpath(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path)
+{
+	struct xfs_imeta_path		temp_path = {
+		.im_path		= path->im_path,
+		.im_depth		= 1,
+		.im_ftype		= XFS_DIR3_FT_DIR,
+	};
+	struct xfs_trans		*tp;
+	unsigned int			i;
+	xfs_ino_t			parent;
+	int				error;
+
+	if (!xfs_has_metadir(mp))
+		return 0;
+
+	error = -libxfs_imeta_ensure_dirpath(mp, path);
+	if (error)
+		return error;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	/* Mark all directories in this path as inuse. */
+	parent = mp->m_metadirip->i_ino;
+	for (i = 0; i < path->im_depth - 1; i++, temp_path.im_depth++) {
+		xfs_ino_t		ino;
+
+		error = -libxfs_imeta_lookup(tp, &temp_path, &ino);
+		if (error)
+			break;
+		if (ino == NULLFSINO) {
+			error = ENOENT;
+			break;
+		}
+
+		mark_ino_inuse(mp, ino, S_IFDIR, parent);
+		parent = ino;
+	}
+
+	libxfs_trans_cancel(tp);
+	return error;
+}
+
+/* Look up the parent of this path. */
+static xfs_ino_t
+lookup_imeta_path_dirname(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path)
+{
+	struct xfs_imeta_path		temp_path = {
+		.im_path		= path->im_path,
+		.im_depth		= path->im_depth - 1,
+		.im_ftype		= XFS_DIR3_FT_DIR,
+	};
+	struct xfs_trans		*tp;
+	xfs_ino_t			ino;
+	int				error;
+
+	if (!xfs_has_metadir(mp))
+		return NULLFSINO;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return NULLFSINO;
+	error = -libxfs_imeta_lookup(tp, &temp_path, &ino);
+	libxfs_trans_cancel(tp);
+	if (error)
+		return NULLFSINO;
+
+	return ino;
+}
+
+static inline bool
+is_inode_inuse(
+	struct xfs_mount	*mp,
+	xfs_ino_t		inum)
+{
+	struct ino_tree_node	*irec;
+	int			ino_offset;
+
+	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, inum),
+				XFS_INO_TO_AGINO(mp, inum));
+	if (!irec)
+		return false;
+	ino_offset = XFS_INO_TO_AGINO(mp, inum) - irec->ino_startnum;
+	return !is_inode_free(irec, ino_offset);
+}
+
 /* Load a realtime metadata inode from disk and reset it. */
 static int
 ensure_rtino(
@@ -489,12 +627,95 @@ ensure_rtino(
 	return 0;
 }
 
+/*
+ * Either link the old rtbitmap/summary inode into the (reinitialized) metadata
+ * directory tree, or create new ones.
+ */
+static void
+ensure_rtino_metadir(
+	struct xfs_mount		*mp,
+	const struct xfs_imeta_path	*path,
+	xfs_ino_t			*inop,
+	struct xfs_inode		**ipp,
+	struct xfs_imeta_update		*upd)
+{
+	struct xfs_trans		*tp;
+	xfs_ino_t			ino = *inop;
+	int				error;
+
+	/*
+	 * If the incore inode pointer is null or points to an inode that is
+	 * not allocated, we need to create a new file.
+	 */
+	if (ino == NULLFSINO || !is_inode_inuse(mp, ino)) {
+		error = -libxfs_imeta_start_create(mp, path, upd);
+		if (error)
+			do_error(
+ _("failed to allocate resources to recreate rt metadata inode, error %d\n"),
+					error);
+
+		/* Allocate a new inode. */
+		error = -libxfs_imeta_create(upd, S_IFREG, ipp);
+		if (error)
+			do_error(
+ _("couldn't create new rt metadata inode, error %d\n"), error);
+
+		ASSERT(*inop == upd->ip->i_ino);
+
+		mark_ino_inuse(mp, upd->ip->i_ino, S_IFREG,
+				lookup_imeta_path_dirname(mp, path));
+		return;
+	}
+
+	/*
+	 * We found the old rt metadata file and it looks ok.  Link it into
+	 * the metadata directory tree.  Null out the superblock pointer before
+	 * we re-link this file into it.
+	 */
+	*inop = NULLFSINO;
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		do_error(
+ _("failed to allocate trans to iget rt metadata inode 0x%llx, error %d\n"),
+				(unsigned long long)ino, error);
+	error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, ipp);
+	libxfs_trans_cancel(tp);
+	if (error)
+		do_error(
+ _("failed to iget rt metadata inode 0x%llx, error %d\n"),
+				(unsigned long long)ino, error);
+
+	/*
+	 * Since we're reattaching this file to the metadata directory tree,
+	 * try to remove all the parent pointers that might be attached.
+	 */
+	try_erase_parent_ptrs(*ipp);
+
+	error = -libxfs_imeta_start_link(mp, path, *ipp, upd);
+	if (error)
+		do_error(
+ _("failed to allocate resources to reinsert rt metadata inode 0x%llx, error %d\n"),
+				(unsigned long long)ino, error);
+
+	error = -libxfs_imeta_link(upd);
+	if (error)
+		do_error(
+ _("failed to link rt metadata inode 0x%llx, error %d\n"),
+				(unsigned long long)ino, error);
+
+	/* Reset the link count to something sane. */
+	set_nlink(VFS_I(upd->ip), 1);
+	libxfs_trans_log_inode(upd->tp, upd->ip, XFS_ILOG_CORE);
+}
+
 static void
 mk_rbmino(
 	struct xfs_mount	*mp)
 {
-	struct xfs_trans	*tp;
-	struct xfs_inode	*ip;
+	struct xfs_imeta_update	upd = { };
+	struct xfs_trans	*tp = NULL;
+	struct xfs_inode	*ip = NULL;
 	struct xfs_bmbt_irec	*ep;
 	int			i;
 	int			nmap;
@@ -503,23 +724,32 @@ mk_rbmino(
 	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
 	uint			blocks;
 
-	/*
-	 * first set up inode
-	 */
-	i = -libxfs_trans_alloc_rollable(mp, 10, &tp);
-	if (i)
-		res_failed(i);
-
 	/* Reset the realtime bitmap inode. */
-	error = ensure_rtino(tp, mp->m_sb.sb_rbmino, &ip);
-	if (error) {
-		do_error(
-		_("couldn't iget realtime bitmap inode -- error - %d\n"),
-			error);
+	if (!xfs_has_metadir(mp)) {
+		i = -libxfs_trans_alloc_rollable(mp, 10, &upd.tp);
+		if (i)
+			res_failed(i);
+
+		error = ensure_rtino(upd.tp, mp->m_sb.sb_rbmino, &ip);
+		if (error) {
+			do_error(
+ _("couldn't iget realtime bitmap inode -- error - %d\n"),
+				error);
+		}
+	} else {
+		error = ensure_imeta_dirpath(mp, &XFS_IMETA_RTBITMAP);
+		if (error)
+			do_error(
+ _("Couldn't create realtime metadata directory, error %d\n"), error);
+
+		ensure_rtino_metadir(mp, &XFS_IMETA_RTBITMAP,
+				&mp->m_sb.sb_rbmino, &ip, &upd);
 	}
+
 	ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize;
-	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	error = -libxfs_trans_commit(tp);
+	libxfs_trans_log_inode(upd.tp, ip, XFS_ILOG_CORE);
+
+	error = -libxfs_imeta_commit_update(&upd);
 	if (error)
 		do_error(_("%s: commit failed, error %d\n"), __func__, error);
 
@@ -720,8 +950,9 @@ static void
 mk_rsumino(
 	struct xfs_mount	*mp)
 {
-	struct xfs_trans	*tp;
-	struct xfs_inode	*ip;
+	struct xfs_imeta_update	upd = { };
+	struct xfs_trans	*tp = NULL;
+	struct xfs_inode	*ip = NULL;
 	struct xfs_bmbt_irec	*ep;
 	int			i;
 	int			nmap;
@@ -731,23 +962,32 @@ mk_rsumino(
 	struct xfs_bmbt_irec	map[XFS_BMAP_MAX_NMAP];
 	uint			blocks;
 
-	/*
-	 * first set up inode
-	 */
-	i = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 10, 0, 0, &tp);
-	if (i)
-		res_failed(i);
+	/* Reset the realtime summary inode. */
+	if (!xfs_has_metadir(mp)) {
+		i = -libxfs_trans_alloc_rollable(mp, 10, &upd.tp);
+		if (i)
+			res_failed(i);
 
-	/* Reset the rt summary inode. */
-	error = ensure_rtino(tp, mp->m_sb.sb_rsumino, &ip);
-	if (error) {
-		do_error(
-		_("couldn't iget realtime summary inode -- error - %d\n"),
-			error);
+		error = ensure_rtino(upd.tp, mp->m_sb.sb_rsumino, &ip);
+		if (error) {
+			do_error(
+ _("couldn't iget realtime summary inode -- error - %d\n"),
+				error);
+		}
+	} else {
+		error = ensure_imeta_dirpath(mp, &XFS_IMETA_RTSUMMARY);
+		if (error)
+			do_error(
+ _("Couldn't create realtime metadata directory, error %d\n"), error);
+
+		ensure_rtino_metadir(mp, &XFS_IMETA_RTSUMMARY,
+				&mp->m_sb.sb_rsumino, &ip, &upd);
 	}
+
 	ip->i_disk_size = mp->m_rsumsize;
-	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	error = -libxfs_trans_commit(tp);
+	libxfs_trans_log_inode(upd.tp, ip, XFS_ILOG_CORE);
+
+	error = -libxfs_imeta_commit_update(&upd);
 	if (error)
 		do_error(_("%s: commit failed, error %d\n"), __func__, error);
 
@@ -845,6 +1085,36 @@ mk_root_dir(xfs_mount_t *mp)
 	libxfs_irele(ip);
 }
 
+/* Create a new metadata directory root. */
+static void
+mk_metadir(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = init_fs_root_dir(mp, mp->m_sb.sb_metadirino, 0,
+			&mp->m_metadirip);
+	if (error)
+		do_error(
+	_("Initialization of the metadata root directory failed, error %d\n"),
+			error);
+
+	/* Mark the new metadata root dir as metadata. */
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		do_error(
+	_("Marking metadata root directory failed"));
+
+	libxfs_trans_ijoin(tp, mp->m_metadirip, 0);
+	libxfs_imeta_set_iflag(tp, mp->m_metadirip);
+
+	error = -libxfs_trans_commit(tp);
+	if (error)
+		do_error(
+	_("Marking metadata root directory failed, error %d\n"), error);
+}
+
 /*
  * orphanage name == lost+found
  */
@@ -1354,6 +1624,8 @@ longform_dir2_rebuild(
 
 	if (ino == mp->m_sb.sb_rootino)
 		need_root_dotdot = 0;
+	else if (ino == mp->m_sb.sb_metadirino)
+		need_metadir_dotdot = 0;
 
 	/* go through the hash list and re-add the inodes */
 
@@ -2973,7 +3245,7 @@ process_dir_inode(
 
 	need_dot = dirty = num_illegal = 0;
 
-	if (mp->m_sb.sb_rootino == ino)  {
+	if (mp->m_sb.sb_rootino == ino || mp->m_sb.sb_metadirino == ino) {
 		/*
 		 * mark root inode reached and bump up
 		 * link count for root inode to account
@@ -3048,6 +3320,9 @@ _("error %d fixing shortform directory %llu\n"),
 	dir_hash_done(hashtab);
 
 	fix_dotdot(mp, ino, ip, mp->m_sb.sb_rootino, "root", &need_root_dotdot);
+	if (xfs_has_metadir(mp))
+		fix_dotdot(mp, ino, ip, mp->m_sb.sb_metadirino, "metadata",
+				&need_metadir_dotdot);
 
 	/*
 	 * if we need to create the '.' entry, do so only if
@@ -3127,6 +3402,15 @@ mark_inode(
 static void
 mark_standalone_inodes(xfs_mount_t *mp)
 {
+	if (xfs_has_metadir(mp)) {
+		/*
+		 * The directory connectivity scanner will pick up the metadata
+		 * inode directory, which will mark the rest of the metadata
+		 * inodes.
+		 */
+		return;
+	}
+
 	mark_inode(mp, mp->m_sb.sb_rbmino);
 	mark_inode(mp, mp->m_sb.sb_rsumino);
 
@@ -3305,6 +3589,17 @@ phase6(xfs_mount_t *mp)
 		}
 	}
 
+	if (need_metadir_inode) {
+		if (!no_modify)  {
+			do_warn(_("reinitializing metadata root directory\n"));
+			mk_metadir(mp);
+			need_metadir_inode = false;
+			need_metadir_dotdot = 0;
+		} else  {
+			do_warn(_("would reinitialize metadata root directory\n"));
+		}
+	}
+
 	if (need_rbmino)  {
 		if (!no_modify)  {
 			do_warn(_("reinitializing realtime bitmap inode\n"));
diff --git a/repair/pptr.c b/repair/pptr.c
index 77f49dbcb84..f554919d525 100644
--- a/repair/pptr.c
+++ b/repair/pptr.c
@@ -1301,3 +1301,54 @@ check_parent_ptrs(
 
 	destroy_work_queue(&wq);
 }
+
+static int
+erase_pptrs(
+	struct xfs_inode	*ip,
+	unsigned int		attr_flags,
+	const unsigned char	*name,
+	unsigned int		namelen,
+	const void		*value,
+	unsigned int		valuelen,
+	void			*priv)
+{
+	struct xfs_da_args	args = {
+		.dp		= ip,
+		.attr_filter	= attr_flags,
+		.name		= name,
+		.namelen	= namelen,
+		.value		= (void *)value,
+		.valuelen	= valuelen,
+		.op_flags	= XFS_DA_OP_REMOVE | XFS_DA_OP_NVLOOKUP,
+	};
+	int			error;
+
+	if (!(attr_flags & XFS_ATTR_PARENT))
+		return 0;
+
+	error = -libxfs_attr_set(&args);
+	if (error)
+		do_warn(
+_("removing ino %llu parent pointer failed: %s\n"),
+				(unsigned long long)ip->i_ino,
+				strerror(error));
+
+	return 0;
+}
+
+/* Delete all of this file's parent pointers if we can. */
+void
+try_erase_parent_ptrs(
+	struct xfs_inode	*ip)
+{
+	int			error;
+
+	if (!xfs_has_parent(ip->i_mount))
+		return;
+
+	error = xattr_walk(ip, erase_pptrs, NULL);
+	if (error)
+		do_warn(_("ino %llu parent pointer erasure failed: %s\n"),
+				(unsigned long long)ip->i_ino,
+				strerror(error));
+}
diff --git a/repair/pptr.h b/repair/pptr.h
index f5ffcc137e3..9c9f244cd68 100644
--- a/repair/pptr.h
+++ b/repair/pptr.h
@@ -14,4 +14,6 @@ void add_parent_ptr(xfs_ino_t ino, const unsigned char *fname,
 
 void check_parent_ptrs(struct xfs_mount *mp);
 
+void try_erase_parent_ptrs(struct xfs_inode *ip);
+
 #endif /* __REPAIR_PPTR_H__ */
diff --git a/repair/sb.c b/repair/sb.c
index faf79d9d083..5f2a08136d7 100644
--- a/repair/sb.c
+++ b/repair/sb.c
@@ -28,6 +28,7 @@ copy_sb(xfs_sb_t *source, xfs_sb_t *dest)
 	xfs_ino_t	uquotino;
 	xfs_ino_t	gquotino;
 	xfs_ino_t	pquotino;
+	xfs_ino_t	metadirino;
 	uint16_t	versionnum;
 
 	rootino = dest->sb_rootino;
@@ -36,6 +37,7 @@ copy_sb(xfs_sb_t *source, xfs_sb_t *dest)
 	uquotino = dest->sb_uquotino;
 	gquotino = dest->sb_gquotino;
 	pquotino = dest->sb_pquotino;
+	metadirino = dest->sb_metadirino;
 
 	versionnum = dest->sb_versionnum;
 
@@ -47,6 +49,7 @@ copy_sb(xfs_sb_t *source, xfs_sb_t *dest)
 	dest->sb_uquotino = uquotino;
 	dest->sb_gquotino = gquotino;
 	dest->sb_pquotino = pquotino;
+	dest->sb_metadirino = metadirino;
 
 	dest->sb_versionnum = versionnum;
 
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index 32c28a980ff..d881d5ec4ac 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -639,6 +639,70 @@ guess_correct_sunit(
 		do_warn(_("Would reset sb_width to %u\n"), new_sunit);
 }
 
+/*
+ * Check that the metadata directory inode comes immediately after the root
+ * directory inode and that it seems to look like a metadata directory.
+ */
+STATIC void
+check_metadir_inode(
+	struct xfs_mount	*mp,
+	xfs_ino_t		rootino)
+{
+	int			error;
+
+	validate_sb_ino(&mp->m_sb.sb_metadirino, rootino + 1,
+			_("metadata root directory"));
+
+	/* If we changed the metadir inode, try reloading it. */
+	if (!mp->m_metadirip ||
+	    mp->m_metadirip->i_ino != mp->m_sb.sb_metadirino) {
+		struct xfs_trans	*tp;
+
+		if (mp->m_metadirip)
+			libxfs_irele(mp->m_metadirip);
+
+		error = -libxfs_trans_alloc_empty(mp, &tp);
+		if (error)
+			do_error(
+ _("could not allocate transaction to load metadir inode"));
+
+		error = -libxfs_imeta_iget(tp, mp->m_sb.sb_metadirino,
+				XFS_DIR3_FT_DIR, &mp->m_metadirip);
+		if (error) {
+			libxfs_trans_cancel(tp);
+			need_metadir_inode = true;
+			goto done;
+		}
+
+		error = -libxfs_imeta_mount(tp);
+		if (error)
+			need_metadir_inode = true;
+
+		libxfs_trans_cancel(tp);
+	}
+
+done:
+	if (need_metadir_inode) {
+		if (!no_modify)
+			do_warn(_("will reset metadata root directory\n"));
+		else
+			do_warn(_("would reset metadata root directory\n"));
+		if (mp->m_metadirip)
+			libxfs_irele(mp->m_metadirip);
+		mp->m_metadirip = NULL;
+	}
+
+	/*
+	 * Since these two realtime inodes are no longer fixed, we must
+	 * remember to regenerate them if we still haven't gotten a pointer to
+	 * a valid realtime inode.
+	 */
+	if (!libxfs_verify_ino(mp, mp->m_sb.sb_rbmino))
+		need_rbmino = 1;
+	if (!libxfs_verify_ino(mp, mp->m_sb.sb_rsumino))
+		need_rsumino = 1;
+}
+
 /*
  * Make sure that the first 3 inodes in the filesystem are the root directory,
  * the realtime bitmap, and the realtime summary, in that order.
@@ -668,10 +732,19 @@ _("sb root inode value %" PRIu64 " valid but in unaligned location (expected %"P
 
 	validate_sb_ino(&mp->m_sb.sb_rootino, rootino,
 			_("root"));
-	validate_sb_ino(&mp->m_sb.sb_rbmino, rootino + 1,
-			_("realtime bitmap"));
-	validate_sb_ino(&mp->m_sb.sb_rsumino, rootino + 2,
-			_("realtime summary"));
+
+	if (xfs_has_metadir(mp)) {
+		check_metadir_inode(mp, rootino);
+	} else {
+		/*
+		 * The realtime bitmap and summary inodes only comes after the
+		 * root directory when the metadir feature is not enabled.
+		 */
+		validate_sb_ino(&mp->m_sb.sb_rbmino, rootino + 1,
+				_("realtime bitmap"));
+		validate_sb_ino(&mp->m_sb.sb_rsumino, rootino + 2,
+				_("realtime summary"));
+	}
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 46/58] xfs_repair: don't let metadata and regular files mix
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (44 preceding siblings ...)
  2023-12-31 23:41   ` [PATCH 45/58] xfs_repair: rebuild the metadata directory Darrick J. Wong
@ 2023-12-31 23:41   ` Darrick J. Wong
  2023-12-31 23:41   ` [PATCH 47/58] xfs_repair: update incore metadata state whenever we create new files Darrick J. Wong
                     ` (11 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:41 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Track whether or not inodes thought they were metadata inodes.  We
cannot allow metadata inodes to appear in the regular directory tree,
and we cannot allow regular inodes to appear in the metadata directory
tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c     |   21 +++++++++++++++++
 repair/incore.h     |   19 +++++++++++++++
 repair/incore_ino.c |    1 +
 repair/phase6.c     |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 104 insertions(+)


diff --git a/repair/dinode.c b/repair/dinode.c
index 4af7c91d5c9..7c3e5d86404 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -2361,6 +2361,7 @@ process_dinode_int(
 	struct xfs_dinode	*dino = *dinop;
 	xfs_agino_t		unlinked_ino;
 	struct xfs_perag	*pag;
+	bool			is_meta = false;
 
 	*dirty = *isa_dir = 0;
 	*used = is_used;
@@ -2933,6 +2934,18 @@ _("Bad CoW extent size %u on inode %" PRIu64 ", "),
 	if (collect_rmaps)
 		record_inode_reflink_flag(mp, dino, agno, ino, lino);
 
+	/* Does this inode think it was metadata? */
+	if (dino->di_version >= 3 &&
+	    (dino->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR))) {
+		struct ino_tree_node	*irec;
+		int			off;
+
+		irec = find_inode_rec(mp, agno, ino);
+		off = get_inode_offset(mp, lino, irec);
+		set_inode_is_meta(irec, off);
+		is_meta = true;
+	}
+
 	/*
 	 * check data fork -- if it's bad, clear the inode
 	 */
@@ -3019,6 +3032,14 @@ _("Bad CoW extent size %u on inode %" PRIu64 ", "),
 	*used = is_free;
 	*isa_dir = 0;
 	blkmap_free(dblkmap);
+	if (is_meta) {
+		struct ino_tree_node	*irec;
+		int			off;
+
+		irec = find_inode_rec(mp, agno, ino);
+		off = get_inode_offset(mp, lino, irec);
+		clear_inode_is_meta(irec, off);
+	}
 	return 1;
 }
 
diff --git a/repair/incore.h b/repair/incore.h
index 9ad5f1972d3..90eb1242cd8 100644
--- a/repair/incore.h
+++ b/repair/incore.h
@@ -271,6 +271,7 @@ typedef struct ino_tree_node  {
 	uint64_t		ino_isa_dir;	/* bit == 1 if a directory */
 	uint64_t		ino_was_rl;	/* bit == 1 if reflink flag set */
 	uint64_t		ino_is_rl;	/* bit == 1 if reflink flag should be set */
+	uint64_t		ino_was_meta;	/* bit == 1 if metadata */
 	uint8_t			nlink_size;
 	union ino_nlink		disk_nlinks;	/* on-disk nlinks, set in P3 */
 	union  {
@@ -538,6 +539,24 @@ static inline int inode_is_rl(struct ino_tree_node *irec, int offset)
 	return (irec->ino_is_rl & IREC_MASK(offset)) != 0;
 }
 
+/*
+ * set/clear/test was inode marked as metadata
+ */
+static inline void set_inode_is_meta(struct ino_tree_node *irec, int offset)
+{
+	irec->ino_was_meta |= IREC_MASK(offset);
+}
+
+static inline void clear_inode_is_meta(struct ino_tree_node *irec, int offset)
+{
+	irec->ino_was_meta &= ~IREC_MASK(offset);
+}
+
+static inline int inode_is_meta(struct ino_tree_node *irec, int offset)
+{
+	return (irec->ino_was_meta & IREC_MASK(offset)) != 0;
+}
+
 /*
  * add_inode_reached() is set on inode I only if I has been reached
  * by an inode P claiming to be the parent and if I is a directory,
diff --git a/repair/incore_ino.c b/repair/incore_ino.c
index b0b41a2cc5c..e33f7cef758 100644
--- a/repair/incore_ino.c
+++ b/repair/incore_ino.c
@@ -254,6 +254,7 @@ alloc_ino_node(
 	irec->ino_isa_dir = 0;
 	irec->ino_was_rl = 0;
 	irec->ino_is_rl = 0;
+	irec->ino_was_meta = 0;
 	irec->ir_free = (xfs_inofree_t) - 1;
 	irec->ir_sparse = 0;
 	irec->ino_un.ex_data = NULL;
diff --git a/repair/phase6.c b/repair/phase6.c
index 05e0b8ac593..21bd2b75050 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -2020,6 +2020,38 @@ longform_dir2_entry_check_data(
 			continue;
 		}
 
+		/*
+		 * Regular directories cannot point to metadata files.  If
+		 * we find such a thing, blow out the entry.
+		 */
+		if (!xfs_is_metadir_inode(ip) &&
+		    inode_is_meta(irec, ino_offset)) {
+			nbad++;
+			if (entry_junked(
+	_("entry \"%s\" in regular dir %" PRIu64" points to a metadata inode %" PRIu64 ", "),
+					fname, ip->i_ino, inum)) {
+				dep->name[0] = '/';
+				libxfs_dir2_data_log_entry(&da, bp, dep);
+			}
+			continue;
+		}
+
+		/*
+		 * Metadata directories cannot point to regular files.  If
+		 * we find such a thing, blow out the entry.
+		 */
+		if (xfs_is_metadir_inode(ip) &&
+		    !inode_is_meta(irec, ino_offset)) {
+			nbad++;
+			if (entry_junked(
+	_("entry \"%s\" in metadata dir %" PRIu64" points to a regular inode %" PRIu64 ", "),
+					fname, ip->i_ino, inum)) {
+				dep->name[0] = '/';
+				libxfs_dir2_data_log_entry(&da, bp, dep);
+			}
+			continue;
+		}
+
 		/*
 		 * check if this inode is lost+found dir in the root
 		 */
@@ -2931,6 +2963,37 @@ shortform_dir2_entry_check(
 						ino_dirty);
 			continue;
 		}
+
+		/*
+		 * Regular directories cannot point to metadata files.  If
+		 * we find such a thing, blow out the entry.
+		 */
+		if (!xfs_is_metadir_inode(ip) &&
+		    inode_is_meta(irec, ino_offset)) {
+			do_warn(
+	_("entry \"%s\" in regular dir %" PRIu64" points to a metadata inode %" PRIu64 ", "),
+					fname, ip->i_ino, lino);
+			next_sfep = shortform_dir2_junk(mp, sfp, sfep, lino,
+						&max_size, &i, &bytes_deleted,
+						ino_dirty);
+			continue;
+		}
+
+		/*
+		 * Metadata directories cannot point to regular files.  If
+		 * we find such a thing, blow out the entry.
+		 */
+		if (xfs_is_metadir_inode(ip) &&
+		    !inode_is_meta(irec, ino_offset)) {
+			do_warn(
+	_("entry \"%s\" in metadata dir %" PRIu64" points to a regular inode %" PRIu64 ", "),
+					fname, ip->i_ino, lino);
+			next_sfep = shortform_dir2_junk(mp, sfp, sfep, lino,
+						&max_size, &i, &bytes_deleted,
+						ino_dirty);
+			continue;
+		}
+
 		/*
 		 * check if this inode is lost+found dir in the root
 		 */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 47/58] xfs_repair: update incore metadata state whenever we create new files
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (45 preceding siblings ...)
  2023-12-31 23:41   ` [PATCH 46/58] xfs_repair: don't let metadata and regular files mix Darrick J. Wong
@ 2023-12-31 23:41   ` Darrick J. Wong
  2023-12-31 23:42   ` [PATCH 48/58] xfs_repair: pass private data pointer to scan_lbtree Darrick J. Wong
                     ` (10 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:41 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make sure that we update our incore metadata inode bookkeepping whenever
we create new metadata files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)


diff --git a/repair/phase6.c b/repair/phase6.c
index 21bd2b75050..9ad586602cb 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -516,6 +516,24 @@ mark_ino_inuse(
 		set_inode_isadir(irec, ino_offset);
 }
 
+/*
+ * Mark a newly allocated inode as metadata in the incore bitmap.  Callers
+ * must have already called mark_ino_inuse to ensure there is an incore record.
+ */
+static void
+mark_ino_metadata(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct ino_tree_node	*irec;
+	int			ino_offset;
+
+	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, ino),
+			XFS_INO_TO_AGINO(mp, ino));
+	ino_offset = get_inode_offset(mp, ino, irec);
+	set_inode_is_meta(irec, ino_offset);
+}
+
 /* Make sure this metadata directory path exists. */
 static int
 ensure_imeta_dirpath(
@@ -557,6 +575,7 @@ ensure_imeta_dirpath(
 		}
 
 		mark_ino_inuse(mp, ino, S_IFDIR, parent);
+		mark_ino_metadata(mp, ino);
 		parent = ino;
 	}
 
@@ -664,6 +683,7 @@ ensure_rtino_metadir(
 
 		mark_ino_inuse(mp, upd->ip->i_ino, S_IFREG,
 				lookup_imeta_path_dirname(mp, path));
+		mark_ino_metadata(mp, upd->ip->i_ino);
 		return;
 	}
 
@@ -1108,6 +1128,7 @@ mk_metadir(
 
 	libxfs_trans_ijoin(tp, mp->m_metadirip, 0);
 	libxfs_imeta_set_iflag(tp, mp->m_metadirip);
+	mark_ino_metadata(mp, mp->m_metadirip->i_ino);
 
 	error = -libxfs_trans_commit(tp);
 	if (error)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 48/58] xfs_repair: pass private data pointer to scan_lbtree
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (46 preceding siblings ...)
  2023-12-31 23:41   ` [PATCH 47/58] xfs_repair: update incore metadata state whenever we create new files Darrick J. Wong
@ 2023-12-31 23:42   ` Darrick J. Wong
  2023-12-31 23:42   ` [PATCH 49/58] xfs_repair: mark space used by metadata files Darrick J. Wong
                     ` (9 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:42 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Pass a private data pointer through scan_lbtree.  We'll use this
later when scanning the rtrmapbt to keep track of scan state.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dinode.c |    2 +-
 repair/scan.c   |   11 +++++++----
 repair/scan.h   |    7 +++++--
 3 files changed, 13 insertions(+), 7 deletions(-)


diff --git a/repair/dinode.c b/repair/dinode.c
index 7c3e5d86404..9b7309afcaf 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -839,7 +839,7 @@ _("bad bmap btree ptr 0x%" PRIx64 " in ino %" PRIu64 "\n"),
 
 		if (scan_lbtree(get_unaligned_be64(&pp[i]), level, scan_bmapbt,
 				type, whichfork, lino, tot, nex, blkmapp,
-				&cursor, 1, check_dups, magic,
+				&cursor, 1, check_dups, magic, NULL,
 				&xfs_bmbt_buf_ops))
 			return(1);
 		/*
diff --git a/repair/scan.c b/repair/scan.c
index fbe1916ac6c..8822f0f05fb 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -139,7 +139,8 @@ scan_lbtree(
 				int			isroot,
 				int			check_dups,
 				int			*dirty,
-				uint64_t		magic),
+				uint64_t		magic,
+				void			*priv),
 	int		type,
 	int		whichfork,
 	xfs_ino_t	ino,
@@ -150,6 +151,7 @@ scan_lbtree(
 	int		isroot,
 	int		check_dups,
 	uint64_t	magic,
+	void		*priv,
 	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf	*bp;
@@ -181,7 +183,7 @@ scan_lbtree(
 	err = (*func)(XFS_BUF_TO_BLOCK(bp), nlevels - 1,
 			type, whichfork, root, ino, tot, nex, blkmapp,
 			bm_cursor, isroot, check_dups, &dirty,
-			magic);
+			magic, priv);
 
 	ASSERT(dirty == 0 || (dirty && !no_modify));
 
@@ -210,7 +212,8 @@ scan_bmapbt(
 	int			isroot,
 	int			check_dups,
 	int			*dirty,
-	uint64_t		magic)
+	uint64_t		magic,
+	void			*priv)
 {
 	int			i;
 	int			err;
@@ -497,7 +500,7 @@ _("bad bmap btree ptr 0x%llx in ino %" PRIu64 "\n"),
 
 		err = scan_lbtree(be64_to_cpu(pp[i]), level, scan_bmapbt,
 				type, whichfork, ino, tot, nex, blkmapp,
-				bm_cursor, 0, check_dups, magic,
+				bm_cursor, 0, check_dups, magic, priv,
 				&xfs_bmbt_buf_ops);
 		if (err)
 			return(1);
diff --git a/repair/scan.h b/repair/scan.h
index ee16362b6d3..4da788becbe 100644
--- a/repair/scan.h
+++ b/repair/scan.h
@@ -26,7 +26,8 @@ int scan_lbtree(
 				int			isroot,
 				int			check_dups,
 				int			*dirty,
-				uint64_t		magic),
+				uint64_t		magic,
+				void			*priv),
 	int		type,
 	int		whichfork,
 	xfs_ino_t	ino,
@@ -37,6 +38,7 @@ int scan_lbtree(
 	int		isroot,
 	int		check_dups,
 	uint64_t	magic,
+	void		*priv,
 	const struct xfs_buf_ops *ops);
 
 int scan_bmapbt(
@@ -53,7 +55,8 @@ int scan_bmapbt(
 	int			isroot,
 	int			check_dups,
 	int			*dirty,
-	uint64_t		magic);
+	uint64_t		magic,
+	void			*priv);
 
 void
 scan_ags(


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 49/58] xfs_repair: mark space used by metadata files
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (47 preceding siblings ...)
  2023-12-31 23:42   ` [PATCH 48/58] xfs_repair: pass private data pointer to scan_lbtree Darrick J. Wong
@ 2023-12-31 23:42   ` Darrick J. Wong
  2023-12-31 23:42   ` [PATCH 50/58] xfs_repair: adjust keep_fsinos to handle metadata directories Darrick J. Wong
                     ` (8 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:42 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Track space used by metadata files as a separate incore extent type.
This ensures that we can warn about cross-linked metadata files, even
though we are going to rebuild the entire metadata directory tree in the
end.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dino_chunks.c |   31 +++++++++++++
 repair/dinode.c      |  121 ++++++++++++++++++++++++++++++++++++++------------
 repair/dinode.h      |    6 ++
 repair/incore.h      |   39 ++++++++++------
 repair/incore_ino.c  |    2 -
 repair/phase4.c      |    2 +
 repair/scan.c        |   30 +++++++++++-
 7 files changed, 179 insertions(+), 52 deletions(-)


diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c
index 479dc9db760..7e18991a3d5 100644
--- a/repair/dino_chunks.c
+++ b/repair/dino_chunks.c
@@ -141,6 +141,16 @@ verify_inode_chunk(xfs_mount_t		*mp,
 		_("uncertain inode block %d/%d already known\n"),
 				agno, agbno);
 			break;
+		case XR_E_METADATA:
+			/*
+			 * Files in the metadata directory tree are always
+			 * reconstructed, so it's ok to let go if this block
+			 * is also a valid inode cluster.
+			 */
+			do_warn(
+		_("inode block %d/%d claimed by metadata file\n"),
+				agno, agbno);
+			fallthrough;
 		case XR_E_UNKNOWN:
 		case XR_E_FREE1:
 		case XR_E_FREE:
@@ -430,6 +440,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
 			set_bmap_ext(agno, cur_agbno, blen, XR_E_MULT);
 			pthread_mutex_unlock(&ag_locks[agno].lock);
 			return 0;
+		case XR_E_METADATA:
 		case XR_E_INO:
 			do_error(
 	_("uncertain inode block overlap, agbno = %d, ino = %" PRIu64 "\n"),
@@ -474,6 +485,16 @@ verify_inode_chunk(xfs_mount_t		*mp,
 		_("uncertain inode block %" PRIu64 " already known\n"),
 				XFS_AGB_TO_FSB(mp, agno, cur_agbno));
 			break;
+		case XR_E_METADATA:
+			/*
+			 * Files in the metadata directory tree are always
+			 * reconstructed, so it's ok to let go if this block
+			 * is also a valid inode cluster.
+			 */
+			do_warn(
+		_("inode block %d/%d claimed by metadata file\n"),
+				agno, agbno);
+			fallthrough;
 		case XR_E_UNKNOWN:
 		case XR_E_FREE1:
 		case XR_E_FREE:
@@ -559,6 +580,16 @@ process_inode_agbno_state(
 	switch (state) {
 	case XR_E_INO:	/* already marked */
 		break;
+	case XR_E_METADATA:
+		/*
+		 * Files in the metadata directory tree are always
+		 * reconstructed, so it's ok to let go if this block is also a
+		 * valid inode cluster.
+		 */
+		do_warn(
+	_("inode block %d/%d claimed by metadata file\n"),
+			agno, agbno);
+		fallthrough;
 	case XR_E_UNKNOWN:
 	case XR_E_FREE:
 	case XR_E_FREE1:
diff --git a/repair/dinode.c b/repair/dinode.c
index 9b7309afcaf..52830a85a6f 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -220,6 +220,7 @@ static int
 process_rt_rec_state(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino,
+	bool			zap_metadata,
 	struct xfs_bmbt_irec	*irec)
 {
 	xfs_fsblock_t		b = irec->br_startblock;
@@ -256,7 +257,13 @@ _("data fork in rt inode %" PRIu64 " found invalid rt extent %"PRIu64" state %d
 		switch (state)  {
 		case XR_E_FREE:
 		case XR_E_UNKNOWN:
-			set_rtbmap(ext, XR_E_INUSE);
+			set_rtbmap(ext, zap_metadata ? XR_E_METADATA :
+						       XR_E_INUSE);
+			break;
+		case XR_E_METADATA:
+			do_error(
+_("data fork in rt inode %" PRIu64 " found metadata file block %" PRIu64 " in rt bmap\n"),
+				ino, ext);
 			break;
 		case XR_E_BAD_STATE:
 			do_error(
@@ -293,7 +300,8 @@ process_rt_rec(
 	struct xfs_bmbt_irec	*irec,
 	xfs_ino_t		ino,
 	xfs_rfsblock_t		*tot,
-	int			check_dups)
+	int			check_dups,
+	bool			zap_metadata)
 {
 	xfs_fsblock_t		lastb;
 	int			bad;
@@ -333,7 +341,7 @@ _("inode %" PRIu64 " - bad rt extent overflows - start %" PRIu64 ", "
 	if (check_dups)
 		bad = process_rt_rec_dups(mp, ino, irec);
 	else
-		bad = process_rt_rec_state(mp, ino, irec);
+		bad = process_rt_rec_state(mp, ino, zap_metadata, irec);
 	if (bad)
 		return bad;
 
@@ -364,7 +372,8 @@ process_bmbt_reclist_int(
 	xfs_fileoff_t		*first_key,
 	xfs_fileoff_t		*last_key,
 	int			check_dups,
-	int			whichfork)
+	int			whichfork,
+	bool			zap_metadata)
 {
 	xfs_bmbt_irec_t		irec;
 	xfs_filblks_t		cp = 0;		/* prev count */
@@ -443,7 +452,8 @@ _("zero length extent (off = %" PRIu64 ", fsbno = %" PRIu64 ") in ino %" PRIu64
 
 		if (type == XR_INO_RTDATA && whichfork == XFS_DATA_FORK) {
 			pthread_mutex_lock(&rt_lock.lock);
-			error2 = process_rt_rec(mp, &irec, ino, tot, check_dups);
+			error2 = process_rt_rec(mp, &irec, ino, tot, check_dups,
+					zap_metadata);
 			pthread_mutex_unlock(&rt_lock.lock);
 			if (error2)
 				return error2;
@@ -558,6 +568,11 @@ _("%s fork in ino %" PRIu64 " claims free block %" PRIu64 "\n"),
 			case XR_E_INUSE_FS1:
 				do_warn(_("rmap claims metadata use!\n"));
 				fallthrough;
+			case XR_E_METADATA:
+				do_warn(
+_("%s fork in inode %" PRIu64 " claims metadata file block %" PRIu64 "\n"),
+					forkname, ino, b);
+				break;
 			case XR_E_FS_MAP:
 			case XR_E_INO:
 			case XR_E_INUSE_FS:
@@ -614,15 +629,28 @@ _("illegal state %d in block map %" PRIu64 "\n"),
 		for (; agbno < ebno; agbno += blen) {
 			state = get_bmap_ext(agno, agbno, ebno, &blen);
 			switch (state)  {
+			case XR_E_METADATA:
+				/*
+				 * The entire metadata directory tree is rebuilt
+				 * every time, so we can let regular files take
+				 * ownership of this block.
+				 */
+				if (zap_metadata)
+					break;
+				fallthrough;
 			case XR_E_FREE:
 			case XR_E_FREE1:
 			case XR_E_INUSE1:
 			case XR_E_UNKNOWN:
-				set_bmap_ext(agno, agbno, blen, XR_E_INUSE);
+				set_bmap_ext(agno, agbno, blen, zap_metadata ?
+						XR_E_METADATA : XR_E_INUSE);
 				break;
+
 			case XR_E_INUSE:
 			case XR_E_MULT:
-				set_bmap_ext(agno, agbno, blen, XR_E_MULT);
+				if (!zap_metadata)
+					set_bmap_ext(agno, agbno, blen,
+							XR_E_MULT);
 				break;
 			default:
 				break;
@@ -661,10 +689,12 @@ process_bmbt_reclist(
 	blkmap_t		**blkmapp,
 	xfs_fileoff_t		*first_key,
 	xfs_fileoff_t		*last_key,
-	int			whichfork)
+	int			whichfork,
+	bool			zap_metadata)
 {
 	return process_bmbt_reclist_int(mp, rp, numrecs, type, ino, tot,
-				blkmapp, first_key, last_key, 0, whichfork);
+				blkmapp, first_key, last_key, 0, whichfork,
+				zap_metadata);
 }
 
 /*
@@ -679,13 +709,15 @@ scan_bmbt_reclist(
 	int			type,
 	xfs_ino_t		ino,
 	xfs_rfsblock_t		*tot,
-	int			whichfork)
+	int			whichfork,
+	bool			zap_metadata)
 {
 	xfs_fileoff_t		first_key = 0;
 	xfs_fileoff_t		last_key = 0;
 
 	return process_bmbt_reclist_int(mp, rp, numrecs, type, ino, tot,
-				NULL, &first_key, &last_key, 1, whichfork);
+				NULL, &first_key, &last_key, 1, whichfork,
+				zap_metadata);
 }
 
 /*
@@ -760,7 +792,8 @@ process_btinode(
 	xfs_extnum_t		*nex,
 	blkmap_t		**blkmapp,
 	int			whichfork,
-	int			check_dups)
+	int			check_dups,
+	bool			zap_metadata)
 {
 	xfs_bmdr_block_t	*dib;
 	xfs_fileoff_t		last_key;
@@ -839,8 +872,8 @@ _("bad bmap btree ptr 0x%" PRIx64 " in ino %" PRIu64 "\n"),
 
 		if (scan_lbtree(get_unaligned_be64(&pp[i]), level, scan_bmapbt,
 				type, whichfork, lino, tot, nex, blkmapp,
-				&cursor, 1, check_dups, magic, NULL,
-				&xfs_bmbt_buf_ops))
+				&cursor, 1, check_dups, magic,
+				(void *)zap_metadata, &xfs_bmbt_buf_ops))
 			return(1);
 		/*
 		 * fix key (offset) mismatches between the keys in root
@@ -935,7 +968,8 @@ process_exinode(
 	xfs_extnum_t		*nex,
 	blkmap_t		**blkmapp,
 	int			whichfork,
-	int			check_dups)
+	int			check_dups,
+	bool			zap_metadata)
 {
 	xfs_ino_t		lino;
 	xfs_bmbt_rec_t		*rp;
@@ -969,10 +1003,10 @@ process_exinode(
 	if (check_dups == 0)
 		ret = process_bmbt_reclist(mp, rp, &numrecs, type, lino,
 					tot, blkmapp, &first_key, &last_key,
-					whichfork);
+					whichfork, zap_metadata);
 	else
 		ret = scan_bmbt_reclist(mp, rp, &numrecs, type, lino, tot,
-					whichfork);
+					whichfork, zap_metadata);
 
 	*nex = numrecs;
 	return ret;
@@ -1898,7 +1932,8 @@ process_inode_data_fork(
 	xfs_extnum_t		*nextents,
 	blkmap_t		**dblkmap,
 	int			check_dups,
-	struct xfs_buf		**ino_bpp)
+	struct xfs_buf		**ino_bpp,
+	bool			zap_metadata)
 {
 	struct xfs_dinode	*dino = *dinop;
 	xfs_ino_t		lino = XFS_AGINO_TO_INO(mp, agno, ino);
@@ -1939,14 +1974,14 @@ process_inode_data_fork(
 			try_rebuild = 1;
 		err = process_exinode(mp, agno, ino, dino, type, dirty,
 			totblocks, nextents, dblkmap, XFS_DATA_FORK,
-			check_dups);
+			check_dups, zap_metadata);
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		if (!rmapbt_suspect && try_rebuild == -1)
 			try_rebuild = 1;
 		err = process_btinode(mp, agno, ino, dino, type, dirty,
 			totblocks, nextents, dblkmap, XFS_DATA_FORK,
-			check_dups);
+			check_dups, zap_metadata);
 		break;
 	case XFS_DINODE_FMT_DEV:
 		err = 0;
@@ -1999,12 +2034,12 @@ _("would have tried to rebuild inode %"PRIu64" data fork\n"),
 		case XFS_DINODE_FMT_EXTENTS:
 			err = process_exinode(mp, agno, ino, dino, type,
 				dirty, totblocks, nextents, dblkmap,
-				XFS_DATA_FORK, 0);
+				XFS_DATA_FORK, 0, zap_metadata);
 			break;
 		case XFS_DINODE_FMT_BTREE:
 			err = process_btinode(mp, agno, ino, dino, type,
 				dirty, totblocks, nextents, dblkmap,
-				XFS_DATA_FORK, 0);
+				XFS_DATA_FORK, 0, zap_metadata);
 			break;
 		case XFS_DINODE_FMT_DEV:
 			err = 0;
@@ -2039,7 +2074,8 @@ process_inode_attr_fork(
 	int			check_dups,
 	int			extra_attr_check,
 	int			*retval,
-	struct xfs_buf		**ino_bpp)
+	struct xfs_buf		**ino_bpp,
+	bool			zap_metadata)
 {
 	xfs_ino_t		lino = XFS_AGINO_TO_INO(mp, agno, ino);
 	struct xfs_dinode	*dino = *dinop;
@@ -2087,7 +2123,7 @@ process_inode_attr_fork(
 		*anextents = 0;
 		err = process_exinode(mp, agno, ino, dino, type, dirty,
 				atotblocks, anextents, &ablkmap,
-				XFS_ATTR_FORK, check_dups);
+				XFS_ATTR_FORK, check_dups, zap_metadata);
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		if (!rmapbt_suspect && try_rebuild == -1)
@@ -2096,7 +2132,7 @@ process_inode_attr_fork(
 		*anextents = 0;
 		err = process_btinode(mp, agno, ino, dino, type, dirty,
 				atotblocks, anextents, &ablkmap,
-				XFS_ATTR_FORK, check_dups);
+				XFS_ATTR_FORK, check_dups, zap_metadata);
 		break;
 	default:
 		do_warn(_("illegal attribute format %d, ino %" PRIu64 "\n"),
@@ -2160,12 +2196,12 @@ _("would have tried to rebuild inode %"PRIu64" attr fork or cleared it\n"),
 		case XFS_DINODE_FMT_EXTENTS:
 			err = process_exinode(mp, agno, ino, dino,
 				type, dirty, atotblocks, anextents,
-				&ablkmap, XFS_ATTR_FORK, 0);
+				&ablkmap, XFS_ATTR_FORK, 0, zap_metadata);
 			break;
 		case XFS_DINODE_FMT_BTREE:
 			err = process_btinode(mp, agno, ino, dino,
 				type, dirty, atotblocks, anextents,
-				&ablkmap, XFS_ATTR_FORK, 0);
+				&ablkmap, XFS_ATTR_FORK, 0, zap_metadata);
 			break;
 		default:
 			do_error(_("illegal attribute fmt %d, ino %" PRIu64 "\n"),
@@ -2362,6 +2398,7 @@ process_dinode_int(
 	xfs_agino_t		unlinked_ino;
 	struct xfs_perag	*pag;
 	bool			is_meta = false;
+	bool			zap_metadata = false;
 
 	*dirty = *isa_dir = 0;
 	*used = is_used;
@@ -2944,6 +2981,32 @@ _("Bad CoW extent size %u on inode %" PRIu64 ", "),
 		off = get_inode_offset(mp, lino, irec);
 		set_inode_is_meta(irec, off);
 		is_meta = true;
+
+		/*
+		 * We always rebuild the metadata directory tree during phase
+		 * 6, so we use this flag to get all the directory blocks
+		 * marked as free, and any other metadata files whose contents
+		 * we don't want to save.
+		 *
+		 * Currently, there are no metadata files that use xattrs, so
+		 * we always drop the xattr blocks of metadata files.
+		 */
+		switch (type) {
+		case XR_INO_RTBITMAP:
+		case XR_INO_RTSUM:
+		case XR_INO_UQUOTA:
+		case XR_INO_GQUOTA:
+		case XR_INO_PQUOTA:
+			/*
+			 * This inode was recognized as being filesystem
+			 * metadata, so preserve the inode and its contents for
+			 * later checking and repair.
+			 */
+			break;
+		default:
+			zap_metadata = true;
+			break;
+		}
 	}
 
 	/*
@@ -2951,7 +3014,7 @@ _("Bad CoW extent size %u on inode %" PRIu64 ", "),
 	 */
 	if (process_inode_data_fork(mp, agno, ino, dinop, type, dirty,
 			&totblocks, &nextents, &dblkmap, check_dups,
-			ino_bpp) != 0)
+			ino_bpp, zap_metadata) != 0)
 		goto bad_out;
 	dino = *dinop;
 
@@ -2961,7 +3024,7 @@ _("Bad CoW extent size %u on inode %" PRIu64 ", "),
 	 */
 	if (process_inode_attr_fork(mp, agno, ino, dinop, type, dirty,
 			&atotblocks, &anextents, check_dups, extra_attr_check,
-			&retval, ino_bpp))
+			&retval, ino_bpp, is_meta))
 		goto bad_out;
 	dino = *dinop;
 
diff --git a/repair/dinode.h b/repair/dinode.h
index 92df83da621..ed2ec4ca238 100644
--- a/repair/dinode.h
+++ b/repair/dinode.h
@@ -27,7 +27,8 @@ process_bmbt_reclist(xfs_mount_t	*mp,
 		struct blkmap		**blkmapp,
 		uint64_t		*first_key,
 		uint64_t		*last_key,
-		int			whichfork);
+		int			whichfork,
+		bool			zap_metadata);
 
 int
 scan_bmbt_reclist(
@@ -37,7 +38,8 @@ scan_bmbt_reclist(
 	int			type,
 	xfs_ino_t		ino,
 	xfs_rfsblock_t		*tot,
-	int			whichfork);
+	int			whichfork,
+	bool			zap_metadata);
 
 void
 update_rootino(xfs_mount_t *mp);
diff --git a/repair/incore.h b/repair/incore.h
index 90eb1242cd8..645cc5317c8 100644
--- a/repair/incore.h
+++ b/repair/incore.h
@@ -85,18 +85,25 @@ typedef struct rt_extent_tree_node  {
 #define XR_E_UNKNOWN	0	/* unknown state */
 #define XR_E_FREE1	1	/* free block (marked by one fs space tree) */
 #define XR_E_FREE	2	/* free block (marked by both fs space trees) */
-#define XR_E_INUSE	3	/* extent used by file/dir data or metadata */
-#define XR_E_INUSE_FS	4	/* extent used by fs ag header or log */
-#define XR_E_MULT	5	/* extent is multiply referenced */
-#define XR_E_INO	6	/* extent used by inodes (inode blocks) */
-#define XR_E_FS_MAP	7	/* extent used by fs space/inode maps */
-#define XR_E_INUSE1	8	/* used block (marked by rmap btree) */
-#define XR_E_INUSE_FS1	9	/* used by fs ag header or log (rmap btree) */
-#define XR_E_INO1	10	/* used by inodes (marked by rmap btree) */
-#define XR_E_FS_MAP1	11	/* used by fs space/inode maps (rmap btree) */
-#define XR_E_REFC	12	/* used by fs ag reference count btree */
-#define XR_E_COW	13	/* leftover cow extent */
-#define XR_E_BAD_STATE	14
+/*
+ * Space used by metadata files.  The entire metadata directory tree will be
+ * rebuilt from scratch during phase 6, so this value must be less than
+ * XR_E_INUSE so that the space will go back to the free space btrees during
+ * phase 5.
+ */
+#define XR_E_METADATA	3
+#define XR_E_INUSE	4	/* extent used by file/dir data or metadata */
+#define XR_E_INUSE_FS	5	/* extent used by fs ag header or log */
+#define XR_E_MULT	6	/* extent is multiply referenced */
+#define XR_E_INO	7	/* extent used by inodes (inode blocks) */
+#define XR_E_FS_MAP	8	/* extent used by fs space/inode maps */
+#define XR_E_INUSE1	9	/* used block (marked by rmap btree) */
+#define XR_E_INUSE_FS1	10	/* used by fs ag header or log (rmap btree) */
+#define XR_E_INO1	11	/* used by inodes (marked by rmap btree) */
+#define XR_E_FS_MAP1	12	/* used by fs space/inode maps (rmap btree) */
+#define XR_E_REFC	13	/* used by fs ag reference count btree */
+#define XR_E_COW	14	/* leftover cow extent */
+#define XR_E_BAD_STATE	15
 
 /* separate state bit, OR'ed into high (4th) bit of ex_state field */
 
@@ -271,7 +278,7 @@ typedef struct ino_tree_node  {
 	uint64_t		ino_isa_dir;	/* bit == 1 if a directory */
 	uint64_t		ino_was_rl;	/* bit == 1 if reflink flag set */
 	uint64_t		ino_is_rl;	/* bit == 1 if reflink flag should be set */
-	uint64_t		ino_was_meta;	/* bit == 1 if metadata */
+	uint64_t		ino_is_meta;	/* bit == 1 if metadata */
 	uint8_t			nlink_size;
 	union ino_nlink		disk_nlinks;	/* on-disk nlinks, set in P3 */
 	union  {
@@ -544,17 +551,17 @@ static inline int inode_is_rl(struct ino_tree_node *irec, int offset)
  */
 static inline void set_inode_is_meta(struct ino_tree_node *irec, int offset)
 {
-	irec->ino_was_meta |= IREC_MASK(offset);
+	irec->ino_is_meta |= IREC_MASK(offset);
 }
 
 static inline void clear_inode_is_meta(struct ino_tree_node *irec, int offset)
 {
-	irec->ino_was_meta &= ~IREC_MASK(offset);
+	irec->ino_is_meta &= ~IREC_MASK(offset);
 }
 
 static inline int inode_is_meta(struct ino_tree_node *irec, int offset)
 {
-	return (irec->ino_was_meta & IREC_MASK(offset)) != 0;
+	return (irec->ino_is_meta & IREC_MASK(offset)) != 0;
 }
 
 /*
diff --git a/repair/incore_ino.c b/repair/incore_ino.c
index e33f7cef758..1b7540d5148 100644
--- a/repair/incore_ino.c
+++ b/repair/incore_ino.c
@@ -254,7 +254,7 @@ alloc_ino_node(
 	irec->ino_isa_dir = 0;
 	irec->ino_was_rl = 0;
 	irec->ino_is_rl = 0;
-	irec->ino_was_meta = 0;
+	irec->ino_is_meta = 0;
 	irec->ir_free = (xfs_inofree_t) - 1;
 	irec->ir_sparse = 0;
 	irec->ino_un.ex_data = NULL;
diff --git a/repair/phase4.c b/repair/phase4.c
index f004111ea4e..e8bd5982147 100644
--- a/repair/phase4.c
+++ b/repair/phase4.c
@@ -303,6 +303,7 @@ phase4(xfs_mount_t *mp)
 				_("unknown block state, ag %d, blocks %u-%u\n"),
 					i, j, j + blen - 1);
 				fallthrough;
+			case XR_E_METADATA:
 			case XR_E_UNKNOWN:
 			case XR_E_FREE:
 			case XR_E_INUSE:
@@ -335,6 +336,7 @@ phase4(xfs_mount_t *mp)
 	_("unknown rt extent state, extent %" PRIu64 "\n"),
 				rtx);
 			fallthrough;
+		case XR_E_METADATA:
 		case XR_E_UNKNOWN:
 		case XR_E_FREE1:
 		case XR_E_FREE:
diff --git a/repair/scan.c b/repair/scan.c
index 8822f0f05fb..c52ace31c2d 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -227,6 +227,7 @@ scan_bmapbt(
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
 	int			state;
+	bool			zap_metadata = priv != NULL;
 
 	/*
 	 * unlike the ag freeblock btrees, if anything looks wrong
@@ -352,7 +353,20 @@ _("bad back (left) sibling pointer (saw %llu should be NULL (0))\n"
 		case XR_E_UNKNOWN:
 		case XR_E_FREE1:
 		case XR_E_FREE:
-			set_bmap(agno, agbno, XR_E_INUSE);
+			set_bmap(agno, agbno, zap_metadata ? XR_E_METADATA :
+							   XR_E_INUSE);
+			break;
+		case XR_E_METADATA:
+			/*
+			 * bmbt block already claimed by a metadata file.  We
+			 * always reconstruct the entire metadata tree, so if
+			 * this is a regular file we mark it owned by the file.
+			 */
+			do_warn(
+_("inode 0x%" PRIx64 "bmap block 0x%" PRIx64 " claimed by metadata file\n"),
+				ino, bno);
+			if (!zap_metadata)
+				set_bmap(agno, agbno, XR_E_INUSE);
 			break;
 		case XR_E_FS_MAP:
 		case XR_E_INUSE:
@@ -364,7 +378,8 @@ _("bad back (left) sibling pointer (saw %llu should be NULL (0))\n"
 			 * we made it here, the block probably
 			 * contains btree data.
 			 */
-			set_bmap(agno, agbno, XR_E_MULT);
+			if (!zap_metadata)
+				set_bmap(agno, agbno, XR_E_MULT);
 			do_warn(
 _("inode 0x%" PRIx64 "bmap block 0x%" PRIx64 " claimed, state is %d\n"),
 				ino, bno, state);
@@ -440,7 +455,8 @@ _("inode %" PRIu64 " bad # of bmap records (%" PRIu64 ", min - %u, max - %u)\n")
 		if (check_dups == 0)  {
 			err = process_bmbt_reclist(mp, rp, &numrecs, type, ino,
 						   tot, blkmapp, &first_key,
-						   &last_key, whichfork);
+						   &last_key, whichfork,
+						   zap_metadata);
 			if (err)
 				return 1;
 
@@ -470,7 +486,7 @@ _("out-of-order bmap key (file offset) in inode %" PRIu64 ", %s fork, fsbno %" P
 			return 0;
 		} else {
 			return scan_bmbt_reclist(mp, rp, &numrecs, type, ino,
-						 tot, whichfork);
+					tot, whichfork, zap_metadata);
 		}
 	}
 	if (numrecs > mp->m_bmap_dmxr[1] || (isroot == 0 && numrecs <
@@ -860,6 +876,12 @@ process_rmap_rec(
 			break;
 		}
 		break;
+	case XR_E_METADATA:
+		do_warn(
+_("Metadata file block (%d,%d-%d) mismatch in %s tree, state - %d,%" PRIx64 "\n"),
+			agno, b, b + blen - 1,
+			name, state, owner);
+		break;
 	case XR_E_INUSE_FS:
 		if (owner == XFS_RMAP_OWN_FS ||
 		    owner == XFS_RMAP_OWN_LOG)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 50/58] xfs_repair: adjust keep_fsinos to handle metadata directories
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (48 preceding siblings ...)
  2023-12-31 23:42   ` [PATCH 49/58] xfs_repair: mark space used by metadata files Darrick J. Wong
@ 2023-12-31 23:42   ` Darrick J. Wong
  2023-12-31 23:42   ` [PATCH 51/58] xfs_repair: metadata dirs are never plausible root dirs Darrick J. Wong
                     ` (7 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:42 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

On a filesystem with metadata directories, we only want to automatically
mark the two root directories present because those are the only two
statically allocated inode numbers -- the rt summary inode is now just a
regular file in a directory.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase5.c |    7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)


diff --git a/repair/phase5.c b/repair/phase5.c
index d7bacb18b84..983f2169228 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -421,13 +421,14 @@ static void
 keep_fsinos(xfs_mount_t *mp)
 {
 	ino_tree_node_t		*irec;
-	int			i;
 
 	irec = find_inode_rec(mp, XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino),
 			XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino));
 
-	for (i = 0; i < 3; i++)
-		set_inode_used(irec, i);
+	set_inode_used(irec, 0);	/* root dir */
+	set_inode_used(irec, 1);	/* rt bitmap or metadata dir root */
+	if (!xfs_has_metadir(mp))
+		set_inode_used(irec, 2);	/* rt summary */
 }
 
 static void


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 51/58] xfs_repair: metadata dirs are never plausible root dirs
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (49 preceding siblings ...)
  2023-12-31 23:42   ` [PATCH 50/58] xfs_repair: adjust keep_fsinos to handle metadata directories Darrick J. Wong
@ 2023-12-31 23:42   ` Darrick J. Wong
  2023-12-31 23:43   ` [PATCH 52/58] xfs_repair: reattach quota inodes to metadata directory Darrick J. Wong
                     ` (6 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:42 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Metadata directories are never candidates to be the root of the
user-accessible directory tree.  Update has_plausible_rootdir to ignore
them all, as well as detecting the case where the superblock incorrectly
thinks both trees have the same root.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/xfs_repair.c |    6 ++++++
 1 file changed, 6 insertions(+)


diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index d881d5ec4ac..fe24f66ab98 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -541,9 +541,15 @@ has_plausible_rootdir(
 	int			error;
 	bool			ret = false;
 
+	if (xfs_has_metadir(mp) &&
+	    mp->m_sb.sb_rootino == mp->m_sb.sb_metadirino)
+		goto out;
+
 	error = -libxfs_iget(mp, NULL, mp->m_sb.sb_rootino, 0, &ip);
 	if (error)
 		goto out;
+	if (xfs_is_metadir_inode(ip))
+		goto out_rele;
 	if (!S_ISDIR(VFS_I(ip)->i_mode))
 		goto out_rele;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 52/58] xfs_repair: reattach quota inodes to metadata directory
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (50 preceding siblings ...)
  2023-12-31 23:42   ` [PATCH 51/58] xfs_repair: metadata dirs are never plausible root dirs Darrick J. Wong
@ 2023-12-31 23:43   ` Darrick J. Wong
  2023-12-31 23:43   ` [PATCH 53/58] xfs_repair: drop all the metadata directory files during pass 4 Darrick J. Wong
                     ` (5 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:43 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If the quota inodes came through unscathed, we should attach them to
the new metadata directory so that phase 7 can run quotacheck on them.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |  127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)


diff --git a/repair/phase6.c b/repair/phase6.c
index 9ad586602cb..5eeffd5dce9 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -3633,6 +3633,131 @@ update_missing_dotdot_entries(
 	}
 }
 
+/*
+ * Re-link a quota inode into the metadata directory.  We do not create quota
+ * inodes or abort repair if we cannot relink the inodes, because quota mount
+ * can recreate all the quota metadata.
+ */
+static int
+reattach_quota_inode(
+	struct xfs_mount		*mp,
+	xfs_ino_t			*inop,
+	const struct xfs_imeta_path	*path)
+{
+	struct xfs_imeta_update		upd;
+	struct xfs_trans		*tp;
+	struct xfs_inode		*ip = NULL;
+	xfs_ino_t			ino = *inop;
+	int				error;
+
+	error = ensure_imeta_dirpath(mp, path);
+	if (error) {
+		do_warn(
+ _("Couldn't create quota metadata directory, error %d\n"), error);
+		return error;
+	}
+
+	error = -libxfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		do_error(
+ _("failed to allocate trans to grab quota inode 0x%llx, error %d\n"),
+				(unsigned long long)ino, error);
+	error = -libxfs_imeta_iget(tp, ino, XFS_DIR3_FT_REG_FILE, &ip);
+	libxfs_trans_cancel(tp);
+	if (error) {
+		do_warn(
+ _("Couldn't grab quota inode 0x%llx, error %d\n"),
+				(unsigned long long)ino, error);
+		goto out_rele;
+	}
+
+	/*
+	 * Since we're reattaching this file to the metadata directory tree,
+	 * try to remove all the parent pointers that might be attached.
+	 */
+	try_erase_parent_ptrs(ip);
+
+	error = -libxfs_imeta_start_link(mp, path, ip, &upd);
+	if (error) {
+		do_warn(
+ _("Couldn't allocate transaction to attach quota inode 0x%llx, error %d\n"),
+				(unsigned long long)ino, error);
+		goto out_rele;
+	}
+
+	/* Null out the superblock pointer and re-link this file into it. */
+	*inop = NULLFSINO;
+
+	error = -libxfs_imeta_link(&upd);
+	if (error) {
+		do_warn(
+ _("Couldn't link quota inode 0x%llx, error %d\n"),
+				(unsigned long long)ino, error);
+		goto out_cancel;
+	}
+
+	/* Reset the link count to something sane. */
+	set_nlink(VFS_I(ip), 1);
+	libxfs_trans_log_inode(upd.tp, ip, XFS_ILOG_CORE);
+
+	error = -libxfs_imeta_commit_update(&upd);
+	if (error) {
+		do_warn(
+_("Couldn't commit quota inode 0x%llx reattachment transaction, error %d\n"),
+				(unsigned long long)ino, error);
+	}
+
+	goto out_rele;
+
+out_cancel:
+	libxfs_imeta_cancel_update(&upd, error);
+out_rele:
+	if (ip)
+		libxfs_irele(ip);
+	return error;
+}
+
+/*
+ * Reattach quota inodes to the metadata directory if we rebuilt the metadata
+ * directory tree.
+ */
+static inline void
+reattach_metadir_quota_inodes(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	if (!xfs_has_metadir(mp) || no_modify)
+		return;
+
+	if (mp->m_sb.sb_uquotino != NULLFSINO) {
+		error = reattach_quota_inode(mp, &mp->m_sb.sb_uquotino,
+				&XFS_IMETA_USRQUOTA);
+		if (error) {
+			mp->m_sb.sb_uquotino = NULLFSINO;
+			lost_uquotino = 1;
+		}
+	}
+
+	if (mp->m_sb.sb_gquotino != NULLFSINO) {
+		error = reattach_quota_inode(mp, &mp->m_sb.sb_gquotino,
+				&XFS_IMETA_GRPQUOTA);
+		if (error) {
+			mp->m_sb.sb_gquotino = NULLFSINO;
+			lost_gquotino = 1;
+		}
+	}
+
+	if (mp->m_sb.sb_pquotino != NULLFSINO) {
+		error = reattach_quota_inode(mp, &mp->m_sb.sb_pquotino,
+				&XFS_IMETA_PRJQUOTA);
+		if (error) {
+			mp->m_sb.sb_pquotino = NULLFSINO;
+			lost_pquotino = 1;
+		}
+	}
+}
+
 static void
 traverse_ags(
 	struct xfs_mount	*mp)
@@ -3718,6 +3843,8 @@ _("        - resetting contents of realtime bitmap and summary inodes\n"));
 		}
 	}
 
+	reattach_metadir_quota_inodes(mp);
+
 	mark_standalone_inodes(mp);
 
 	do_log(_("        - traversing filesystem ...\n"));


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 53/58] xfs_repair: drop all the metadata directory files during pass 4
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (51 preceding siblings ...)
  2023-12-31 23:43   ` [PATCH 52/58] xfs_repair: reattach quota inodes to metadata directory Darrick J. Wong
@ 2023-12-31 23:43   ` Darrick J. Wong
  2023-12-31 23:43   ` [PATCH 54/58] xfs_repair: truncate and unmark orphaned metadata inodes Darrick J. Wong
                     ` (4 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:43 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Drop the entire metadata directory tree during pass 4 so that we can
reinitialize the entire tree in phase 6.  The existing metadata files
(rtbitmap, rtsummary, quotas) will be reattached to the newly rebuilt
directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/dino_chunks.c |    9 +++++++++
 repair/dinode.c      |   14 +++++++++++++-
 repair/phase6.c      |   21 +++++++++++----------
 repair/scan.c        |    2 +-
 4 files changed, 34 insertions(+), 12 deletions(-)


diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c
index 7e18991a3d5..a7f9ea70ca7 100644
--- a/repair/dino_chunks.c
+++ b/repair/dino_chunks.c
@@ -950,6 +950,15 @@ process_inode_chunk(
 			clear_inode_isadir(ino_rec, irec_offset);
 		}
 
+		/*
+		 * We always reinitialize the rt bitmap and summary inodes if
+		 * the metadata directory feature is enabled.
+		 */
+		if (xfs_has_metadir(mp) && !no_modify) {
+			need_rbmino = -1;
+			need_rsumino = -1;
+		}
+
 		if (status)  {
 			if (mp->m_sb.sb_rootino == ino) {
 				need_root_inode = 1;
diff --git a/repair/dinode.c b/repair/dinode.c
index 52830a85a6f..5c9101fa0b0 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -656,7 +656,7 @@ _("illegal state %d in block map %" PRIu64 "\n"),
 				break;
 			}
 		}
-		if (collect_rmaps) /* && !check_dups */
+		if (collect_rmaps && !zap_metadata) /* && !check_dups */
 			rmap_add_rec(mp, ino, whichfork, &irec);
 		*tot += irec.br_blockcount;
 	}
@@ -3084,6 +3084,18 @@ _("Bad CoW extent size %u on inode %" PRIu64 ", "),
 	 */
 	*dirty += process_check_inode_nlink_version(dino, lino);
 
+	/*
+	 * The entire metadata directory tree will be rebuilt during phase 6.
+	 * Therefore, if we're at the end of phase 4 and this is a metadata
+	 * file, zero the ondisk inode and the incore state.
+	 */
+	if (check_dups && zap_metadata && !no_modify) {
+		clear_dinode(mp, dino, lino);
+		*dirty += 1;
+		*used = is_free;
+		*isa_dir = 0;
+	}
+
 	return retval;
 
 clear_bad_out:
diff --git a/repair/phase6.c b/repair/phase6.c
index 5eeffd5dce9..c7cfc371ac2 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -3798,20 +3798,20 @@ phase6(xfs_mount_t *mp)
 		}
 	}
 
-	if (need_metadir_inode) {
-		if (!no_modify)  {
+	if (!no_modify && xfs_has_metadir(mp))  {
+		if (need_metadir_inode)
 			do_warn(_("reinitializing metadata root directory\n"));
-			mk_metadir(mp);
-			need_metadir_inode = false;
-			need_metadir_dotdot = 0;
-		} else  {
-			do_warn(_("would reinitialize metadata root directory\n"));
-		}
+		mk_metadir(mp);
+		need_metadir_inode = false;
+		need_metadir_dotdot = 0;
+	} else if (need_metadir_inode) {
+		do_warn(_("would reinitialize metadata root directory\n"));
 	}
 
 	if (need_rbmino)  {
 		if (!no_modify)  {
-			do_warn(_("reinitializing realtime bitmap inode\n"));
+			if (need_rbmino > 0)
+				do_warn(_("reinitializing realtime bitmap inode\n"));
 			mk_rbmino(mp);
 			need_rbmino = 0;
 		} else  {
@@ -3821,7 +3821,8 @@ phase6(xfs_mount_t *mp)
 
 	if (need_rsumino)  {
 		if (!no_modify)  {
-			do_warn(_("reinitializing realtime summary inode\n"));
+			if (need_rsumino > 0)
+				do_warn(_("reinitializing realtime summary inode\n"));
 			mk_rsumino(mp);
 			need_rsumino = 0;
 		} else  {
diff --git a/repair/scan.c b/repair/scan.c
index c52ace31c2d..3857593b165 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -429,7 +429,7 @@ _("bad state %d, inode %" PRIu64 " bmap block 0x%" PRIx64 "\n"),
 	numrecs = be16_to_cpu(block->bb_numrecs);
 
 	/* Record BMBT blocks in the reverse-mapping data. */
-	if (check_dups && collect_rmaps) {
+	if (check_dups && collect_rmaps && !zap_metadata) {
 		agno = XFS_FSB_TO_AGNO(mp, bno);
 		pthread_mutex_lock(&ag_locks[agno].lock);
 		rmap_add_bmbt_rec(mp, ino, whichfork, bno);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 54/58] xfs_repair: truncate and unmark orphaned metadata inodes
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (52 preceding siblings ...)
  2023-12-31 23:43   ` [PATCH 53/58] xfs_repair: drop all the metadata directory files during pass 4 Darrick J. Wong
@ 2023-12-31 23:43   ` Darrick J. Wong
  2023-12-31 23:43   ` [PATCH 55/58] xfs_repair: do not count metadata directory files when doing quotacheck Darrick J. Wong
                     ` (3 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:43 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If an inode claims to be a metadata inode but wasn't linked in either
directory tree, remove the attr fork and reset the data fork if the
contents weren't regular extent mappings before moving the inode to the
lost+found.

We don't ifree the inode, because it's possible that the inode was not
actually a metadata inode but simply got corrupted due to bitflips or
something, and we'd rather let the sysadmin examine what's left of the
file instead of photorec'ing it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)


diff --git a/repair/phase6.c b/repair/phase6.c
index c7cfc371ac2..a99793b4d90 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -1273,6 +1273,53 @@ mk_orphanage(
 	return(ino);
 }
 
+/* Don't let metadata inode contents leak to lost+found. */
+static void
+trunc_metadata_inode(
+	struct xfs_inode	*ip)
+{
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = ip->i_mount;
+	int			err;
+
+	err = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (err)
+		do_error(
+	_("space reservation failed (%d), filesystem may be out of space\n"),
+					err);
+
+	libxfs_trans_ijoin(tp, ip, 0);
+	ip->i_diflags2 &= ~XFS_DIFLAG2_METADIR;
+
+	switch (VFS_I(ip)->i_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
+		break;
+	case S_IFREG:
+		switch (ip->i_df.if_format) {
+		case XFS_DINODE_FMT_EXTENTS:
+		case XFS_DINODE_FMT_BTREE:
+			break;
+		default:
+			ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+			ip->i_df.if_nextents = 0;
+			break;
+		}
+		break;
+	}
+
+	libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	err = -libxfs_trans_commit(tp);
+	if (err)
+		do_error(
+	_("truncation of metadata inode 0x%llx failed, err=%d\n"),
+				(unsigned long long)ip->i_ino, err);
+}
+
 /*
  * Add a parent pointer back to the orphanage for any file we're moving into
  * the orphanage, being careful not to trip over any existing parent pointer.
@@ -1362,6 +1409,9 @@ mv_orphanage(
 	if (err)
 		do_error(_("%d - couldn't iget disconnected inode\n"), err);
 
+	if (xfs_is_metadir_inode(ino_p))
+		trunc_metadata_inode(ino_p);
+
 	xname.type = libxfs_mode_to_ftype(VFS_I(ino_p)->i_mode);
 
 	if (isa_dir)  {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 55/58] xfs_repair: do not count metadata directory files when doing quotacheck
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (53 preceding siblings ...)
  2023-12-31 23:43   ` [PATCH 54/58] xfs_repair: truncate and unmark orphaned metadata inodes Darrick J. Wong
@ 2023-12-31 23:43   ` Darrick J. Wong
  2023-12-31 23:44   ` [PATCH 56/58] xfs_repair: allow sysadmins to add metadata directories Darrick J. Wong
                     ` (2 subsequent siblings)
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:43 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Previously, we stated that files in the metadata directory tree are not
counted in the dquot information.  Fix the offline quotacheck code in
xfs_repair and xfs_check to reflect this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/check.c          |    4 ++++
 repair/quotacheck.c |    5 +++++
 2 files changed, 9 insertions(+)


diff --git a/db/check.c b/db/check.c
index ae527471161..190565074b6 100644
--- a/db/check.c
+++ b/db/check.c
@@ -3018,6 +3018,10 @@ process_inode(
 		default:
 			break;
 		}
+		/* Metadata directory files are not counted in quotas. */
+		if (dip->di_version >= 3 &&
+		    (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADIR)))
+			ic = 0;
 		if (ic)
 			quota_add(&prid, &gid, &uid, 0, bc, ic, rc);
 	}
diff --git a/repair/quotacheck.c b/repair/quotacheck.c
index 4cb38db3ddd..3abcd8ae9a0 100644
--- a/repair/quotacheck.c
+++ b/repair/quotacheck.c
@@ -217,6 +217,10 @@ quotacheck_adjust(
 		return;
 	}
 
+	/* Metadata directory files aren't counted in quota. */
+	if (xfs_is_metadir_inode(ip))
+		goto out_rele;
+
 	/* Count the file's blocks. */
 	if (XFS_IS_REALTIME_INODE(ip))
 		rtblks = qc_count_rtblocks(ip);
@@ -229,6 +233,7 @@ quotacheck_adjust(
 	if (proj_dquots)
 		qc_adjust(proj_dquots, ip->i_projid, blocks, rtblks);
 
+out_rele:
 	libxfs_irele(ip);
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 56/58] xfs_repair: allow sysadmins to add metadata directories
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (54 preceding siblings ...)
  2023-12-31 23:43   ` [PATCH 55/58] xfs_repair: do not count metadata directory files when doing quotacheck Darrick J. Wong
@ 2023-12-31 23:44   ` Darrick J. Wong
  2023-12-31 23:44   ` [PATCH 57/58] mkfs.xfs: enable " Darrick J. Wong
  2023-12-31 23:44   ` [PATCH 58/58] mkfs: add a utility to generate protofiles Darrick J. Wong
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:44 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow the sysadmin to use xfs_repair to upgrade an existing filesystem
to support metadata directories.  This will be needed to upgrade
filesystems to support realtime rmap and reflink.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 man/man8/xfs_admin.8 |    8 ++++++
 repair/dino_chunks.c |    6 ++++
 repair/dinode.c      |    5 +++-
 repair/globals.c     |    1 +
 repair/globals.h     |    1 +
 repair/phase2.c      |   69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 repair/phase4.c      |    5 +++-
 repair/protos.h      |    6 ++++
 repair/xfs_repair.c  |   11 ++++++++
 9 files changed, 109 insertions(+), 3 deletions(-)


diff --git a/man/man8/xfs_admin.8 b/man/man8/xfs_admin.8
index 28f28b6dd8f..68b4bf62427 100644
--- a/man/man8/xfs_admin.8
+++ b/man/man8/xfs_admin.8
@@ -184,6 +184,14 @@ This enables much stronger cross-referencing and online repairs of the
 directory tree.
 The filesystem cannot be downgraded after this feature is enabled.
 This upgrade can fail if the filesystem has less than 25% free space remaining.
+.TP 0.4i
+.B metadir
+Create a directory tree of metadata inodes instead of storing them all in the
+superblock.
+This is required for reverse mapping btrees and reflink support on the realtime
+device.
+The filesystem cannot be downgraded after this feature is enabled.
+This upgrade can fail if any AG has less than 5% free space remaining.
 This feature is not upstream yet.
 .RE
 .TP
diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c
index a7f9ea70ca7..d132556d9dc 100644
--- a/repair/dino_chunks.c
+++ b/repair/dino_chunks.c
@@ -960,7 +960,11 @@ process_inode_chunk(
 		}
 
 		if (status)  {
-			if (mp->m_sb.sb_rootino == ino) {
+			if (wipe_pre_metadir_file(ino)) {
+				if (!ino_discovery)
+					do_warn(
+	_("wiping pre-metadir metadata inode %"PRIu64".\n"), ino);
+			} else if (mp->m_sb.sb_rootino == ino) {
 				need_root_inode = 1;
 
 				if (!no_modify)  {
diff --git a/repair/dinode.c b/repair/dinode.c
index 5c9101fa0b0..5a57069c29e 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -2422,6 +2422,9 @@ process_dinode_int(
 	ASSERT(uncertain == 0 || verify_mode != 0);
 	ASSERT(ino_bpp != NULL || verify_mode != 0);
 
+	if (wipe_pre_metadir_file(lino))
+		goto clear_bad_out;
+
 	/*
 	 * This is the only valid point to check the CRC; after this we may have
 	 * made changes which invalidate it, and the CRC is only updated again
@@ -2631,7 +2634,7 @@ _("bad (negative) size %" PRId64 " on inode %" PRIu64 "\n"),
 		if (flags & XFS_DIFLAG_NEWRTBM) {
 			/* must be a rt bitmap inode */
 			if (lino != mp->m_sb.sb_rbmino) {
-				if (!uncertain) {
+				if (!uncertain && !add_metadir) {
 					do_warn(
 	_("inode %" PRIu64 " not rt bitmap\n"),
 						lino);
diff --git a/repair/globals.c b/repair/globals.c
index 22cb096c6a4..e3b2697127f 100644
--- a/repair/globals.c
+++ b/repair/globals.c
@@ -56,6 +56,7 @@ bool	add_finobt;		/* add free inode btrees */
 bool	add_reflink;		/* add reference count btrees */
 bool	add_rmapbt;		/* add reverse mapping btrees */
 bool	add_parent;		/* add parent pointers */
+bool	add_metadir;		/* add metadata directory tree */
 
 /* misc status variables */
 
diff --git a/repair/globals.h b/repair/globals.h
index c3709f11874..1c24e313b89 100644
--- a/repair/globals.h
+++ b/repair/globals.h
@@ -97,6 +97,7 @@ extern bool	add_finobt;		/* add free inode btrees */
 extern bool	add_reflink;		/* add reference count btrees */
 extern bool	add_rmapbt;		/* add reverse mapping btrees */
 extern bool	add_parent;		/* add parent pointers */
+extern bool	add_metadir;		/* add metadata directory tree */
 
 /* misc status variables */
 
diff --git a/repair/phase2.c b/repair/phase2.c
index 5a08cbc31c6..cc7ddad8240 100644
--- a/repair/phase2.c
+++ b/repair/phase2.c
@@ -14,6 +14,7 @@
 #include "incore.h"
 #include "progress.h"
 #include "scan.h"
+#include "quotacheck.h"
 
 /* workaround craziness in the xlog routines */
 int xlog_recover_do_trans(struct xlog *log, struct xlog_recover *t, int p)
@@ -287,6 +288,70 @@ set_parent(
 	return true;
 }
 
+static xfs_ino_t doomed_rbmino = NULLFSINO;
+static xfs_ino_t doomed_rsumino = NULLFSINO;
+static xfs_ino_t doomed_uquotino = NULLFSINO;
+static xfs_ino_t doomed_gquotino = NULLFSINO;
+static xfs_ino_t doomed_pquotino = NULLFSINO;
+
+bool
+wipe_pre_metadir_file(
+	xfs_ino_t	ino)
+{
+	if (ino == doomed_rbmino ||
+	    ino == doomed_rsumino ||
+	    ino == doomed_uquotino ||
+	    ino == doomed_gquotino ||
+	    ino == doomed_pquotino)
+		return true;
+	return false;
+}
+
+static bool
+set_metadir(
+	struct xfs_mount	*mp,
+	struct xfs_sb		*new_sb)
+{
+	if (xfs_has_metadir(mp)) {
+		printf(_("Filesystem already supports metadata directory trees.\n"));
+		exit(0);
+	}
+
+	if (!xfs_has_crc(mp)) {
+		printf(
+	_("Metadata directory trees only supported on V5 filesystems.\n"));
+		exit(0);
+	}
+
+	printf(_("Adding metadata directory trees to filesystem.\n"));
+	new_sb->sb_features_incompat |= (XFS_SB_FEAT_INCOMPAT_METADIR |
+					 XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
+
+	/* Blow out all the old metadata inodes; we'll rebuild in phase6. */
+	new_sb->sb_metadirino = new_sb->sb_rootino + 1;
+	doomed_rbmino = mp->m_sb.sb_rbmino;
+	doomed_rsumino = mp->m_sb.sb_rsumino;
+	doomed_uquotino = mp->m_sb.sb_uquotino;
+	doomed_gquotino = mp->m_sb.sb_gquotino;
+	doomed_pquotino = mp->m_sb.sb_pquotino;
+
+	new_sb->sb_rbmino = NULLFSINO;
+	new_sb->sb_rsumino = NULLFSINO;
+	new_sb->sb_uquotino = NULLFSINO;
+	new_sb->sb_gquotino = NULLFSINO;
+	new_sb->sb_pquotino = NULLFSINO;
+
+	/* Indicate that we need a rebuild. */
+	need_metadir_inode = 1;
+	need_rbmino = 1;
+	need_rsumino = 1;
+	have_uquotino = 0;
+	have_gquotino = 0;
+	have_pquotino = 0;
+	quotacheck_skip();
+	return true;
+}
+
 struct check_state {
 	struct xfs_sb		sb;
 	uint64_t		features;
@@ -475,6 +540,8 @@ need_check_fs_free_space(
 		return true;
 	if (xfs_has_parent(mp) && !(old->features & XFS_FEAT_PARENT))
 		return true;
+	if (xfs_has_metadir(mp) && !(old->features & XFS_FEAT_METADIR))
+		return true;
 	return false;
 }
 
@@ -558,6 +625,8 @@ upgrade_filesystem(
 		dirty |= set_rmapbt(mp, &new_sb);
 	if (add_parent)
 		dirty |= set_parent(mp, &new_sb);
+	if (add_metadir)
+		dirty |= set_metadir(mp, &new_sb);
 	if (!dirty)
 		return;
 
diff --git a/repair/phase4.c b/repair/phase4.c
index e8bd5982147..cfdea1460e5 100644
--- a/repair/phase4.c
+++ b/repair/phase4.c
@@ -272,7 +272,10 @@ phase4(xfs_mount_t *mp)
 	if (xfs_has_metadir(mp) &&
 	    (is_inode_free(irec, 1) || !inode_isadir(irec, 1))) {
 		need_metadir_inode = true;
-		if (no_modify)
+		if (add_metadir)
+			do_warn(
+	_("metadata directory root inode needs to be initialized\n"));
+		else if (no_modify)
 			do_warn(
 	_("metadata directory root inode would be lost\n"));
 		else
diff --git a/repair/protos.h b/repair/protos.h
index e2f39f1d6e8..ce171f3dd87 100644
--- a/repair/protos.h
+++ b/repair/protos.h
@@ -3,6 +3,8 @@
  * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
+#ifndef __XFS_REPAIR_PROTOS_H__
+#define __XFS_REPAIR_PROTOS_H__
 
 void	xfs_init(struct libxfs_init *args);
 
@@ -45,3 +47,7 @@ void	phase7(struct xfs_mount *, int);
 int	verify_set_agheader(struct xfs_mount *, struct xfs_buf *,
 		struct xfs_sb *, struct xfs_agf *, struct xfs_agi *,
 		xfs_agnumber_t);
+
+bool wipe_pre_metadir_file(xfs_ino_t ino);
+
+#endif  /* __XFS_REPAIR_PROTOS_H__ */
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index fe24f66ab98..bab8aa70f7a 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -73,6 +73,7 @@ enum c_opt_nums {
 	CONVERT_REFLINK,
 	CONVERT_RMAPBT,
 	CONVERT_PARENT,
+	CONVERT_METADIR,
 	C_MAX_OPTS,
 };
 
@@ -85,6 +86,7 @@ static char *c_opts[] = {
 	[CONVERT_REFLINK]	= "reflink",
 	[CONVERT_RMAPBT]	= "rmapbt",
 	[CONVERT_PARENT]	= "parent",
+	[CONVERT_METADIR]	= "metadir",
 	[C_MAX_OPTS]		= NULL,
 };
 
@@ -380,6 +382,15 @@ process_args(int argc, char **argv)
 		_("-c parent only supports upgrades\n"));
 					add_parent = true;
 					break;
+				case CONVERT_METADIR:
+					if (!val)
+						do_abort(
+		_("-c metadir requires a parameter\n"));
+					if (strtol(val, NULL, 0) != 1)
+						do_abort(
+		_("-c metadir only supports upgrades\n"));
+					add_metadir = true;
+					break;
 				default:
 					unknown('c', val);
 					break;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 57/58] mkfs.xfs: enable metadata directories
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (55 preceding siblings ...)
  2023-12-31 23:44   ` [PATCH 56/58] xfs_repair: allow sysadmins to add metadata directories Darrick J. Wong
@ 2023-12-31 23:44   ` Darrick J. Wong
  2023-12-31 23:44   ` [PATCH 58/58] mkfs: add a utility to generate protofiles Darrick J. Wong
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:44 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable formatting filesystems with metadata directories.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h    |    3 ++-
 man/man8/mkfs.xfs.8.in |   11 +++++++++++
 mkfs/lts_4.19.conf     |    1 +
 mkfs/lts_5.10.conf     |    1 +
 mkfs/lts_5.15.conf     |    1 +
 mkfs/proto.c           |   45 ++++++++++++++++++++++++++++++++++++++++++++-
 mkfs/xfs_mkfs.c        |   24 +++++++++++++++++++++++-
 7 files changed, 83 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 7596b928698..0636ca97622 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -398,7 +398,8 @@ xfs_sb_has_ro_compat_feature(
 		 XFS_SB_FEAT_INCOMPAT_BIGTIME| \
 		 XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
 		 XFS_SB_FEAT_INCOMPAT_NREXT64| \
-		 XFS_SB_FEAT_INCOMPAT_PARENT)
+		 XFS_SB_FEAT_INCOMPAT_PARENT | \
+		 XFS_SB_FEAT_INCOMPAT_METADIR)
 
 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL
 static inline bool
diff --git a/man/man8/mkfs.xfs.8.in b/man/man8/mkfs.xfs.8.in
index 8060d342c2a..587754ff95b 100644
--- a/man/man8/mkfs.xfs.8.in
+++ b/man/man8/mkfs.xfs.8.in
@@ -271,6 +271,17 @@ option set.
 When the option
 .B \-m finobt=0
 is used, the inode btree counter feature is not supported and is disabled.
+.TP
+.BI metadir= value
+This option creates an internal directory tree to store filesystem metadata.
+.IP
+By default,
+.B mkfs.xfs
+will not enable this feature.
+If the option
+.B \-m crc=0
+is used, the metadata directory feature is not supported and is disabled.
+
 .TP
 .BI uuid= value
 Use the given value as the filesystem UUID for the newly created filesystem.
diff --git a/mkfs/lts_4.19.conf b/mkfs/lts_4.19.conf
index 8b2bdd7a347..6ed71c75347 100644
--- a/mkfs/lts_4.19.conf
+++ b/mkfs/lts_4.19.conf
@@ -6,6 +6,7 @@ bigtime=0
 crc=1
 finobt=1
 inobtcount=0
+metadir=0
 reflink=0
 rmapbt=0
 
diff --git a/mkfs/lts_5.10.conf b/mkfs/lts_5.10.conf
index 40189310af2..6eb25d7d4b2 100644
--- a/mkfs/lts_5.10.conf
+++ b/mkfs/lts_5.10.conf
@@ -6,6 +6,7 @@ bigtime=0
 crc=1
 finobt=1
 inobtcount=0
+metadir=0
 reflink=1
 rmapbt=0
 
diff --git a/mkfs/lts_5.15.conf b/mkfs/lts_5.15.conf
index aeecc035567..445719f6013 100644
--- a/mkfs/lts_5.15.conf
+++ b/mkfs/lts_5.15.conf
@@ -6,6 +6,7 @@ bigtime=1
 crc=1
 finobt=1
 inobtcount=1
+metadir=0
 reflink=1
 rmapbt=0
 
diff --git a/mkfs/proto.c b/mkfs/proto.c
index 0103fe54a5d..33d454cffb2 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -17,6 +17,7 @@ static void fail(char *msg, int i);
 static struct xfs_trans * getres(struct xfs_mount *mp, uint blocks);
 static void rsvfile(xfs_mount_t *mp, xfs_inode_t *ip, long long len);
 static char *newregfile(char **pp, int *len);
+static int metadir_create(struct xfs_mount *mp);
 static void rtinit(xfs_mount_t *mp);
 static void rtfreesp_init(struct xfs_mount *mp);
 static long filesize(int fd);
@@ -705,8 +706,15 @@ parseproto(
 		 * RT initialization.  Do this here to ensure that
 		 * the RT inodes get placed after the root inode.
 		 */
-		if (isroot)
+		if (isroot) {
+			error = metadir_create(mp);
+			if (error)
+				fail(
+	_("Creation of the metadata directory inode failed"),
+					error);
+
 			rtinit(mp);
+		}
 		tp = NULL;
 		for (;;) {
 			name = getdirentname(pp);
@@ -744,6 +752,41 @@ parse_proto(
 	parseproto(mp, NULL, fsx, pp, NULL);
 }
 
+/* Create a new metadata root directory. */
+static int
+metadir_create(
+	struct xfs_mount	*mp)
+{
+	struct xfs_imeta_update	upd;
+	struct xfs_inode	*ip = NULL;
+	int			error;
+
+	if (!xfs_has_metadir(mp))
+		return 0;
+
+	error = -libxfs_imeta_start_create(mp, &XFS_IMETA_METADIR, &upd);
+	if (error)
+		return error;
+
+	error = -libxfs_imeta_create(&upd, S_IFDIR, &ip);
+	if (error)
+		goto out_cancel;
+
+	error = -libxfs_imeta_commit_update(&upd);
+	if (error)
+		goto out_rele;
+
+	mp->m_metadirip = ip;
+	return 0;
+
+out_cancel:
+	libxfs_imeta_cancel_update(&upd, error);
+out_rele:
+	if (ip)
+		libxfs_irele(ip);
+	return error;
+}
+
 /* Create the realtime bitmap inode. */
 static void
 rtbitmap_create(
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 15be7e0fb60..de4d1b25c26 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -147,6 +147,7 @@ enum {
 	M_REFLINK,
 	M_INOBTCNT,
 	M_BIGTIME,
+	M_METADIR,
 	M_MAX_OPTS,
 };
 
@@ -801,6 +802,7 @@ static struct opt_params mopts = {
 		[M_REFLINK] = "reflink",
 		[M_INOBTCNT] = "inobtcount",
 		[M_BIGTIME] = "bigtime",
+		[M_METADIR] = "metadir",
 		[M_MAX_OPTS] = NULL,
 	},
 	.subopt_params = {
@@ -844,6 +846,12 @@ static struct opt_params mopts = {
 		  .maxval = 1,
 		  .defaultval = 1,
 		},
+		{ .index = M_METADIR,
+		  .conflicts = { { NULL, LAST_CONFLICT } },
+		  .minval = 0,
+		  .maxval = 1,
+		  .defaultval = 1,
+		},
 	},
 };
 
@@ -896,6 +904,7 @@ struct sb_feat_args {
 	bool	reflink;		/* XFS_SB_FEAT_RO_COMPAT_REFLINK */
 	bool	inobtcnt;		/* XFS_SB_FEAT_RO_COMPAT_INOBTCNT */
 	bool	bigtime;		/* XFS_SB_FEAT_INCOMPAT_BIGTIME */
+	bool	metadir;		/* XFS_SB_FEAT_INCOMPAT_METADIR */
 	bool	nodalign;
 	bool	nortalign;
 	bool	nrext64;
@@ -1028,7 +1037,7 @@ usage( void )
 /* blocksize */		[-b size=num]\n\
 /* config file */	[-c options=xxx]\n\
 /* metadata */		[-m crc=0|1,finobt=0|1,uuid=xxx,rmapbt=0|1,reflink=0|1,\n\
-			    inobtcount=0|1,bigtime=0|1]\n\
+			    inobtcount=0|1,bigtime=0|1,metadir=0|1]\n\
 /* data subvol */	[-d agcount=n,agsize=n,file,name=xxx,size=num,\n\
 			    (sunit=value,swidth=value|su=num,sw=num|noalign),\n\
 			    sectsize=num,concurrency=num]\n\
@@ -1845,6 +1854,9 @@ meta_opts_parser(
 	case M_BIGTIME:
 		cli->sb_feat.bigtime = getnum(value, opts, subopt);
 		break;
+	case M_METADIR:
+		cli->sb_feat.metadir = getnum(value, opts, subopt);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -2384,6 +2396,13 @@ _("64 bit extent count not supported without CRC support\n"));
 			usage();
 		}
 		cli->sb_feat.nrext64 = false;
+
+		if (cli->sb_feat.metadir) {
+			fprintf(stderr,
+_("metadata directory not supported without CRC support\n"));
+			usage();
+		}
+		cli->sb_feat.metadir = false;
 	}
 
 	if (!cli->sb_feat.finobt) {
@@ -3508,6 +3527,8 @@ sb_set_features(
 		sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_PARENT;
 		sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
 	}
+	if (fp->metadir)
+		sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_METADIR;
 
 	/*
 	 * Sparse inode chunk support has two main inode alignment requirements.
@@ -3957,6 +3978,7 @@ finish_superblock_setup(
 	platform_uuid_copy(&sbp->sb_meta_uuid, &cfg->uuid);
 	sbp->sb_logstart = cfg->logstart;
 	sbp->sb_rootino = sbp->sb_rbmino = sbp->sb_rsumino = NULLFSINO;
+	sbp->sb_metadirino = NULLFSINO;
 	sbp->sb_agcount = (xfs_agnumber_t)cfg->agcount;
 	sbp->sb_rbmblocks = cfg->rtbmblocks;
 	sbp->sb_logblocks = (xfs_extlen_t)cfg->logblocks;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 58/58] mkfs: add a utility to generate protofiles
  2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
                     ` (56 preceding siblings ...)
  2023-12-31 23:44   ` [PATCH 57/58] mkfs.xfs: enable " Darrick J. Wong
@ 2023-12-31 23:44   ` Darrick J. Wong
  57 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:44 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new utility to generate mkfs protofiles from a directory tree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 man/man8/xfs_protofile.8 |   33 ++++++++++
 mkfs/Makefile            |   10 +++
 mkfs/xfs_protofile.in    |  152 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 194 insertions(+), 1 deletion(-)
 create mode 100644 man/man8/xfs_protofile.8
 create mode 100644 mkfs/xfs_protofile.in


diff --git a/man/man8/xfs_protofile.8 b/man/man8/xfs_protofile.8
new file mode 100644
index 00000000000..75090c138f3
--- /dev/null
+++ b/man/man8/xfs_protofile.8
@@ -0,0 +1,33 @@
+.TH xfs_protofile 8
+.SH NAME
+xfs_protofile \- create a protofile for use with mkfs.xfs
+.SH SYNOPSIS
+.B xfs_protofile
+.I path
+[
+.I paths...
+]
+.br
+.B xfs_protofile \-V
+.SH DESCRIPTION
+.B xfs_protofile
+walks a directory tree to generate a protofile.
+The protofile format is specified in the
+.BR mkfs.xfs (8)
+manual page and is derived from 3rd edition Unix.
+.SH OPTIONS
+.TP 1.0i
+.I path
+Create protofile directives to copy this path into the root directory.
+If the path is a directory, protofile directives will be emitted to
+replicate the entire subtree as a subtree of the root directory.
+If the path is a not a directory, protofile directives will be emitted
+to create the file as an entry in the root directory.
+The first path must resolve to a directory.
+
+.SH BUGS
+Filenames cannot contain spaces.
+Extended attributes are not copied into the filesystem.
+
+.PD
+.RE
diff --git a/mkfs/Makefile b/mkfs/Makefile
index a0c168e3815..e42a5618302 100644
--- a/mkfs/Makefile
+++ b/mkfs/Makefile
@@ -6,6 +6,7 @@ TOPDIR = ..
 include $(TOPDIR)/include/builddefs
 
 LTCOMMAND = mkfs.xfs
+XFS_PROTOFILE = xfs_protofile
 
 HFILES =
 CFILES = proto.c xfs_mkfs.c
@@ -22,17 +23,24 @@ LLDLIBS += $(LIBXFS) $(LIBXCMD) $(LIBFROG) $(LIBRT) $(LIBBLKID) \
 	$(LIBUUID) $(LIBINIH) $(LIBURCU) $(LIBPTHREAD)
 LTDEPENDENCIES += $(LIBXFS) $(LIBXCMD) $(LIBFROG)
 LLDFLAGS = -static-libtool-libs
+DIRT = $(XFS_PROTOFILE)
 
-default: depend $(LTCOMMAND) $(CFGFILES)
+default: depend $(LTCOMMAND) $(CFGFILES) $(XFS_PROTOFILE)
 
 include $(BUILDRULES)
 
 install: default
 	$(INSTALL) -m 755 -d $(PKG_ROOT_SBIN_DIR)
 	$(LTINSTALL) -m 755 $(LTCOMMAND) $(PKG_ROOT_SBIN_DIR)
+	$(INSTALL) -m 755 $(XFS_PROTOFILE) $(PKG_ROOT_SBIN_DIR)
 	$(INSTALL) -m 755 -d $(MKFS_CFG_DIR)
 	$(INSTALL) -m 644 $(CFGFILES) $(MKFS_CFG_DIR)
 
 install-dev:
 
+$(XFS_PROTOFILE): $(XFS_PROTOFILE).in
+	@echo "    [SED]    $@"
+	$(Q)$(SED) -e "s|@pkg_version@|$(PKG_VERSION)|g" < $< > $@
+	$(Q)chmod a+x $@
+
 -include .dep
diff --git a/mkfs/xfs_protofile.in b/mkfs/xfs_protofile.in
new file mode 100644
index 00000000000..9aee4336888
--- /dev/null
+++ b/mkfs/xfs_protofile.in
@@ -0,0 +1,152 @@
+#!/usr/bin/python3
+
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2018-2024 Oracle.  All rights reserved.
+#
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+# Walk a filesystem tree to generate a protofile for mkfs.
+
+import os
+import argparse
+import sys
+import stat
+
+def emit_proto_header():
+	'''Emit the protofile header.'''
+	print('/')
+	print('0 0')
+
+def stat_to_str(statbuf):
+	'''Convert a stat buffer to a proto string.'''
+
+	if stat.S_ISREG(statbuf.st_mode):
+		type = '-'
+	elif stat.S_ISCHR(statbuf.st_mode):
+		type = 'c'
+	elif stat.S_ISBLK(statbuf.st_mode):
+		type = 'b'
+	elif stat.S_ISFIFO(statbuf.st_mode):
+		type = 'p'
+	elif stat.S_ISDIR(statbuf.st_mode):
+		type = 'd'
+	elif stat.S_ISLNK(statbuf.st_mode):
+		type = 'l'
+
+	if statbuf.st_mode & stat.S_ISUID:
+		suid = 'u'
+	else:
+		suid = '-'
+
+	if statbuf.st_mode & stat.S_ISGID:
+		sgid = 'g'
+	else:
+		sgid = '-'
+
+	perms = stat.S_IMODE(statbuf.st_mode)
+
+	return '%s%s%s%o %d %d' % (type, suid, sgid, perms, statbuf.st_uid, \
+			statbuf.st_gid)
+
+def stat_to_extra(statbuf, fullpath):
+	'''Compute the extras column for a protofile.'''
+
+	if stat.S_ISREG(statbuf.st_mode):
+		return ' %s' % fullpath
+	elif stat.S_ISCHR(statbuf.st_mode) or stat.S_ISBLK(statbuf.st_mode):
+		return ' %d %d' % (statbuf.st_rdev, statbuf.st_rdev)
+	elif stat.S_ISLNK(statbuf.st_mode):
+		return ' %s' % os.readlink(fullpath)
+	return ''
+
+def max_fname_len(s1):
+	'''Return the length of the longest string in s1.'''
+	ret = 0
+	for s in s1:
+		if len(s) > ret:
+			ret = len(s)
+	return ret
+
+def walk_tree(path, depth):
+	'''Walk the directory tree rooted by path.'''
+	dirs = []
+	files = []
+
+	for fname in os.listdir(path):
+		fullpath = os.path.join(path, fname)
+		sb = os.lstat(fullpath)
+
+		if stat.S_ISDIR(sb.st_mode):
+			dirs.append(fname)
+			continue
+		elif stat.S_ISSOCK(sb.st_mode):
+			continue
+		else:
+			files.append(fname)
+
+	for fname in files:
+		if ' ' in fname:
+			raise ValueError( \
+				f'{fname}: Spaces not allowed in file names.')
+	for fname in dirs:
+		if ' ' in fname:
+			raise Exception( \
+				f'{fname}: Spaces not allowed in file names.')
+
+	fname_width = max_fname_len(files)
+	for fname in files:
+		fullpath = os.path.join(path, fname)
+		sb = os.lstat(fullpath)
+		extra = stat_to_extra(sb, fullpath)
+		print('%*s%-*s %s%s' % (depth, ' ', fname_width, fname, \
+				stat_to_str(sb), extra))
+
+	for fname in dirs:
+		fullpath = os.path.join(path, fname)
+		sb = os.lstat(fullpath)
+		extra = stat_to_extra(sb, fullpath)
+		print('%*s%s %s' % (depth, ' ', fname, \
+				stat_to_str(sb)))
+		walk_tree(fullpath, depth + 1)
+
+	if depth > 1:
+		print('%*s$' % (depth - 1, ' '))
+
+def main():
+	parser = argparse.ArgumentParser( \
+			description = "Generate mkfs.xfs protofile for a directory tree.")
+	parser.add_argument('paths', metavar = 'paths', type = str, \
+			nargs = '*', help = 'Directory paths to walk.')
+	parser.add_argument("-V", help = "Report version and exit.", \
+			action = "store_true")
+	args = parser.parse_args()
+
+	if args.V:
+		print("xfs_protofile version @pkg_version@")
+		sys.exit(0)
+
+	emit_proto_header()
+	if len(args.paths) == 0:
+		print('d--755 0 0')
+		print('$')
+	else:
+		# Copy the first argument's stat to the rootdir
+		statbuf = os.stat(args.paths[0])
+		if not stat.S_ISDIR(statbuf.st_mode):
+			raise NotADirectoryError(path)
+		print(stat_to_str(statbuf))
+
+		# All files under each path go in the root dir, recursively
+		for path in args.paths:
+			print(': Descending path %s' % path)
+			try:
+				walk_tree(path, 1)
+			except Exception as e:
+				print(e, file = sys.stderr)
+				return 1
+
+		print('$')
+	return 0
+
+if __name__ == '__main__':
+	sys.exit(main())


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/8] xfs_db: support passing the realtime device to the debugger
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
@ 2023-12-31 23:44   ` Darrick J. Wong
  2023-12-31 23:45   ` [PATCH 2/8] xfs_db: report the realtime device when associated with each io cursor Darrick J. Wong
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:44 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new -R flag so that sysadmins can pass the realtime device to
the xfs debugger.  Since we can now have superblocks on the rt device,
we need this to be able to inspect/dump/etc.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/init.c         |    7 +++++--
 db/io.c           |   28 +++++++++++++++++++++++-----
 db/io.h           |    2 ++
 db/xfs_admin.sh   |    4 +++-
 man/man8/xfs_db.8 |   13 +++++++++++++
 5 files changed, 46 insertions(+), 8 deletions(-)


diff --git a/db/init.c b/db/init.c
index cea25ae52bd..17fb094296c 100644
--- a/db/init.c
+++ b/db/init.c
@@ -33,7 +33,7 @@ static void
 usage(void)
 {
 	fprintf(stderr, _(
-		"Usage: %s [-ifFrxV] [-p prog] [-l logdev] [-c cmd]... device\n"
+		"Usage: %s [-ifFrxV] [-p prog] [-l logdev] [-R rtdev] [-c cmd]... device\n"
 		), progname);
 	exit(1);
 }
@@ -54,7 +54,7 @@ init(
 	textdomain(PACKAGE);
 
 	progname = basename(argv[0]);
-	while ((c = getopt(argc, argv, "c:fFip:rxVl:")) != EOF) {
+	while ((c = getopt(argc, argv, "c:fFip:rR:xVl:")) != EOF) {
 		switch (c) {
 		case 'c':
 			cmdline = xrealloc(cmdline, (ncmdline+1)*sizeof(char*));
@@ -75,6 +75,9 @@ init(
 		case 'r':
 			x.flags = LIBXFS_ISREADONLY;
 			break;
+		case 'R':
+			x.rt.name = optarg;
+			break;
 		case 'l':
 			x.log.name = optarg;
 			break;
diff --git a/db/io.c b/db/io.c
index 590dd1f82f7..61befb43437 100644
--- a/db/io.c
+++ b/db/io.c
@@ -458,6 +458,7 @@ ring_add(void)
 static void
 write_cur_buf(void)
 {
+	struct xfs_buftarg	*btp = iocur_top->bp->b_target;
 	int ret;
 
 	ret = -libxfs_bwrite(iocur_top->bp);
@@ -465,7 +466,7 @@ write_cur_buf(void)
 		dbprintf(_("write error: %s\n"), strerror(ret));
 
 	/* re-read buffer from disk */
-	ret = -libxfs_readbufr(mp->m_ddev_targp, iocur_top->bb, iocur_top->bp,
+	ret = -libxfs_readbufr(btp, iocur_top->bb, iocur_top->bp,
 			      iocur_top->blen, 0);
 	if (ret != 0)
 		dbprintf(_("read error: %s\n"), strerror(ret));
@@ -474,6 +475,7 @@ write_cur_buf(void)
 static void
 write_cur_bbs(void)
 {
+	struct xfs_buftarg	*btp = iocur_top->bp->b_target;
 	int ret;
 
 	ret = -libxfs_bwrite(iocur_top->bp);
@@ -482,7 +484,7 @@ write_cur_bbs(void)
 
 
 	/* re-read buffer from disk */
-	ret = -libxfs_readbufr_map(mp->m_ddev_targp, iocur_top->bp, 0);
+	ret = -libxfs_readbufr_map(btp, iocur_top->bp, 0);
 	if (ret != 0)
 		dbprintf(_("read error: %s\n"), strerror(ret));
 }
@@ -541,9 +543,9 @@ static void
 __set_cur(
 	struct xfs_buftarg	*btargp,
 	const typ_t		*type,
-	xfs_daddr_t		 blknum,
-	int			 len,
-	int			 ring_flag,
+	xfs_daddr_t		blknum,
+	int			len,
+	int			ring_flag,
 	bbmap_t			*bbmap)
 {
 	struct xfs_buf		*bp;
@@ -647,6 +649,22 @@ set_log_cur(
 	__set_cur(mp->m_logdev_targp, type, blknum, len, ring_flag, bbmap);
 }
 
+int
+set_rt_cur(
+	const typ_t	*type,
+	xfs_daddr_t	blknum,
+	int		len,
+	int		ring_flag,
+	bbmap_t		*bbmap)
+{
+	if (!mp->m_rtdev_targp->bt_bdev) {
+		printf(_("realtime device not loaded, use -R.\n"));
+		return ENODEV;
+	}
+
+	__set_cur(mp->m_rtdev_targp, type, blknum, len, ring_flag, bbmap);
+	return 0;
+}
 
 void
 set_iocur_type(
diff --git a/db/io.h b/db/io.h
index f48b67b47a2..bb5065f06c0 100644
--- a/db/io.h
+++ b/db/io.h
@@ -51,6 +51,8 @@ extern void	set_cur(const struct typ *type, xfs_daddr_t blknum,
 			int len, int ring_add, bbmap_t *bbmap);
 extern void	set_log_cur(const struct typ *type, xfs_daddr_t blknum,
 			int len, int ring_add, bbmap_t *bbmap);
+extern int	set_rt_cur(const struct typ *type, xfs_daddr_t blknum,
+			int len, int ring_add, bbmap_t *bbmap);
 extern void     ring_add(void);
 extern void	set_iocur_type(const struct typ *type);
 extern void	xfs_dummy_verify(struct xfs_buf *bp);
diff --git a/db/xfs_admin.sh b/db/xfs_admin.sh
index cc650c42550..52a658ba4a5 100755
--- a/db/xfs_admin.sh
+++ b/db/xfs_admin.sh
@@ -8,6 +8,7 @@ status=0
 require_offline=""
 require_online=""
 DB_OPTS=""
+DB_DEV_OPTS=""
 REPAIR_OPTS=""
 IO_OPTS=""
 REPAIR_DEV_OPTS=""
@@ -42,6 +43,7 @@ do
 		require_offline=1
 		;;
 	r)	REPAIR_DEV_OPTS=" -r '$OPTARG'"
+		DB_DEV_OPTS=" -R '$OPTARG'"
 		require_offline=1
 		;;
 	u)	DB_OPTS=$DB_OPTS" -r -c uuid"
@@ -89,7 +91,7 @@ case $# in
 
 		if [ -n "$DB_OPTS" ]
 		then
-			eval xfs_db -x -p xfs_admin $LOG_OPTS $DB_OPTS "$1"
+			eval xfs_db -x -p xfs_admin $LOG_OPTS $DB_DEV_OPTS $DB_OPTS "$1"
 			status=$?
 		fi
 		if [ -n "$REPAIR_OPTS" ]
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index b1712a92f76..f5e3064d923 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -14,6 +14,9 @@ xfs_db \- debug an XFS filesystem
 .B \-l
 .I logdev
 ] [
+.B \-R
+.I rtdev
+] [
 .B \-p
 .I progname
 ]
@@ -80,6 +83,16 @@ Set the program name to
 for prompts and some error messages, the default value is
 .BR xfs_db .
 .TP
+.B -R
+.I rtdev
+Specifies the device where the realtime data resides.
+This is only relevant for filesystems that have a realtime section.
+See the
+.BR mkfs.xfs "(8) " \-r
+option, and refer to
+.BR xfs (5)
+for a detailed description of the XFS realtime section.
+.TP
 .B -r
 Open
 .I device


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/8] xfs_db: report the realtime device when associated with each io cursor
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
  2023-12-31 23:44   ` [PATCH 1/8] xfs_db: support passing the realtime device to the debugger Darrick J. Wong
@ 2023-12-31 23:45   ` Darrick J. Wong
  2023-12-31 23:45   ` [PATCH 3/8] xfs_db: make the daddr command target the realtime device Darrick J. Wong
                     ` (5 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:45 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

When db is reporting on an io cursor and the cursor points to the
realtime device, print that fact.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/block.c |    2 ++
 db/io.c    |   11 +++++++++++
 db/io.h    |    1 +
 3 files changed, 14 insertions(+)


diff --git a/db/block.c b/db/block.c
index d730c779671..b2b5edf9385 100644
--- a/db/block.c
+++ b/db/block.c
@@ -132,6 +132,8 @@ daddr_f(
 			dbprintf(_("datadev daddr is %lld\n"), daddr);
 		else if (iocur_is_extlogdev(iocur_top))
 			dbprintf(_("logdev daddr is %lld\n"), daddr);
+		else if (iocur_is_rtdev(iocur_top))
+			dbprintf(_("rtdev daddr is %lld\n"), daddr);
 		else
 			dbprintf(_("current daddr is %lld\n"), daddr);
 
diff --git a/db/io.c b/db/io.c
index 61befb43437..580d3401586 100644
--- a/db/io.c
+++ b/db/io.c
@@ -159,6 +159,15 @@ iocur_is_extlogdev(const struct iocur *ioc)
 	return bp->b_target == bp->b_mount->m_logdev_targp;
 }
 
+bool
+iocur_is_rtdev(const struct iocur *ioc)
+{
+	if (!ioc->bp)
+		return false;
+
+	return ioc->bp->b_target == ioc->bp->b_mount->m_rtdev_targp;
+}
+
 void
 print_iocur(
 	char	*tag,
@@ -171,6 +180,8 @@ print_iocur(
 		block_unit = "fsbno";
 	else if (iocur_is_extlogdev(ioc))
 		block_unit = "logbno";
+	else if (iocur_is_rtdev(ioc))
+		block_unit = "rtbno";
 
 	dbprintf("%s\n", tag);
 	dbprintf(_("\tbyte offset %lld, length %d\n"), ioc->off, ioc->len);
diff --git a/db/io.h b/db/io.h
index bb5065f06c0..8eab4cd9c9a 100644
--- a/db/io.h
+++ b/db/io.h
@@ -60,6 +60,7 @@ extern void	xfs_verify_recalc_crc(struct xfs_buf *bp);
 
 bool iocur_is_ddev(const struct iocur *ioc);
 bool iocur_is_extlogdev(const struct iocur *ioc);
+bool iocur_is_rtdev(const struct iocur *ioc);
 
 /*
  * returns -1 for unchecked, 0 for bad and 1 for good


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 3/8] xfs_db: make the daddr command target the realtime device
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
  2023-12-31 23:44   ` [PATCH 1/8] xfs_db: support passing the realtime device to the debugger Darrick J. Wong
  2023-12-31 23:45   ` [PATCH 2/8] xfs_db: report the realtime device when associated with each io cursor Darrick J. Wong
@ 2023-12-31 23:45   ` Darrick J. Wong
  2023-12-31 23:45   ` [PATCH 4/8] xfs_db: access realtime file blocks Darrick J. Wong
                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:45 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make it so that users can issue the command "daddr -r XXX" to select
disk block XXX on the realtime device.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/block.c        |   43 ++++++++++++++++++++++++++++++++++++++-----
 man/man8/xfs_db.8 |    6 +++++-
 2 files changed, 43 insertions(+), 6 deletions(-)


diff --git a/db/block.c b/db/block.c
index b2b5edf9385..d064fbed5aa 100644
--- a/db/block.c
+++ b/db/block.c
@@ -31,7 +31,7 @@ static const cmdinfo_t	ablock_cmd =
 	{ "ablock", NULL, ablock_f, 1, 1, 1, N_("filoff"),
 	  N_("set address to file offset (attr fork)"), ablock_help };
 static const cmdinfo_t	daddr_cmd =
-	{ "daddr", NULL, daddr_f, 0, 1, 1, N_("[d]"),
+	{ "daddr", NULL, daddr_f, 0, -1, 1, N_("[d]"),
 	  N_("set address to daddr value"), daddr_help };
 static const cmdinfo_t	dblock_cmd =
 	{ "dblock", NULL, dblock_f, 1, 1, 1, N_("filoff"),
@@ -117,6 +117,11 @@ daddr_help(void)
 ));
 }
 
+enum daddr_target {
+	DT_DATA,
+	DT_RT,
+};
+
 static int
 daddr_f(
 	int		argc,
@@ -124,8 +129,23 @@ daddr_f(
 {
 	int64_t		d;
 	char		*p;
+	int		c;
+	xfs_rfsblock_t	max_daddrs = mp->m_sb.sb_dblocks;
+	enum daddr_target tgt = DT_DATA;
 
-	if (argc == 1) {
+	while ((c = getopt(argc, argv, "r")) != -1) {
+		switch (c) {
+		case 'r':
+			tgt = DT_RT;
+			max_daddrs = mp->m_sb.sb_rblocks;
+			break;
+		default:
+			daddr_help();
+			return 0;
+		}
+	}
+
+	if (optind == argc) {
 		xfs_daddr_t	daddr = iocur_top->off >> BBSHIFT;
 
 		if (iocur_is_ddev(iocur_top))
@@ -139,14 +159,27 @@ daddr_f(
 
 		return 0;
 	}
-	d = (int64_t)strtoull(argv[1], &p, 0);
+
+	if (optind != argc - 1) {
+		daddr_help();
+		return 0;
+	}
+
+	d = (int64_t)strtoull(argv[optind], &p, 0);
 	if (*p != '\0' ||
-	    d >= mp->m_sb.sb_dblocks << (mp->m_sb.sb_blocklog - BBSHIFT)) {
+	    d >= max_daddrs << (mp->m_sb.sb_blocklog - BBSHIFT)) {
 		dbprintf(_("bad daddr %s\n"), argv[1]);
 		return 0;
 	}
 	ASSERT(typtab[TYP_DATA].typnm == TYP_DATA);
-	set_cur(&typtab[TYP_DATA], d, 1, DB_RING_ADD, NULL);
+	switch (tgt) {
+	case DT_DATA:
+		set_cur(&typtab[TYP_DATA], d, 1, DB_RING_ADD, NULL);
+		break;
+	case DT_RT:
+		set_rt_cur(&typtab[TYP_DATA], d, 1, DB_RING_ADD, NULL);
+		break;
+	}
 	return 0;
 }
 
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index f5e3064d923..dfd44900d15 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -590,7 +590,7 @@ Recalculate the current structure's correct CRC value, and write it to disk.
 Validate and display the current value and state of the structure's CRC.
 .RE
 .TP
-.BI "daddr [" d ]
+.BI "daddr [" -r "] [" d ]
 Set current address to the daddr (512 byte block) given by
 .IR d .
 If no value for
@@ -599,6 +599,10 @@ is given, the current address is printed, expressed as a daddr.
 The type is set to
 .B data
 (uninterpreted).
+
+If an address and the
+.B \-r
+option are specified, the current address is set to the realtime device.
 .TP
 .BI dblock " filoff"
 Set current address to the offset


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 4/8] xfs_db: access realtime file blocks
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
                     ` (2 preceding siblings ...)
  2023-12-31 23:45   ` [PATCH 3/8] xfs_db: make the daddr command target the realtime device Darrick J. Wong
@ 2023-12-31 23:45   ` Darrick J. Wong
  2023-12-31 23:46   ` [PATCH 5/8] xfs_db: access arbitrary realtime blocks and extents Darrick J. Wong
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:45 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we have the ability to point the io cursor at the realtime
device, let's make it so that the "dblock" command can walk the contents
of realtime files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/block.c |   17 +++++++++++++++--
 db/faddr.c |    4 +++-
 2 files changed, 18 insertions(+), 3 deletions(-)


diff --git a/db/block.c b/db/block.c
index d064fbed5aa..ae8744685b0 100644
--- a/db/block.c
+++ b/db/block.c
@@ -195,6 +195,13 @@ dblock_help(void)
 ));
 }
 
+static inline bool
+is_rtfile(
+	struct xfs_dinode	*dip)
+{
+	return dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
+}
+
 static int
 dblock_f(
 	int		argc,
@@ -234,8 +241,14 @@ dblock_f(
 	ASSERT(typtab[type].typnm == type);
 	if (nex > 1)
 		make_bbmap(&bbmap, nex, bmp);
-	set_cur(&typtab[type], (int64_t)XFS_FSB_TO_DADDR(mp, dfsbno),
-		nb * blkbb, DB_RING_ADD, nex > 1 ? &bbmap : NULL);
+	if (is_rtfile(iocur_top->data))
+		set_rt_cur(&typtab[type], (int64_t)XFS_FSB_TO_DADDR(mp, dfsbno),
+				nb * blkbb, DB_RING_ADD,
+				nex > 1 ? &bbmap : NULL);
+	else
+		set_cur(&typtab[type], (int64_t)XFS_FSB_TO_DADDR(mp, dfsbno),
+				nb * blkbb, DB_RING_ADD,
+				nex > 1 ? &bbmap : NULL);
 	free(bmp);
 	return 0;
 }
diff --git a/db/faddr.c b/db/faddr.c
index ec4aae68bb5..fd65b86b5e9 100644
--- a/db/faddr.c
+++ b/db/faddr.c
@@ -323,7 +323,9 @@ fa_drtbno(
 		dbprintf(_("null block number, cannot set new addr\n"));
 		return;
 	}
-	/* need set_cur to understand rt subvolume */
+
+	set_rt_cur(&typtab[next], (int64_t)XFS_FSB_TO_BB(mp, bno), blkbb,
+			DB_RING_ADD, NULL);
 }
 
 /*ARGSUSED*/


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 5/8] xfs_db: access arbitrary realtime blocks and extents
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
                     ` (3 preceding siblings ...)
  2023-12-31 23:45   ` [PATCH 4/8] xfs_db: access realtime file blocks Darrick J. Wong
@ 2023-12-31 23:46   ` Darrick J. Wong
  2023-12-31 23:46   ` [PATCH 6/8] xfs_db: enable conversion of rt space units Darrick J. Wong
                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:46 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add two commands to xfs_db so that we can point ourselves at any
arbitrary realtime block or extent.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/block.c        |  105 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 man/man8/xfs_db.8 |   20 ++++++++++
 2 files changed, 125 insertions(+)


diff --git a/db/block.c b/db/block.c
index ae8744685b0..fff7f7d6f52 100644
--- a/db/block.c
+++ b/db/block.c
@@ -25,6 +25,10 @@ static int	dblock_f(int argc, char **argv);
 static void     dblock_help(void);
 static int	fsblock_f(int argc, char **argv);
 static void     fsblock_help(void);
+static int	rtblock_f(int argc, char **argv);
+static void	rtblock_help(void);
+static int	rtextent_f(int argc, char **argv);
+static void	rtextent_help(void);
 static void	print_rawdata(void *data, int len);
 
 static const cmdinfo_t	ablock_cmd =
@@ -39,6 +43,12 @@ static const cmdinfo_t	dblock_cmd =
 static const cmdinfo_t	fsblock_cmd =
 	{ "fsblock", "fsb", fsblock_f, 0, 1, 1, N_("[fsb]"),
 	  N_("set address to fsblock value"), fsblock_help };
+static const cmdinfo_t	rtblock_cmd =
+	{ "rtblock", "rtbno", rtblock_f, 0, 1, 1, N_("[rtbno]"),
+	  N_("set address to rtblock value"), rtblock_help };
+static const cmdinfo_t	rtextent_cmd =
+	{ "rtextent", "rtx", rtextent_f, 0, 1, 1, N_("[rtxno]"),
+	  N_("set address to rtextent value"), rtextent_help };
 
 static void
 ablock_help(void)
@@ -104,6 +114,8 @@ block_init(void)
 	add_command(&daddr_cmd);
 	add_command(&dblock_cmd);
 	add_command(&fsblock_cmd);
+	add_command(&rtblock_cmd);
+	add_command(&rtextent_cmd);
 }
 
 static void
@@ -301,6 +313,99 @@ fsblock_f(
 	return 0;
 }
 
+static void
+rtblock_help(void)
+{
+	dbprintf(_(
+"\n Example:\n"
+"\n"
+" 'rtblock 1023' - sets the file position to the 1023rd block on the realtime\n"
+" volume. The filesystem block size is specified in the superblock and set\n"
+" during mkfs time.\n\n"
+));
+}
+
+static int
+rtblock_f(
+	int		argc,
+	char		**argv)
+{
+	xfs_rtblock_t	rtbno;
+	char		*p;
+
+	if (argc == 1) {
+		if (!iocur_is_rtdev(iocur_top)) {
+			dbprintf(_("cursor does not point to rt device\n"));
+			return 0;
+		}
+		dbprintf(_("current rtblock is %lld\n"),
+			XFS_BB_TO_FSB(mp, iocur_top->off >> BBSHIFT));
+		return 0;
+	}
+	rtbno = strtoull(argv[1], &p, 0);
+	if (*p != '\0') {
+		dbprintf(_("bad rtblock %s\n"), argv[1]);
+		return 0;
+	}
+	if (rtbno >= mp->m_sb.sb_rblocks) {
+		dbprintf(_("bad rtblock %s\n"), argv[1]);
+		return 0;
+	}
+	ASSERT(typtab[TYP_DATA].typnm == TYP_DATA);
+	set_rt_cur(&typtab[TYP_DATA], XFS_FSB_TO_BB(mp, rtbno), blkbb,
+			DB_RING_ADD, NULL);
+	return 0;
+}
+
+static void
+rtextent_help(void)
+{
+	dbprintf(_(
+"\n Example:\n"
+"\n"
+" 'rtextent 10' - sets the file position to the 10th extent on the realtime\n"
+" volume. The realtime extent size is specified in the superblock and set\n"
+" during mkfs or growfs time.\n\n"
+));
+}
+
+static int
+rtextent_f(
+	int		argc,
+	char		**argv)
+{
+	xfs_rtblock_t	rtbno;
+	xfs_rtxnum_t	rtx;
+	char		*p;
+
+	if (argc == 1) {
+		if (!iocur_is_rtdev(iocur_top)) {
+			dbprintf(_("cursor does not point to rt device\n"));
+			return 0;
+		}
+
+		rtbno = XFS_BB_TO_FSB(mp, iocur_top->off >> BBSHIFT);
+		dbprintf(_("current rtextent is %lld\n"),
+				xfs_rtb_to_rtx(mp, rtbno));
+		return 0;
+	}
+	rtx = strtoull(argv[1], &p, 0);
+	if (*p != '\0') {
+		dbprintf(_("bad rtextent %s\n"), argv[1]);
+		return 0;
+	}
+	if (rtx >= mp->m_sb.sb_rextents) {
+		dbprintf(_("bad rtextent %s\n"), argv[1]);
+		return 0;
+	}
+
+	rtbno = xfs_rtx_to_rtb(mp, rtx);
+	ASSERT(typtab[TYP_DATA].typnm == TYP_DATA);
+	set_rt_cur(&typtab[TYP_DATA], XFS_FSB_TO_BB(mp, rtbno),
+			mp->m_sb.sb_rextsize * blkbb, DB_RING_ADD, NULL);
+	return 0;
+}
+
 void
 print_block(
 	const field_t	*fields,
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index dfd44900d15..6fa72acb313 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -1026,6 +1026,26 @@ command.
 Exit
 .BR xfs_db .
 .TP
+.BI "rtblock [" rtbno ]
+Set current address to the rtblock value given by
+.IR rtbno .
+If no value for
+.I rtbno
+is given the current address is printed, expressed as an rtbno.
+The type is set to
+.B data
+(uninterpreted).
+.TP
+.BI "rtextent [" rtxno ]
+Set current address to the rtextent value given by
+.IR rtextent .
+If no value for
+.I rtextent
+is given the current address is printed, expressed as an rtextent.
+The type is set to
+.B data
+(uninterpreted).
+.TP
 .BI "ring [" index ]
 Show position ring (if no
 .I index


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/8] xfs_db: enable conversion of rt space units
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-31 23:46   ` [PATCH 5/8] xfs_db: access arbitrary realtime blocks and extents Darrick J. Wong
@ 2023-12-31 23:46   ` Darrick J. Wong
  2023-12-31 23:46   ` [PATCH 7/8] xfs_db: convert rtbitmap geometry Darrick J. Wong
  2023-12-31 23:46   ` [PATCH 8/8] xfs_db: convert rtsummary geometry Darrick J. Wong
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:46 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the xfs_db convert function about rt extents, rt block numbers,
and how to compute offsets within the rt bitmap and summary files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/convert.c      |  232 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 man/man8/xfs_db.8 |   51 ++++++++++++
 2 files changed, 260 insertions(+), 23 deletions(-)


diff --git a/db/convert.c b/db/convert.c
index e1466057031..811bac00f71 100644
--- a/db/convert.c
+++ b/db/convert.c
@@ -26,6 +26,10 @@
 	 agino_to_bytes(XFS_INO_TO_AGINO(mp, (x))))
 #define	inoidx_to_bytes(x)	\
 	((uint64_t)(x) << mp->m_sb.sb_inodelog)
+#define rtblock_to_bytes(x)	\
+	((uint64_t)(x) << mp->m_sb.sb_blocklog)
+#define rtx_to_rtblock(x)	\
+	((uint64_t)(x) * mp->m_sb.sb_rextsize)
 
 typedef enum {
 	CT_NONE = -1,
@@ -40,11 +44,12 @@ typedef enum {
 	CT_INO,			/* xfs_ino_t */
 	CT_INOIDX,		/* index of inode in fsblock */
 	CT_INOOFF,		/* byte offset in inode */
+	CT_RTBLOCK,		/* realtime block */
+	CT_RTX,			/* realtime extent */
 	NCTS
 } ctype_t;
 
 typedef struct ctydesc {
-	ctype_t		ctype;
 	int		allowed;
 	const char	**names;
 } ctydesc_t;
@@ -61,12 +66,16 @@ typedef union {
 	xfs_ino_t	ino;
 	int		inoidx;
 	int		inooff;
+	xfs_rtblock_t	rtblock;
+	xfs_rtblock_t	rtx;
 } cval_t;
 
 static uint64_t		bytevalue(ctype_t ctype, cval_t *val);
+static int		rtconvert_f(int argc, char **argv);
 static int		convert_f(int argc, char **argv);
 static int		getvalue(char *s, ctype_t ctype, cval_t *val);
-static ctype_t		lookupcty(char *ctyname);
+static ctype_t		lookupcty(const struct ctydesc *descs,
+				  const char *ctyname);
 
 static const char	*agblock_names[] = { "agblock", "agbno", NULL };
 static const char	*agino_names[] = { "agino", "aginode", NULL };
@@ -74,6 +83,8 @@ static const char	*agnumber_names[] = { "agnumber", "agno", NULL };
 static const char	*bboff_names[] = { "bboff", "daddroff", NULL };
 static const char	*blkoff_names[] = { "blkoff", "fsboff", "agboff",
 					    NULL };
+static const char	*rtblkoff_names[] = { "blkoff", "rtboff",
+					    NULL };
 static const char	*byte_names[] = { "byte", "fsbyte", NULL };
 static const char	*daddr_names[] = { "daddr", "bb", NULL };
 static const char	*fsblock_names[] = { "fsblock", "fsb", "fsbno", NULL };
@@ -81,30 +92,91 @@ static const char	*ino_names[] = { "ino", "inode", NULL };
 static const char	*inoidx_names[] = { "inoidx", "offset", NULL };
 static const char	*inooff_names[] = { "inooff", "inodeoff", NULL };
 
+static const char	*rtblock_names[] = { "rtblock", "rtb", "rtbno", NULL };
+static const char	*rtx_names[] = { "rtx", "rtextent", NULL };
+
 static const ctydesc_t	ctydescs[NCTS] = {
-	{ CT_AGBLOCK, M(AGNUMBER)|M(BBOFF)|M(BLKOFF)|M(INOIDX)|M(INOOFF),
-	  agblock_names },
-	{ CT_AGINO, M(AGNUMBER)|M(INOOFF), agino_names },
-	{ CT_AGNUMBER,
-	  M(AGBLOCK)|M(AGINO)|M(BBOFF)|M(BLKOFF)|M(INOIDX)|M(INOOFF),
-	  agnumber_names },
-	{ CT_BBOFF, M(AGBLOCK)|M(AGNUMBER)|M(DADDR)|M(FSBLOCK), bboff_names },
-	{ CT_BLKOFF, M(AGBLOCK)|M(AGNUMBER)|M(FSBLOCK), blkoff_names },
-	{ CT_BYTE, 0, byte_names },
-	{ CT_DADDR, M(BBOFF), daddr_names },
-	{ CT_FSBLOCK, M(BBOFF)|M(BLKOFF)|M(INOIDX), fsblock_names },
-	{ CT_INO, M(INOOFF), ino_names },
-	{ CT_INOIDX, M(AGBLOCK)|M(AGNUMBER)|M(FSBLOCK)|M(INOOFF),
-	  inoidx_names },
-	{ CT_INOOFF,
-	  M(AGBLOCK)|M(AGINO)|M(AGNUMBER)|M(FSBLOCK)|M(INO)|M(INOIDX),
-	  inooff_names },
+	[CT_AGBLOCK] = {
+		.allowed = M(AGNUMBER)|M(BBOFF)|M(BLKOFF)|M(INOIDX)|M(INOOFF),
+		.names   = agblock_names,
+	},
+	[CT_AGINO] = {
+		.allowed = M(AGNUMBER)|M(INOOFF),
+		.names   = agino_names,
+	},
+	[CT_AGNUMBER] = {
+		.allowed = M(AGBLOCK)|M(AGINO)|M(BBOFF)|M(BLKOFF)|M(INOIDX)|M(INOOFF),
+		.names   = agnumber_names,
+	},
+	[CT_BBOFF] = {
+		.allowed = M(AGBLOCK)|M(AGNUMBER)|M(DADDR)|M(FSBLOCK),
+		.names   = bboff_names,
+	},
+	[CT_BLKOFF] = {
+		.allowed = M(AGBLOCK)|M(AGNUMBER)|M(FSBLOCK),
+		.names   = blkoff_names,
+	},
+	[CT_BYTE] = {
+		.allowed = 0,
+		.names   = byte_names,
+	},
+	[CT_DADDR] = {
+		.allowed = M(BBOFF),
+		.names   = daddr_names,
+	},
+	[CT_FSBLOCK] = {
+		.allowed = M(BBOFF)|M(BLKOFF)|M(INOIDX),
+		.names   = fsblock_names,
+	},
+	[CT_INO] = {
+		.allowed = M(INOOFF),
+		.names   = ino_names,
+	},
+	[CT_INOIDX] = {
+		.allowed = M(AGBLOCK)|M(AGNUMBER)|M(FSBLOCK)|M(INOOFF),
+		.names   = inoidx_names,
+	},
+	[CT_INOOFF] = {
+		.allowed = M(AGBLOCK)|M(AGINO)|M(AGNUMBER)|M(FSBLOCK)|M(INO)|M(INOIDX),
+		.names   = inooff_names,
+	},
+};
+
+static const ctydesc_t	ctydescs_rt[NCTS] = {
+	[CT_BBOFF] = {
+		.allowed = M(DADDR)|M(RTBLOCK),
+		.names   = bboff_names,
+	},
+	[CT_BLKOFF] = {
+		.allowed = M(RTBLOCK),
+		.names   = rtblkoff_names,
+	},
+	[CT_BYTE] = {
+		.allowed = 0,
+		.names   = byte_names,
+	},
+	[CT_DADDR] = {
+		.allowed = M(BBOFF),
+		.names   = daddr_names,
+	},
+	[CT_RTBLOCK] = {
+		.allowed = M(BBOFF)|M(BLKOFF),
+		.names   = rtblock_names,
+	},
+	[CT_RTX] = {
+		.allowed = M(BBOFF)|M(BLKOFF),
+		.names   = rtx_names,
+	},
 };
 
 static const cmdinfo_t	convert_cmd =
 	{ "convert", NULL, convert_f, 3, 9, 0, "type num [type num]... type",
 	  "convert from one address form to another", NULL };
 
+static const cmdinfo_t	rtconvert_cmd =
+	{ "rtconvert", NULL, rtconvert_f, 3, 9, 0, "type num [type num]... type",
+	  "convert from one realtime address form to another", NULL };
+
 static uint64_t
 bytevalue(ctype_t ctype, cval_t *val)
 {
@@ -131,6 +203,10 @@ bytevalue(ctype_t ctype, cval_t *val)
 		return inoidx_to_bytes(val->inoidx);
 	case CT_INOOFF:
 		return (uint64_t)val->inooff;
+	case CT_RTBLOCK:
+		return rtblock_to_bytes(val->rtblock);
+	case CT_RTX:
+		return rtblock_to_bytes(rtx_to_rtblock(val->rtx));
 	case CT_NONE:
 	case NCTS:
 		break;
@@ -159,13 +235,13 @@ convert_f(int argc, char **argv)
 			 "arguments\n"), argc);
 		return 0;
 	}
-	if ((wtype = lookupcty(argv[argc - 1])) == CT_NONE) {
+	if ((wtype = lookupcty(ctydescs, argv[argc - 1])) == CT_NONE) {
 		dbprintf(_("unknown conversion type %s\n"), argv[argc - 1]);
 		return 0;
 	}
 
 	for (i = mask = conmask = 0; i < (argc - 1) / 2; i++) {
-		c = lookupcty(argv[i * 2]);
+		c = lookupcty(ctydescs, argv[i * 2]);
 		if (c == CT_NONE) {
 			dbprintf(_("unknown conversion type %s\n"), argv[i * 2]);
 			return 0;
@@ -230,6 +306,107 @@ convert_f(int argc, char **argv)
 	case CT_INOOFF:
 		v &= mp->m_sb.sb_inodesize - 1;
 		break;
+	case CT_RTBLOCK:
+	case CT_RTX:
+		/* shouldn't get here */
+		ASSERT(0);
+		break;
+	case CT_NONE:
+	case NCTS:
+		/* NOTREACHED */
+		break;
+	}
+	dbprintf("0x%llx (%llu)\n", v, v);
+	return 0;
+}
+
+static inline xfs_rtblock_t
+xfs_daddr_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	return daddr >> mp->m_blkbb_log;
+}
+
+static int
+rtconvert_f(int argc, char **argv)
+{
+	ctype_t		c;
+	int		conmask;
+	cval_t		cvals[NCTS] = {};
+	int		i;
+	int		mask;
+	uint64_t	v;
+	ctype_t		wtype;
+
+	/* move past the "rtconvert" command */
+	argc--;
+	argv++;
+
+	if ((argc % 2) != 1) {
+		dbprintf(_("bad argument count %d to rtconvert, expected 3,5,7,9 "
+			 "arguments\n"), argc);
+		return 0;
+	}
+	if ((wtype = lookupcty(ctydescs_rt, argv[argc - 1])) == CT_NONE) {
+		dbprintf(_("unknown conversion type %s\n"), argv[argc - 1]);
+		return 0;
+	}
+
+	for (i = mask = conmask = 0; i < (argc - 1) / 2; i++) {
+		c = lookupcty(ctydescs_rt, argv[i * 2]);
+		if (c == CT_NONE) {
+			dbprintf(_("unknown conversion type %s\n"), argv[i * 2]);
+			return 0;
+		}
+		if (c == wtype) {
+			dbprintf(_("result type same as argument\n"));
+			return 0;
+		}
+		if (conmask & (1 << c)) {
+			dbprintf(_("conflicting conversion type %s\n"),
+				argv[i * 2]);
+			return 0;
+		}
+		if (!getvalue(argv[i * 2 + 1], c, &cvals[c]))
+			return 0;
+		mask |= 1 << c;
+		conmask |= ~ctydescs_rt[c].allowed;
+	}
+	v = 0;
+	for (c = (ctype_t)0; c < NCTS; c++) {
+		if (!(mask & (1 << c)))
+			continue;
+		v += bytevalue(c, &cvals[c]);
+	}
+	switch (wtype) {
+	case CT_BBOFF:
+		v &= BBMASK;
+		break;
+	case CT_BLKOFF:
+		v &= mp->m_blockmask;
+		break;
+	case CT_BYTE:
+		break;
+	case CT_DADDR:
+		v >>= BBSHIFT;
+		break;
+	case CT_RTBLOCK:
+		v = xfs_daddr_to_rtb(mp, v >> BBSHIFT);
+		break;
+	case CT_RTX:
+		v = xfs_daddr_to_rtb(mp, v >> BBSHIFT) / mp->m_sb.sb_rextsize;
+		break;
+	case CT_AGBLOCK:
+	case CT_AGINO:
+	case CT_AGNUMBER:
+	case CT_FSBLOCK:
+	case CT_INO:
+	case CT_INOIDX:
+	case CT_INOOFF:
+		/* shouldn't get here */
+		ASSERT(0);
+		break;
 	case CT_NONE:
 	case NCTS:
 		/* NOTREACHED */
@@ -243,6 +420,7 @@ void
 convert_init(void)
 {
 	add_command(&convert_cmd);
+	add_command(&rtconvert_cmd);
 }
 
 static int
@@ -290,6 +468,12 @@ getvalue(char *s, ctype_t ctype, cval_t *val)
 	case CT_INOOFF:
 		val->inooff = (int)v;
 		break;
+	case CT_RTBLOCK:
+		val->rtblock = (xfs_rtblock_t)v;
+		break;
+	case CT_RTX:
+		val->rtx = (xfs_rtblock_t)v;
+		break;
 	case CT_NONE:
 	case NCTS:
 		/* NOTREACHED */
@@ -299,13 +483,15 @@ getvalue(char *s, ctype_t ctype, cval_t *val)
 }
 
 static ctype_t
-lookupcty(char *ctyname)
+lookupcty(
+	const struct ctydesc	*descs,
+	const char		*ctyname)
 {
 	ctype_t		cty;
 	const char	**name;
 
 	for (cty = (ctype_t)0; cty < NCTS; cty++) {
-		for (name = ctydescs[cty].names; *name; name++) {
+		for (name = descs[cty].names; name && *name; name++) {
 			if (strcmp(ctyname, *name) == 0)
 				return cty;
 		}
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 6fa72acb313..dc0bbfc9ac9 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -1052,6 +1052,57 @@ Show position ring (if no
 argument is given), or move to a specific entry in the position ring given by
 .IR index .
 .TP
+.BI "rtconvert " "type number" " [" "type number" "] ... " type
+Convert from one address form to another for realtime section addresses.
+The known
+.IR type s,
+with alternate names, are:
+.RS 1.0i
+.PD 0
+.HP
+.B bboff
+or
+.B daddroff
+(byte offset in a
+.BR daddr )
+.HP
+.B blkoff
+or
+.B fsboff or
+.B rtboff
+(byte offset in a
+.B rtblock
+or
+.BR rtextent )
+.HP
+.B byte
+or
+.B fsbyte
+(byte address in filesystem)
+.HP
+.B daddr
+or
+.B bb
+(disk address, 512-byte blocks)
+.HP
+.B rtblock
+or
+.B rtb
+or
+.B rtbno
+(realtime filesystem block, see the
+.B fsblock
+command)
+.HP
+.B rtx
+or
+.B rtextent
+(realtime extent)
+.PD
+.RE
+.IP
+Only conversions that "make sense" are allowed.
+.TP
 .BI "sb [" agno ]
 Set current address to SB header in allocation group
 .IR agno .


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 7/8] xfs_db: convert rtbitmap geometry
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 23:46   ` [PATCH 6/8] xfs_db: enable conversion of rt space units Darrick J. Wong
@ 2023-12-31 23:46   ` Darrick J. Wong
  2023-12-31 23:46   ` [PATCH 8/8] xfs_db: convert rtsummary geometry Darrick J. Wong
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:46 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the rtconvert command to be able to convert realtime blocks and
extents to locations within the rt bitmap.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/convert.c      |   40 ++++++++++++++++++++++++++++++++++++++++
 man/man8/xfs_db.8 |   10 ++++++++++
 2 files changed, 50 insertions(+)


diff --git a/db/convert.c b/db/convert.c
index 811bac00f71..35e71f8a442 100644
--- a/db/convert.c
+++ b/db/convert.c
@@ -30,6 +30,10 @@
 	((uint64_t)(x) << mp->m_sb.sb_blocklog)
 #define rtx_to_rtblock(x)	\
 	((uint64_t)(x) * mp->m_sb.sb_rextsize)
+#define rbmblock_to_bytes(x)	\
+	rtblock_to_bytes(rtx_to_rtblock(xfs_rbmblock_to_rtx(mp, (uint64_t)x)))
+#define rbmword_to_bytes(x)	\
+	rtblock_to_bytes(rtx_to_rtblock((uint64_t)(x) << XFS_NBWORDLOG))
 
 typedef enum {
 	CT_NONE = -1,
@@ -46,6 +50,8 @@ typedef enum {
 	CT_INOOFF,		/* byte offset in inode */
 	CT_RTBLOCK,		/* realtime block */
 	CT_RTX,			/* realtime extent */
+	CT_RBMBLOCK,		/* block within rt bitmap */
+	CT_RBMWORD,		/* word within rt bitmap */
 	NCTS
 } ctype_t;
 
@@ -68,6 +74,8 @@ typedef union {
 	int		inooff;
 	xfs_rtblock_t	rtblock;
 	xfs_rtblock_t	rtx;
+	xfs_fileoff_t	rbmblock;
+	unsigned int	rbmword;
 } cval_t;
 
 static uint64_t		bytevalue(ctype_t ctype, cval_t *val);
@@ -94,6 +102,8 @@ static const char	*inooff_names[] = { "inooff", "inodeoff", NULL };
 
 static const char	*rtblock_names[] = { "rtblock", "rtb", "rtbno", NULL };
 static const char	*rtx_names[] = { "rtx", "rtextent", NULL };
+static const char	*rbmblock_names[] = { "rbmblock", "rbmb", NULL };
+static const char	*rbmword_names[] = { "rbmword", "rbmw", NULL };
 
 static const ctydesc_t	ctydescs[NCTS] = {
 	[CT_AGBLOCK] = {
@@ -167,6 +177,14 @@ static const ctydesc_t	ctydescs_rt[NCTS] = {
 		.allowed = M(BBOFF)|M(BLKOFF),
 		.names   = rtx_names,
 	},
+	[CT_RBMBLOCK] = {
+		.allowed = M(RBMWORD),
+		.names   = rbmblock_names,
+	},
+	[CT_RBMWORD] = {
+		.allowed = M(RBMBLOCK),
+		.names   = rbmword_names,
+	},
 };
 
 static const cmdinfo_t	convert_cmd =
@@ -207,6 +225,10 @@ bytevalue(ctype_t ctype, cval_t *val)
 		return rtblock_to_bytes(val->rtblock);
 	case CT_RTX:
 		return rtblock_to_bytes(rtx_to_rtblock(val->rtx));
+	case CT_RBMBLOCK:
+		return rbmblock_to_bytes(val->rbmblock);
+	case CT_RBMWORD:
+		return rbmword_to_bytes(val->rbmword);
 	case CT_NONE:
 	case NCTS:
 		break;
@@ -308,6 +330,8 @@ convert_f(int argc, char **argv)
 		break;
 	case CT_RTBLOCK:
 	case CT_RTX:
+	case CT_RBMBLOCK:
+	case CT_RBMWORD:
 		/* shouldn't get here */
 		ASSERT(0);
 		break;
@@ -397,6 +421,16 @@ rtconvert_f(int argc, char **argv)
 	case CT_RTX:
 		v = xfs_daddr_to_rtb(mp, v >> BBSHIFT) / mp->m_sb.sb_rextsize;
 		break;
+	case CT_RBMBLOCK:
+		v = xfs_rtx_to_rbmblock(mp,
+				xfs_rtb_to_rtx(mp,
+					xfs_daddr_to_rtb(mp, v >> BBSHIFT)));
+		break;
+	case CT_RBMWORD:
+		v = xfs_rtx_to_rbmword(mp,
+				xfs_rtb_to_rtx(mp,
+					xfs_daddr_to_rtb(mp, v >> BBSHIFT)));
+		break;
 	case CT_AGBLOCK:
 	case CT_AGINO:
 	case CT_AGNUMBER:
@@ -474,6 +508,12 @@ getvalue(char *s, ctype_t ctype, cval_t *val)
 	case CT_RTX:
 		val->rtx = (xfs_rtblock_t)v;
 		break;
+	case CT_RBMBLOCK:
+		val->rbmblock = (xfs_fileoff_t)v;
+		break;
+	case CT_RBMWORD:
+		val->rbmword = (unsigned int)v;
+		break;
 	case CT_NONE:
 	case NCTS:
 		/* NOTREACHED */
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index dc0bbfc9ac9..a1ea21162b5 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -1098,6 +1098,16 @@ command)
 or
 .B rtextent
 (realtime extent)
+.HP
+.B rbmblock
+or
+.B rbmb
+(realtime bitmap block)
+.HP
+.B rbmword
+or
+.B rbmw
+(32-bit word within a realtime bitmap block)
 .PD
 .RE
 .IP


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 8/8] xfs_db: convert rtsummary geometry
  2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 23:46   ` [PATCH 7/8] xfs_db: convert rtbitmap geometry Darrick J. Wong
@ 2023-12-31 23:46   ` Darrick J. Wong
  7 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:46 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the rtconvert command to be able to convert realtime blocks and
extents to locations within the rt summary.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/convert.c      |  153 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 man/man8/xfs_db.8 |   29 ++++++++++
 2 files changed, 174 insertions(+), 8 deletions(-)


diff --git a/db/convert.c b/db/convert.c
index 35e71f8a442..048c1766f81 100644
--- a/db/convert.c
+++ b/db/convert.c
@@ -52,6 +52,9 @@ typedef enum {
 	CT_RTX,			/* realtime extent */
 	CT_RBMBLOCK,		/* block within rt bitmap */
 	CT_RBMWORD,		/* word within rt bitmap */
+	CT_RSUMBLOCK,		/* block within rt summary */
+	CT_RSUMLOG,		/* log level for rtsummary computations */
+	CT_RSUMINFO,		/* info word within rt summary */
 	NCTS
 } ctype_t;
 
@@ -76,6 +79,7 @@ typedef union {
 	xfs_rtblock_t	rtx;
 	xfs_fileoff_t	rbmblock;
 	unsigned int	rbmword;
+	xfs_fileoff_t	rsumblock;
 } cval_t;
 
 static uint64_t		bytevalue(ctype_t ctype, cval_t *val);
@@ -104,6 +108,12 @@ static const char	*rtblock_names[] = { "rtblock", "rtb", "rtbno", NULL };
 static const char	*rtx_names[] = { "rtx", "rtextent", NULL };
 static const char	*rbmblock_names[] = { "rbmblock", "rbmb", NULL };
 static const char	*rbmword_names[] = { "rbmword", "rbmw", NULL };
+static const char	*rsumblock_names[] = { "rsumblock", "rsmb", NULL };
+static const char	*rsumlog_names[] = { "rsumlog", "rsml", NULL };
+static const char	*rsumword_names[] = { "rsuminfo", "rsmi", NULL };
+
+static int		rsuminfo;
+static int		rsumlog;
 
 static const ctydesc_t	ctydescs[NCTS] = {
 	[CT_AGBLOCK] = {
@@ -154,37 +164,50 @@ static const ctydesc_t	ctydescs[NCTS] = {
 
 static const ctydesc_t	ctydescs_rt[NCTS] = {
 	[CT_BBOFF] = {
-		.allowed = M(DADDR)|M(RTBLOCK),
+		.allowed = M(DADDR)|M(RTBLOCK)|M(RSUMLOG),
 		.names   = bboff_names,
 	},
 	[CT_BLKOFF] = {
-		.allowed = M(RTBLOCK),
+		.allowed = M(RTBLOCK)|M(RSUMLOG),
 		.names   = rtblkoff_names,
 	},
 	[CT_BYTE] = {
-		.allowed = 0,
+		.allowed = 0|M(RSUMLOG),
 		.names   = byte_names,
 	},
 	[CT_DADDR] = {
-		.allowed = M(BBOFF),
+		.allowed = M(BBOFF)|M(RSUMLOG),
 		.names   = daddr_names,
 	},
 	[CT_RTBLOCK] = {
-		.allowed = M(BBOFF)|M(BLKOFF),
+		.allowed = M(BBOFF)|M(BLKOFF)|M(RSUMLOG),
 		.names   = rtblock_names,
 	},
 	[CT_RTX] = {
-		.allowed = M(BBOFF)|M(BLKOFF),
+		.allowed = M(BBOFF)|M(BLKOFF)|M(RSUMLOG),
 		.names   = rtx_names,
 	},
 	[CT_RBMBLOCK] = {
-		.allowed = M(RBMWORD),
+		.allowed = M(RBMWORD)|M(RSUMLOG),
 		.names   = rbmblock_names,
 	},
 	[CT_RBMWORD] = {
-		.allowed = M(RBMBLOCK),
+		.allowed = M(RBMBLOCK)|M(RSUMLOG),
 		.names   = rbmword_names,
 	},
+	/* must be specified in order rsumlog -> rsuminfo -> rsumblock */
+	[CT_RSUMBLOCK] = {
+		.allowed = 0,
+		.names   = rsumblock_names,
+	},
+	[CT_RSUMLOG] = {
+		.allowed = M(RSUMINFO)|M(RSUMBLOCK),
+		.names   = rsumlog_names,
+	},
+	[CT_RSUMINFO] = {
+		.allowed = M(RSUMBLOCK),
+		.names   = rsumword_names,
+	},
 };
 
 static const cmdinfo_t	convert_cmd =
@@ -195,6 +218,39 @@ static const cmdinfo_t	rtconvert_cmd =
 	{ "rtconvert", NULL, rtconvert_f, 3, 9, 0, "type num [type num]... type",
 	  "convert from one realtime address form to another", NULL };
 
+static inline uint64_t
+rsumblock_to_bytes(
+	xfs_fileoff_t	rsumblock)
+{
+	/*
+	 * We compute the rt summary file block with this formula:
+	 *   sumoffs = (log2len * sb_rbmblocks) + rbmblock;
+	 *   sumblock = sumoffs / blockwsize;
+	 *
+	 * Hence the return value is the inverse of this:
+	 *   sumoffs = (rsumblock * blockwsize) + rsuminfo;
+	 *   rbmblock = sumoffs % (log2len * sb_rbmblocks);
+	 */
+	xfs_rtsumoff_t	sumoff;
+	xfs_fileoff_t	rbmblock;
+
+	if (rsumlog < 0) {
+		dbprintf(_("need to set rsumlog\n"));
+		return 0;
+	}
+	if (rsuminfo < 0) {
+		dbprintf(_("need to set rsuminfo\n"));
+		return 0;
+	}
+
+	sumoff = rsuminfo + (rsumblock * mp->m_blockwsize);
+	if (rsumlog)
+		rbmblock = sumoff % (rsumlog * mp->m_sb.sb_rbmblocks);
+	else
+		rbmblock = sumoff;
+	return rbmblock_to_bytes(rbmblock);
+}
+
 static uint64_t
 bytevalue(ctype_t ctype, cval_t *val)
 {
@@ -229,6 +285,16 @@ bytevalue(ctype_t ctype, cval_t *val)
 		return rbmblock_to_bytes(val->rbmblock);
 	case CT_RBMWORD:
 		return rbmword_to_bytes(val->rbmword);
+	case CT_RSUMBLOCK:
+		return rsumblock_to_bytes(val->rbmblock);
+	case CT_RSUMLOG:
+	case CT_RSUMINFO:
+		/*
+		 * These have to specified before rsumblock, and are stored in
+		 * global variables.  Hence they do not adjust the disk address
+		 * value.
+		 */
+		return 0;
 	case CT_NONE:
 	case NCTS:
 		break;
@@ -332,6 +398,9 @@ convert_f(int argc, char **argv)
 	case CT_RTX:
 	case CT_RBMBLOCK:
 	case CT_RBMWORD:
+	case CT_RSUMBLOCK:
+	case CT_RSUMLOG:
+	case CT_RSUMINFO:
 		/* shouldn't get here */
 		ASSERT(0);
 		break;
@@ -352,6 +421,52 @@ xfs_daddr_to_rtb(
 	return daddr >> mp->m_blkbb_log;
 }
 
+static inline uint64_t
+rt_daddr_to_rsumblock(
+	struct xfs_mount	*mp,
+	uint64_t		input)
+{
+	xfs_rtblock_t		rtbno;
+	xfs_rtxnum_t		rtx;
+	xfs_fileoff_t		rbmblock;
+	xfs_rtsumoff_t		rsumoff;
+
+	if (rsumlog < 0) {
+		dbprintf(_("need to set rsumlog\n"));
+		return 0;
+	}
+
+	rtbno = xfs_daddr_to_rtb(mp, input >> BBSHIFT);
+	rtx = xfs_rtb_to_rtx(mp, rtbno);
+	rbmblock = xfs_rtx_to_rbmblock(mp, rtx);
+	rsumoff = xfs_rtsumoffs(mp, rsumlog, rbmblock);
+
+	return xfs_rtsumoffs_to_block(mp, rsumoff);
+}
+
+static inline uint64_t
+rt_daddr_to_rsuminfo(
+	struct xfs_mount	*mp,
+	uint64_t		input)
+{
+	xfs_rtblock_t		rtbno;
+	xfs_rtxnum_t		rtx;
+	xfs_fileoff_t		rbmblock;
+	xfs_rtsumoff_t		rsumoff;
+
+	if (rsumlog < 0) {
+		dbprintf(_("need to set rsumlog\n"));
+		return 0;
+	}
+
+	rtbno = xfs_daddr_to_rtb(mp, input >> BBSHIFT);
+	rtx = xfs_rtb_to_rtx(mp, rtbno);
+	rbmblock = xfs_rtx_to_rbmblock(mp, rtx);
+	rsumoff = xfs_rtsumoffs(mp, rsumlog, rbmblock);
+
+	return xfs_rtsumoffs_to_infoword(mp, rsumoff);
+}
+
 static int
 rtconvert_f(int argc, char **argv)
 {
@@ -363,6 +478,9 @@ rtconvert_f(int argc, char **argv)
 	uint64_t	v;
 	ctype_t		wtype;
 
+	rsumlog = -1;
+	rsuminfo = -1;
+
 	/* move past the "rtconvert" command */
 	argc--;
 	argv++;
@@ -431,6 +549,16 @@ rtconvert_f(int argc, char **argv)
 				xfs_rtb_to_rtx(mp,
 					xfs_daddr_to_rtb(mp, v >> BBSHIFT)));
 		break;
+	case CT_RSUMBLOCK:
+		v = rt_daddr_to_rsumblock(mp, v);
+		break;
+	case CT_RSUMLOG:
+		dbprintf(_("cannot convert to rsumlog\n"));
+		return 0;
+		break;
+	case CT_RSUMINFO:
+		v = rt_daddr_to_rsuminfo(mp, v);
+		break;
 	case CT_AGBLOCK:
 	case CT_AGINO:
 	case CT_AGNUMBER:
@@ -514,6 +642,15 @@ getvalue(char *s, ctype_t ctype, cval_t *val)
 	case CT_RBMWORD:
 		val->rbmword = (unsigned int)v;
 		break;
+	case CT_RSUMBLOCK:
+		val->rsumblock = (xfs_fileoff_t)v;
+		break;
+	case CT_RSUMLOG:
+		rsumlog = (unsigned int)v;
+		break;
+	case CT_RSUMINFO:
+		rsuminfo = (unsigned int)v;
+		break;
 	case CT_NONE:
 	case NCTS:
 		/* NOTREACHED */
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index a1ea21162b5..91e9e0ed9a5 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -1108,10 +1108,39 @@ or
 or
 .B rbmw
 (32-bit word within a realtime bitmap block)
+.HP
+.B rsumblock
+or
+.B rsmb
+(realtime summary file block)
+.HP
+.B rsuminfo
+or
+.B rsmi
+(32-bit counter within a realtime summary block)
+.HP
+.B rsumlog
+or
+.B rsml
+(log2len parameter used for summary file offset computations)
 .PD
 .RE
 .IP
 Only conversions that "make sense" are allowed.
+
+Realtime summary file location conversions have the following rules:
+Each info word in the rt summary file counts the number of free extents of a
+given log2(length) that start in a given rt bitmap block.
+
+To compute summary file location information for a given rt bitmap block, a
+log2(extent length) must be specified as the last type/number pair before the
+conversion type, and the type must be
+.BR rsumlog .
+
+To compute the rt bitmap block from summary file location, the type/number pairs
+must be specified exactly in the order
+.BR rsumlog ", " rsuminfo ", " rsumblock .
+
 .TP
 .BI "sb [" agno ]
 Set current address to SB header in allocation group


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/1] xfs_db: allow setting current address to log blocks
  2023-12-31 19:52 ` [PATCHSET v2.0 04/17] xfs_metadump: support external devices Darrick J. Wong
@ 2023-12-31 23:47   ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:47 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add commands so that users can target blocks on an external log device.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/block.c        |  103 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 man/man8/xfs_db.8 |   17 +++++++++
 2 files changed, 119 insertions(+), 1 deletion(-)


diff --git a/db/block.c b/db/block.c
index fff7f7d6f52..f234fcb4edc 100644
--- a/db/block.c
+++ b/db/block.c
@@ -29,6 +29,8 @@ static int	rtblock_f(int argc, char **argv);
 static void	rtblock_help(void);
 static int	rtextent_f(int argc, char **argv);
 static void	rtextent_help(void);
+static int	logblock_f(int argc, char **argv);
+static void	logblock_help(void);
 static void	print_rawdata(void *data, int len);
 
 static const cmdinfo_t	ablock_cmd =
@@ -49,6 +51,9 @@ static const cmdinfo_t	rtblock_cmd =
 static const cmdinfo_t	rtextent_cmd =
 	{ "rtextent", "rtx", rtextent_f, 0, 1, 1, N_("[rtxno]"),
 	  N_("set address to rtextent value"), rtextent_help };
+static const cmdinfo_t	logblock_cmd =
+	{ "logblock", "lsb", logblock_f, 0, 1, 1, N_("[logbno]"),
+	  N_("set address to logblock value"), logblock_help };
 
 static void
 ablock_help(void)
@@ -116,6 +121,7 @@ block_init(void)
 	add_command(&fsblock_cmd);
 	add_command(&rtblock_cmd);
 	add_command(&rtextent_cmd);
+	add_command(&logblock_cmd);
 }
 
 static void
@@ -132,6 +138,7 @@ daddr_help(void)
 enum daddr_target {
 	DT_DATA,
 	DT_RT,
+	DT_LOG,
 };
 
 static int
@@ -145,18 +152,27 @@ daddr_f(
 	xfs_rfsblock_t	max_daddrs = mp->m_sb.sb_dblocks;
 	enum daddr_target tgt = DT_DATA;
 
-	while ((c = getopt(argc, argv, "r")) != -1) {
+	while ((c = getopt(argc, argv, "rl")) != -1) {
 		switch (c) {
 		case 'r':
 			tgt = DT_RT;
 			max_daddrs = mp->m_sb.sb_rblocks;
 			break;
+		case 'l':
+			tgt = DT_LOG;
+			max_daddrs = mp->m_sb.sb_logblocks;
+			break;
 		default:
 			daddr_help();
 			return 0;
 		}
 	}
 
+	if (tgt == DT_LOG && mp->m_sb.sb_logstart > 0) {
+		dbprintf(_("filesystem has internal log\n"));
+		return 0;
+	}
+
 	if (optind == argc) {
 		xfs_daddr_t	daddr = iocur_top->off >> BBSHIFT;
 
@@ -191,6 +207,9 @@ daddr_f(
 	case DT_RT:
 		set_rt_cur(&typtab[TYP_DATA], d, 1, DB_RING_ADD, NULL);
 		break;
+	case DT_LOG:
+		set_log_cur(&typtab[TYP_DATA], d, 1, DB_RING_ADD, NULL);
+		break;
 	}
 	return 0;
 }
@@ -406,6 +425,88 @@ rtextent_f(
 	return 0;
 }
 
+static void
+logblock_help(void)
+{
+	dbprintf(_(
+"\n Example:\n"
+"\n"
+" 'logblock 1023' - sets the file position to the 1023rd log block.\n"
+" The external log device or the block offset within the internal log will be\n"
+" chosen as appropriate.\n"
+));
+}
+
+static int
+logblock_f(
+	int		argc,
+	char		**argv)
+{
+	xfs_fsblock_t	logblock;
+	char		*p;
+
+	if (argc == 1) {
+		if (mp->m_sb.sb_logstart > 0 && iocur_is_ddev(iocur_top)) {
+			logblock = XFS_DADDR_TO_FSB(mp,
+						iocur_top->off >> BBSHIFT);
+
+			if (logblock < mp->m_sb.sb_logstart ||
+			    logblock >= mp->m_sb.sb_logstart +
+					mp->m_sb.sb_logblocks) {
+				dbprintf(
+ _("current address not within internal log\n"));
+				return 0;
+			}
+
+			dbprintf(_("current logblock is %lld\n"),
+					logblock - mp->m_sb.sb_logstart);
+			return 0;
+		}
+
+		if (mp->m_sb.sb_logstart == 0 &&
+		    iocur_is_extlogdev(iocur_top)) {
+			logblock = XFS_BB_TO_FSB(mp,
+						iocur_top->off >> BBSHIFT);
+
+			if (logblock >= mp->m_sb.sb_logblocks) {
+				dbprintf(
+ _("current address not within external log\n"));
+				return 0;
+			}
+
+			dbprintf(_("current logblock is %lld\n"), logblock);
+			return 0;
+		}
+
+		dbprintf(_("current address does not point to log\n"));
+		return 0;
+	}
+
+	logblock = strtoull(argv[1], &p, 0);
+	if (*p != '\0') {
+		dbprintf(_("bad logblock %s\n"), argv[1]);
+		return 0;
+	}
+
+	if (logblock >= mp->m_sb.sb_logblocks) {
+		dbprintf(_("bad logblock %s\n"), argv[1]);
+		return 0;
+	}
+
+	ASSERT(typtab[TYP_DATA].typnm == TYP_DATA);
+
+	if (mp->m_sb.sb_logstart) {
+		logblock += mp->m_sb.sb_logstart;
+		set_cur(&typtab[TYP_DATA], XFS_FSB_TO_DADDR(mp, logblock),
+				blkbb, DB_RING_ADD, NULL);
+	} else {
+		set_log_cur(&typtab[TYP_DATA], XFS_FSB_TO_BB(mp, logblock),
+				blkbb, DB_RING_ADD, NULL);
+	}
+
+	return 0;
+}
+
 void
 print_block(
 	const field_t	*fields,
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 91e9e0ed9a5..15a30c11ae5 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -600,6 +600,9 @@ The type is set to
 .B data
 (uninterpreted).
 
+If an address and the
+.B \-l
+option are specified, the current address is set to the external log device.
 If an address and the
 .B \-r
 option are specified, the current address is set to the realtime device.
@@ -925,6 +928,20 @@ Start logging output to
 .IR filename ,
 stop logging, or print the current logging status.
 .TP
+.BI "logblock [" logbno ]
+Set current address to the log block value given by
+.IR logbno .
+If no value for
+.I logbno
+is given the current address is printed, expressed as an fsb.
+The type is set to
+.B data
+(uninterpreted).
+If the filesystem has an external log, then the address will be within the log
+device.
+If the filesystem has an internal log, then the address will be within the
+internal log.
+.TP
 .BI "logformat [\-c " cycle "] [\-s " sunit "]"
 Reformats the log to the specified log cycle and log stripe unit.
 This has the effect of clearing the log destructively.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 1/2] xfs: refactor realtime inode locking
  2023-12-31 19:52 ` [PATCHSET v2.0 05/17] xfsprogs: refactor realtime meta inode locking Darrick J. Wong
@ 2023-12-31 23:47   ` Darrick J. Wong
  2023-12-31 23:47   ` [PATCH 2/2] xfs: remove XFS_ILOCK_RT* Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:47 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create helper functions to deal with locking realtime metadata inodes.
This enables us to maintain correct locking order once we start adding
the realtime rmap and refcount btree inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_priv.h  |    3 +++
 libxfs/xfs_bmap.c     |    7 ++----
 libxfs/xfs_rtbitmap.c |   57 +++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtbitmap.h |   17 +++++++++++++++
 4 files changed, 79 insertions(+), 5 deletions(-)


diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index f4614ee9631..178330aafa1 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -174,9 +174,12 @@ enum ce { CE_DEBUG, CE_CONT, CE_NOTE, CE_WARN, CE_ALERT, CE_PANIC };
 	(unlikely(expr) ? XFS_WARN_CORRUPT((mp), (expr)) : false)
 
 #define XFS_ERRLEVEL_LOW		1
+#define XFS_ILOCK_SHARED		0
 #define XFS_ILOCK_EXCL			0
 #define XFS_IOLOCK_SHARED		0
 #define XFS_IOLOCK_EXCL			0
+#define XFS_ILOCK_RTSUM			0
+#define XFS_ILOCK_RTBITMAP		0
 #define XFS_STATS_INC(mp, count)	do { (mp) = (mp); } while (0)
 #define XFS_STATS_DEC(mp, count, x)	do { (mp) = (mp); } while (0)
 #define XFS_STATS_ADD(mp, count, x)	do { (mp) = (mp); } while (0)
diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c
index 0dfb37073a7..c69cb5c66df 100644
--- a/libxfs/xfs_bmap.c
+++ b/libxfs/xfs_bmap.c
@@ -5370,12 +5370,9 @@ __xfs_bunmapi(
 
 	if (isrt) {
 		/*
-		 * Synchronize by locking the bitmap inode.
+		 * Synchronize by locking the realtime bitmap.
 		 */
-		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
-		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
-		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		xfs_rtbitmap_lock(tp, mp);
 	}
 
 	extno = 0;
diff --git a/libxfs/xfs_rtbitmap.c b/libxfs/xfs_rtbitmap.c
index b4da1b07c73..72d9d0f0ec9 100644
--- a/libxfs/xfs_rtbitmap.c
+++ b/libxfs/xfs_rtbitmap.c
@@ -1192,3 +1192,60 @@ xfs_rtsummary_wordcount(
 	blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
 	return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
 }
+
+/*
+ * Lock both realtime free space metadata inodes for a freespace update.  If a
+ * transaction is given, the inodes will be joined to the transaction and the
+ * ILOCKs will be released on transaction commit.
+ */
+void
+xfs_rtbitmap_lock(
+	struct xfs_trans	*tp,
+	struct xfs_mount	*mp)
+{
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+	if (tp)
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+	xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	if (tp)
+		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+}
+
+/* Unlock both realtime free space metadata inodes after a freespace update. */
+void
+xfs_rtbitmap_unlock(
+	struct xfs_mount	*mp)
+{
+	xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+}
+
+/*
+ * Lock the realtime free space metadata inodes for a freespace scan.  Callers
+ * must walk metadata blocks in order of increasing file offset.
+ */
+void
+xfs_rtbitmap_lock_shared(
+	struct xfs_mount	*mp,
+	unsigned int		rbmlock_flags)
+{
+	if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+
+	if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+}
+
+/* Unlock the realtime free space metadata inodes after a freespace scan. */
+void
+xfs_rtbitmap_unlock_shared(
+	struct xfs_mount	*mp,
+	unsigned int		rbmlock_flags)
+{
+	if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
+		xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+
+	if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
+		xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+}
diff --git a/libxfs/xfs_rtbitmap.h b/libxfs/xfs_rtbitmap.h
index 1c84b52de3d..6ac17f0195e 100644
--- a/libxfs/xfs_rtbitmap.h
+++ b/libxfs/xfs_rtbitmap.h
@@ -374,6 +374,19 @@ xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
 		unsigned int rsumlevels, xfs_extlen_t rbmblocks);
 unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
 		unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+
+void xfs_rtbitmap_lock(struct xfs_trans *tp, struct xfs_mount *mp);
+void xfs_rtbitmap_unlock(struct xfs_mount *mp);
+
+/* Lock the rt bitmap inode in shared mode */
+#define XFS_RBMLOCK_BITMAP	(1U << 0)
+/* Lock the rt summary inode in shared mode */
+#define XFS_RBMLOCK_SUMMARY	(1U << 1)
+
+void xfs_rtbitmap_lock_shared(struct xfs_mount *mp,
+		unsigned int rbmlock_flags);
+void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
+		unsigned int rbmlock_flags);
 #else /* CONFIG_XFS_RT */
 # define xfs_rtfree_extent(t,b,l)			(-ENOSYS)
 # define xfs_rtfree_blocks(t,rb,rl)			(-ENOSYS)
@@ -394,6 +407,10 @@ xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
 # define xfs_rtbitmap_wordcount(mp, r)			(0)
 # define xfs_rtsummary_blockcount(mp, l, b)		(0)
 # define xfs_rtsummary_wordcount(mp, l, b)		(0)
+# define xfs_rtbitmap_lock(tp, mp)		do { } while (0)
+# define xfs_rtbitmap_unlock(mp)		do { } while (0)
+# define xfs_rtbitmap_lock_shared(mp, lf)	do { } while (0)
+# define xfs_rtbitmap_unlock_shared(mp, lf)	do { } while (0)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __XFS_RTBITMAP_H__ */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 2/2] xfs: remove XFS_ILOCK_RT*
  2023-12-31 19:52 ` [PATCHSET v2.0 05/17] xfsprogs: refactor realtime meta inode locking Darrick J. Wong
  2023-12-31 23:47   ` [PATCH 1/2] xfs: refactor realtime " Darrick J. Wong
@ 2023-12-31 23:47   ` Darrick J. Wong
  1 sibling, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:47 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that we've centralized the realtime metadata locking routines, get
rid of the ILOCK subclasses since we now use explicit lockdep classes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_priv.h  |    2 --
 libxfs/xfs_rtbitmap.c |   16 ++++++++--------
 2 files changed, 8 insertions(+), 10 deletions(-)


diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 178330aafa1..57b8edc54a5 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -178,8 +178,6 @@ enum ce { CE_DEBUG, CE_CONT, CE_NOTE, CE_WARN, CE_ALERT, CE_PANIC };
 #define XFS_ILOCK_EXCL			0
 #define XFS_IOLOCK_SHARED		0
 #define XFS_IOLOCK_EXCL			0
-#define XFS_ILOCK_RTSUM			0
-#define XFS_ILOCK_RTBITMAP		0
 #define XFS_STATS_INC(mp, count)	do { (mp) = (mp); } while (0)
 #define XFS_STATS_DEC(mp, count, x)	do { (mp) = (mp); } while (0)
 #define XFS_STATS_ADD(mp, count, x)	do { (mp) = (mp); } while (0)
diff --git a/libxfs/xfs_rtbitmap.c b/libxfs/xfs_rtbitmap.c
index 72d9d0f0ec9..69c70c89c96 100644
--- a/libxfs/xfs_rtbitmap.c
+++ b/libxfs/xfs_rtbitmap.c
@@ -1203,11 +1203,11 @@ xfs_rtbitmap_lock(
 	struct xfs_trans	*tp,
 	struct xfs_mount	*mp)
 {
-	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
 	if (tp)
 		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 
-	xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
 	if (tp)
 		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 }
@@ -1217,8 +1217,8 @@ void
 xfs_rtbitmap_unlock(
 	struct xfs_mount	*mp)
 {
-	xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
-	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+	xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL);
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
 }
 
 /*
@@ -1231,10 +1231,10 @@ xfs_rtbitmap_lock_shared(
 	unsigned int		rbmlock_flags)
 {
 	if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
-		xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
 
 	if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
-		xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED);
 }
 
 /* Unlock the realtime free space metadata inodes after a freespace scan. */
@@ -1244,8 +1244,8 @@ xfs_rtbitmap_unlock_shared(
 	unsigned int		rbmlock_flags)
 {
 	if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
-		xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+		xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED);
 
 	if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
-		xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+		xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 01/52] xfs: create incore realtime group structures
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (4 preceding siblings ...)
  2023-12-27 13:01   ` [PATCH 52/52] mkfs: format realtime groups Darrick J. Wong
@ 2023-12-31 23:47   ` Darrick J. Wong
  2023-12-31 23:48   ` [PATCH 02/52] xfs: define the format of rt groups Darrick J. Wong
                     ` (45 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:47 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create an incore object that will contain information about a realtime
allocation group.  This will eventually enable us to shard the realtime
section in a similar manner to how we shard the data section.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_mount.h      |    8 +
 include/xfs_trace.h      |    7 +
 libxfs/Makefile          |    2 
 libxfs/init.c            |   21 ++++
 libxfs/libxfs_api_defs.h |    2 
 libxfs/xfs_format.h      |    8 +
 libxfs/xfs_rtgroup.c     |  249 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtgroup.h     |  132 ++++++++++++++++++++++++
 libxfs/xfs_sb.c          |    5 +
 libxfs/xfs_types.h       |    4 +
 10 files changed, 438 insertions(+)
 create mode 100644 libxfs/xfs_rtgroup.c
 create mode 100644 libxfs/xfs_rtgroup.h


diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index 8952869d89a..ec0956c539f 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -79,6 +79,7 @@ typedef struct xfs_mount {
 	uint8_t			m_sectbb_log;	/* sectorlog - BBSHIFT */
 	uint8_t			m_agno_log;	/* log #ag's */
 	int8_t			m_rtxblklog;	/* log2 of rextsize, if possible */
+	int8_t			m_rgblklog;	/* log2 of rt group sz if possible */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
 	uint			m_blockwmask;	/* blockwsize-1 */
@@ -99,9 +100,11 @@ typedef struct xfs_mount {
 	uint			m_alloc_set_aside; /* space we can't use */
 	uint			m_ag_max_usable; /* max space per AG */
 	struct radix_tree_root	m_perag_tree;
+	struct radix_tree_root	m_rtgroup_tree;
 	uint64_t		m_features;	/* active filesystem features */
 	uint64_t		m_low_space[XFS_LOWSP_MAX];
 	uint64_t		m_rtxblkmask;	/* rt extent block mask */
+	uint64_t		m_rgblkmask;	/* rt group block mask */
 	unsigned long		m_opstate;	/* dynamic state flags */
 	bool			m_finobt_nores; /* no per-AG finobt resv. */
 	uint			m_qflags;	/* quota status flags */
@@ -138,6 +141,7 @@ typedef struct xfs_mount {
 	 */
 	atomic64_t		m_allocbt_blks;
 	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
+	spinlock_t		m_rtgroup_lock;	/* lock for m_rtgroup_tree */
 
 } xfs_mount_t;
 
@@ -177,6 +181,7 @@ typedef struct xfs_mount {
 #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
 #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
 #define XFS_FEAT_METADIR	(1ULL << 27)	/* metadata directory tree */
+#define XFS_FEAT_RTGROUPS	(1ULL << 28)	/* realtime groups */
 
 #define __XFS_HAS_FEAT(name, NAME) \
 static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
@@ -222,6 +227,7 @@ __XFS_HAS_FEAT(bigtime, BIGTIME)
 __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
 __XFS_HAS_FEAT(large_extent_counts, NREXT64)
 __XFS_HAS_FEAT(metadir, METADIR)
+__XFS_HAS_FEAT(rtgroups, RTGROUPS)
 
 /* Kernel mount features that we don't support */
 #define __XFS_UNSUPP_FEAT(name) \
@@ -242,6 +248,7 @@ __XFS_UNSUPP_FEAT(grpid)
 #define XFS_OPSTATE_DEBUGGER		1	/* is this the debugger? */
 #define XFS_OPSTATE_REPORT_CORRUPTION	2	/* report buffer corruption? */
 #define XFS_OPSTATE_PERAG_DATA_LOADED	3	/* per-AG data initialized? */
+#define XFS_OPSTATE_RTGROUP_DATA_LOADED	4	/* rtgroup data initialized? */
 
 #define __XFS_IS_OPSTATE(name, NAME) \
 static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -267,6 +274,7 @@ __XFS_IS_OPSTATE(inode32, INODE32)
 __XFS_IS_OPSTATE(debugger, DEBUGGER)
 __XFS_IS_OPSTATE(reporting_corruption, REPORT_CORRUPTION)
 __XFS_IS_OPSTATE(perag_data_loaded, PERAG_DATA_LOADED)
+__XFS_IS_OPSTATE(rtgroup_data_loaded, RTGROUP_DATA_LOADED)
 
 #define __XFS_UNSUPP_OPSTATE(name) \
 static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index 2cca9394b70..b3240213364 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -345,6 +345,13 @@
 #define trace_xfs_rmap_map_error(...)		((void) 0)
 #define trace_xfs_rmap_delete_error(...)	((void) 0)
 
+/* set c = c to avoid unused var warnings */
+#define trace_xfs_rtgroup_get(a,b)		((a) = (a))
+#define trace_xfs_rtgroup_hold(a,b)		((a) = (a))
+#define trace_xfs_rtgroup_put(a,b)		((a) = (a))
+#define trace_xfs_rtgroup_grab(a,b)		((a) = (a))
+#define trace_xfs_rtgroup_rele(a,b)		((a) = (a))
+
 #define trace_xfs_swapext_defer(...)		((void) 0)
 #define trace_xfs_swapext_delta_nextents(...)	((void) 0)
 #define trace_xfs_swapext_delta_nextents_step(...)	((void) 0)
diff --git a/libxfs/Makefile b/libxfs/Makefile
index 258ce473200..a37a5263199 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -62,6 +62,7 @@ HFILES = \
 	xfs_rmap.h \
 	xfs_rmap_btree.h \
 	xfs_rtbitmap.h \
+	xfs_rtgroup.h \
 	xfs_sb.h \
 	xfs_shared.h \
 	xfs_swapext.h \
@@ -119,6 +120,7 @@ CFILES = cache.c \
 	xfs_rmap.c \
 	xfs_rmap_btree.c \
 	xfs_rtbitmap.c \
+	xfs_rtgroup.c \
 	xfs_sb.c \
 	xfs_swapext.c \
 	xfs_symlink_remote.c \
diff --git a/libxfs/init.c b/libxfs/init.c
index 67f9e1fe99b..f1ba28a3ca3 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -30,6 +30,7 @@
 #include "xfs_ondisk.h"
 
 #include "libxfs.h"		/* for now */
+#include "xfs_rtgroup.h"
 
 #ifndef HAVE_LIBURCU_ATOMIC64
 pthread_mutex_t	atomic64_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -738,7 +739,9 @@ libxfs_mount(
 {
 	struct xfs_buf		*bp;
 	struct xfs_sb		*sbp;
+	struct xfs_rtgroup	*rtg;
 	xfs_daddr_t		d;
+	xfs_rgnumber_t		rgno;
 	int			error;
 
 	mp->m_features = xfs_sb_version_to_features(sb);
@@ -752,9 +755,11 @@ libxfs_mount(
 	xfs_set_inode32(mp);
 	mp->m_sb = *sb;
 	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_KERNEL);
+	INIT_RADIX_TREE(&mp->m_rtgroup_tree, GFP_KERNEL);
 	sbp = &mp->m_sb;
 	spin_lock_init(&mp->m_sb_lock);
 	spin_lock_init(&mp->m_agirotor_lock);
+	spin_lock_init(&mp->m_rtgroup_lock);
 
 	xfs_sb_mount_common(mp, sb);
 
@@ -884,6 +889,20 @@ libxfs_mount(
 
 	libxfs_mountfs_imeta(mp);
 
+	error = libxfs_initialize_rtgroups(mp, sbp->sb_rgcount);
+	if (error) {
+		fprintf(stderr, _("%s: rtgroup init failed\n"),
+			progname);
+		exit(1);
+	}
+
+	for_each_rtgroup(mp, rgno, rtg) {
+		rtg->rtg_blockcount = xfs_rtgroup_block_count(mp,
+							      rtg->rtg_rgno);
+	}
+
+	xfs_set_rtgroup_data_loaded(mp);
+
 	return mp;
 out_da:
 	xfs_da_unmount(mp);
@@ -1017,6 +1036,8 @@ libxfs_umount(
 	 * Only try to free the per-AG structures if we set them up in the
 	 * first place.
 	 */
+	if (xfs_is_rtgroup_data_loaded(mp))
+		xfs_free_rtgroups(mp);
 	if (xfs_is_perag_data_loaded(mp))
 		libxfs_free_perag(mp);
 
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 5f2250ac5e2..ce255ec3a87 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -154,6 +154,7 @@
 #define xfs_free_extent			libxfs_free_extent
 #define xfs_free_extent_later		libxfs_free_extent_later
 #define xfs_free_perag			libxfs_free_perag
+#define xfs_free_rtgroups		libxfs_free_rtgroups
 #define xfs_fs_geometry			libxfs_fs_geometry
 #define xfs_get_projid			libxfs_get_projid
 #define xfs_get_initial_prid		libxfs_get_initial_prid
@@ -197,6 +198,7 @@
 
 #define xfs_initialize_perag		libxfs_initialize_perag
 #define xfs_initialize_perag_data	libxfs_initialize_perag_data
+#define xfs_initialize_rtgroups		libxfs_initialize_rtgroups
 #define xfs_init_local_fork		libxfs_init_local_fork
 
 #define xfs_inobt_maxrecs		libxfs_inobt_maxrecs
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 0636ca97622..3bd93c01bf4 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -184,6 +184,14 @@ typedef struct xfs_sb {
 	 */
 	xfs_ino_t	sb_metadirino;
 
+	/*
+	 * Realtime group geometry information.  On disk these fields live in
+	 * the rsumino slot, but we cache them separately in the in-core super
+	 * for easy access.
+	 */
+	xfs_rgblock_t	sb_rgblocks;	/* size of a realtime group */
+	xfs_rgnumber_t	sb_rgcount;	/* number of realtime groups */
+
 	/* must be padded to 64 bit alignment */
 } xfs_sb_t;
 
diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
new file mode 100644
index 00000000000..02c0c592c91
--- /dev/null
+++ b/libxfs/xfs_rtgroup.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs_priv.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+
+/*
+ * Passive reference counting access wrappers to the rtgroup structures.  If
+ * the rtgroup structure is to be freed, the freeing code is responsible for
+ * cleaning up objects with passive references before freeing the structure.
+ */
+struct xfs_rtgroup *
+xfs_rtgroup_get(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rcu_read_lock();
+	rtg = radix_tree_lookup(&mp->m_rtgroup_tree, rgno);
+	if (rtg) {
+		trace_xfs_rtgroup_get(rtg, _RET_IP_);
+		ASSERT(atomic_read(&rtg->rtg_ref) >= 0);
+		atomic_inc(&rtg->rtg_ref);
+	}
+	rcu_read_unlock();
+	return rtg;
+}
+
+/* Get a passive reference to the given rtgroup. */
+struct xfs_rtgroup *
+xfs_rtgroup_hold(
+	struct xfs_rtgroup	*rtg)
+{
+	ASSERT(atomic_read(&rtg->rtg_ref) > 0 ||
+	       atomic_read(&rtg->rtg_active_ref) > 0);
+
+	trace_xfs_rtgroup_hold(rtg, _RET_IP_);
+	atomic_inc(&rtg->rtg_ref);
+	return rtg;
+}
+
+void
+xfs_rtgroup_put(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_put(rtg, _RET_IP_);
+	ASSERT(atomic_read(&rtg->rtg_ref) > 0);
+	atomic_dec(&rtg->rtg_ref);
+}
+
+/*
+ * Active references for rtgroup structures. This is for short term access to
+ * the rtgroup structures for walking trees or accessing state. If an rtgroup
+ * is being shrunk or is offline, then this will fail to find that group and
+ * return NULL instead.
+ */
+struct xfs_rtgroup *
+xfs_rtgroup_grab(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rcu_read_lock();
+	rtg = radix_tree_lookup(&mp->m_rtgroup_tree, agno);
+	if (rtg) {
+		trace_xfs_rtgroup_grab(rtg, _RET_IP_);
+		if (!atomic_inc_not_zero(&rtg->rtg_active_ref))
+			rtg = NULL;
+	}
+	rcu_read_unlock();
+	return rtg;
+}
+
+void
+xfs_rtgroup_rele(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_rele(rtg, _RET_IP_);
+	if (atomic_dec_and_test(&rtg->rtg_active_ref))
+		wake_up(&rtg->rtg_active_wq);
+}
+
+int
+xfs_initialize_rtgroups(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgcount)
+{
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		index;
+	xfs_rgnumber_t		first_initialised = NULLRGNUMBER;
+	int			error;
+
+	if (!xfs_has_rtgroups(mp))
+		return 0;
+
+	/*
+	 * Walk the current rtgroup tree so we don't try to initialise rt
+	 * groups that already exist (growfs case). Allocate and insert all the
+	 * rtgroups we don't find ready for initialisation.
+	 */
+	for (index = 0; index < rgcount; index++) {
+		rtg = xfs_rtgroup_get(mp, index);
+		if (rtg) {
+			xfs_rtgroup_put(rtg);
+			continue;
+		}
+
+		rtg = kmem_zalloc(sizeof(struct xfs_rtgroup), KM_MAYFAIL);
+		if (!rtg) {
+			error = -ENOMEM;
+			goto out_unwind_new_rtgs;
+		}
+		rtg->rtg_rgno = index;
+		rtg->rtg_mount = mp;
+
+		error = radix_tree_preload(GFP_NOFS);
+		if (error)
+			goto out_free_rtg;
+
+		spin_lock(&mp->m_rtgroup_lock);
+		if (radix_tree_insert(&mp->m_rtgroup_tree, index, rtg)) {
+			WARN_ON_ONCE(1);
+			spin_unlock(&mp->m_rtgroup_lock);
+			radix_tree_preload_end();
+			error = -EEXIST;
+			goto out_free_rtg;
+		}
+		spin_unlock(&mp->m_rtgroup_lock);
+		radix_tree_preload_end();
+
+#ifdef __KERNEL__
+		/* Place kernel structure only init below this point. */
+		spin_lock_init(&rtg->rtg_state_lock);
+		init_waitqueue_head(&rtg->rtg_active_wq);
+#endif /* __KERNEL__ */
+
+		/* Active ref owned by mount indicates rtgroup is online. */
+		atomic_set(&rtg->rtg_active_ref, 1);
+
+		/* first new rtg is fully initialized */
+		if (first_initialised == NULLRGNUMBER)
+			first_initialised = index;
+	}
+
+	return 0;
+
+out_free_rtg:
+	kmem_free(rtg);
+out_unwind_new_rtgs:
+	/* unwind any prior newly initialized rtgs */
+	for (index = first_initialised; index < rgcount; index++) {
+		rtg = radix_tree_delete(&mp->m_rtgroup_tree, index);
+		if (!rtg)
+			break;
+		kmem_free(rtg);
+	}
+	return error;
+}
+
+STATIC void
+__xfs_free_rtgroups(
+	struct rcu_head		*head)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rtg = container_of(head, struct xfs_rtgroup, rcu_head);
+	kmem_free(rtg);
+}
+
+/*
+ * Free up the rtgroup resources associated with the mount structure.
+ */
+void
+xfs_free_rtgroups(
+	struct xfs_mount	*mp)
+{
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
+
+	if (!xfs_has_rtgroups(mp))
+		return;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		spin_lock(&mp->m_rtgroup_lock);
+		rtg = radix_tree_delete(&mp->m_rtgroup_tree, rgno);
+		spin_unlock(&mp->m_rtgroup_lock);
+		ASSERT(rtg);
+		XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_ref) != 0);
+
+		/* drop the mount's active reference */
+		xfs_rtgroup_rele(rtg);
+		XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_active_ref) != 0);
+
+		call_rcu(&rtg->rcu_head, __xfs_free_rtgroups);
+	}
+}
+
+/* Find the size of the rtgroup, in blocks. */
+static xfs_rgblock_t
+__xfs_rtgroup_block_count(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	xfs_rgnumber_t		rgcount,
+	xfs_rfsblock_t		rblocks)
+{
+	ASSERT(rgno < rgcount);
+
+	if (rgno < rgcount - 1)
+		return mp->m_sb.sb_rgblocks;
+	return xfs_rtb_rounddown_rtx(mp,
+			rblocks - (rgno * mp->m_sb.sb_rgblocks));
+}
+
+/* Compute the number of blocks in this realtime group. */
+xfs_rgblock_t
+xfs_rtgroup_block_count(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	return __xfs_rtgroup_block_count(mp, rgno, mp->m_sb.sb_rgcount,
+			mp->m_sb.sb_rblocks);
+}
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
new file mode 100644
index 00000000000..2f0a670217c
--- /dev/null
+++ b/libxfs/xfs_rtgroup.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __LIBXFS_RTGROUP_H
+#define __LIBXFS_RTGROUP_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Realtime group incore structure, similar to the per-AG structure.
+ */
+struct xfs_rtgroup {
+	struct xfs_mount	*rtg_mount;
+	xfs_rgnumber_t		rtg_rgno;
+	atomic_t		rtg_ref;	/* passive reference count */
+	atomic_t		rtg_active_ref;	/* active reference count */
+	wait_queue_head_t	rtg_active_wq;/* woken active_ref falls to zero */
+
+	/* for rcu-safe freeing */
+	struct rcu_head		rcu_head;
+
+	/* Number of blocks in this group */
+	xfs_rgblock_t		rtg_blockcount;
+
+#ifdef __KERNEL__
+	/* -- kernel only structures below this line -- */
+	spinlock_t		rtg_state_lock;
+#endif /* __KERNEL__ */
+};
+
+#ifdef CONFIG_XFS_RT
+/* Passive rtgroup references */
+struct xfs_rtgroup *xfs_rtgroup_get(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+struct xfs_rtgroup *xfs_rtgroup_hold(struct xfs_rtgroup *rtg);
+void xfs_rtgroup_put(struct xfs_rtgroup *rtg);
+
+/* Active rtgroup references */
+struct xfs_rtgroup *xfs_rtgroup_grab(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+void xfs_rtgroup_rele(struct xfs_rtgroup *rtg);
+
+int xfs_initialize_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t rgcount);
+void xfs_free_rtgroups(struct xfs_mount *mp);
+#else
+static inline struct xfs_rtgroup *
+xfs_rtgroup_get(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	return NULL;
+}
+static inline struct xfs_rtgroup *
+xfs_rtgroup_hold(struct xfs_rtgroup *rtg)
+{
+	ASSERT(rtg == NULL);
+	return NULL;
+}
+# define xfs_rtgroup_grab			xfs_rtgroup_get
+# define xfs_rtgroup_put(rtg)			((void)0)
+# define xfs_rtgroup_rele(rtg)			((void)0)
+# define xfs_initialize_rtgroups(mp, rgcount)	(0)
+# define xfs_free_rtgroups(mp)			((void)0)
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * rt group iteration APIs
+ */
+static inline struct xfs_rtgroup *
+xfs_rtgroup_next(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgnumber_t		*rgno,
+	xfs_rgnumber_t		end_rgno)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+
+	*rgno = rtg->rtg_rgno + 1;
+	xfs_rtgroup_rele(rtg);
+	if (*rgno > end_rgno)
+		return NULL;
+	return xfs_rtgroup_grab(mp, *rgno);
+}
+
+#define for_each_rtgroup_range(mp, rgno, end_rgno, rtg) \
+	for ((rtg) = xfs_rtgroup_grab((mp), (rgno)); \
+		(rtg) != NULL; \
+		(rtg) = xfs_rtgroup_next((rtg), &(rgno), (end_rgno)))
+
+#define for_each_rtgroup_from(mp, rgno, rtg) \
+	for_each_rtgroup_range((mp), (rgno), (mp)->m_sb.sb_rgcount - 1, (rtg))
+
+
+#define for_each_rtgroup(mp, rgno, rtg) \
+	(rgno) = 0; \
+	for_each_rtgroup_from((mp), (rgno), (rtg))
+
+static inline bool
+xfs_verify_rgbno(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		rgbno)
+{
+	if (rgbno >= rtg->rtg_blockcount)
+		return false;
+	if (rgbno < rtg->rtg_mount->m_sb.sb_rextsize)
+		return false;
+	return true;
+}
+
+static inline bool
+xfs_verify_rgbext(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		rgbno,
+	xfs_rgblock_t		len)
+{
+	if (rgbno + len <= rgbno)
+		return false;
+
+	if (!xfs_verify_rgbno(rtg, rgbno))
+		return false;
+
+	return xfs_verify_rgbno(rtg, rgbno + len - 1);
+}
+
+#ifdef CONFIG_XFS_RT
+xfs_rgblock_t xfs_rtgroup_block_count(struct xfs_mount *mp,
+		xfs_rgnumber_t rgno);
+#else
+# define xfs_rtgroup_block_count(mp, rgno)	(0)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __LIBXFS_RTGROUP_H */
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index 719da346d36..f3a5af60e2c 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -699,6 +699,9 @@ __xfs_sb_from_disk(
 		to->sb_gquotino = NULLFSINO;
 		to->sb_pquotino = NULLFSINO;
 	}
+
+	to->sb_rgcount = 0;
+	to->sb_rgblocks = 0;
 }
 
 void
@@ -1013,6 +1016,8 @@ xfs_sb_mount_common(
 	mp->m_blockwmask = mp->m_blockwsize - 1;
 	mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
 	mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
+	mp->m_rgblklog = 0;
+	mp->m_rgblkmask = 0;
 
 	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
 	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
diff --git a/libxfs/xfs_types.h b/libxfs/xfs_types.h
index 5556615a2ff..195471c4385 100644
--- a/libxfs/xfs_types.h
+++ b/libxfs/xfs_types.h
@@ -9,10 +9,12 @@
 typedef uint32_t	prid_t;		/* project ID */
 
 typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
+typedef uint32_t	xfs_rgblock_t;	/* blockno in realtime group */
 typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
 typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
 typedef uint32_t	xfs_rtxlen_t;	/* file extent length in rtextents */
 typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
+typedef uint32_t	xfs_rgnumber_t;	/* realtime group number */
 typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
 typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
 typedef int64_t		xfs_fsize_t;	/* bytes in a file */
@@ -54,7 +56,9 @@ typedef void *		xfs_failaddr_t;
 #define	NULLRTEXTNO	((xfs_rtxnum_t)-1)
 
 #define	NULLAGBLOCK	((xfs_agblock_t)-1)
+#define NULLRGBLOCK	((xfs_rgblock_t)-1)
 #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
+#define	NULLRGNUMBER	((xfs_rgnumber_t)-1)
 
 #define NULLCOMMITLSN	((xfs_lsn_t)-1)
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 02/52] xfs: define the format of rt groups
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (5 preceding siblings ...)
  2023-12-31 23:47   ` [PATCH 01/52] xfs: create incore realtime group structures Darrick J. Wong
@ 2023-12-31 23:48   ` Darrick J. Wong
  2023-12-31 23:48   ` [PATCH 03/52] xfs: reduce rt summary file levels for rtgroups filesystems Darrick J. Wong
                     ` (44 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:48 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Define the ondisk format of realtime group metadata.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/convert.c         |    8 ----
 include/libxfs.h     |    2 +
 libfrog/util.c       |   14 +++++++
 libfrog/util.h       |    2 +
 libxfs/libxfs_priv.h |   19 ++++++++++
 libxfs/xfs_format.h  |   62 ++++++++++++++++++++++++++++++++-
 libxfs/xfs_ondisk.h  |    1 +
 libxfs/xfs_rtgroup.c |   95 ++++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtgroup.h |   83 ++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_sb.c      |   86 +++++++++++++++++++++++++++++++++++++++++++--
 libxfs/xfs_shared.h  |    1 +
 mkfs/xfs_mkfs.c      |    1 +
 repair/sb.c          |    1 +
 13 files changed, 362 insertions(+), 13 deletions(-)


diff --git a/db/convert.c b/db/convert.c
index 048c1766f81..7c0b1edc235 100644
--- a/db/convert.c
+++ b/db/convert.c
@@ -413,14 +413,6 @@ convert_f(int argc, char **argv)
 	return 0;
 }
 
-static inline xfs_rtblock_t
-xfs_daddr_to_rtb(
-	struct xfs_mount	*mp,
-	xfs_daddr_t		daddr)
-{
-	return daddr >> mp->m_blkbb_log;
-}
-
 static inline uint64_t
 rt_daddr_to_rsumblock(
 	struct xfs_mount	*mp,
diff --git a/include/libxfs.h b/include/libxfs.h
index 7fa061d5e1b..8d2e321b914 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -91,6 +91,8 @@ struct iomap;
 #include "xfs_parent.h"
 #include "xfs_imeta.h"
 #include "imeta_utils.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/libfrog/util.c b/libfrog/util.c
index 8fb10cf82f5..46047571a55 100644
--- a/libfrog/util.c
+++ b/libfrog/util.c
@@ -22,3 +22,17 @@ log2_roundup(unsigned int i)
 	}
 	return rval;
 }
+
+void *
+memchr_inv(const void *start, int c, size_t bytes)
+{
+	const unsigned char	*p = start;
+
+	while (bytes > 0) {
+		if (*p != (unsigned char)c)
+			return (void *)p;
+		bytes--;
+	}
+
+	return NULL;
+}
diff --git a/libfrog/util.h b/libfrog/util.h
index 1b97881bf16..ac2f331c93e 100644
--- a/libfrog/util.h
+++ b/libfrog/util.h
@@ -8,4 +8,6 @@
 
 unsigned int	log2_roundup(unsigned int i);
 
+void *memchr_inv(const void *start, int c, size_t bytes);
+
 #endif /* __LIBFROG_UTIL_H__ */
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 57b8edc54a5..4e4a51637e6 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -52,6 +52,7 @@
 #include "libfrog/radix-tree.h"
 #include "libfrog/bitmask.h"
 #include "libfrog/div64.h"
+#include "libfrog/util.h"
 #include "atomic.h"
 #include "spinlock.h"
 #include "linux-err.h"
@@ -391,6 +392,24 @@ static inline unsigned long long mask64_if_power2(unsigned long b)
 	return is_power_of_2(b) ? b - 1 : 0;
 }
 
+/* If @b is a power of 2, return log2(b).  Else return zero. */
+static inline unsigned int log2_if_power(unsigned long b)
+{
+	unsigned long	mask = 1;
+	unsigned int	i;
+	unsigned int	ret = 1;
+
+	if (!is_power_of_2(b))
+	       return 0;
+
+	for (i = 0; i < NBBY * sizeof(unsigned long); i++, mask <<= 1) {
+		if (b & mask)
+			ret = i;
+	}
+
+	return ret;
+}
+
 /* buffer management */
 #define XBF_TRYLOCK			0
 #define XBF_UNMAPPED			0
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 3bd93c01bf4..8debe925716 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -216,7 +216,17 @@ struct xfs_dsb {
 	 * pointers are no longer used.
 	 */
 	__be64		sb_rbmino;
-	__be64		sb_rsumino;	/* summary inode for rt bitmap */
+	/*
+	 * rtgroups requires metadir, so we reuse the rsumino space to hold
+	 * the rg block count and shift values.
+	 */
+	union {
+		__be64	sb_rsumino;	/* summary inode for rt bitmap */
+		struct {
+			__be32	sb_rgcount;	/* # of realtime groups */
+			__be32	sb_rgblocks;	/* rtblocks per group */
+		};
+	};
 	__be32		sb_rextsize;	/* realtime extent size, blocks */
 	__be32		sb_agblocks;	/* size of an allocation group */
 	__be32		sb_agcount;	/* number of allocation groups */
@@ -398,6 +408,7 @@ xfs_sb_has_ro_compat_feature(
 #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4)	/* needs xfs_repair */
 #define XFS_SB_FEAT_INCOMPAT_NREXT64	(1 << 5)	/* large extent counters */
 #define XFS_SB_FEAT_INCOMPAT_PARENT	(1 << 6)	/* parent pointers */
+#define XFS_SB_FEAT_INCOMPAT_RTGROUPS	(1U << 30)	/* realtime groups */
 #define XFS_SB_FEAT_INCOMPAT_METADIR	(1U << 31)	/* metadata dir tree */
 #define XFS_SB_FEAT_INCOMPAT_ALL \
 		(XFS_SB_FEAT_INCOMPAT_FTYPE|	\
@@ -752,6 +763,55 @@ union xfs_suminfo_raw {
 	__u32		old;
 };
 
+/*
+ * Realtime allocation groups break the rt section into multiple pieces that
+ * could be locked independently.  Realtime block group numbers are 32-bit
+ * quantities.  Block numbers within a group are also 32-bit quantities, but
+ * the upper bit must never be set.
+ */
+#define XFS_MAX_RGBLOCKS	((xfs_rgblock_t)(1U << 31) - 1)
+#define XFS_MAX_RGNUMBER	((xfs_rgnumber_t)(-1U))
+
+#define XFS_RTSB_MAGIC	0x58524750	/* 'XRGP' */
+
+/*
+ * Realtime superblock - on disk version.  Must be padded to 64 bit alignment.
+ * The first block of each realtime group contains this superblock; this is
+ * how we avoid having file data extents cross a group boundary.
+ */
+struct xfs_rtsb {
+	__be32		rsb_magicnum;	/* magic number == XFS_RTSB_MAGIC */
+	__be32		rsb_blocksize;	/* logical block size, bytes */
+	__be64		rsb_rblocks;	/* number of realtime blocks */
+
+	__be64		rsb_rextents;	/* number of realtime extents */
+	__be64		rsb_lsn;	/* last write sequence */
+
+	__be32		rsb_rgcount;	/* # of realtime groups */
+	unsigned char	rsb_fname[XFSLABEL_MAX]; /* rt volume name */
+
+	uuid_t		rsb_uuid;	/* user-visible file system unique id */
+
+	__be32		rsb_rextsize;	/* realtime extent size, blocks */
+	__be32		rsb_rbmblocks;	/* number of rt bitmap blocks */
+
+	__be32		rsb_rgblocks;	/* rt blocks per group */
+	__u8		rsb_blocklog;	/* log2 of sb_blocksize */
+	__u8		rsb_sectlog;	/* log2 of sb_sectsize */
+	__u8		rsb_rextslog;	/* log2 of sb_rextents */
+	__u8		rsb_pad;
+
+	__le32		rsb_crc;	/* superblock crc */
+	__le32		rsb_pad2;
+
+	uuid_t		rsb_meta_uuid;	/* metadata file system unique id */
+
+	/* must be padded to 64 bit alignment */
+};
+
+#define XFS_RTSB_CRC_OFF	offsetof(struct xfs_rtsb, rsb_crc)
+#define XFS_RTSB_DADDR		((xfs_daddr_t)0) /* daddr in rt section */
+
 /*
  * XFS Timestamps
  * ==============
diff --git a/libxfs/xfs_ondisk.h b/libxfs/xfs_ondisk.h
index 832d96f0f3c..65219d4cf99 100644
--- a/libxfs/xfs_ondisk.h
+++ b/libxfs/xfs_ondisk.h
@@ -53,6 +53,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,			4);
 	XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t,			4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtsb,			104);
 
 	/* dir/attr trees */
 	XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr,	80);
diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index 02c0c592c91..c72ef884909 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -247,3 +247,98 @@ xfs_rtgroup_block_count(
 	return __xfs_rtgroup_block_count(mp, rgno, mp->m_sb.sb_rgcount,
 			mp->m_sb.sb_rblocks);
 }
+
+static xfs_failaddr_t
+xfs_rtsb_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_rtsb		*rsb = bp->b_addr;
+
+	if (!xfs_verify_magic(bp, rsb->rsb_magicnum))
+		return __this_address;
+	if (be32_to_cpu(rsb->rsb_blocksize) != mp->m_sb.sb_blocksize)
+		return __this_address;
+	if (be64_to_cpu(rsb->rsb_rblocks) != mp->m_sb.sb_rblocks)
+		return __this_address;
+
+	if (be64_to_cpu(rsb->rsb_rextents) != mp->m_sb.sb_rextents)
+		return __this_address;
+
+	if (!uuid_equal(&rsb->rsb_uuid, &mp->m_sb.sb_uuid))
+		return __this_address;
+
+	if (be32_to_cpu(rsb->rsb_rgcount) != mp->m_sb.sb_rgcount)
+		return __this_address;
+
+	if (be32_to_cpu(rsb->rsb_rextsize) != mp->m_sb.sb_rextsize)
+		return __this_address;
+	if (be32_to_cpu(rsb->rsb_rbmblocks) != mp->m_sb.sb_rbmblocks)
+		return __this_address;
+
+	if (be32_to_cpu(rsb->rsb_rgblocks) != mp->m_sb.sb_rgblocks)
+		return __this_address;
+	if (rsb->rsb_blocklog != mp->m_sb.sb_blocklog)
+		return __this_address;
+	if (rsb->rsb_sectlog != mp->m_sb.sb_sectlog)
+		return __this_address;
+	if (rsb->rsb_rextslog != mp->m_sb.sb_rextslog)
+		return __this_address;
+	if (rsb->rsb_pad)
+		return __this_address;
+
+	if (rsb->rsb_pad2)
+		return __this_address;
+
+	if (!uuid_equal(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+
+	/* Everything to the end of the fs block must be zero */
+	if (memchr_inv(rsb + 1, 0, BBTOB(bp->b_length) - sizeof(*rsb)))
+		return __this_address;
+
+	return NULL;
+}
+
+static void
+xfs_rtsb_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_RTSB_CRC_OFF))
+		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+	else {
+		fa = xfs_rtsb_verify(bp);
+		if (fa)
+			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+	}
+}
+
+static void
+xfs_rtsb_write_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_rtsb		*rsb = bp->b_addr;
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+	xfs_failaddr_t		fa;
+
+	fa = xfs_rtsb_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (bip)
+		rsb->rsb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_RTSB_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_rtsb_buf_ops = {
+	.name = "xfs_rtsb",
+	.magic = { 0, cpu_to_be32(XFS_RTSB_MAGIC) },
+	.verify_read = xfs_rtsb_read_verify,
+	.verify_write = xfs_rtsb_write_verify,
+	.verify_struct = xfs_rtsb_verify,
+};
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 2f0a670217c..924c8c95acb 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -122,6 +122,89 @@ xfs_verify_rgbext(
 	return xfs_verify_rgbno(rtg, rgbno + len - 1);
 }
 
+static inline xfs_rtblock_t
+xfs_rgbno_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	xfs_rgblock_t		rgbno)
+{
+	ASSERT(xfs_has_rtgroups(mp));
+
+	if (mp->m_rgblklog >= 0)
+		return ((xfs_rtblock_t)rgno << mp->m_rgblklog) | rgbno;
+
+	return ((xfs_rtblock_t)rgno * mp->m_sb.sb_rgblocks) + rgbno;
+}
+
+static inline xfs_rgnumber_t
+xfs_rtb_to_rgno(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	ASSERT(xfs_has_rtgroups(mp));
+
+	if (mp->m_rgblklog >= 0)
+		return rtbno >> mp->m_rgblklog;
+
+	return div_u64(rtbno, mp->m_sb.sb_rgblocks);
+}
+
+static inline xfs_rgblock_t
+xfs_rtb_to_rgbno(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno,
+	xfs_rgnumber_t		*rgno)
+{
+	uint32_t		rem;
+
+	ASSERT(xfs_has_rtgroups(mp));
+
+	if (mp->m_rgblklog >= 0) {
+		*rgno = rtbno >> mp->m_rgblklog;
+		return rtbno & mp->m_rgblkmask;
+	}
+
+	*rgno = div_u64_rem(rtbno, mp->m_sb.sb_rgblocks, &rem);
+	return rem;
+}
+
+static inline xfs_daddr_t
+xfs_rtb_to_daddr(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	return rtbno << mp->m_blkbb_log;
+}
+
+static inline xfs_rtblock_t
+xfs_daddr_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	return daddr >> mp->m_blkbb_log;
+}
+
+static inline xfs_rgnumber_t
+xfs_daddr_to_rgno(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	xfs_rtblock_t		rtb = daddr >> mp->m_blkbb_log;
+
+	return xfs_rtb_to_rgno(mp, rtb);
+}
+
+static inline xfs_rgblock_t
+xfs_daddr_to_rgbno(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	xfs_rtblock_t		rtb = daddr >> mp->m_blkbb_log;
+	xfs_rgnumber_t		rgno;
+
+	return xfs_rtb_to_rgbno(mp, rtb, &rgno);
+}
+
 #ifdef CONFIG_XFS_RT
 xfs_rgblock_t xfs_rtgroup_block_count(struct xfs_mount *mp,
 		xfs_rgnumber_t rgno);
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index f3a5af60e2c..95cb070aab5 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -178,6 +178,8 @@ xfs_sb_version_to_features(
 		features |= XFS_FEAT_PARENT;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
 		features |= XFS_FEAT_METADIR;
+	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS)
+		features |= XFS_FEAT_RTGROUPS;
 
 	return features;
 }
@@ -305,6 +307,64 @@ xfs_validate_sb_write(
 	return 0;
 }
 
+static int
+xfs_validate_sb_rtgroups(
+	struct xfs_mount	*mp,
+	struct xfs_sb		*sbp)
+{
+	uint64_t		groups;
+
+	if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
+		xfs_warn(mp,
+"Realtime groups require metadata directory tree.");
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rgblocks > XFS_MAX_RGBLOCKS) {
+		xfs_warn(mp,
+"Realtime group size (%u) must be less than %u.",
+			 sbp->sb_rgblocks, XFS_MAX_RGBLOCKS);
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rextsize == 0) {
+		xfs_warn(mp,
+"Realtime extent size must not be zero.");
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rgblocks % sbp->sb_rextsize != 0) {
+		xfs_warn(mp,
+"Realtime group size (%u) must be an even multiple of extent size (%u).",
+			 sbp->sb_rgblocks, sbp->sb_rextsize);
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rgblocks < (sbp->sb_rextsize << 1)) {
+		xfs_warn(mp,
+"Realtime group size (%u) must be greater than 1 rt extent.",
+			 sbp->sb_rgblocks);
+		return -EINVAL;
+	}
+
+	if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) {
+		xfs_warn(mp,
+"Realtime groups (%u) must be less than %u.",
+			 sbp->sb_rgcount, XFS_MAX_RGNUMBER);
+		return -EINVAL;
+	}
+
+	groups = howmany_64(sbp->sb_rblocks, sbp->sb_rgblocks);
+	if (groups != sbp->sb_rgcount) {
+		xfs_warn(mp,
+"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.",
+			sbp->sb_rgcount, groups);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /* Check the validity of the SB. */
 STATIC int
 xfs_validate_sb_common(
@@ -316,6 +376,7 @@ xfs_validate_sb_common(
 	uint32_t		agcount = 0;
 	uint32_t		rem;
 	bool			has_dalign;
+	int			error;
 
 	if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
 		xfs_warn(mp,
@@ -365,6 +426,12 @@ xfs_validate_sb_common(
 				return -EINVAL;
 			}
 		}
+
+		if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS) {
+			error = xfs_validate_sb_rtgroups(mp, sbp);
+			if (error)
+				return error;
+		}
 	} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
 				XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
 			xfs_notice(mp,
@@ -700,8 +767,13 @@ __xfs_sb_from_disk(
 		to->sb_pquotino = NULLFSINO;
 	}
 
-	to->sb_rgcount = 0;
-	to->sb_rgblocks = 0;
+	if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS) {
+		to->sb_rgcount = be32_to_cpu(from->sb_rgcount);
+		to->sb_rgblocks = be32_to_cpu(from->sb_rgblocks);
+	} else {
+		to->sb_rgcount = 0;
+		to->sb_rgblocks = 0;
+	}
 }
 
 void
@@ -861,6 +933,12 @@ xfs_sb_to_disk(
 		to->sb_gquotino = cpu_to_be64(NULLFSINO);
 		to->sb_pquotino = cpu_to_be64(NULLFSINO);
 	}
+
+	if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS) {
+		/* must come after setting to_rsumino */
+		to->sb_rgcount = cpu_to_be32(from->sb_rgcount);
+		to->sb_rgblocks = cpu_to_be32(from->sb_rgblocks);
+	}
 }
 
 /*
@@ -1016,8 +1094,8 @@ xfs_sb_mount_common(
 	mp->m_blockwmask = mp->m_blockwsize - 1;
 	mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
 	mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
-	mp->m_rgblklog = 0;
-	mp->m_rgblkmask = 0;
+	mp->m_rgblklog = log2_if_power2(sbp->sb_rgblocks);
+	mp->m_rgblkmask = mask64_if_power2(sbp->sb_rgblocks);
 
 	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
 	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h
index 2cecebe0188..f76d2789e1c 100644
--- a/libxfs/xfs_shared.h
+++ b/libxfs/xfs_shared.h
@@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
+extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index de4d1b25c26..8d2e832d126 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
+#include <stddef.h>
 #include "libfrog/util.h"
 #include "libxfs.h"
 #include <ctype.h>
diff --git a/repair/sb.c b/repair/sb.c
index 5f2a08136d7..8292a0f3c3e 100644
--- a/repair/sb.c
+++ b/repair/sb.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
+#include <stddef.h>
 #include "libfrog/util.h"
 #include "libxfs.h"
 #include "libxcmd.h"


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 03/52] xfs: reduce rt summary file levels for rtgroups filesystems
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (6 preceding siblings ...)
  2023-12-31 23:48   ` [PATCH 02/52] xfs: define the format of rt groups Darrick J. Wong
@ 2023-12-31 23:48   ` Darrick J. Wong
  2023-12-31 23:48   ` [PATCH 04/52] xfs: update primary realtime super every time we update the primary fs super Darrick J. Wong
                     ` (43 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:48 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The rt summary file is supposed to be large enough to track the number
of log2(rtextentcount) free space extents that start in a given rt
bitmap block.  Prior to rt groups, there could be a single 2^52 block
free extent, which implies a summary file with 53 levels.

However, each rtgroup uses its first rt extent to hold a superblock,
so there can't be any free extents longer than the length of a group.
Groups are limited to 2^32-1 blocks, which means that the longest
freespace will be counted in level 31.  Hence we only need 32 levels.

Adjust the rextslog computation to create smaller rt summary files for
rtgroups filesystems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h   |    2 +-
 libxfs/xfs_rtbitmap.c |   11 +++++++++++
 libxfs/xfs_rtbitmap.h |    4 ++--
 libxfs/xfs_sb.c       |    2 +-
 mkfs/xfs_mkfs.c       |    2 +-
 repair/sb.c           |    3 ++-
 6 files changed, 18 insertions(+), 6 deletions(-)


diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 8debe925716..43e66740e2a 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -313,7 +313,7 @@ struct xfs_dsb {
 
 #define	XFS_SB_VERSION_NUM(sbp)	((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
 
-static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
+static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp)
 {
 	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
 }
diff --git a/libxfs/xfs_rtbitmap.c b/libxfs/xfs_rtbitmap.c
index 69c70c89c96..c2970b78ce8 100644
--- a/libxfs/xfs_rtbitmap.c
+++ b/libxfs/xfs_rtbitmap.c
@@ -1142,10 +1142,21 @@ xfs_rtbitmap_blockcount(
  */
 uint8_t
 xfs_compute_rextslog(
+	const struct xfs_sb	*sbp,
 	xfs_rtbxlen_t		rtextents)
 {
 	if (!rtextents)
 		return 0;
+
+	/*
+	 * Realtime groups are never larger than 2^32 extents and are never
+	 * fully free, so we can use highbit32 on the number of rtextents per
+	 * group.
+	 */
+	if (xfs_sb_is_v5(sbp) &&
+	    (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS))
+		return xfs_highbit32(sbp->sb_rgblocks / sbp->sb_rextsize);
+
 	return xfs_highbit64(rtextents);
 }
 
diff --git a/libxfs/xfs_rtbitmap.h b/libxfs/xfs_rtbitmap.h
index 6ac17f0195e..3de0ec2d241 100644
--- a/libxfs/xfs_rtbitmap.h
+++ b/libxfs/xfs_rtbitmap.h
@@ -351,7 +351,7 @@ xfs_rtfree_extent(
 int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
 		xfs_filblks_t rtlen);
 
-uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+uint8_t xfs_compute_rextslog(const struct xfs_sb *sbp, xfs_rtbxlen_t rtextents);
 
 /* Do we support an rt volume having this number of rtextents? */
 static inline bool
@@ -396,7 +396,7 @@ void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
 # define xfs_rtsummary_read_buf(a,b)			(-ENOSYS)
 # define xfs_rtbuf_cache_relse(a)			(0)
 # define xfs_rtalloc_extent_is_free(m,t,s,l,i)		(-ENOSYS)
-# define xfs_compute_rextslog(rtx)			(0)
+# define xfs_compute_rextslog(sbp, rtx)			(0)
 # define xfs_validate_rtextents(rtx)			(false)
 static inline xfs_filblks_t
 xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index 95cb070aab5..b5e4367d4ca 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -581,7 +581,7 @@ xfs_validate_sb_common(
 
 		if (!xfs_validate_rtextents(rexts) ||
 		    sbp->sb_rextents != rexts ||
-		    sbp->sb_rextslog != xfs_compute_rextslog(rexts) ||
+		    sbp->sb_rextslog != xfs_compute_rextslog(sbp, rexts) ||
 		    sbp->sb_rbmblocks != rbmblocks) {
 			xfs_notice(mp,
 				"realtime geometry sanity check failed");
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 8d2e832d126..2330ebebfae 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -3983,7 +3983,7 @@ finish_superblock_setup(
 	sbp->sb_agcount = (xfs_agnumber_t)cfg->agcount;
 	sbp->sb_rbmblocks = cfg->rtbmblocks;
 	sbp->sb_logblocks = (xfs_extlen_t)cfg->logblocks;
-	sbp->sb_rextslog = libxfs_compute_rextslog(cfg->rtextents);
+	sbp->sb_rextslog = libxfs_compute_rextslog(sbp, cfg->rtextents);
 	sbp->sb_imax_pct = cfg->imaxpct;
 	sbp->sb_icount = 0;
 	sbp->sb_ifree = 0;
diff --git a/repair/sb.c b/repair/sb.c
index 8292a0f3c3e..32602d84f3c 100644
--- a/repair/sb.c
+++ b/repair/sb.c
@@ -482,7 +482,8 @@ verify_sb(char *sb_buf, xfs_sb_t *sb, int is_primary_sb)
 		if (sb->sb_rextents == 0)
 			return XR_BAD_RT_GEO_DATA;
 
-		if (sb->sb_rextslog != libxfs_compute_rextslog(sb->sb_rextents))
+		if (sb->sb_rextslog != libxfs_compute_rextslog(sb,
+							sb->sb_rextents))
 			return(XR_BAD_RT_GEO_DATA);
 
 		if (sb->sb_rbmblocks != (xfs_extlen_t) howmany(sb->sb_rextents,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 04/52] xfs: update primary realtime super every time we update the primary fs super
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (7 preceding siblings ...)
  2023-12-31 23:48   ` [PATCH 03/52] xfs: reduce rt summary file levels for rtgroups filesystems Darrick J. Wong
@ 2023-12-31 23:48   ` Darrick J. Wong
  2023-12-31 23:48   ` [PATCH 05/52] xfs: write secondary realtime superblocks to disk Darrick J. Wong
                     ` (42 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:48 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Every time we update parts of the primary filesystem superblock that are
echoed in the primary rt super, we should update that primary realtime
super.  Avoid an ondisk log format change by using ordered buffers to
write the primary rt super.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_trans.h      |    1 +
 libxfs/libxfs_api_defs.h |    1 +
 libxfs/libxfs_io.h       |    1 +
 libxfs/rdwr.c            |   17 +++++++++++
 libxfs/trans.c           |   29 ++++++++++++++++++
 libxfs/xfs_rtgroup.c     |   74 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtgroup.h     |    6 ++++
 libxfs/xfs_sb.c          |   13 ++++++++
 8 files changed, 142 insertions(+)


diff --git a/include/xfs_trans.h b/include/xfs_trans.h
index 183163e81a5..630bc85ce37 100644
--- a/include/xfs_trans.h
+++ b/include/xfs_trans.h
@@ -108,6 +108,7 @@ int	libxfs_trans_reserve_more(struct xfs_trans *tp, uint blocks,
 void xfs_defer_cancel(struct xfs_trans *);
 
 struct xfs_buf *libxfs_trans_getsb(struct xfs_trans *);
+struct xfs_buf *libxfs_trans_getrtsb(struct xfs_trans *tp);
 
 void	libxfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
 void	libxfs_trans_log_inode (struct xfs_trans *, struct xfs_inode *,
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index ce255ec3a87..fff0e6d0f60 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -309,6 +309,7 @@
 #define xfs_trans_dirty_buf		libxfs_trans_dirty_buf
 #define xfs_trans_get_buf		libxfs_trans_get_buf
 #define xfs_trans_get_buf_map		libxfs_trans_get_buf_map
+#define xfs_trans_getrtsb		libxfs_trans_getrtsb
 #define xfs_trans_getsb			libxfs_trans_getsb
 #define xfs_trans_ichgtime		libxfs_trans_ichgtime
 #define xfs_trans_ijoin			libxfs_trans_ijoin
diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h
index 8a99955ee73..f118cb5e836 100644
--- a/libxfs/libxfs_io.h
+++ b/libxfs/libxfs_io.h
@@ -189,6 +189,7 @@ libxfs_buf_read(
 
 int libxfs_readbuf_verify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
 struct xfs_buf *libxfs_getsb(struct xfs_mount *mp);
+struct xfs_buf *libxfs_getrtsb(struct xfs_mount *mp);
 extern void	libxfs_bcache_purge(struct xfs_mount *mp);
 extern void	libxfs_bcache_free(void);
 extern void	libxfs_bcache_flush(struct xfs_mount *mp);
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index ecf10f7946b..17abced06c9 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -164,6 +164,23 @@ libxfs_getsb(
 	return bp;
 }
 
+struct xfs_buf *
+libxfs_getrtsb(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	if (!mp->m_rtdev_targp->bt_bdev)
+		return NULL;
+
+	error = libxfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
+			XFS_FSB_TO_BB(mp, 1), 0, &bp, &xfs_rtsb_buf_ops);
+	if (error)
+		return NULL;
+	return bp;
+}
+
 struct kmem_cache			*xfs_buf_cache;
 
 static struct cache_mru		xfs_buf_freelist =
diff --git a/libxfs/trans.c b/libxfs/trans.c
index 8d969400984..aab9923d9ad 100644
--- a/libxfs/trans.c
+++ b/libxfs/trans.c
@@ -512,6 +512,35 @@ libxfs_trans_getsb(
 	return bp;
 }
 
+struct xfs_buf *
+libxfs_trans_getrtsb(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp;
+	struct xfs_buf_log_item	*bip;
+	int			len = XFS_FSS_TO_BB(mp, 1);
+	DEFINE_SINGLE_BUF_MAP(map, XFS_SB_DADDR, len);
+
+	bp = xfs_trans_buf_item_match(tp, mp->m_rtdev, &map, 1);
+	if (bp != NULL) {
+		ASSERT(bp->b_transp == tp);
+		bip = bp->b_log_item;
+		ASSERT(bip != NULL);
+		bip->bli_recur++;
+		trace_xfs_trans_getsb_recur(bip);
+		return bp;
+	}
+
+	bp = libxfs_getrtsb(mp);
+	if (bp == NULL)
+		return NULL;
+
+	_libxfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_getsb(bp->b_log_item);
+	return bp;
+}
+
 int
 libxfs_trans_read_buf_map(
 	struct xfs_mount	*mp,
diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index c72ef884909..26b2d3e03e7 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -342,3 +342,77 @@ const struct xfs_buf_ops xfs_rtsb_buf_ops = {
 	.verify_write = xfs_rtsb_write_verify,
 	.verify_struct = xfs_rtsb_verify,
 };
+
+/* Update a realtime superblock from the primary fs super */
+void
+xfs_rtgroup_update_super(
+	struct xfs_buf		*rtsb_bp,
+	const struct xfs_buf	*sb_bp)
+{
+	const struct xfs_dsb	*dsb = sb_bp->b_addr;
+	struct xfs_rtsb		*rsb = rtsb_bp->b_addr;
+	const uuid_t		*meta_uuid;
+
+	rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC);
+	rsb->rsb_blocksize = dsb->sb_blocksize;
+	rsb->rsb_rblocks = dsb->sb_rblocks;
+
+	rsb->rsb_rextents = dsb->sb_rextents;
+	rsb->rsb_lsn = 0;
+
+	memcpy(&rsb->rsb_uuid, &dsb->sb_uuid, sizeof(rsb->rsb_uuid));
+
+	rsb->rsb_rgcount = dsb->sb_rgcount;
+	memcpy(&rsb->rsb_fname, &dsb->sb_fname, XFSLABEL_MAX);
+
+	rsb->rsb_rextsize = dsb->sb_rextsize;
+	rsb->rsb_rbmblocks = dsb->sb_rbmblocks;
+
+	rsb->rsb_rgblocks = dsb->sb_rgblocks;
+	rsb->rsb_blocklog = dsb->sb_blocklog;
+	rsb->rsb_sectlog = dsb->sb_sectlog;
+	rsb->rsb_rextslog = dsb->sb_rextslog;
+	rsb->rsb_pad = 0;
+	rsb->rsb_pad2 = 0;
+
+	/*
+	 * The metadata uuid is the fs uuid if the metauuid feature is not
+	 * enabled.
+	 */
+	if (dsb->sb_features_incompat &
+				cpu_to_be32(XFS_SB_FEAT_INCOMPAT_META_UUID))
+		meta_uuid = &dsb->sb_meta_uuid;
+	else
+		meta_uuid = &dsb->sb_uuid;
+	memcpy(&rsb->rsb_meta_uuid, meta_uuid, sizeof(rsb->rsb_meta_uuid));
+}
+
+/*
+ * Update the primary realtime superblock from a filesystem superblock and
+ * log it to the given transaction.
+ */
+void
+xfs_rtgroup_log_super(
+	struct xfs_trans	*tp,
+	const struct xfs_buf	*sb_bp)
+{
+	struct xfs_buf		*rtsb_bp;
+
+	if (!xfs_has_rtgroups(tp->t_mountp))
+		return;
+
+	rtsb_bp = xfs_trans_getrtsb(tp);
+	if (!rtsb_bp) {
+		/*
+		 * It's possible for the rtgroups feature to be enabled but
+		 * there is no incore rt superblock buffer if the rt geometry
+		 * was specified at mkfs time but the rt section has not yet
+		 * been attached.  In this case, rblocks must be zero.
+		 */
+		ASSERT(tp->t_mountp->m_sb.sb_rblocks == 0);
+		return;
+	}
+
+	xfs_rtgroup_update_super(rtsb_bp, sb_bp);
+	xfs_trans_ordered_buf(tp, rtsb_bp);
+}
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 924c8c95acb..83bbd43f727 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -208,8 +208,14 @@ xfs_daddr_to_rgbno(
 #ifdef CONFIG_XFS_RT
 xfs_rgblock_t xfs_rtgroup_block_count(struct xfs_mount *mp,
 		xfs_rgnumber_t rgno);
+
+void xfs_rtgroup_update_super(struct xfs_buf *rtsb_bp,
+		const struct xfs_buf *sb_bp);
+void xfs_rtgroup_log_super(struct xfs_trans *tp, const struct xfs_buf *sb_bp);
 #else
 # define xfs_rtgroup_block_count(mp, rgno)	(0)
+# define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
+# define xfs_rtgroup_log_super(tp, sb_bp)	((void)0)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __LIBXFS_RTGROUP_H */
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index b5e4367d4ca..2de270171a4 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -25,6 +25,7 @@
 #include "xfs_ag.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_swapext.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1159,6 +1160,8 @@ xfs_log_sb(
 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
 	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+
+	xfs_rtgroup_log_super(tp, bp);
 }
 
 /*
@@ -1275,6 +1278,7 @@ xfs_sync_sb_buf(
 {
 	struct xfs_trans	*tp;
 	struct xfs_buf		*bp;
+	struct xfs_buf		*rtsb_bp = NULL;
 	int			error;
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
@@ -1284,6 +1288,11 @@ xfs_sync_sb_buf(
 	bp = xfs_trans_getsb(tp);
 	xfs_log_sb(tp);
 	xfs_trans_bhold(tp, bp);
+	if (xfs_has_rtgroups(mp)) {
+		rtsb_bp = xfs_trans_getrtsb(tp);
+		if (rtsb_bp)
+			xfs_trans_bhold(tp, rtsb_bp);
+	}
 	xfs_trans_set_sync(tp);
 	error = xfs_trans_commit(tp);
 	if (error)
@@ -1292,7 +1301,11 @@ xfs_sync_sb_buf(
 	 * write out the sb buffer to get the changes to disk
 	 */
 	error = xfs_bwrite(bp);
+	if (!error && rtsb_bp)
+		error = xfs_bwrite(rtsb_bp);
 out:
+	if (rtsb_bp)
+		xfs_buf_relse(rtsb_bp);
 	xfs_buf_relse(bp);
 	return error;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 05/52] xfs: write secondary realtime superblocks to disk
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (8 preceding siblings ...)
  2023-12-31 23:48   ` [PATCH 04/52] xfs: update primary realtime super every time we update the primary fs super Darrick J. Wong
@ 2023-12-31 23:48   ` Darrick J. Wong
  2023-12-31 23:49   ` [PATCH 06/52] xfs: grow the realtime section when realtime groups are enabled Darrick J. Wong
                     ` (41 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:48 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create some library functions to make it easy to update all the
secondary realtime superblocks on disk; this will be used by growfs,
xfs_db, mkfs, and xfs_repair.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtgroup.c |  117 ++++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_rtgroup.h |    2 +
 2 files changed, 119 insertions(+)


diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index 26b2d3e03e7..8bfaa8d06f8 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -416,3 +416,120 @@ xfs_rtgroup_log_super(
 	xfs_rtgroup_update_super(rtsb_bp, sb_bp);
 	xfs_trans_ordered_buf(tp, rtsb_bp);
 }
+
+/* Initialize a secondary realtime superblock. */
+static int
+xfs_rtgroup_init_secondary_super(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp;
+	struct xfs_rtsb		*rsb;
+	xfs_rtblock_t		rtbno;
+	int			error;
+
+	ASSERT(rgno != 0);
+
+	error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1),
+			0, &bp);
+	if (error)
+		return error;
+
+	rtbno = xfs_rgbno_to_rtb(mp, rgno, 0);
+	bp->b_maps[0].bm_bn = xfs_rtb_to_daddr(mp, rtbno);
+	bp->b_ops = &xfs_rtsb_buf_ops;
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+
+	rsb = bp->b_addr;
+	rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC);
+	rsb->rsb_blocksize = cpu_to_be32(mp->m_sb.sb_blocksize);
+	rsb->rsb_rblocks = cpu_to_be64(mp->m_sb.sb_rblocks);
+
+	rsb->rsb_rextents = cpu_to_be64(mp->m_sb.sb_rextents);
+
+	memcpy(&rsb->rsb_uuid, &mp->m_sb.sb_uuid, sizeof(rsb->rsb_uuid));
+
+	rsb->rsb_rgcount = cpu_to_be32(mp->m_sb.sb_rgcount);
+	memcpy(&rsb->rsb_fname, &mp->m_sb.sb_fname, XFSLABEL_MAX);
+
+	rsb->rsb_rextsize = cpu_to_be32(mp->m_sb.sb_rextsize);
+	rsb->rsb_rbmblocks = cpu_to_be32(mp->m_sb.sb_rbmblocks);
+
+	rsb->rsb_rgblocks = cpu_to_be32(mp->m_sb.sb_rgblocks);
+	rsb->rsb_blocklog = mp->m_sb.sb_blocklog;
+	rsb->rsb_sectlog = mp->m_sb.sb_sectlog;
+	rsb->rsb_rextslog = mp->m_sb.sb_rextslog;
+
+	memcpy(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid,
+			sizeof(rsb->rsb_meta_uuid));
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Update all the realtime superblocks to match the new state of the primary.
+ * Because we are completely overwriting all the existing fields in the
+ * secondary superblock buffers, there is no need to read them in from disk.
+ * Just get a new buffer, stamp it and write it.
+ *
+ * The rt super buffers do not need to be kept them in memory once they are
+ * written so we mark them as a one-shot buffer.
+ */
+int
+xfs_rtgroup_update_secondary_sbs(
+	struct xfs_mount	*mp)
+{
+	LIST_HEAD		(buffer_list);
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		start_rgno = 1;
+	int			saved_error = 0;
+	int			error = 0;
+
+	for_each_rtgroup_from(mp, start_rgno, rtg) {
+		struct xfs_buf		*bp;
+
+		error = xfs_rtgroup_init_secondary_super(mp, rtg->rtg_rgno,
+				&bp);
+		/*
+		 * If we get an error reading or writing alternate superblocks,
+		 * continue.  If we break early, we'll leave more superblocks
+		 * un-updated than updated.
+		 */
+		if (error) {
+			xfs_warn(mp,
+		"error allocating secondary superblock for rt group %d",
+				rtg->rtg_rgno);
+			if (!saved_error)
+				saved_error = error;
+			continue;
+		}
+
+		xfs_buf_oneshot(bp);
+		xfs_buf_delwri_queue(bp, &buffer_list);
+		xfs_buf_relse(bp);
+
+		/* don't hold too many buffers at once */
+		if (rtg->rtg_rgno % 16)
+			continue;
+
+		error = xfs_buf_delwri_submit(&buffer_list);
+		if (error) {
+			xfs_warn(mp,
+	"write error %d updating a secondary superblock near rt group %u",
+				error, rtg->rtg_rgno);
+			if (!saved_error)
+				saved_error = error;
+			continue;
+		}
+	}
+	error = xfs_buf_delwri_submit(&buffer_list);
+	if (error) {
+		xfs_warn(mp,
+	"write error %d updating a secondary superblock near rt group %u",
+			error, start_rgno);
+	}
+
+	return saved_error ? saved_error : error;
+}
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 83bbd43f727..2d0422c6712 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -212,10 +212,12 @@ xfs_rgblock_t xfs_rtgroup_block_count(struct xfs_mount *mp,
 void xfs_rtgroup_update_super(struct xfs_buf *rtsb_bp,
 		const struct xfs_buf *sb_bp);
 void xfs_rtgroup_log_super(struct xfs_trans *tp, const struct xfs_buf *sb_bp);
+int xfs_rtgroup_update_secondary_sbs(struct xfs_mount *mp);
 #else
 # define xfs_rtgroup_block_count(mp, rgno)	(0)
 # define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
 # define xfs_rtgroup_log_super(tp, sb_bp)	((void)0)
+# define xfs_rtgroup_update_secondary_sbs(mp)	(0)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __LIBXFS_RTGROUP_H */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 06/52] xfs: grow the realtime section when realtime groups are enabled
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (9 preceding siblings ...)
  2023-12-31 23:48   ` [PATCH 05/52] xfs: write secondary realtime superblocks to disk Darrick J. Wong
@ 2023-12-31 23:49   ` Darrick J. Wong
  2023-12-31 23:49   ` [PATCH 07/52] xfs: export realtime group geometry via XFS_FSOP_GEOM Darrick J. Wong
                     ` (40 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:49 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable growing the rt section when realtime groups are enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_shared.h |    1 +
 1 file changed, 1 insertion(+)


diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h
index f76d2789e1c..65691af0488 100644
--- a/libxfs/xfs_shared.h
+++ b/libxfs/xfs_shared.h
@@ -111,6 +111,7 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define	XFS_TRANS_SB_RBLOCKS		0x00000800
 #define	XFS_TRANS_SB_REXTENTS		0x00001000
 #define	XFS_TRANS_SB_REXTSLOG		0x00002000
+#define XFS_TRANS_SB_RGCOUNT		0x00004000
 
 /*
  * Here we centralize the specification of XFS meta-data buffer reference count


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 07/52] xfs: export realtime group geometry via XFS_FSOP_GEOM
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (10 preceding siblings ...)
  2023-12-31 23:49   ` [PATCH 06/52] xfs: grow the realtime section when realtime groups are enabled Darrick J. Wong
@ 2023-12-31 23:49   ` Darrick J. Wong
  2023-12-31 23:49   ` [PATCH 08/52] xfs: check that rtblock extents do not overlap with the rt group metadata Darrick J. Wong
                     ` (39 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:49 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Export the realtime geometry information so that userspace can query it.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs.h |    4 +++-
 libxfs/xfs_sb.c |    5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index a0efcbde5ae..bae9ef924bf 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -186,7 +186,9 @@ struct xfs_fsop_geom {
 	__u32		logsunit;	/* log stripe unit, bytes	*/
 	uint32_t	sick;		/* o: unhealthy fs & rt metadata */
 	uint32_t	checked;	/* o: checked fs & rt metadata	*/
-	__u64		reserved[17];	/* reserved space		*/
+	__u32		rgblocks;	/* rtblocks in a realtime group	*/
+	__u32		rgcount;	/* number of realtime groups	*/
+	__u64		reserved[16];	/* reserved space		*/
 };
 
 #define XFS_FSOP_GEOM_SICK_COUNTERS	(1 << 0)  /* summary counters */
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index 2de270171a4..3bd56edab87 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -1409,6 +1409,11 @@ xfs_fs_geometry(
 		return;
 
 	geo->version = XFS_FSOP_GEOM_VERSION_V5;
+
+	if (xfs_has_rtgroups(mp)) {
+		geo->rgcount = sbp->sb_rgcount;
+		geo->rgblocks = sbp->sb_rgblocks;
+	}
 }
 
 /* Read a secondary superblock. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 08/52] xfs: check that rtblock extents do not overlap with the rt group metadata
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (11 preceding siblings ...)
  2023-12-31 23:49   ` [PATCH 07/52] xfs: export realtime group geometry via XFS_FSOP_GEOM Darrick J. Wong
@ 2023-12-31 23:49   ` Darrick J. Wong
  2023-12-31 23:49   ` [PATCH 09/52] xfs: add frextents to the lazysbcounters when rtgroups enabled Darrick J. Wong
                     ` (38 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:49 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

The ondisk format specifies that the start of each realtime group must
have a superblock so that rt space mappings never cross an rtgroup
boundary.  Check that rt block pointers obey this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_types.c |   46 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)


diff --git a/libxfs/xfs_types.c b/libxfs/xfs_types.c
index f5eab8839e3..6488cda24e8 100644
--- a/libxfs/xfs_types.c
+++ b/libxfs/xfs_types.c
@@ -13,6 +13,8 @@
 #include "xfs_mount.h"
 #include "xfs_ag.h"
 #include "xfs_imeta.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
 
 
 /*
@@ -133,6 +135,26 @@ xfs_verify_dir_ino(
 	return xfs_verify_ino(mp, ino);
 }
 
+/*
+ * Verify that an rtgroup block number pointer neither points outside the
+ * rtgroup nor points at static metadata.
+ */
+static inline bool
+xfs_verify_rgno_rgbno(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	xfs_rgblock_t		rgbno)
+{
+	xfs_rgblock_t		eorg;
+
+	eorg = xfs_rtgroup_block_count(mp, rgno);
+	if (rgbno >= eorg)
+		return false;
+	if (rgbno < mp->m_sb.sb_rextsize)
+		return false;
+	return true;
+}
+
 /*
  * Verify that an realtime block number pointer doesn't point off the
  * end of the realtime device.
@@ -142,7 +164,20 @@ xfs_verify_rtbno(
 	struct xfs_mount	*mp,
 	xfs_rtblock_t		rtbno)
 {
-	return rtbno < mp->m_sb.sb_rblocks;
+	xfs_rgnumber_t		rgno;
+	xfs_rgblock_t		rgbno;
+
+	if (rtbno >= mp->m_sb.sb_rblocks)
+		return false;
+
+	if (!xfs_has_rtgroups(mp))
+		return true;
+
+	rgbno = xfs_rtb_to_rgbno(mp, rtbno, &rgno);
+	if (rgno >= mp->m_sb.sb_rgcount)
+		return false;
+
+	return xfs_verify_rgno_rgbno(mp, rgno, rgbno);
 }
 
 /* Verify that a realtime device extent is fully contained inside the volume. */
@@ -158,7 +193,14 @@ xfs_verify_rtbext(
 	if (!xfs_verify_rtbno(mp, rtbno))
 		return false;
 
-	return xfs_verify_rtbno(mp, rtbno + len - 1);
+	if (!xfs_verify_rtbno(mp, rtbno + len - 1))
+		return false;
+
+	if (xfs_has_rtgroups(mp) &&
+	    xfs_rtb_to_rgno(mp, rtbno) != xfs_rtb_to_rgno(mp, rtbno + len - 1))
+		return false;
+
+	return true;
 }
 
 /* Calculate the range of valid icount values. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 09/52] xfs: add frextents to the lazysbcounters when rtgroups enabled
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (12 preceding siblings ...)
  2023-12-31 23:49   ` [PATCH 08/52] xfs: check that rtblock extents do not overlap with the rt group metadata Darrick J. Wong
@ 2023-12-31 23:49   ` Darrick J. Wong
  2023-12-31 23:50   ` [PATCH 10/52] xfs: record rt group superblock errors in the health system Darrick J. Wong
                     ` (37 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:49 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make the free rt extent count a part of the lazy sb counters when the
realtime groups feature is enabled.  This is possible because the patch
to recompute frextents from the rtbitmap during log recovery predates
the code adding rtgroup support, hence we know that the value will
always be correct during runtime.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_mount.h |    1 +
 libxfs/xfs_sb.c     |    5 +++++
 2 files changed, 6 insertions(+)


diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index ec0956c539f..184e7a38fb3 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -39,6 +39,7 @@ typedef struct xfs_mount {
 #define m_icount	m_sb.sb_icount
 #define m_ifree		m_sb.sb_ifree
 #define m_fdblocks	m_sb.sb_fdblocks
+#define m_frextents	m_sb.sb_frextents
 	spinlock_t		m_sb_lock;
 
 	/*
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index 3bd56edab87..0ded5220161 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -1148,6 +1148,9 @@ xfs_log_sb(
 	 * sb counters, despite having a percpu counter. It is always kept
 	 * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
 	 * and hence we don't need have to update it here.
+	 *
+	 * sb_frextents was added to the lazy sb counters when the rt groups
+	 * feature was introduced.
 	 */
 	if (xfs_has_lazysbcount(mp)) {
 		mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
@@ -1156,6 +1159,8 @@ xfs_log_sb(
 				mp->m_sb.sb_icount);
 		mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
 	}
+	if (xfs_has_rtgroups(mp))
+		mp->m_sb.sb_frextents = percpu_counter_sum(&mp->m_frextents);
 
 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 10/52] xfs: record rt group superblock errors in the health system
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (13 preceding siblings ...)
  2023-12-31 23:49   ` [PATCH 09/52] xfs: add frextents to the lazysbcounters when rtgroups enabled Darrick J. Wong
@ 2023-12-31 23:50   ` Darrick J. Wong
  2023-12-31 23:50   ` [PATCH 11/52] xfs: define locking primitives for realtime groups Darrick J. Wong
                     ` (36 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:50 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Record the state of per-rtgroup metadata sickness in the rtgroup
structure for later reporting.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_health.h  |   28 +++++++++++++++++++++++++++-
 libxfs/xfs_rtgroup.h |    8 ++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfs_health.h b/libxfs/xfs_health.h
index 1816c67351a..f5449a804c6 100644
--- a/libxfs/xfs_health.h
+++ b/libxfs/xfs_health.h
@@ -52,6 +52,7 @@ struct xfs_inode;
 struct xfs_fsop_geom;
 struct xfs_btree_cur;
 struct xfs_da_args;
+struct xfs_rtgroup;
 
 /* Observable health issues for metadata spanning the entire filesystem. */
 #define XFS_SICK_FS_COUNTERS	(1 << 0)  /* summary counters */
@@ -66,6 +67,7 @@ struct xfs_da_args;
 /* Observable health issues for realtime volume metadata. */
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
 #define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
+#define XFS_SICK_RT_SUPER	(1 << 2)  /* rt group superblock */
 
 /* Observable health issues for AG metadata. */
 #define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
@@ -110,7 +112,8 @@ struct xfs_da_args;
 				 XFS_SICK_FS_METAPATH)
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
-				 XFS_SICK_RT_SUMMARY)
+				 XFS_SICK_RT_SUMMARY | \
+				 XFS_SICK_RT_SUPER)
 
 #define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
 				 XFS_SICK_AG_AGF | \
@@ -192,6 +195,14 @@ void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
 void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
 		unsigned int *checked);
 
+void xfs_rgno_mark_sick(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		unsigned int mask);
+void xfs_rtgroup_mark_sick(struct xfs_rtgroup *rtg, unsigned int mask);
+void xfs_rtgroup_mark_checked(struct xfs_rtgroup *rtg, unsigned int mask);
+void xfs_rtgroup_mark_healthy(struct xfs_rtgroup *rtg, unsigned int mask);
+void xfs_rtgroup_measure_sickness(struct xfs_rtgroup *rtg, unsigned int *sick,
+		unsigned int *checked);
+
 void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
 		unsigned int mask);
 void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
@@ -241,6 +252,15 @@ xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
 	return sick & mask;
 }
 
+static inline bool
+xfs_rtgroup_has_sickness(struct xfs_rtgroup *rtg, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_rtgroup_measure_sickness(rtg, &sick, &checked);
+	return sick & mask;
+}
+
 static inline bool
 xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
 {
@@ -262,6 +282,12 @@ xfs_rt_is_healthy(struct xfs_mount *mp)
 	return !xfs_rt_has_sickness(mp, -1U);
 }
 
+static inline bool
+xfs_rtgroup_is_healthy(struct xfs_rtgroup *rtg)
+{
+	return !xfs_rtgroup_has_sickness(rtg, -1U);
+}
+
 static inline bool
 xfs_ag_is_healthy(struct xfs_perag *pag)
 {
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 2d0422c6712..c3f4f644ea5 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -25,6 +25,14 @@ struct xfs_rtgroup {
 	/* Number of blocks in this group */
 	xfs_rgblock_t		rtg_blockcount;
 
+	/*
+	 * Bitsets of per-rtgroup metadata that have been checked and/or are
+	 * sick.  Callers should hold rtg_state_lock before accessing this
+	 * field.
+	 */
+	uint16_t		rtg_checked;
+	uint16_t		rtg_sick;
+
 #ifdef __KERNEL__
 	/* -- kernel only structures below this line -- */
 	spinlock_t		rtg_state_lock;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 11/52] xfs: define locking primitives for realtime groups
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (14 preceding siblings ...)
  2023-12-31 23:50   ` [PATCH 10/52] xfs: record rt group superblock errors in the health system Darrick J. Wong
@ 2023-12-31 23:50   ` Darrick J. Wong
  2023-12-31 23:50   ` [PATCH 12/52] xfs: export the geometry of realtime groups to userspace Darrick J. Wong
                     ` (35 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:50 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Define helper functions to lock all metadata inodes related to a
realtime group.  There's not much to look at now, but this will become
important when we add per-rtgroup metadata files and online fsck code
for them.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtgroup.c |   33 +++++++++++++++++++++++++++++++++
 libxfs/xfs_rtgroup.h |   14 ++++++++++++++
 2 files changed, 47 insertions(+)


diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index 8bfaa8d06f8..7003ac5c567 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -533,3 +533,36 @@ xfs_rtgroup_update_secondary_sbs(
 
 	return saved_error ? saved_error : error;
 }
+
+/* Lock metadata inodes associated with this rt group. */
+void
+xfs_rtgroup_lock(
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	unsigned int		rtglock_flags)
+{
+	ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
+	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
+
+	if (rtglock_flags & XFS_RTGLOCK_BITMAP)
+		xfs_rtbitmap_lock(tp, rtg->rtg_mount);
+	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
+		xfs_rtbitmap_lock_shared(rtg->rtg_mount, XFS_RBMLOCK_BITMAP);
+}
+
+/* Unlock metadata inodes associated with this rt group. */
+void
+xfs_rtgroup_unlock(
+	struct xfs_rtgroup	*rtg,
+	unsigned int		rtglock_flags)
+{
+	ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+	ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
+	       !(rtglock_flags & XFS_RTGLOCK_BITMAP));
+
+	if (rtglock_flags & XFS_RTGLOCK_BITMAP)
+		xfs_rtbitmap_unlock(rtg->rtg_mount);
+	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
+		xfs_rtbitmap_unlock_shared(rtg->rtg_mount, XFS_RBMLOCK_BITMAP);
+}
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index c3f4f644ea5..70968cf700f 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -221,11 +221,25 @@ void xfs_rtgroup_update_super(struct xfs_buf *rtsb_bp,
 		const struct xfs_buf *sb_bp);
 void xfs_rtgroup_log_super(struct xfs_trans *tp, const struct xfs_buf *sb_bp);
 int xfs_rtgroup_update_secondary_sbs(struct xfs_mount *mp);
+
+/* Lock the rt bitmap inode in exclusive mode */
+#define XFS_RTGLOCK_BITMAP		(1U << 0)
+/* Lock the rt bitmap inode in shared mode */
+#define XFS_RTGLOCK_BITMAP_SHARED	(1U << 1)
+
+#define XFS_RTGLOCK_ALL_FLAGS	(XFS_RTGLOCK_BITMAP | \
+				 XFS_RTGLOCK_BITMAP_SHARED)
+
+void xfs_rtgroup_lock(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+		unsigned int rtglock_flags);
+void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
 #else
 # define xfs_rtgroup_block_count(mp, rgno)	(0)
 # define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
 # define xfs_rtgroup_log_super(tp, sb_bp)	((void)0)
 # define xfs_rtgroup_update_secondary_sbs(mp)	(0)
+# define xfs_rtgroup_lock(tp, rtg, gf)		((void)0)
+# define xfs_rtgroup_unlock(rtg, gf)		((void)0)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __LIBXFS_RTGROUP_H */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 12/52] xfs: export the geometry of realtime groups to userspace
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (15 preceding siblings ...)
  2023-12-31 23:50   ` [PATCH 11/52] xfs: define locking primitives for realtime groups Darrick J. Wong
@ 2023-12-31 23:50   ` Darrick J. Wong
  2023-12-31 23:50   ` [PATCH 13/52] xfs: add block headers to realtime bitmap blocks Darrick J. Wong
                     ` (34 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:50 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create an ioctl so that the kernel can report the status of realtime
groups to userspace.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/util.c                         |    7 ++
 libxfs/xfs_fs.h                       |    1 
 libxfs/xfs_fs_staging.h               |   17 ++++++
 libxfs/xfs_health.h                   |    2 +
 libxfs/xfs_rtgroup.c                  |   14 +++++
 libxfs/xfs_rtgroup.h                  |    4 +
 man/man2/ioctl_xfs_rtgroup_geometry.2 |   99 +++++++++++++++++++++++++++++++++
 7 files changed, 144 insertions(+)
 create mode 100644 man/man2/ioctl_xfs_rtgroup_geometry.2


diff --git a/libxfs/util.c b/libxfs/util.c
index a3f3ad29933..76e49b8637c 100644
--- a/libxfs/util.c
+++ b/libxfs/util.c
@@ -446,6 +446,13 @@ xfs_fs_mark_healthy(
 }
 
 void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo) { }
+void
+xfs_rtgroup_geom_health(
+	struct xfs_rtgroup		*rtg,
+	struct xfs_rtgroup_geometry	*rgeo)
+{
+	/* empty */
+}
 void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask) { }
 void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
 		unsigned int mask) { }
diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index bae9ef924bf..c5bf53c6a43 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -852,6 +852,7 @@ struct xfs_scrub_metadata {
 /*	XFS_IOC_SCRUBV_METADATA -- staging 60	   */
 #define XFS_IOC_AG_GEOMETRY	_IOWR('X', 61, struct xfs_ag_geometry)
 /*	XFS_IOC_GETPARENTS ---- staging 62         */
+/*	XFS_IOC_RTGROUP_GEOMETRY - staging 63	   */
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/libxfs/xfs_fs_staging.h b/libxfs/xfs_fs_staging.h
index 69d29f213af..1f573314877 100644
--- a/libxfs/xfs_fs_staging.h
+++ b/libxfs/xfs_fs_staging.h
@@ -202,4 +202,21 @@ static inline size_t sizeof_xfs_scrub_vec(unsigned int nr)
 
 #define XFS_IOC_SCRUBV_METADATA	_IOWR('X', 60, struct xfs_scrub_vec_head)
 
+/*
+ * Output for XFS_IOC_RTGROUP_GEOMETRY
+ */
+struct xfs_rtgroup_geometry {
+	__u32 rg_number;	/* i/o: rtgroup number */
+	__u32 rg_length;	/* o: length in blocks */
+	__u32 rg_sick;		/* o: sick things in ag */
+	__u32 rg_checked;	/* o: checked metadata in ag */
+	__u32 rg_flags;		/* i/o: flags for this ag */
+	__u32 rg_pad;		/* o: zero */
+	__u64 rg_reserved[13];	/* o: zero */
+};
+#define XFS_RTGROUP_GEOM_SICK_SUPER	(1 << 0)  /* superblock */
+#define XFS_RTGROUP_GEOM_SICK_BITMAP	(1 << 1)  /* rtbitmap for this group */
+
+#define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 63, struct xfs_rtgroup_geometry)
+
 #endif /* __XFS_FS_STAGING_H__ */
diff --git a/libxfs/xfs_health.h b/libxfs/xfs_health.h
index f5449a804c6..1e9938a417b 100644
--- a/libxfs/xfs_health.h
+++ b/libxfs/xfs_health.h
@@ -302,6 +302,8 @@ xfs_inode_is_healthy(struct xfs_inode *ip)
 
 void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
 void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_rtgroup_geom_health(struct xfs_rtgroup *rtg,
+		struct xfs_rtgroup_geometry *rgeo);
 void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
 
 #define xfs_metadata_is_sick(error) \
diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index 7003ac5c567..c503a39b0fc 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -566,3 +566,17 @@ xfs_rtgroup_unlock(
 	else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)
 		xfs_rtbitmap_unlock_shared(rtg->rtg_mount, XFS_RBMLOCK_BITMAP);
 }
+
+/* Retrieve rt group geometry. */
+int
+xfs_rtgroup_get_geometry(
+	struct xfs_rtgroup	*rtg,
+	struct xfs_rtgroup_geometry *rgeo)
+{
+	/* Fill out form. */
+	memset(rgeo, 0, sizeof(*rgeo));
+	rgeo->rg_number = rtg->rtg_rgno;
+	rgeo->rg_length = rtg->rtg_blockcount;
+	xfs_rtgroup_geom_health(rtg, rgeo);
+	return 0;
+}
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index 70968cf700f..e6d60425faa 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -233,6 +233,9 @@ int xfs_rtgroup_update_secondary_sbs(struct xfs_mount *mp);
 void xfs_rtgroup_lock(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
 		unsigned int rtglock_flags);
 void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
+
+int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg,
+		struct xfs_rtgroup_geometry *rgeo);
 #else
 # define xfs_rtgroup_block_count(mp, rgno)	(0)
 # define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
@@ -240,6 +243,7 @@ void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
 # define xfs_rtgroup_update_secondary_sbs(mp)	(0)
 # define xfs_rtgroup_lock(tp, rtg, gf)		((void)0)
 # define xfs_rtgroup_unlock(rtg, gf)		((void)0)
+# define xfs_rtgroup_get_geometry(rtg, rgeo)	(-EOPNOTSUPP)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __LIBXFS_RTGROUP_H */
diff --git a/man/man2/ioctl_xfs_rtgroup_geometry.2 b/man/man2/ioctl_xfs_rtgroup_geometry.2
new file mode 100644
index 00000000000..ccd931d1e17
--- /dev/null
+++ b/man/man2/ioctl_xfs_rtgroup_geometry.2
@@ -0,0 +1,99 @@
+.\" Copyright (c) 2022-2024 Oracle.  All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\" %%%LICENSE_END
+.TH IOCTL-XFS-RTGROUP-GEOMETRY 2 2022-08-18 "XFS"
+.SH NAME
+ioctl_xfs_rtgroup_geometry \- query XFS realtime group geometry information
+.SH SYNOPSIS
+.br
+.B #include <xfs/xfs_fs.h>
+.br
+.B #include <xfs/xfs_fs_staging.h>
+.PP
+.BI "int ioctl(int " fd ", XFS_IOC_RTGROUP_GEOMETRY, struct xfs_rtgroup_geometry *" arg );
+.SH DESCRIPTION
+This XFS ioctl retrieves the geometry information for a given realtime group.
+The geometry information is conveyed in a structure of the following form:
+.PP
+.in +4n
+.nf
+struct xfs_rtgroup_geometry {
+	__u32  rg_number;
+	__u32  rg_length;
+	__u32  rg_sick;
+	__u32  rg_checked;
+	__u32  rg_flags;
+	__u32  rg_pad;
+	__u64  rg_reserved[13];
+};
+.fi
+.in
+.TP
+.I rg_number
+The caller must set this field to the index of the realtime group that the
+caller wishes to learn about.
+.TP
+.I rg_length
+The length of the realtime group is returned in this field, in units of
+filesystem blocks.
+.I rg_flags
+The caller can set this field to change the operational behavior of the ioctl.
+Currently no flags are defined, so this field must be zero.
+.TP
+.IR rg_reserved " and " rg_pad
+All reserved fields will be set to zero on return.
+.PP
+The fields
+.IR rg_sick " and " rg_checked
+indicate the relative health of various realtime group metadata:
+.IP \[bu] 2
+If a given sick flag is set in
+.IR rg_sick ,
+then that piece of metadata has been observed to be damaged.
+The same bit will be set in
+.IR rg_checked .
+.IP \[bu]
+If a given sick flag is set in
+.I rg_checked
+and is not set in
+.IR rg_sick ,
+then that piece of metadata has been checked and is not faulty.
+.IP \[bu]
+If a given sick flag is not set in
+.IR rg_checked ,
+then no conclusion can be made.
+.PP
+The following flags apply to these fields:
+.RS 0.4i
+.TP
+.B XFS_RTGROUP_GEOM_SICK_SUPER
+Realtime group superblock.
+.TP
+.B XFS_RTGROUP_GEOM_SICK_BITMAP
+Realtime bitmap for this group.
+.RE
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+.SH ERRORS
+Error codes can be one of, but are not limited to, the following:
+.TP
+.B EFSBADCRC
+Metadata checksum validation failed while performing the query.
+.TP
+.B EFSCORRUPTED
+Metadata corruption was encountered while performing the query.
+.TP
+.B EINVAL
+The specified realtime group number is not valid for this filesystem.
+.TP
+.B EIO
+An I/O error was encountered while performing the query.
+.SH CONFORMING TO
+This API is specific to XFS filesystem on the Linux kernel.
+.SH SEE ALSO
+.BR ioctl (2)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 13/52] xfs: add block headers to realtime bitmap blocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (16 preceding siblings ...)
  2023-12-31 23:50   ` [PATCH 12/52] xfs: export the geometry of realtime groups to userspace Darrick J. Wong
@ 2023-12-31 23:50   ` Darrick J. Wong
  2023-12-31 23:51   ` [PATCH 14/52] xfs: encode the rtbitmap in little endian format Darrick J. Wong
                     ` (33 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:50 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Upgrade rtbitmap blocks to have self describing metadata like most every
other thing in XFS.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_mount.h   |    3 +
 libxfs/xfs_format.h   |   14 ++++++
 libxfs/xfs_ondisk.h   |    1 
 libxfs/xfs_rtbitmap.c |  106 +++++++++++++++++++++++++++++++++++++++++++++----
 libxfs/xfs_rtbitmap.h |   33 +++++++++++++++
 libxfs/xfs_sb.c       |   18 ++++++--
 libxfs/xfs_shared.h   |    1 
 7 files changed, 161 insertions(+), 15 deletions(-)


diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index 184e7a38fb3..ecee4ccc0b7 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -83,7 +83,8 @@ typedef struct xfs_mount {
 	int8_t			m_rgblklog;	/* log2 of rt group sz if possible */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
-	uint			m_blockwmask;	/* blockwsize-1 */
+	/* number of rt extents per rt bitmap block if rtgroups enabled */
+	unsigned int		m_rtx_per_rbmblock;
 	uint			m_alloc_mxr[2];	/* XFS_ALLOC_BLOCK_MAXRECS */
 	uint			m_alloc_mnr[2];	/* XFS_ALLOC_BLOCK_MINRECS */
 	uint			m_bmap_dmxr[2];	/* XFS_BMAP_BLOCK_DMAXRECS */
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 43e66740e2a..7178bd463c1 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1300,6 +1300,20 @@ static inline bool xfs_dinode_has_large_extent_counts(
 /*
  * RT bit manipulation macros.
  */
+#define XFS_RTBITMAP_MAGIC	0x424D505A	/* BMPZ */
+
+struct xfs_rtbuf_blkinfo {
+	__be32		rt_magic;	/* validity check on block */
+	__be32		rt_crc;		/* CRC of block */
+	__be64		rt_owner;	/* inode that owns the block */
+	__be64		rt_blkno;	/* first block of the buffer */
+	__be64		rt_lsn;		/* sequence number of last write */
+	uuid_t		rt_uuid;	/* filesystem we belong to */
+};
+
+#define XFS_RTBUF_CRC_OFF \
+	offsetof(struct xfs_rtbuf_blkinfo, rt_crc)
+
 #define	XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
 #define	XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
 
diff --git a/libxfs/xfs_ondisk.h b/libxfs/xfs_ondisk.h
index 65219d4cf99..70b96efa269 100644
--- a/libxfs/xfs_ondisk.h
+++ b/libxfs/xfs_ondisk.h
@@ -76,6 +76,7 @@ xfs_check_ondisk_structs(void)
 	/* realtime structures */
 	XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw,		4);
 	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo,		48);
 
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
diff --git a/libxfs/xfs_rtbitmap.c b/libxfs/xfs_rtbitmap.c
index c2970b78ce8..3aa2c163725 100644
--- a/libxfs/xfs_rtbitmap.c
+++ b/libxfs/xfs_rtbitmap.c
@@ -21,23 +21,77 @@
  * Realtime allocator bitmap functions shared with userspace.
  */
 
-/*
- * Real time buffers need verifiers to avoid runtime warnings during IO.
- * We don't have anything to verify, however, so these are just dummy
- * operations.
- */
+static xfs_failaddr_t
+xfs_rtbuf_verify(
+	struct xfs_buf			*bp)
+{
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+
+	if (!xfs_verify_magic(bp, hdr->rt_magic))
+		return __this_address;
+	if (!xfs_has_rtgroups(mp))
+		return __this_address;
+	if (!xfs_has_crc(mp))
+		return __this_address;
+	if (!uuid_equal(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+	if (hdr->rt_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
+		return __this_address;
+	return NULL;
+}
+
 static void
 xfs_rtbuf_verify_read(
-	struct xfs_buf	*bp)
+	struct xfs_buf			*bp)
 {
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+	xfs_failaddr_t			fa;
+
+	if (!xfs_has_rtgroups(mp) || bp->b_ops != &xfs_rtbitmap_buf_ops)
+		return;
+
+	if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) {
+		fa = __this_address;
+		goto fail;
+	}
+
+	if (!xfs_buf_verify_cksum(bp, XFS_RTBUF_CRC_OFF)) {
+		fa = __this_address;
+		goto fail;
+	}
+
+	fa = xfs_rtbuf_verify(bp);
+	if (fa)
+		goto fail;
+
 	return;
+fail:
+	xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 }
 
 static void
 xfs_rtbuf_verify_write(
 	struct xfs_buf	*bp)
 {
-	return;
+	struct xfs_mount		*mp = bp->b_mount;
+	struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+	struct xfs_buf_log_item		*bip = bp->b_log_item;
+	xfs_failaddr_t			fa;
+
+	if (!xfs_has_rtgroups(mp) || bp->b_ops != &xfs_rtbitmap_buf_ops)
+		return;
+
+	fa = xfs_rtbuf_verify(bp);
+	if (fa) {
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+		return;
+	}
+
+	if (bip)
+		hdr->rt_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_RTBUF_CRC_OFF);
 }
 
 const struct xfs_buf_ops xfs_rtbuf_ops = {
@@ -46,6 +100,14 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
 	.verify_write = xfs_rtbuf_verify_write,
 };
 
+const struct xfs_buf_ops xfs_rtbitmap_buf_ops = {
+	.name		= "xfs_rtbitmap",
+	.magic		= { 0, cpu_to_be32(XFS_RTBITMAP_MAGIC) },
+	.verify_read	= xfs_rtbuf_verify_read,
+	.verify_write	= xfs_rtbuf_verify_write,
+	.verify_struct	= xfs_rtbuf_verify,
+};
+
 /* Release cached rt bitmap and summary buffers. */
 void
 xfs_rtbuf_cache_relse(
@@ -123,13 +185,26 @@ xfs_rtbuf_get(
 	ASSERT(map.br_startblock != NULLFSBLOCK);
 	error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
 				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-				   mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+				   mp->m_bsize, 0, &bp,
+				   xfs_rtblock_ops(mp, issum));
 	if (xfs_metadata_is_sick(error))
 		xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
 					     XFS_SICK_RT_BITMAP);
 	if (error)
 		return error;
 
+	if (xfs_has_rtgroups(mp) && !issum) {
+		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+
+		if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) {
+			xfs_buf_mark_corrupt(bp);
+			xfs_trans_brelse(args->tp, bp);
+			xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+						     XFS_SICK_RT_BITMAP);
+			return -EFSCORRUPTED;
+		}
+	}
+
 	xfs_trans_buf_set_type(args->tp, bp, type);
 	*cbpp = bp;
 	*coffp = block;
@@ -1123,6 +1198,19 @@ xfs_rtalloc_extent_is_free(
 	return 0;
 }
 
+/* Compute the number of rt extents tracked by a single bitmap block. */
+xfs_rtxnum_t
+xfs_rtbitmap_rtx_per_rbmblock(
+	struct xfs_mount	*mp)
+{
+	unsigned int		rbmblock_bytes = mp->m_sb.sb_blocksize;
+
+	if (xfs_has_rtgroups(mp))
+		rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo);
+
+	return rbmblock_bytes * NBBY;
+}
+
 /*
  * Compute the number of rtbitmap blocks needed to track the given number of rt
  * extents.
@@ -1132,7 +1220,7 @@ xfs_rtbitmap_blockcount(
 	struct xfs_mount	*mp,
 	xfs_rtbxlen_t		rtextents)
 {
-	return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
+	return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
 }
 
 /*
diff --git a/libxfs/xfs_rtbitmap.h b/libxfs/xfs_rtbitmap.h
index 3de0ec2d241..e1cf47d2a92 100644
--- a/libxfs/xfs_rtbitmap.h
+++ b/libxfs/xfs_rtbitmap.h
@@ -148,6 +148,9 @@ xfs_rtx_to_rbmblock(
 	struct xfs_mount	*mp,
 	xfs_rtxnum_t		rtx)
 {
+	if (xfs_has_rtgroups(mp))
+		return div_u64(rtx, mp->m_rtx_per_rbmblock);
+
 	return rtx >> mp->m_blkbit_log;
 }
 
@@ -157,6 +160,13 @@ xfs_rtx_to_rbmword(
 	struct xfs_mount	*mp,
 	xfs_rtxnum_t		rtx)
 {
+	if (xfs_has_rtgroups(mp)) {
+		unsigned int	mod;
+
+		div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod);
+		return mod;
+	}
+
 	return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1);
 }
 
@@ -166,6 +176,9 @@ xfs_rbmblock_to_rtx(
 	struct xfs_mount	*mp,
 	xfs_fileoff_t		rbmoff)
 {
+	if (xfs_has_rtgroups(mp))
+		return rbmoff * mp->m_rtx_per_rbmblock;
+
 	return rbmoff << mp->m_blkbit_log;
 }
 
@@ -175,7 +188,14 @@ xfs_rbmblock_wordptr(
 	struct xfs_rtalloc_args	*args,
 	unsigned int		index)
 {
-	union xfs_rtword_raw	*words = args->rbmbp->b_addr;
+	struct xfs_mount	*mp = args->mp;
+	union xfs_rtword_raw	*words;
+	struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr;
+
+	if (xfs_has_rtgroups(mp))
+		words = (union xfs_rtword_raw *)(hdr + 1);
+	else
+		words = args->rbmbp->b_addr;
 
 	return words + index;
 }
@@ -277,6 +297,16 @@ xfs_suminfo_add(
 	return info->old;
 }
 
+static inline const struct xfs_buf_ops *
+xfs_rtblock_ops(
+	struct xfs_mount	*mp,
+	bool			issum)
+{
+	if (xfs_has_rtgroups(mp) && !issum)
+		return &xfs_rtbitmap_buf_ops;
+	return &xfs_rtbuf_ops;
+}
+
 /*
  * Functions for walking free space rtextents in the realtime bitmap.
  */
@@ -365,6 +395,7 @@ xfs_validate_rtextents(
 	return true;
 }
 
+xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp);
 xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
 		rtextents);
 unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index 0ded5220161..d04a8e15331 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -575,10 +575,15 @@ xfs_validate_sb_common(
 	} else {
 		uint64_t	rexts;
 		uint64_t	rbmblocks;
+		unsigned int	rbmblock_bytes = sbp->sb_blocksize;
 
 		rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
-		rbmblocks = howmany_64(sbp->sb_rextents,
-				       NBBY * sbp->sb_blocksize);
+
+		if (xfs_sb_is_v5(sbp) &&
+		    (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS))
+			rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo);
+
+		rbmblocks = howmany_64(sbp->sb_rextents, NBBY * rbmblock_bytes);
 
 		if (!xfs_validate_rtextents(rexts) ||
 		    sbp->sb_rextents != rexts ||
@@ -1091,8 +1096,13 @@ xfs_sb_mount_common(
 	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
 	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
 	mp->m_blockmask = sbp->sb_blocksize - 1;
-	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
-	mp->m_blockwmask = mp->m_blockwsize - 1;
+	if (xfs_has_rtgroups(mp))
+		mp->m_blockwsize = (sbp->sb_blocksize -
+					sizeof(struct xfs_rtbuf_blkinfo)) >>
+					XFS_WORDLOG;
+	else
+		mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+	mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG;
 	mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
 	mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
 	mp->m_rgblklog = log2_if_power2(sbp->sb_rgblocks);
diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h
index 65691af0488..f57788164a7 100644
--- a/libxfs/xfs_shared.h
+++ b/libxfs/xfs_shared.h
@@ -38,6 +38,7 @@ extern const struct xfs_buf_ops xfs_inode_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 14/52] xfs: encode the rtbitmap in little endian format
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (17 preceding siblings ...)
  2023-12-31 23:50   ` [PATCH 13/52] xfs: add block headers to realtime bitmap blocks Darrick J. Wong
@ 2023-12-31 23:51   ` Darrick J. Wong
  2023-12-31 23:51   ` [PATCH 15/52] xfs: add block headers to realtime summary blocks Darrick J. Wong
                     ` (32 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:51 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Currently, the ondisk realtime bitmap file is accessed in units of
32-bit words.  There's no endian translation of the contents of this
file, which means that the Bad Things Happen(tm) if you go from (say)
x86 to powerpc.  Since we have a new feature flag, let's take the
opportunity to enforce an endianness on the file.

The natural format of a bitmap is (IMHO) little endian, because the byte
offsets of the bitmap data should always increase in step with the
information being indexed.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h   |    4 +++-
 libxfs/xfs_rtbitmap.h |    7 ++++++-
 repair/rt.c           |    5 ++++-
 3 files changed, 13 insertions(+), 3 deletions(-)


diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 7178bd463c1..77422fbd337 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -749,10 +749,12 @@ struct xfs_agfl {
 
 /*
  * Realtime bitmap information is accessed by the word, which is currently
- * stored in host-endian format.
+ * stored in host-endian format.  Starting with the realtime groups feature,
+ * the words are stored in le32 ondisk.
  */
 union xfs_rtword_raw {
 	__u32		old;
+	__le32		rtg;
 };
 
 /*
diff --git a/libxfs/xfs_rtbitmap.h b/libxfs/xfs_rtbitmap.h
index e1cf47d2a92..588689e53ba 100644
--- a/libxfs/xfs_rtbitmap.h
+++ b/libxfs/xfs_rtbitmap.h
@@ -208,6 +208,8 @@ xfs_rtbitmap_getword(
 {
 	union xfs_rtword_raw	*word = xfs_rbmblock_wordptr(args, index);
 
+	if (xfs_has_rtgroups(args->mp))
+		return le32_to_cpu(word->rtg);
 	return word->old;
 }
 
@@ -220,7 +222,10 @@ xfs_rtbitmap_setword(
 {
 	union xfs_rtword_raw	*word = xfs_rbmblock_wordptr(args, index);
 
-	word->old = value;
+	if (xfs_has_rtgroups(args->mp))
+		word->rtg = cpu_to_le32(value);
+	else
+		word->old = value;
 }
 
 /*
diff --git a/repair/rt.c b/repair/rt.c
index e49487829af..c2b2c25ff79 100644
--- a/repair/rt.c
+++ b/repair/rt.c
@@ -49,7 +49,10 @@ set_rtword(
 	union xfs_rtword_raw	*word,
 	xfs_rtword_t		value)
 {
-	word->old = value;
+	if (xfs_has_rtgroups(mp))
+		word->rtg = cpu_to_le32(value);
+	else
+		word->old = value;
 }
 
 static inline void


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 15/52] xfs: add block headers to realtime summary blocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (18 preceding siblings ...)
  2023-12-31 23:51   ` [PATCH 14/52] xfs: encode the rtbitmap in little endian format Darrick J. Wong
@ 2023-12-31 23:51   ` Darrick J. Wong
  2023-12-31 23:51   ` [PATCH 16/52] xfs: encode the rtsummary in big endian format Darrick J. Wong
                     ` (31 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:51 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Upgrade rtsummary blocks to have self describing metadata like most
every other thing in XFS.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h   |    1 +
 libxfs/xfs_rtbitmap.c |   18 +++++++++++++++---
 libxfs/xfs_rtbitmap.h |   19 +++++++++++++++++--
 libxfs/xfs_shared.h   |    1 +
 4 files changed, 34 insertions(+), 5 deletions(-)


diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 77422fbd337..f2c70e1027e 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -1303,6 +1303,7 @@ static inline bool xfs_dinode_has_large_extent_counts(
  * RT bit manipulation macros.
  */
 #define XFS_RTBITMAP_MAGIC	0x424D505A	/* BMPZ */
+#define XFS_RTSUMMARY_MAGIC	0x53554D59	/* SUMY */
 
 struct xfs_rtbuf_blkinfo {
 	__be32		rt_magic;	/* validity check on block */
diff --git a/libxfs/xfs_rtbitmap.c b/libxfs/xfs_rtbitmap.c
index 3aa2c163725..7be1c0bdbea 100644
--- a/libxfs/xfs_rtbitmap.c
+++ b/libxfs/xfs_rtbitmap.c
@@ -49,7 +49,7 @@ xfs_rtbuf_verify_read(
 	struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
 	xfs_failaddr_t			fa;
 
-	if (!xfs_has_rtgroups(mp) || bp->b_ops != &xfs_rtbitmap_buf_ops)
+	if (!xfs_has_rtgroups(mp))
 		return;
 
 	if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) {
@@ -80,7 +80,7 @@ xfs_rtbuf_verify_write(
 	struct xfs_buf_log_item		*bip = bp->b_log_item;
 	xfs_failaddr_t			fa;
 
-	if (!xfs_has_rtgroups(mp) || bp->b_ops != &xfs_rtbitmap_buf_ops)
+	if (!xfs_has_rtgroups(mp))
 		return;
 
 	fa = xfs_rtbuf_verify(bp);
@@ -108,6 +108,14 @@ const struct xfs_buf_ops xfs_rtbitmap_buf_ops = {
 	.verify_struct	= xfs_rtbuf_verify,
 };
 
+const struct xfs_buf_ops xfs_rtsummary_buf_ops = {
+	.name		= "xfs_rtsummary",
+	.magic		= { 0, cpu_to_be32(XFS_RTSUMMARY_MAGIC) },
+	.verify_read	= xfs_rtbuf_verify_read,
+	.verify_write	= xfs_rtbuf_verify_write,
+	.verify_struct	= xfs_rtbuf_verify,
+};
+
 /* Release cached rt bitmap and summary buffers. */
 void
 xfs_rtbuf_cache_relse(
@@ -193,7 +201,7 @@ xfs_rtbuf_get(
 	if (error)
 		return error;
 
-	if (xfs_has_rtgroups(mp) && !issum) {
+	if (xfs_has_rtgroups(mp)) {
 		struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
 
 		if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) {
@@ -1273,6 +1281,10 @@ xfs_rtsummary_blockcount(
 	unsigned long long	rsumwords;
 
 	rsumwords = (unsigned long long)rsumlevels * rbmblocks;
+
+	if (xfs_has_rtgroups(mp))
+		return howmany_64(rsumwords, mp->m_blockwsize);
+
 	return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
 }
 
diff --git a/libxfs/xfs_rtbitmap.h b/libxfs/xfs_rtbitmap.h
index 588689e53ba..e8558db14b9 100644
--- a/libxfs/xfs_rtbitmap.h
+++ b/libxfs/xfs_rtbitmap.h
@@ -250,6 +250,9 @@ xfs_rtsumoffs_to_block(
 	struct xfs_mount	*mp,
 	xfs_rtsumoff_t		rsumoff)
 {
+	if (xfs_has_rtgroups(mp))
+		return rsumoff / mp->m_blockwsize;
+
 	return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t));
 }
 
@@ -264,6 +267,9 @@ xfs_rtsumoffs_to_infoword(
 {
 	unsigned int		mask = mp->m_blockmask >> XFS_SUMINFOLOG;
 
+	if (xfs_has_rtgroups(mp))
+		return rsumoff % mp->m_blockwsize;
+
 	return rsumoff & mask;
 }
 
@@ -273,7 +279,13 @@ xfs_rsumblock_infoptr(
 	struct xfs_rtalloc_args	*args,
 	unsigned int		index)
 {
-	union xfs_suminfo_raw	*info = args->sumbp->b_addr;
+	union xfs_suminfo_raw	*info;
+	struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr;
+
+	if (xfs_has_rtgroups(args->mp))
+		info = (union xfs_suminfo_raw *)(hdr + 1);
+	else
+		info = args->sumbp->b_addr;
 
 	return info + index;
 }
@@ -307,8 +319,11 @@ xfs_rtblock_ops(
 	struct xfs_mount	*mp,
 	bool			issum)
 {
-	if (xfs_has_rtgroups(mp) && !issum)
+	if (xfs_has_rtgroups(mp)) {
+		if (issum)
+			return &xfs_rtsummary_buf_ops;
 		return &xfs_rtbitmap_buf_ops;
+	}
 	return &xfs_rtbuf_ops;
 }
 
diff --git a/libxfs/xfs_shared.h b/libxfs/xfs_shared.h
index f57788164a7..8ad4b67d6fe 100644
--- a/libxfs/xfs_shared.h
+++ b/libxfs/xfs_shared.h
@@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
+extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 16/52] xfs: encode the rtsummary in big endian format
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (19 preceding siblings ...)
  2023-12-31 23:51   ` [PATCH 15/52] xfs: add block headers to realtime summary blocks Darrick J. Wong
@ 2023-12-31 23:51   ` Darrick J. Wong
  2023-12-31 23:51   ` [PATCH 17/52] xfs: store rtgroup information with a bmap intent Darrick J. Wong
                     ` (30 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:51 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Currently, the ondisk realtime summary file counters are accessed in
units of 32-bit words.  There's no endian translation of the contents of
this file, which means that the Bad Things Happen(tm) if you go from
(say) x86 to powerpc.  Since we have a new feature flag, let's take the
opportunity to enforce an endianness on the file.  Encode the summary
information in big endian format, like most of the rest of the
filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/check.c            |    7 ++++++-
 libxfs/xfs_format.h   |    4 +++-
 libxfs/xfs_rtbitmap.h |    7 +++++++
 repair/rt.c           |    5 ++++-
 4 files changed, 20 insertions(+), 3 deletions(-)


diff --git a/db/check.c b/db/check.c
index 190565074b6..20de9c74d4a 100644
--- a/db/check.c
+++ b/db/check.c
@@ -1709,6 +1709,8 @@ get_suminfo(
 	struct xfs_mount	*mp,
 	union xfs_suminfo_raw	*raw)
 {
+	if (xfs_has_rtgroups(mp))
+		return be32_to_cpu(raw->rtg);
 	return raw->old;
 }
 
@@ -3625,7 +3627,10 @@ inc_sumcount(
 {
 	union xfs_suminfo_raw	*p = info + index;
 
-	p->old++;
+	if (xfs_has_rtgroups(mp))
+		be32_add_cpu(&p->rtg, 1);
+	else
+		p->old++;
 }
 
 static void
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index f2c70e1027e..59ba13db53e 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -759,10 +759,12 @@ union xfs_rtword_raw {
 
 /*
  * Realtime summary counts are accessed by the word, which is currently
- * stored in host-endian format.
+ * stored in host-endian format.  Starting with the realtime groups feature,
+ * the words are stored in be32 ondisk.
  */
 union xfs_suminfo_raw {
 	__u32		old;
+	__be32		rtg;
 };
 
 /*
diff --git a/libxfs/xfs_rtbitmap.h b/libxfs/xfs_rtbitmap.h
index e8558db14b9..0bb63f77ede 100644
--- a/libxfs/xfs_rtbitmap.h
+++ b/libxfs/xfs_rtbitmap.h
@@ -298,6 +298,8 @@ xfs_suminfo_get(
 {
 	union xfs_suminfo_raw	*info = xfs_rsumblock_infoptr(args, index);
 
+	if (xfs_has_rtgroups(args->mp))
+		return be32_to_cpu(info->rtg);
 	return info->old;
 }
 
@@ -310,6 +312,11 @@ xfs_suminfo_add(
 {
 	union xfs_suminfo_raw	*info = xfs_rsumblock_infoptr(args, index);
 
+	if (xfs_has_rtgroups(args->mp)) {
+		be32_add_cpu(&info->rtg, delta);
+		return be32_to_cpu(info->rtg);
+	}
+
 	info->old += delta;
 	return info->old;
 }
diff --git a/repair/rt.c b/repair/rt.c
index c2b2c25ff79..ded968b7d82 100644
--- a/repair/rt.c
+++ b/repair/rt.c
@@ -63,7 +63,10 @@ inc_sumcount(
 {
 	union xfs_suminfo_raw	*p = info + index;
 
-	p->old++;
+	if (xfs_has_rtgroups(mp))
+		be32_add_cpu(&p->rtg, 1);
+	else
+		p->old++;
 }
 
 /*


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 17/52] xfs: store rtgroup information with a bmap intent
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (20 preceding siblings ...)
  2023-12-31 23:51   ` [PATCH 16/52] xfs: encode the rtsummary in big endian format Darrick J. Wong
@ 2023-12-31 23:51   ` Darrick J. Wong
  2023-12-31 23:52   ` [PATCH 18/52] xfs: use an incore rtgroup rotor for rtpick Darrick J. Wong
                     ` (29 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:51 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Make the bmap intent items take an active reference to the rtgroup
containing the space that is being mapped or unmapped.  We will need
this functionality once we start enabling rmap and reflink on the rt
volume.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/defer_item.c |   17 +++++++++++++++--
 libxfs/xfs_bmap.h   |    5 ++++-
 2 files changed, 19 insertions(+), 3 deletions(-)


diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index 49e6cf02dc8..fbd035d9b94 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -492,8 +492,18 @@ xfs_bmap_update_get_group(
 {
 	xfs_agnumber_t		agno;
 
-	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
+		if (xfs_has_rtgroups(mp)) {
+			xfs_rgnumber_t	rgno;
+
+			rgno = xfs_rtb_to_rgno(mp, bi->bi_bmap.br_startblock);
+			bi->bi_rtg = xfs_rtgroup_get(mp, rgno);
+		} else {
+			bi->bi_rtg = NULL;
+		}
+
 		return;
+	}
 
 	agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
 
@@ -524,8 +534,11 @@ static inline void
 xfs_bmap_update_put_group(
 	struct xfs_bmap_intent	*bi)
 {
-	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
+		if (xfs_has_rtgroups(bi->bi_owner->i_mount))
+			xfs_rtgroup_put(bi->bi_rtg);
 		return;
+	}
 
 	xfs_perag_intent_put(bi->bi_pag);
 }
diff --git a/libxfs/xfs_bmap.h b/libxfs/xfs_bmap.h
index bd7f936262c..61c195198db 100644
--- a/libxfs/xfs_bmap.h
+++ b/libxfs/xfs_bmap.h
@@ -246,7 +246,10 @@ struct xfs_bmap_intent {
 	enum xfs_bmap_intent_type		bi_type;
 	int					bi_whichfork;
 	struct xfs_inode			*bi_owner;
-	struct xfs_perag			*bi_pag;
+	union {
+		struct xfs_perag		*bi_pag;
+		struct xfs_rtgroup		*bi_rtg;
+	};
 	struct xfs_bmbt_irec			bi_bmap;
 };
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 18/52] xfs: use an incore rtgroup rotor for rtpick
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (21 preceding siblings ...)
  2023-12-31 23:51   ` [PATCH 17/52] xfs: store rtgroup information with a bmap intent Darrick J. Wong
@ 2023-12-31 23:52   ` Darrick J. Wong
  2023-12-31 23:52   ` [PATCH 19/52] xfs: scrub the realtime group superblock Darrick J. Wong
                     ` (28 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:52 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

During the 6.7 merge window, Linus noticed that the realtime allocator
was doing some sketchy things trying to encode a u64 sequence counter
into the rtbitmap file's atime.  The sketchy casting of a struct pointer
to a u64 pointer has subtly broken several times over the past decade as
the codebase has transitioned to using the VFS i_atime field and that
field has changed in size and layout over time.

Since the goal of the rtpick code is to _suggest_ a starting place for
new rt file allocations, the repeated breakage has not resulted in
inconsistent metadata.  IOWs, it's a hint.

For rtgroups, we don't need this complex code to cut the rtextents space
into fractions.  Add an rtgroup rotor and use that for rtpick, similar
to AG rotoring on the data device.  The new rotor does not persist,
which reduces the logging overhead slightly.

Link: https://lore.kernel.org/linux-xfs/CAHk-=wj3oM3d-Hw2vvxys3KCZ9De+gBN7Gxr2jf96OTisL9udw@mail.gmail.com/
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_mount.h   |    1 +
 libxfs/xfs_rtbitmap.c |   12 ++++++++----
 mkfs/proto.c          |    3 ++-
 3 files changed, 11 insertions(+), 5 deletions(-)


diff --git a/include/xfs_mount.h b/include/xfs_mount.h
index ecee4ccc0b7..a2fdd7c2f14 100644
--- a/include/xfs_mount.h
+++ b/include/xfs_mount.h
@@ -52,6 +52,7 @@ typedef struct xfs_mount {
 	char			*m_fsname;	/* filesystem name */
 	int			m_bsize;	/* fs logical block size */
 	spinlock_t		m_agirotor_lock;
+	xfs_rgnumber_t		m_rtgrotor;	/* last rtgroup rtpicked */
 	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
 	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
 	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
diff --git a/libxfs/xfs_rtbitmap.c b/libxfs/xfs_rtbitmap.c
index 7be1c0bdbea..7d29a72be6b 100644
--- a/libxfs/xfs_rtbitmap.c
+++ b/libxfs/xfs_rtbitmap.c
@@ -1060,10 +1060,14 @@ xfs_rtfree_extent(
 		if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
 			mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
 
-		atime = inode_get_atime(VFS_I(mp->m_rbmip));
-		atime.tv_sec = 0;
-		inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
-		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		if (xfs_has_rtgroups(mp)) {
+			mp->m_rtgrotor = 0;
+		} else {
+			atime = inode_get_atime(VFS_I(mp->m_rbmip));
+			atime.tv_sec = 0;
+			inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
+			xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		}
 	}
 	error = 0;
 out:
diff --git a/mkfs/proto.c b/mkfs/proto.c
index 33d454cffb2..36df148018f 100644
--- a/mkfs/proto.c
+++ b/mkfs/proto.c
@@ -810,7 +810,8 @@ rtbitmap_create(
 
 	rbmip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize;
 	rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
-	inode_set_atime(VFS_I(rbmip), 0, 0);
+	if (!xfs_has_rtgroups(mp))
+		inode_set_atime(VFS_I(rbmip), 0, 0);
 	libxfs_trans_log_inode(upd.tp, rbmip, XFS_ILOG_CORE);
 
 	error = -libxfs_imeta_commit_update(&upd);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 19/52] xfs: scrub the realtime group superblock
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (22 preceding siblings ...)
  2023-12-31 23:52   ` [PATCH 18/52] xfs: use an incore rtgroup rotor for rtpick Darrick J. Wong
@ 2023-12-31 23:52   ` Darrick J. Wong
  2023-12-31 23:52   ` [PATCH 20/52] xfs: repair secondary realtime group superblocks Darrick J. Wong
                     ` (27 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:52 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Enable scrubbing of realtime group superblocks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/scrub.c                     |    5 +++++
 libfrog/scrub.h                     |    1 +
 libxfs/xfs_fs.h                     |    3 ++-
 man/man2/ioctl_xfs_scrub_metadata.2 |    9 +++++++++
 scrub/repair.c                      |    1 +
 scrub/scrub.c                       |    3 +++
 6 files changed, 21 insertions(+), 1 deletion(-)


diff --git a/libfrog/scrub.c b/libfrog/scrub.c
index f48bb1a0d2b..a9aad03de0d 100644
--- a/libfrog/scrub.c
+++ b/libfrog/scrub.c
@@ -159,6 +159,11 @@ const struct xfrog_scrub_descr xfrog_scrubbers[XFS_SCRUB_TYPE_NR] = {
 		.descr	= "metadata directory paths",
 		.group	= XFROG_SCRUB_GROUP_METAPATH,
 	},
+	[XFS_SCRUB_TYPE_RGSUPER] = {
+		.name	= "rgsuper",
+		.descr	= "realtime group superblock",
+		.group	= XFROG_SCRUB_GROUP_RTGROUP,
+	},
 };
 
 const struct xfrog_scrub_descr xfrog_metapaths[XFS_SCRUB_METAPATH_NR] = {
diff --git a/libfrog/scrub.h b/libfrog/scrub.h
index 5fa0fafef56..afab2b07fc4 100644
--- a/libfrog/scrub.h
+++ b/libfrog/scrub.h
@@ -16,6 +16,7 @@ enum xfrog_scrub_group {
 	XFROG_SCRUB_GROUP_ISCAN,	/* metadata requiring full inode scan */
 	XFROG_SCRUB_GROUP_SUMMARY,	/* summary metadata */
 	XFROG_SCRUB_GROUP_METAPATH,	/* metadata directory path */
+	XFROG_SCRUB_GROUP_RTGROUP,	/* per-rtgroup metadata */
 };
 
 /* Catalog of scrub types and names, indexed by XFS_SCRUB_TYPE_* */
diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index c5bf53c6a43..237d13a500d 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -735,9 +735,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_HEALTHY	27	/* everything checked out ok */
 #define XFS_SCRUB_TYPE_DIRTREE	28	/* directory tree structure */
 #define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
+#define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	30
+#define XFS_SCRUB_TYPE_NR	31
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/man/man2/ioctl_xfs_scrub_metadata.2 b/man/man2/ioctl_xfs_scrub_metadata.2
index b1db740560d..13f655e2b97 100644
--- a/man/man2/ioctl_xfs_scrub_metadata.2
+++ b/man/man2/ioctl_xfs_scrub_metadata.2
@@ -88,6 +88,15 @@ The allocation group number must be given in
 .BR sm_ino " and " sm_gen
 must be zero.
 
+.PP
+.TP
+.B XFS_SCRUB_TYPE_RGSUPER
+Examine a given realtime allocation group's superblock.
+The realtime allocation group number must be given in
+.IR sm_agno "."
+.IR sm_ino " and " sm_gen
+must be zero.
+
 .TP
 .B XFS_SCRUB_TYPE_INODE
 Examine a given inode record for obviously incorrect values and
diff --git a/scrub/repair.c b/scrub/repair.c
index 7e131001e13..43037a7c5e1 100644
--- a/scrub/repair.c
+++ b/scrub/repair.c
@@ -545,6 +545,7 @@ repair_item_difficulty(
 		case XFS_SCRUB_TYPE_REFCNTBT:
 		case XFS_SCRUB_TYPE_RTBITMAP:
 		case XFS_SCRUB_TYPE_RTSUM:
+		case XFS_SCRUB_TYPE_RGSUPER:
 			ret |= REPAIR_DIFFICULTY_PRIMARY;
 			break;
 		}
diff --git a/scrub/scrub.c b/scrub/scrub.c
index bad1384dcfb..8f9fde80263 100644
--- a/scrub/scrub.c
+++ b/scrub/scrub.c
@@ -98,6 +98,9 @@ format_scrubv_descr(
 		return snprintf(buf, buflen, _("%s"), _(sc->descr));
 	case XFROG_SCRUB_GROUP_METAPATH:
 		return format_metapath_descr(buf, buflen, vhead);
+	case XFROG_SCRUB_GROUP_RTGROUP:
+		return snprintf(buf, buflen, _("rtgroup %u %s"),
+				vhead->svh_agno, _(sc->descr));
 	}
 	return -1;
 }


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 20/52] xfs: repair secondary realtime group superblocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (23 preceding siblings ...)
  2023-12-31 23:52   ` [PATCH 19/52] xfs: scrub the realtime group superblock Darrick J. Wong
@ 2023-12-31 23:52   ` Darrick J. Wong
  2023-12-31 23:53   ` [PATCH 21/52] xfs: scrub each rtgroup's portion of the rtbitmap separately Darrick J. Wong
                     ` (26 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:52 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Repair secondary realtime group superblocks.  They're not critical for
anything, but some consistency would be a good idea.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_rtgroup.c |    2 +-
 libxfs/xfs_rtgroup.h |    3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)


diff --git a/libxfs/xfs_rtgroup.c b/libxfs/xfs_rtgroup.c
index c503a39b0fc..4ef6b9e9094 100644
--- a/libxfs/xfs_rtgroup.c
+++ b/libxfs/xfs_rtgroup.c
@@ -418,7 +418,7 @@ xfs_rtgroup_log_super(
 }
 
 /* Initialize a secondary realtime superblock. */
-static int
+int
 xfs_rtgroup_init_secondary_super(
 	struct xfs_mount	*mp,
 	xfs_rgnumber_t		rgno,
diff --git a/libxfs/xfs_rtgroup.h b/libxfs/xfs_rtgroup.h
index e6d60425faa..0a63f14b5aa 100644
--- a/libxfs/xfs_rtgroup.h
+++ b/libxfs/xfs_rtgroup.h
@@ -221,6 +221,8 @@ void xfs_rtgroup_update_super(struct xfs_buf *rtsb_bp,
 		const struct xfs_buf *sb_bp);
 void xfs_rtgroup_log_super(struct xfs_trans *tp, const struct xfs_buf *sb_bp);
 int xfs_rtgroup_update_secondary_sbs(struct xfs_mount *mp);
+int xfs_rtgroup_init_secondary_super(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+		struct xfs_buf **bpp);
 
 /* Lock the rt bitmap inode in exclusive mode */
 #define XFS_RTGLOCK_BITMAP		(1U << 0)
@@ -241,6 +243,7 @@ int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg,
 # define xfs_rtgroup_update_super(bp, sb_bp)	((void)0)
 # define xfs_rtgroup_log_super(tp, sb_bp)	((void)0)
 # define xfs_rtgroup_update_secondary_sbs(mp)	(0)
+# define xfs_rtgroup_init_secondary_super(mp, rgno, bpp)	(-EOPNOTSUPP)
 # define xfs_rtgroup_lock(tp, rtg, gf)		((void)0)
 # define xfs_rtgroup_unlock(rtg, gf)		((void)0)
 # define xfs_rtgroup_get_geometry(rtg, rgeo)	(-EOPNOTSUPP)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 21/52] xfs: scrub each rtgroup's portion of the rtbitmap separately
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (24 preceding siblings ...)
  2023-12-31 23:52   ` [PATCH 20/52] xfs: repair secondary realtime group superblocks Darrick J. Wong
@ 2023-12-31 23:53   ` Darrick J. Wong
  2023-12-31 23:53   ` [PATCH 22/52] libfrog: report rt groups in output Darrick J. Wong
                     ` (25 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:53 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new scrub type code so that userspace can scrub each rtgroup's
portion of the rtbitmap file separately.  This reduces the long tail
latency that results from scanning the entire bitmap all at once, and
prepares us for future patchsets, wherein we'll need to be able to lock
a specific rtgroup so that we can rebuild that rtgroup's part of the
rtbitmap contents from the rtgroup's rmap btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/scrub.c                     |    5 +++++
 libxfs/xfs_fs.h                     |    3 ++-
 man/man2/ioctl_xfs_scrub_metadata.2 |   12 ++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)


diff --git a/libfrog/scrub.c b/libfrog/scrub.c
index a9aad03de0d..b22770d639c 100644
--- a/libfrog/scrub.c
+++ b/libfrog/scrub.c
@@ -164,6 +164,11 @@ const struct xfrog_scrub_descr xfrog_scrubbers[XFS_SCRUB_TYPE_NR] = {
 		.descr	= "realtime group superblock",
 		.group	= XFROG_SCRUB_GROUP_RTGROUP,
 	},
+	[XFS_SCRUB_TYPE_RGBITMAP] = {
+		.name	= "rgbitmap",
+		.descr	= "realtime group bitmap",
+		.group	= XFROG_SCRUB_GROUP_RTGROUP,
+	},
 };
 
 const struct xfrog_scrub_descr xfrog_metapaths[XFS_SCRUB_METAPATH_NR] = {
diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index 237d13a500d..102b9273360 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -736,9 +736,10 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_DIRTREE	28	/* directory tree structure */
 #define XFS_SCRUB_TYPE_METAPATH	29	/* metadata directory tree paths */
 #define XFS_SCRUB_TYPE_RGSUPER	30	/* realtime superblock */
+#define XFS_SCRUB_TYPE_RGBITMAP	31	/* realtime group bitmap */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	31
+#define XFS_SCRUB_TYPE_NR	32
 
 /*
  * This special type code only applies to the vectored scrub implementation.
diff --git a/man/man2/ioctl_xfs_scrub_metadata.2 b/man/man2/ioctl_xfs_scrub_metadata.2
index 13f655e2b97..dc439897c98 100644
--- a/man/man2/ioctl_xfs_scrub_metadata.2
+++ b/man/man2/ioctl_xfs_scrub_metadata.2
@@ -97,6 +97,18 @@ The realtime allocation group number must be given in
 .IR sm_ino " and " sm_gen
 must be zero.
 
+.PP
+.TP
+.B XFS_SCRUB_TYPE_RGBITMAP
+Examine a given realtime allocation group's free space bitmap.
+Records are checked for obviously incorrect values and cross-referenced
+with other allocation group metadata records to ensure that there are no
+conflicts.
+The realtime allocation group number must be given in
+.IR sm_agno "."
+.IR sm_ino " and " sm_gen
+must be zero.
+
 .TP
 .B XFS_SCRUB_TYPE_INODE
 Examine a given inode record for obviously incorrect values and


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 22/52] libfrog: report rt groups in output
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (25 preceding siblings ...)
  2023-12-31 23:53   ` [PATCH 21/52] xfs: scrub each rtgroup's portion of the rtbitmap separately Darrick J. Wong
@ 2023-12-31 23:53   ` Darrick J. Wong
  2023-12-31 23:53   ` [PATCH 23/52] libxfs: implement some sanity checking for enormous rgcount Darrick J. Wong
                     ` (24 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:53 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Report realtime group geometry.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libfrog/fsgeom.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)


diff --git a/libfrog/fsgeom.c b/libfrog/fsgeom.c
index e2732fb7a09..b9618661427 100644
--- a/libfrog/fsgeom.c
+++ b/libfrog/fsgeom.c
@@ -65,7 +65,8 @@ xfs_report_geom(
 "naming   =version %-14u bsize=%-6u ascii-ci=%d, ftype=%d, parent=%d\n"
 "log      =%-22s bsize=%-6d blocks=%u, version=%d\n"
 "         =%-22s sectsz=%-5u sunit=%d blks, lazy-count=%d\n"
-"realtime =%-22s extsz=%-6d blocks=%lld, rtextents=%lld\n"),
+"realtime =%-22s extsz=%-6d blocks=%lld, rtextents=%lld\n"
+"         =%-22s rgcount=%-4d rgsize=%u blks\n"),
 		mntpoint, geo->inodesize, geo->agcount, geo->agblocks,
 		"", geo->sectsize, attrversion, projid32bit,
 		"", crcs_enabled, finobt_enabled, spinodes, rmapbt_enabled,
@@ -80,7 +81,8 @@ xfs_report_geom(
 		"", geo->logsectsize, geo->logsunit / geo->blocksize, lazycount,
 		!geo->rtblocks ? _("none") : rtname ? rtname : _("external"),
 		geo->rtextsize * geo->blocksize, (unsigned long long)geo->rtblocks,
-			(unsigned long long)geo->rtextents);
+			(unsigned long long)geo->rtextents,
+		"", geo->rgcount, geo->rgblocks);
 }
 
 /* Try to obtain the xfs geometry.  On error returns a negative error code. */


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 23/52] libxfs: implement some sanity checking for enormous rgcount
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (26 preceding siblings ...)
  2023-12-31 23:53   ` [PATCH 22/52] libfrog: report rt groups in output Darrick J. Wong
@ 2023-12-31 23:53   ` Darrick J. Wong
  2023-12-31 23:53   ` [PATCH 24/52] xfs_repair: support realtime groups Darrick J. Wong
                     ` (23 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:53 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Similar to what we do for suspiciously large sb_agcount values, if
someone tries to get libxfs to load a filesystem with a very large
realtime group count, let's do some basic checks of the rt device to
see if it's really that large.  If the read fails, only load the first
rtgroup and warn the user.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/init.c |   47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)


diff --git a/libxfs/init.c b/libxfs/init.c
index f1ba28a3ca3..0332a4eeb21 100644
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -725,6 +725,50 @@ xfs_set_low_space_thresholds(
 		mp->m_low_space[i] = dblocks * (i + 1);
 }
 
+/*
+ * libxfs_initialize_rtgroup will allocate a rtgroup structure for each
+ * rtgroup.  If rgcount is corrupted and insanely high, this will OOM the box.
+ * Try to read what would be the last rtgroup superblock.  If that fails, read
+ * the first one and let the user know to check the geometry.
+ */
+static inline bool
+check_many_rtgroups(
+	struct xfs_mount	*mp,
+	struct xfs_sb		*sbp)
+{
+	struct xfs_buf		*bp;
+	xfs_rtblock_t		rtbno;
+	int			error;
+
+	if (!mp->m_rtdev->bt_bdev) {
+		fprintf(stderr, _("%s: no rt device, ignoring rgcount %u\n"),
+				progname, sbp->sb_rgcount);
+		if (!xfs_is_debugger(mp))
+			return false;
+
+		sbp->sb_rgcount = 0;
+		return true;
+	}
+
+	rtbno = xfs_rgbno_to_rtb(mp, sbp->sb_rgcount - 1, 0);
+
+	error = libxfs_buf_read(mp->m_rtdev, xfs_rtb_to_daddr(mp, rtbno), 1, 0,
+			&bp, NULL);
+	if (!error) {
+		libxfs_buf_relse(bp);
+		return true;
+	}
+
+	fprintf(stderr, _("%s: read of rtgroup %u failed\n"), progname,
+			sbp->sb_rgcount - 1);
+	if (!xfs_is_debugger(mp))
+		return false;
+
+	fprintf(stderr, _("%s: limiting reads to rtgroup 0\n"), progname);
+	sbp->sb_rgcount = 1;
+	return true;
+}
+
 /*
  * Mount structure initialization, provides a filled-in xfs_mount_t
  * such that the numerous XFS_* macros can be used.  If dev is zero,
@@ -878,6 +922,9 @@ libxfs_mount(
 			libxfs_buf_relse(bp);
 	}
 
+	if (sbp->sb_rgcount > 1000000 && !check_many_rtgroups(mp, sbp))
+		goto out_da;
+
 	error = libxfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
 			&mp->m_maxagi);
 	if (error) {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 24/52] xfs_repair: support realtime groups
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (27 preceding siblings ...)
  2023-12-31 23:53   ` [PATCH 23/52] libxfs: implement some sanity checking for enormous rgcount Darrick J. Wong
@ 2023-12-31 23:53   ` Darrick J. Wong
  2023-12-31 23:54   ` [PATCH 25/52] xfs_repair: improve rtbitmap discrepancy reporting Darrick J. Wong
                     ` (22 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:53 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Support the realtime group feature.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/libxfs_api_defs.h |    2 +
 repair/agheader.c        |    2 +
 repair/incore.c          |   22 +++++++++++++++
 repair/phase3.c          |    3 ++
 repair/rt.c              |   69 ++++++++++++++++++++++++++++++++++++++++++++++
 repair/rt.h              |    3 ++
 repair/sb.c              |   37 +++++++++++++++++++++++++
 repair/xfs_repair.c      |   11 +++++++
 8 files changed, 149 insertions(+)


diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index fff0e6d0f60..a3503e07984 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -283,6 +283,8 @@
 #define xfs_rtsummary_wordcount		libxfs_rtsummary_wordcount
 
 #define xfs_rtfree_extent		libxfs_rtfree_extent
+#define xfs_rtgroup_update_secondary_sbs	libxfs_rtgroup_update_secondary_sbs
+#define xfs_rtgroup_update_super	libxfs_rtgroup_update_super
 #define xfs_sb_from_disk		libxfs_sb_from_disk
 #define xfs_sb_quota_from_disk		libxfs_sb_quota_from_disk
 #define xfs_sb_read_secondary		libxfs_sb_read_secondary
diff --git a/repair/agheader.c b/repair/agheader.c
index af88802ffdf..076860a4451 100644
--- a/repair/agheader.c
+++ b/repair/agheader.c
@@ -412,6 +412,8 @@ secondary_sb_whack(
 			 * super byte for byte.
 			 */
 			sb->sb_metadirino = mp->m_sb.sb_metadirino;
+			sb->sb_rgblocks = mp->m_sb.sb_rgblocks;
+			sb->sb_rgcount = mp->m_sb.sb_rgcount;
 		} else
 			do_warn(
 	_("would zero unused portion of %s superblock (AG #%u)\n"),
diff --git a/repair/incore.c b/repair/incore.c
index 06edaf0d605..27457a7c17e 100644
--- a/repair/incore.c
+++ b/repair/incore.c
@@ -195,6 +195,25 @@ set_rtbmap(
 	 (((uint64_t) state) << ((rtx % XR_BB_NUM) * XR_BB)));
 }
 
+static void
+rtgroups_init(
+	struct xfs_mount	*mp)
+{
+	xfs_rgnumber_t		rgno;
+
+	if (!xfs_has_rtgroups(mp) || !rt_bmap)
+		return;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		xfs_rtblock_t	start_rtx;
+
+		start_rtx = xfs_rgbno_to_rtb(mp, rgno, 0) /
+				mp->m_sb.sb_rextsize;
+
+		set_rtbmap(start_rtx, XR_E_INUSE_FS);
+	}
+}
+
 static void
 reset_rt_bmap(void)
 {
@@ -219,6 +238,8 @@ init_rt_bmap(
 			mp->m_sb.sb_rextents);
 		return;
 	}
+
+	rtgroups_init(mp);
 }
 
 static void
@@ -271,6 +292,7 @@ reset_bmaps(xfs_mount_t *mp)
 	}
 
 	reset_rt_bmap();
+	rtgroups_init(mp);
 }
 
 void
diff --git a/repair/phase3.c b/repair/phase3.c
index ca4dbee4743..19490dbe9bb 100644
--- a/repair/phase3.c
+++ b/repair/phase3.c
@@ -17,6 +17,7 @@
 #include "progress.h"
 #include "bmap.h"
 #include "threads.h"
+#include "rt.h"
 
 static void
 process_agi_unlinked(
@@ -116,6 +117,8 @@ phase3(
 
 	set_progress_msg(PROG_FMT_AGI_UNLINKED, (uint64_t) glob_agcount);
 
+	check_rtsupers(mp);
+
 	/* first clear the agi unlinked AGI list */
 	if (!no_modify) {
 		for (i = 0; i < mp->m_sb.sb_agcount; i++)
diff --git a/repair/rt.c b/repair/rt.c
index ded968b7d82..5576511e7a0 100644
--- a/repair/rt.c
+++ b/repair/rt.c
@@ -239,3 +239,72 @@ check_rtsummary(
 	check_rtfile_contents(mp, "rtsummary", mp->m_sb.sb_rsumino, sumcompute,
 			XFS_B_TO_FSB(mp, mp->m_rsumsize));
 }
+
+void
+check_rtsupers(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp;
+	xfs_rtblock_t		rtbno;
+	xfs_rgnumber_t		rgno;
+	int			error;
+
+	if (!xfs_has_rtgroups(mp))
+		return;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		rtbno = xfs_rgbno_to_rtb(mp, rgno, 0);
+		error = -libxfs_buf_read_uncached(mp->m_rtdev_targp,
+				xfs_rtb_to_daddr(mp, rtbno),
+				XFS_FSB_TO_BB(mp, 1), 0, &bp,
+				&xfs_rtsb_buf_ops);
+		if (!error) {
+			libxfs_buf_relse(bp);
+			continue;
+		}
+
+		if (no_modify) {
+			do_warn(
+	_("would rewrite realtime group %u superblock\n"),
+					rgno);
+		} else {
+			do_warn(
+	_("will rewrite realtime group %u superblock\n"),
+					rgno);
+			/*
+			 * Rewrite the primary rt superblock before an update
+			 * to the primary fs superblock trips over the rt super
+			 * being corrupt.
+			 */
+			if (rgno == 0)
+				rewrite_primary_rt_super(mp);
+		}
+	}
+}
+
+void
+rewrite_primary_rt_super(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*rtsb_bp;
+	struct xfs_buf		*sb_bp = libxfs_getsb(mp);
+	int			error;
+
+	if (!sb_bp)
+		do_error(
+ _("couldn't grab primary sb to update rt superblocks\n"));
+
+	error = -libxfs_buf_get_uncached(mp->m_rtdev_targp,
+			XFS_FSB_TO_BB(mp, 1), 0, &rtsb_bp);
+	if (error)
+		do_error(
+ _("couldn't grab primary rt superblock\n"));
+
+	rtsb_bp->b_maps[0].bm_bn = XFS_RTSB_DADDR;
+	rtsb_bp->b_ops = &xfs_rtsb_buf_ops;
+
+	libxfs_rtgroup_update_super(rtsb_bp, sb_bp);
+	libxfs_buf_mark_dirty(rtsb_bp);
+	libxfs_buf_relse(rtsb_bp);
+	libxfs_buf_relse(sb_bp);
+}
diff --git a/repair/rt.h b/repair/rt.h
index 862695487bc..912150ce8be 100644
--- a/repair/rt.h
+++ b/repair/rt.h
@@ -16,5 +16,8 @@ int generate_rtinfo(struct xfs_mount *mp, union xfs_rtword_raw *words,
 
 void check_rtbitmap(struct xfs_mount *mp);
 void check_rtsummary(struct xfs_mount *mp);
+void check_rtsupers(struct xfs_mount *mp);
+
+void rewrite_primary_rt_super(struct xfs_mount *mp);
 
 #endif /* _XFS_REPAIR_RT_H_ */
diff --git a/repair/sb.c b/repair/sb.c
index 32602d84f3c..b0f85e9cff8 100644
--- a/repair/sb.c
+++ b/repair/sb.c
@@ -312,6 +312,37 @@ verify_sb_loginfo(
 	return true;
 }
 
+static int
+verify_sb_rtgroups(
+	struct xfs_sb		*sbp)
+{
+	uint64_t		groups;
+
+	if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
+		return XR_BAD_RT_GEO_DATA;
+
+	if (sbp->sb_rgblocks > XFS_MAX_RGBLOCKS)
+		return XR_BAD_RT_GEO_DATA;
+
+	if (sbp->sb_rextsize == 0)
+		return XR_BAD_RT_GEO_DATA;
+
+	if (sbp->sb_rgblocks % sbp->sb_rextsize != 0)
+		return XR_BAD_RT_GEO_DATA;
+
+	if (sbp->sb_rgblocks < (sbp->sb_rextsize << 1))
+		return XR_BAD_RT_GEO_DATA;
+
+	if (sbp->sb_rgcount > XFS_MAX_RGNUMBER)
+		return XR_BAD_RT_GEO_DATA;
+
+	groups = howmany(sbp->sb_rblocks, sbp->sb_rgblocks);
+	if (groups != sbp->sb_rgcount)
+		return XR_BAD_RT_GEO_DATA;
+
+	return 0;
+}
+
 /*
  * verify a superblock -- does not verify root inode #
  *	can only check that geometry info is internally
@@ -520,6 +551,12 @@ verify_sb(char *sb_buf, xfs_sb_t *sb, int is_primary_sb)
 	if (sb->sb_blocklog + sb->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG)
 		return XR_BAD_DIR_SIZE_DATA;
 
+	if (sb->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS) {
+		int err = verify_sb_rtgroups(sb);
+		if (err)
+			return err;
+	}
+
 	return(XR_OK);
 }
 
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index bab8aa70f7a..811e89317f1 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -27,6 +27,7 @@
 #include "bulkload.h"
 #include "quotacheck.h"
 #include "rcbag_btree.h"
+#include "rt.h"
 
 /*
  * option tables for getsubopt calls
@@ -1543,6 +1544,16 @@ _("Note - stripe unit (%d) and width (%d) were copied from a backup superblock.\
 				XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
 	}
 
+	/* Always rewrite the realtime superblocks. */
+	if (xfs_has_rtgroups(mp)) {
+		if (mp->m_sb.sb_rgcount > 0)
+			rewrite_primary_rt_super(mp);
+
+		error = -libxfs_rtgroup_update_secondary_sbs(mp);
+		if (error)
+			do_error(_("updating rt superblocks, err %d"), error);
+	}
+
 	/*
 	 * Done. Flush all cached buffers and inodes first to ensure all
 	 * verifiers are run (where we discover the max metadata LSN), reformat


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 25/52] xfs_repair: improve rtbitmap discrepancy reporting
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (28 preceding siblings ...)
  2023-12-31 23:53   ` [PATCH 24/52] xfs_repair: support realtime groups Darrick J. Wong
@ 2023-12-31 23:54   ` Darrick J. Wong
  2023-12-31 23:54   ` [PATCH 26/52] xfs_repair: repair rtbitmap block headers Darrick J. Wong
                     ` (21 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:54 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Improve the reporting of discrepancies in the realtime bitmap and
summary files by creating a separate helper function that will pinpoint
the exact (word) locations of mismatches.  This will help developers to
diagnose problems with the rtgroups feature and users to figure out
exactly what's bad in a filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/rt.c |   42 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)


diff --git a/repair/rt.c b/repair/rt.c
index 5576511e7a0..f73760e9cc9 100644
--- a/repair/rt.c
+++ b/repair/rt.c
@@ -150,6 +150,44 @@ generate_rtinfo(
 	return(0);
 }
 
+static void
+check_rtwords(
+	struct xfs_mount	*mp,
+	const char		*filename,
+	unsigned long long	bno,
+	void			*ondisk,
+	void			*incore)
+{
+	unsigned int		wordcnt = mp->m_blockwsize;
+	union xfs_rtword_raw	*o = ondisk, *i = incore;
+	int			badstart = -1;
+	unsigned int		j;
+
+	if (memcmp(ondisk, incore, wordcnt << XFS_WORDLOG) == 0)
+		return;
+
+	for (j = 0; j < wordcnt; j++, o++, i++) {
+		if (o->old == i->old) {
+			/* Report a range of inconsistency that just ended. */
+			if (badstart >= 0)
+				do_warn(
+ _("discrepancy in %s at dblock 0x%llx words 0x%x-0x%x/0x%x\n"),
+					filename, bno, badstart, j - 1, wordcnt);
+			badstart = -1;
+			continue;
+		}
+
+		if (badstart == -1)
+			badstart = j;
+	}
+
+	if (badstart >= 0)
+		do_warn(
+ _("discrepancy in %s at dblock 0x%llx words 0x%x-0x%x/0x%x\n"),
+					filename, bno, badstart, wordcnt,
+					wordcnt);
+}
+
 static void
 check_rtfile_contents(
 	struct xfs_mount	*mp,
@@ -206,9 +244,7 @@ check_rtfile_contents(
 			break;
 		}
 
-		if (memcmp(bp->b_addr, buf, mp->m_blockwsize << XFS_WORDLOG))
-			do_warn(_("discrepancy in %s at dblock 0x%llx\n"),
-					filename, (unsigned long long)bno);
+		check_rtwords(mp, filename, bno, bp->b_addr, buf);
 
 		buf += XFS_FSB_TO_B(mp, map.br_blockcount);
 		bno += map.br_blockcount;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 26/52] xfs_repair: repair rtbitmap block headers
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (29 preceding siblings ...)
  2023-12-31 23:54   ` [PATCH 25/52] xfs_repair: improve rtbitmap discrepancy reporting Darrick J. Wong
@ 2023-12-31 23:54   ` Darrick J. Wong
  2023-12-31 23:54   ` [PATCH 27/52] xfs_repair: repair rtsummary " Darrick J. Wong
                     ` (20 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:54 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Check and repair the new block headers attached to rtbitmap blocks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   19 +++++++++++++++----
 repair/rt.c     |   47 +++++++++++++++++++++++++++++++++++------------
 repair/sb.c     |    8 +++++++-
 3 files changed, 57 insertions(+), 17 deletions(-)


diff --git a/repair/phase6.c b/repair/phase6.c
index a99793b4d90..f3d687732b5 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -843,6 +843,7 @@ fill_rbmino(xfs_mount_t *mp)
 			.tp		= tp,
 		};
 		union xfs_rtword_raw	*ondisk;
+		xfs_daddr_t		daddr;
 
 		/*
 		 * fill the file one block at a time
@@ -857,11 +858,9 @@ fill_rbmino(xfs_mount_t *mp)
 
 		ASSERT(map.br_startblock != HOLESTARTBLOCK);
 
-		error = -libxfs_trans_read_buf(
-				mp, tp, mp->m_dev,
-				XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		daddr = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+		error = -libxfs_trans_read_buf(mp, tp, mp->m_dev, daddr,
 				XFS_FSB_TO_BB(mp, 1), 1, &bp, NULL);
-
 		if (error) {
 			do_warn(
 _("can't access block %" PRIu64 " (fsbno %" PRIu64 ") of realtime bitmap inode %" PRIu64 "\n"),
@@ -873,6 +872,18 @@ _("can't access block %" PRIu64 " (fsbno %" PRIu64 ") of realtime bitmap inode %
 		ondisk = xfs_rbmblock_wordptr(&args, 0);
 		memcpy(ondisk, bmp, mp->m_blockwsize << XFS_WORDLOG);
 
+		if (xfs_has_rtgroups(mp)) {
+			struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+			bp->b_ops = &xfs_rtbitmap_buf_ops;
+			hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+			hdr->rt_owner = cpu_to_be64(ip->i_ino);
+			hdr->rt_lsn = 0;
+			hdr->rt_blkno = cpu_to_be64(daddr);
+			platform_uuid_copy(&hdr->rt_uuid,
+					&mp->m_sb.sb_meta_uuid);
+		}
+
 		libxfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
 
 		bmp += mp->m_blockwsize;
diff --git a/repair/rt.c b/repair/rt.c
index f73760e9cc9..ecf86099b47 100644
--- a/repair/rt.c
+++ b/repair/rt.c
@@ -194,7 +194,8 @@ check_rtfile_contents(
 	const char		*filename,
 	xfs_ino_t		ino,
 	void			*buf,
-	xfs_fileoff_t		filelen)
+	xfs_fileoff_t		filelen,
+	const struct xfs_buf_ops *buf_ops)
 {
 	struct xfs_bmbt_irec	map;
 	struct xfs_buf		*bp;
@@ -216,12 +217,10 @@ check_rtfile_contents(
 	}
 
 	while (bno < filelen)  {
-		xfs_filblks_t	maplen;
+		xfs_daddr_t	daddr;
 		int		nmap = 1;
 
-		/* Read up to 1MB at a time. */
-		maplen = min(filelen - bno, XFS_B_TO_FSBT(mp, 1048576));
-		error = -libxfs_bmapi_read(ip, bno, maplen, &map, &nmap, 0);
+		error = -libxfs_bmapi_read(ip, bno, 1, &map, &nmap, 0);
 		if (error) {
 			do_warn(_("unable to read %s mapping, err %d\n"),
 					filename, error);
@@ -234,19 +233,39 @@ check_rtfile_contents(
 			break;
 		}
 
-		error = -libxfs_buf_read_uncached(mp->m_dev,
-				XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		daddr = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+		error = -libxfs_buf_read_uncached(mp->m_dev, daddr,
 				XFS_FSB_TO_BB(mp, map.br_blockcount),
-				0, &bp, NULL);
+				0, &bp, buf_ops);
 		if (error) {
 			do_warn(_("unable to read %s at dblock 0x%llx, err %d\n"),
 					filename, (unsigned long long)bno, error);
 			break;
 		}
 
-		check_rtwords(mp, filename, bno, bp->b_addr, buf);
+		if (buf_ops == &xfs_rtbitmap_buf_ops) {
+			struct xfs_rtalloc_args		args = {
+				.mp			= mp,
+			};
+			struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+			union xfs_rtword_raw		*incore = buf;
+			union xfs_rtword_raw		*ondisk;
+
+			if (hdr->rt_owner != cpu_to_be64(ino)) {
+				do_warn(
+ _("corrupt owner in %s at dblock 0x%llx\n"),
+					filename, (unsigned long long)bno);
+			}
+
+			args.rbmbp = bp;
+			ondisk = xfs_rbmblock_wordptr(&args, 0);
+			check_rtwords(mp, filename, bno, ondisk, incore);
+			buf += mp->m_blockwsize << XFS_WORDLOG;
+		} else {
+			check_rtwords(mp, filename, bno, bp->b_addr, buf);
+			buf += XFS_FSB_TO_B(mp, map.br_blockcount);
+		}
 
-		buf += XFS_FSB_TO_B(mp, map.br_blockcount);
 		bno += map.br_blockcount;
 		libxfs_buf_relse(bp);
 	}
@@ -258,11 +277,15 @@ void
 check_rtbitmap(
 	struct xfs_mount	*mp)
 {
+	const struct xfs_buf_ops *buf_ops = NULL;
+
 	if (need_rbmino)
 		return;
+	if (xfs_has_rtgroups(mp))
+		buf_ops = &xfs_rtbitmap_buf_ops;
 
 	check_rtfile_contents(mp, "rtbitmap", mp->m_sb.sb_rbmino, btmcompute,
-			mp->m_sb.sb_rbmblocks);
+			mp->m_sb.sb_rbmblocks, buf_ops);
 }
 
 void
@@ -273,7 +296,7 @@ check_rtsummary(
 		return;
 
 	check_rtfile_contents(mp, "rtsummary", mp->m_sb.sb_rsumino, sumcompute,
-			XFS_B_TO_FSB(mp, mp->m_rsumsize));
+			XFS_B_TO_FSB(mp, mp->m_rsumsize), NULL);
 }
 
 void
diff --git a/repair/sb.c b/repair/sb.c
index b0f85e9cff8..4e39c40370d 100644
--- a/repair/sb.c
+++ b/repair/sb.c
@@ -504,6 +504,8 @@ verify_sb(char *sb_buf, xfs_sb_t *sb, int is_primary_sb)
 		if (sb->sb_frextents != 0)
 			return(XR_BAD_RT_GEO_DATA);
 	} else  {
+		unsigned int	rbmblock_bytes = sb->sb_blocksize;
+
 		/*
 		 * if we have a real-time partition, sanity-check geometry
 		 */
@@ -517,8 +519,12 @@ verify_sb(char *sb_buf, xfs_sb_t *sb, int is_primary_sb)
 							sb->sb_rextents))
 			return(XR_BAD_RT_GEO_DATA);
 
+		if (xfs_sb_is_v5(sb) &&
+		    (sb->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_RTGROUPS))
+			rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo);
+
 		if (sb->sb_rbmblocks != (xfs_extlen_t) howmany(sb->sb_rextents,
-						NBBY * sb->sb_blocksize))
+						NBBY * rbmblock_bytes))
 			return(XR_BAD_RT_GEO_DATA);
 	}
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 27/52] xfs_repair: repair rtsummary block headers
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (30 preceding siblings ...)
  2023-12-31 23:54   ` [PATCH 26/52] xfs_repair: repair rtbitmap block headers Darrick J. Wong
@ 2023-12-31 23:54   ` Darrick J. Wong
  2023-12-31 23:54   ` [PATCH 28/52] xfs_repair: support adding rtgroups to a filesystem Darrick J. Wong
                     ` (19 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:54 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Check and repair the new block headers attached to rtsummary blocks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 repair/phase6.c |   18 +++++++++++++++---
 repair/rt.c     |   24 +++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 4 deletions(-)


diff --git a/repair/phase6.c b/repair/phase6.c
index f3d687732b5..63a2768d9c6 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -932,6 +932,7 @@ fill_rsumino(xfs_mount_t *mp)
 			.tp		= tp,
 		};
 		union xfs_suminfo_raw	*ondisk;
+		xfs_daddr_t		daddr;
 
 		/*
 		 * fill the file one block at a time
@@ -946,9 +947,8 @@ fill_rsumino(xfs_mount_t *mp)
 
 		ASSERT(map.br_startblock != HOLESTARTBLOCK);
 
-		error = -libxfs_trans_read_buf(
-				mp, tp, mp->m_dev,
-				XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		daddr = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+		error = -libxfs_trans_read_buf(mp, tp, mp->m_dev, daddr,
 				XFS_FSB_TO_BB(mp, 1), 1, &bp, NULL);
 
 		if (error) {
@@ -963,6 +963,18 @@ _("can't access block %" PRIu64 " (fsbno %" PRIu64 ") of realtime summary inode
 		ondisk = xfs_rsumblock_infoptr(&args, 0);
 		memcpy(ondisk, smp, mp->m_blockwsize << XFS_WORDLOG);
 
+		if (xfs_has_rtgroups(mp)) {
+			struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+			bp->b_ops = &xfs_rtsummary_buf_ops;
+			hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC);
+			hdr->rt_owner = cpu_to_be64(ip->i_ino);
+			hdr->rt_lsn = 0;
+			hdr->rt_blkno = cpu_to_be64(daddr);
+			platform_uuid_copy(&hdr->rt_uuid,
+					&mp->m_sb.sb_meta_uuid);
+		}
+
 		libxfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
 
 		smp += mp->m_blockwsize;
diff --git a/repair/rt.c b/repair/rt.c
index ecf86099b47..0c282e36bf6 100644
--- a/repair/rt.c
+++ b/repair/rt.c
@@ -261,6 +261,24 @@ check_rtfile_contents(
 			ondisk = xfs_rbmblock_wordptr(&args, 0);
 			check_rtwords(mp, filename, bno, ondisk, incore);
 			buf += mp->m_blockwsize << XFS_WORDLOG;
+		} else if (buf_ops == &xfs_rtsummary_buf_ops) {
+			struct xfs_rtalloc_args		args = {
+				.mp			= mp,
+			};
+			struct xfs_rtbuf_blkinfo	*hdr = bp->b_addr;
+			union xfs_suminfo_raw		*incore = buf;
+			union xfs_suminfo_raw		*ondisk;
+
+			if (hdr->rt_owner != cpu_to_be64(ino)) {
+				do_warn(
+ _("corrupt owner in %s at dblock 0x%llx\n"),
+					filename, (unsigned long long)bno);
+			}
+
+			args.sumbp = bp;
+			ondisk = xfs_rsumblock_infoptr(&args, 0);
+			check_rtwords(mp, filename, bno, ondisk, incore);
+			buf += mp->m_blockwsize << XFS_WORDLOG;
 		} else {
 			check_rtwords(mp, filename, bno, bp->b_addr, buf);
 			buf += XFS_FSB_TO_B(mp, map.br_blockcount);
@@ -292,11 +310,15 @@ void
 check_rtsummary(
 	struct xfs_mount	*mp)
 {
+	const struct xfs_buf_ops *buf_ops = NULL;
+
 	if (need_rsumino)
 		return;
+	if (xfs_has_rtgroups(mp))
+		buf_ops = &xfs_rtsummary_buf_ops;
 
 	check_rtfile_contents(mp, "rtsummary", mp->m_sb.sb_rsumino, sumcompute,
-			XFS_B_TO_FSB(mp, mp->m_rsumsize), NULL);
+			XFS_B_TO_FSB(mp, mp->m_rsumsize), buf_ops);
 }
 
 void


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 28/52] xfs_repair: support adding rtgroups to a filesystem
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (31 preceding siblings ...)
  2023-12-31 23:54   ` [PATCH 27/52] xfs_repair: repair rtsummary " Darrick J. Wong
@ 2023-12-31 23:54   ` Darrick J. Wong
  2023-12-31 23:55   ` [PATCH 29/52] xfs_db: listify the definition of enum typnm Darrick J. Wong
                     ` (18 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:54 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow users to add the rtgroups feature to a filesystem if the
filesystem does not already have a realtime volume attached.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 man/man8/xfs_admin.8 |    9 +++++++++
 repair/globals.c     |    1 +
 repair/globals.h     |    1 +
 repair/phase2.c      |   42 ++++++++++++++++++++++++++++++++++++++++++
 repair/xfs_repair.c  |   11 +++++++++++
 5 files changed, 64 insertions(+)


diff --git a/man/man8/xfs_admin.8 b/man/man8/xfs_admin.8
index 68b4bf62427..57e08bb8212 100644
--- a/man/man8/xfs_admin.8
+++ b/man/man8/xfs_admin.8
@@ -193,6 +193,15 @@ device.
 The filesystem cannot be downgraded after this feature is enabled.
 This upgrade can fail if any AG has less than 5% free space remaining.
 This feature is not upstream yet.
+.TP 0.4i
+.B rtgroups
+Create allocation groups for the realtime volume, to increase parallelism.
+This is required for reverse mapping btrees and reflink support on the realtime
+device.
+The filesystem cannot be downgraded after this feature is enabled.
+This upgrade is not possible if a realtime volume has already been added to the
+filesystem.
+This feature is not upstream yet.
 .RE
 .TP
 .BI \-U " uuid"
diff --git a/repair/globals.c b/repair/globals.c
index e3b2697127f..b121d6e2d6d 100644
--- a/repair/globals.c
+++ b/repair/globals.c
@@ -57,6 +57,7 @@ bool	add_reflink;		/* add reference count btrees */
 bool	add_rmapbt;		/* add reverse mapping btrees */
 bool	add_parent;		/* add parent pointers */
 bool	add_metadir;		/* add metadata directory tree */
+bool	add_rtgroups;		/* add realtime allocation groups */
 
 /* misc status variables */
 
diff --git a/repair/globals.h b/repair/globals.h
index 1c24e313b89..f5dcc11f410 100644
--- a/repair/globals.h
+++ b/repair/globals.h
@@ -98,6 +98,7 @@ extern bool	add_reflink;		/* add reference count btrees */
 extern bool	add_rmapbt;		/* add reverse mapping btrees */
 extern bool	add_parent;		/* add parent pointers */
 extern bool	add_metadir;		/* add metadata directory tree */
+extern bool	add_rtgroups;		/* add realtime allocation groups */
 
 /* misc status variables */
 
diff --git a/repair/phase2.c b/repair/phase2.c
index cc7ddad8240..4cb0d7946bf 100644
--- a/repair/phase2.c
+++ b/repair/phase2.c
@@ -16,6 +16,8 @@
 #include "scan.h"
 #include "quotacheck.h"
 
+#define TERABYTES(count, blog)	((uint64_t)(count) << (40 - (blog)))
+
 /* workaround craziness in the xlog routines */
 int xlog_recover_do_trans(struct xlog *log, struct xlog_recover *t, int p)
 {
@@ -352,6 +354,44 @@ set_metadir(
 	return true;
 }
 
+static bool
+set_rtgroups(
+	struct xfs_mount	*mp,
+	struct xfs_sb		*new_sb)
+{
+	uint64_t		rgsize;
+
+	if (xfs_has_rtgroups(mp)) {
+		printf(_("Filesystem already supports realtime groups.\n"));
+		exit(0);
+	}
+
+	if (!xfs_has_metadir(mp)) {
+		printf(
+	_("Realtime allocation group feature only supported if metadir is enabled.\n"));
+		exit(0);
+	}
+
+	if (xfs_has_realtime(mp)) {
+		printf(
+	_("Realtime allocation group feature cannot be added to existing realtime volumes.\n"));
+		exit(0);
+	}
+
+	printf(_("Adding realtime groups to filesystem.\n"));
+	new_sb->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_RTGROUPS;
+	new_sb->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR;
+	new_sb->sb_rgcount = 0;
+	/*
+	 * The allocation group size is 1TB, rounded down to the nearest rt
+	 * extent.
+	 */
+	rgsize = TERABYTES(1, mp->m_sb.sb_blocklog);
+	rgsize -= rgsize % mp->m_sb.sb_rextsize;
+	new_sb->sb_rgblocks = rgsize;
+	return true;
+}
+
 struct check_state {
 	struct xfs_sb		sb;
 	uint64_t		features;
@@ -627,6 +667,8 @@ upgrade_filesystem(
 		dirty |= set_parent(mp, &new_sb);
 	if (add_metadir)
 		dirty |= set_metadir(mp, &new_sb);
+	if (add_rtgroups)
+		dirty |= set_rtgroups(mp, &new_sb);
 	if (!dirty)
 		return;
 
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index 811e89317f1..e3701f91470 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -75,6 +75,7 @@ enum c_opt_nums {
 	CONVERT_RMAPBT,
 	CONVERT_PARENT,
 	CONVERT_METADIR,
+	CONVERT_RTGROUPS,
 	C_MAX_OPTS,
 };
 
@@ -88,6 +89,7 @@ static char *c_opts[] = {
 	[CONVERT_RMAPBT]	= "rmapbt",
 	[CONVERT_PARENT]	= "parent",
 	[CONVERT_METADIR]	= "metadir",
+	[CONVERT_RTGROUPS]	= "rtgroups",
 	[C_MAX_OPTS]		= NULL,
 };
 
@@ -392,6 +394,15 @@ process_args(int argc, char **argv)
 		_("-c metadir only supports upgrades\n"));
 					add_metadir = true;
 					break;
+				case CONVERT_RTGROUPS:
+					if (!val)
+						do_abort(
+		_("-c rtgroups requires a parameter\n"));
+					if (strtol(val, NULL, 0) != 1)
+						do_abort(
+		_("-c rtgroups only supports upgrades\n"));
+					add_rtgroups = true;
+					break;
 				default:
 					unknown('c', val);
 					break;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 29/52] xfs_db: listify the definition of enum typnm
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (32 preceding siblings ...)
  2023-12-31 23:54   ` [PATCH 28/52] xfs_repair: support adding rtgroups to a filesystem Darrick J. Wong
@ 2023-12-31 23:55   ` Darrick J. Wong
  2023-12-31 23:55   ` [PATCH 30/52] xfs_db: support dumping realtime superblocks Darrick J. Wong
                     ` (17 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:55 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert the enum definition into a list so that future patches adding
things to enum typnm don't have to reflow the entire thing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/type.h |   29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)


diff --git a/db/type.h b/db/type.h
index 411bfe90dbc..397dcf5464c 100644
--- a/db/type.h
+++ b/db/type.h
@@ -11,11 +11,30 @@ struct field;
 
 typedef enum typnm
 {
-	TYP_AGF, TYP_AGFL, TYP_AGI, TYP_ATTR, TYP_BMAPBTA,
-	TYP_BMAPBTD, TYP_BNOBT, TYP_CNTBT, TYP_RMAPBT, TYP_REFCBT, TYP_DATA,
-	TYP_DIR2, TYP_DQBLK, TYP_INOBT, TYP_INODATA, TYP_INODE,
-	TYP_LOG, TYP_RTBITMAP, TYP_RTSUMMARY, TYP_SB, TYP_SYMLINK,
-	TYP_TEXT, TYP_FINOBT, TYP_NONE
+	TYP_AGF,
+	TYP_AGFL,
+	TYP_AGI,
+	TYP_ATTR,
+	TYP_BMAPBTA,
+	TYP_BMAPBTD,
+	TYP_BNOBT,
+	TYP_CNTBT,
+	TYP_RMAPBT,
+	TYP_REFCBT,
+	TYP_DATA,
+	TYP_DIR2,
+	TYP_DQBLK,
+	TYP_INOBT,
+	TYP_INODATA,
+	TYP_INODE,
+	TYP_LOG,
+	TYP_RTBITMAP,
+	TYP_RTSUMMARY,
+	TYP_SB,
+	TYP_SYMLINK,
+	TYP_TEXT,
+	TYP_FINOBT,
+	TYP_NONE
 } typnm_t;
 
 #define DB_FUZZ  2


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 30/52] xfs_db: support dumping realtime superblocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (33 preceding siblings ...)
  2023-12-31 23:55   ` [PATCH 29/52] xfs_db: listify the definition of enum typnm Darrick J. Wong
@ 2023-12-31 23:55   ` Darrick J. Wong
  2023-12-31 23:55   ` [PATCH 31/52] xfs_db: support changing the label and uuid of rt superblocks Darrick J. Wong
                     ` (16 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:55 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Allow debugging of realtime superblocks, and add the relevant fields in
the fs superblock that point us at the existence and location of the rt
supers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/Makefile  |    2 +
 db/command.c |    2 +
 db/field.c   |    8 ++++
 db/field.h   |    4 ++
 db/rtgroup.c |  115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 db/rtgroup.h |   15 ++++++++
 db/sb.c      |   15 ++++++++
 db/type.c    |    6 +++
 db/type.h    |    1 +
 9 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 db/rtgroup.c
 create mode 100644 db/rtgroup.h


diff --git a/db/Makefile b/db/Makefile
index 1511d4a3968..8ea89da9d51 100644
--- a/db/Makefile
+++ b/db/Makefile
@@ -13,7 +13,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
 	flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \
 	io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
 	sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
-	fuzz.h obfuscate.h
+	fuzz.h obfuscate.h rtgroup.h
 CFILES = $(HFILES:.h=.c) bmap_inflate.c btdump.c btheight.c convert.c info.c \
 	iunlink.c namei.c timelimit.c
 LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
diff --git a/db/command.c b/db/command.c
index 6cda03e9856..1b46c3fec08 100644
--- a/db/command.c
+++ b/db/command.c
@@ -39,6 +39,7 @@
 #include "fsmap.h"
 #include "crc.h"
 #include "fuzz.h"
+#include "rtgroup.h"
 
 cmdinfo_t	*cmdtab;
 int		ncmds;
@@ -135,6 +136,7 @@ init_commands(void)
 	output_init();
 	print_init();
 	quit_init();
+	rtsb_init();
 	sb_init();
 	type_init();
 	write_init();
diff --git a/db/field.c b/db/field.c
index a3e47ee81cc..cee5c661595 100644
--- a/db/field.c
+++ b/db/field.c
@@ -23,6 +23,7 @@
 #include "dir2.h"
 #include "dir2sf.h"
 #include "symlink.h"
+#include "rtgroup.h"
 
 const ftattr_t	ftattrtab[] = {
 	{ FLDT_AGBLOCK, "agblock", fp_num, "%u", SI(bitsz(xfs_agblock_t)),
@@ -44,6 +45,11 @@ const ftattr_t	ftattrtab[] = {
 	{ FLDT_AGNUMBER, "agnumber", fp_num, "%u", SI(bitsz(xfs_agnumber_t)),
 	  FTARG_DONULL, NULL, NULL },
 
+	{ FLDT_RGBLOCK, "rgblock", fp_num, "%u", SI(bitsz(xfs_rgblock_t)),
+	  FTARG_DONULL, NULL, NULL },
+	{ FLDT_RGNUMBER, "rgnumber", fp_num, "%u", SI(bitsz(xfs_rgnumber_t)),
+	  FTARG_DONULL, NULL, NULL },
+
 /* attr fields */
 	{ FLDT_ATTR, "attr", NULL, (char *)attr_flds, attr_size, FTARG_SIZE,
 	  NULL, attr_flds },
@@ -347,6 +353,8 @@ const ftattr_t	ftattrtab[] = {
 	  NULL, NULL },
 	{ FLDT_SB, "sb", NULL, (char *)sb_flds, sb_size, FTARG_SIZE, NULL,
 	  sb_flds },
+	{ FLDT_RTSB, "rtsb", NULL, (char *)rtsb_flds, rtsb_size, FTARG_SIZE,
+	  NULL, rtsb_flds },
 
 /* CRC enabled symlink */
 	{ FLDT_SYMLINK_CRC, "symlink", NULL, (char *)symlink_crc_flds,
diff --git a/db/field.h b/db/field.h
index 634742a572c..226753490ad 100644
--- a/db/field.h
+++ b/db/field.h
@@ -15,6 +15,9 @@ typedef enum fldt	{
 	FLDT_AGINONN,
 	FLDT_AGNUMBER,
 
+	FLDT_RGBLOCK,
+	FLDT_RGNUMBER,
+
 	/* attr fields */
 	FLDT_ATTR,
 	FLDT_ATTR_BLKINFO,
@@ -166,6 +169,7 @@ typedef enum fldt	{
 	FLDT_QCNT,
 	FLDT_QWARNCNT,
 	FLDT_SB,
+	FLDT_RTSB,
 
 	/* CRC enabled symlink */
 	FLDT_SYMLINK_CRC,
diff --git a/db/rtgroup.c b/db/rtgroup.c
new file mode 100644
index 00000000000..4a719215d6a
--- /dev/null
+++ b/db/rtgroup.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs.h"
+#include "libxlog.h"
+#include "command.h"
+#include "type.h"
+#include "faddr.h"
+#include "fprint.h"
+#include "field.h"
+#include "io.h"
+#include "sb.h"
+#include "bit.h"
+#include "output.h"
+#include "init.h"
+#include "rtgroup.h"
+
+#define uuid_equal(s,d)		(platform_uuid_compare((s),(d)) == 0)
+
+static int	rtsb_f(int argc, char **argv);
+static void     rtsb_help(void);
+
+static const cmdinfo_t	rtsb_cmd =
+	{ "rtsb", NULL, rtsb_f, 0, 1, 1, N_("[rgno]"),
+	  N_("set current address to realtime sb header"), rtsb_help };
+
+void
+rtsb_init(void)
+{
+	if (xfs_has_rtgroups(mp))
+		add_command(&rtsb_cmd);
+}
+
+#define	OFF(f)	bitize(offsetof(struct xfs_rtsb, rsb_ ## f))
+#define	SZC(f)	szcount(struct xfs_rtsb, rsb_ ## f)
+const field_t	rtsb_flds[] = {
+	{ "magicnum", FLDT_UINT32X, OI(OFF(magicnum)), C1, 0, TYP_NONE },
+	{ "blocksize", FLDT_UINT32D, OI(OFF(blocksize)), C1, 0, TYP_NONE },
+	{ "rblocks", FLDT_DRFSBNO, OI(OFF(rblocks)), C1, 0, TYP_NONE },
+	{ "rextents", FLDT_DRTBNO, OI(OFF(rextents)), C1, 0, TYP_NONE },
+	{ "uuid", FLDT_UUID, OI(OFF(uuid)), C1, 0, TYP_NONE },
+	{ "rextsize", FLDT_AGBLOCK, OI(OFF(rextsize)), C1, 0, TYP_NONE },
+	{ "rgblocks", FLDT_RGBLOCK, OI(OFF(rgblocks)), C1, 0, TYP_NONE },
+	{ "rgcount", FLDT_RGNUMBER, OI(OFF(rgcount)), C1, 0, TYP_NONE },
+	{ "rbmblocks", FLDT_EXTLEN, OI(OFF(rbmblocks)), C1, 0, TYP_NONE },
+	{ "fname", FLDT_CHARNS, OI(OFF(fname)), CI(SZC(fname)), 0, TYP_NONE },
+	{ "blocklog", FLDT_UINT8D, OI(OFF(blocklog)), C1, 0, TYP_NONE },
+	{ "sectlog", FLDT_UINT8D, OI(OFF(sectlog)), C1, 0, TYP_NONE },
+	{ "rextslog", FLDT_UINT8D, OI(OFF(rextslog)), C1, 0, TYP_NONE },
+	{ "crc", FLDT_CRC, OI(OFF(crc)), C1, 0, TYP_NONE },
+	{ "lsn", FLDT_UINT64X, OI(OFF(lsn)), C1, 0, TYP_NONE },
+	{ "meta_uuid", FLDT_UUID, OI(OFF(meta_uuid)), C1, 0, TYP_NONE },
+	{ NULL }
+};
+
+const field_t	rtsb_hfld[] = {
+	{ "", FLDT_RTSB, OI(0), C1, 0, TYP_NONE },
+	{ NULL }
+};
+
+static void
+rtsb_help(void)
+{
+	dbprintf(_(
+"\n"
+" set realtime group superblock\n"
+"\n"
+" Example:\n"
+"\n"
+" 'rtsb 7' - set location to 7th realtime group superblock, set type to 'rtsb'\n"
+"\n"
+" Located in the first block of each realtime group, the rt superblock\n"
+" contains the base information for the realtime section of a filesystem.\n"
+" The superblock in allocation group 0 is the primary.  The copies in the\n"
+" remaining realtime groups only serve as backup for filesystem recovery.\n"
+"\n"
+));
+}
+
+static int
+rtsb_f(
+	int		argc,
+	char		**argv)
+{
+	xfs_rtblock_t	rtbno;
+	xfs_rgnumber_t	rgno = 0;
+	char		*p;
+
+	if (argc > 1) {
+		rgno = (xfs_rgnumber_t)strtoul(argv[1], &p, 0);
+		if (*p != '\0' || rgno >= mp->m_sb.sb_rgcount) {
+			dbprintf(_("bad realtime group number %s\n"), argv[1]);
+			return 0;
+		}
+	}
+	cur_agno = NULLAGNUMBER;
+
+	rtbno = xfs_rgbno_to_rtb(mp, rgno, 0);
+
+	ASSERT(typtab[TYP_RTSB].typnm == TYP_RTSB);
+	set_rt_cur(&typtab[TYP_RTSB], xfs_rtb_to_daddr(mp, rtbno),
+			XFS_FSB_TO_BB(mp, 1), DB_RING_ADD, NULL);
+	return 0;
+}
+
+int
+rtsb_size(
+	void	*obj,
+	int	startoff,
+	int	idx)
+{
+	return bitize(mp->m_sb.sb_blocksize);
+}
diff --git a/db/rtgroup.h b/db/rtgroup.h
new file mode 100644
index 00000000000..85960a3fb9f
--- /dev/null
+++ b/db/rtgroup.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef DB_RTGROUP_H_
+#define DB_RTGROUP_H_
+
+extern const struct field	rtsb_flds[];
+extern const struct field	rtsb_hfld[];
+
+extern void	rtsb_init(void);
+extern int	rtsb_size(void *obj, int startoff, int idx);
+
+#endif /* DB_RTGROUP_H_ */
diff --git a/db/sb.c b/db/sb.c
index 2ad032cc81a..d315bb1dd28 100644
--- a/db/sb.c
+++ b/db/sb.c
@@ -74,6 +74,17 @@ rootino_count(
 	return xfs_has_metadir(mp) ? 0 : 1;
 }
 
+/*
+ * Counts superblock fields that only exist when realtime groups are enabled.
+ */
+static int
+rtgroups_count(
+	void		*obj,
+	int		startoff)
+{
+	return xfs_has_rtgroups(mp) ? 1 : 0;
+}
+
 #define	OFF(f)	bitize(offsetof(struct xfs_dsb, sb_ ## f))
 #define	SZC(f)	szcount(struct xfs_dsb, sb_ ## f)
 const field_t	sb_flds[] = {
@@ -91,6 +102,10 @@ const field_t	sb_flds[] = {
 	  TYP_INODE },
 	{ "rsumino", FLDT_INO, OI(OFF(rsumino)), rootino_count, FLD_COUNT,
 	  TYP_INODE },
+	{ "rgblocks", FLDT_RGBLOCK, OI(OFF(rgblocks)), rtgroups_count,
+	  FLD_COUNT, TYP_NONE },
+	{ "rgcount", FLDT_RGNUMBER, OI(OFF(rgcount)), rtgroups_count,
+	  FLD_COUNT, TYP_NONE },
 	{ "rextsize", FLDT_AGBLOCK, OI(OFF(rextsize)), C1, 0, TYP_NONE },
 	{ "agblocks", FLDT_AGBLOCK, OI(OFF(agblocks)), C1, 0, TYP_NONE },
 	{ "agcount", FLDT_AGNUMBER, OI(OFF(agcount)), C1, 0, TYP_NONE },
diff --git a/db/type.c b/db/type.c
index efe7044569d..d875c0c6365 100644
--- a/db/type.c
+++ b/db/type.c
@@ -28,6 +28,7 @@
 #include "text.h"
 #include "symlink.h"
 #include "fuzz.h"
+#include "rtgroup.h"
 
 static const typ_t	*findtyp(char *name);
 static int		type_f(int argc, char **argv);
@@ -60,6 +61,7 @@ static const typ_t	__typtab[] = {
 	{ TYP_LOG, "log", NULL, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_RTBITMAP, "rtbitmap", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_RTSUMMARY, "rtsummary", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
+	{ TYP_RTSB, "rtsb", handle_struct, rtsb_hfld, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_SB, "sb", handle_struct, sb_hfld, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_SYMLINK, "symlink", handle_string, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_TEXT, "text", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
@@ -102,6 +104,8 @@ static const typ_t	__typtab_crc[] = {
 	{ TYP_LOG, "log", NULL, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_RTBITMAP, "rtbitmap", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_RTSUMMARY, "rtsummary", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
+	{ TYP_RTSB, "rtsb", handle_struct, rtsb_hfld, &xfs_rtsb_buf_ops,
+		XFS_SB_CRC_OFF },
 	{ TYP_SB, "sb", handle_struct, sb_hfld, &xfs_sb_buf_ops,
 		XFS_SB_CRC_OFF },
 	{ TYP_SYMLINK, "symlink", handle_struct, symlink_crc_hfld,
@@ -146,6 +150,8 @@ static const typ_t	__typtab_spcrc[] = {
 	{ TYP_LOG, "log", NULL, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_RTBITMAP, "rtbitmap", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_RTSUMMARY, "rtsummary", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
+	{ TYP_RTSB, "rtsb", handle_struct, rtsb_hfld, &xfs_rtsb_buf_ops,
+		XFS_SB_CRC_OFF },
 	{ TYP_SB, "sb", handle_struct, sb_hfld, &xfs_sb_buf_ops,
 		XFS_SB_CRC_OFF },
 	{ TYP_SYMLINK, "symlink", handle_struct, symlink_crc_hfld,
diff --git a/db/type.h b/db/type.h
index 397dcf5464c..d4efa4b0fab 100644
--- a/db/type.h
+++ b/db/type.h
@@ -30,6 +30,7 @@ typedef enum typnm
 	TYP_LOG,
 	TYP_RTBITMAP,
 	TYP_RTSUMMARY,
+	TYP_RTSB,
 	TYP_SB,
 	TYP_SYMLINK,
 	TYP_TEXT,


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 31/52] xfs_db: support changing the label and uuid of rt superblocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (34 preceding siblings ...)
  2023-12-31 23:55   ` [PATCH 30/52] xfs_db: support dumping realtime superblocks Darrick J. Wong
@ 2023-12-31 23:55   ` Darrick J. Wong
  2023-12-31 23:55   ` [PATCH 32/52] xfs_db: listify the definition of dbm_t Darrick J. Wong
                     ` (15 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:55 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Update the label and uuid commands to change the rt superblocks along
with the filesystem superblocks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/sb.c |  119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 108 insertions(+), 11 deletions(-)


diff --git a/db/sb.c b/db/sb.c
index d315bb1dd28..ce04f489adf 100644
--- a/db/sb.c
+++ b/db/sb.c
@@ -27,6 +27,7 @@ static int	label_f(int argc, char **argv);
 static void     label_help(void);
 static int	version_f(int argc, char **argv);
 static void     version_help(void);
+static size_t check_label(char *label, bool can_warn);
 
 static const cmdinfo_t	sb_cmd =
 	{ "sb", NULL, sb_f, 0, 1, 1, N_("[agno]"),
@@ -356,6 +357,77 @@ uuid_help(void)
 ));
 }
 
+static bool
+check_rtgroup_update_problems(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	if (!xfs_has_rtgroups(mp) || mp->m_sb.sb_rgcount == 0)
+		return false;
+
+	push_cur();
+	error = set_rt_cur(&typtab[TYP_RTSB], XFS_RTSB_DADDR,
+			XFS_FSB_TO_BB(mp, 1), DB_RING_ADD, NULL);
+	if (error == ENODEV) {
+		/* no rt dev means we should just bail out */
+		pop_cur();
+		return true;
+	}
+
+	pop_cur();
+	return false;
+}
+
+static int
+update_rt_supers(
+	struct xfs_mount	*mp,
+	uuid_t			*uuid,
+	char			*label)
+{
+	uuid_t			old_uuid;
+	xfs_rgnumber_t		rgno;
+	int			error;
+
+	if (uuid)
+		memcpy(&old_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		struct xfs_rtsb	*rsb;
+		xfs_rtblock_t	rtbno;
+
+		push_cur();
+		rtbno = xfs_rgbno_to_rtb(mp, rgno, 0);
+		error = set_rt_cur(&typtab[TYP_RTSB],
+				xfs_rtb_to_daddr(mp, rtbno),
+				XFS_FSB_TO_BB(mp, 1), DB_RING_ADD, NULL);
+		if (error == ENODEV) {
+			/* no rt dev means we should just bail out */
+			exitcode = 1;
+			pop_cur();
+			return 1;
+		}
+
+		rsb = iocur_top->data;
+		if (label) {
+			size_t	len = check_label(label, false);
+
+			memset(&rsb->rsb_fname, 0, XFSLABEL_MAX);
+			memcpy(&rsb->rsb_fname, label, len);
+		}
+		if (uuid) {
+			memcpy(&mp->m_sb.sb_uuid, uuid, sizeof(uuid_t));
+			memcpy(&rsb->rsb_uuid, uuid, sizeof(rsb->rsb_uuid));
+		}
+		write_cur();
+		if (uuid)
+			memcpy(&mp->m_sb.sb_uuid, &old_uuid, sizeof(uuid_t));
+		pop_cur();
+	}
+
+	return 0;
+}
+
 static uuid_t *
 do_uuid(xfs_agnumber_t agno, uuid_t *uuid)
 {
@@ -462,11 +534,18 @@ uuid_f(
 			}
 		}
 
+		if (check_rtgroup_update_problems(mp)) {
+			exitcode = 1;
+			return 0;
+		}
+
 		/* clear the log (setting uuid) if it's not dirty */
 		if (!sb_logzero(&uu))
 			return 0;
 
 		dbprintf(_("writing all SBs\n"));
+		if (update_rt_supers(mp, &uu, NULL))
+			return 1;
 		for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)
 			if (!do_uuid(agno, &uu)) {
 				dbprintf(_("failed to set UUID in AG %d\n"), agno);
@@ -535,6 +614,27 @@ label_help(void)
 ));
 }
 
+static size_t
+check_label(
+	char	*label,
+	bool	can_warn)
+{
+	size_t	len = strlen(label);
+
+	if (len > XFSLABEL_MAX) {
+		if (can_warn)
+			dbprintf(_("%s: truncating label length from %d to %d\n"),
+				progname, (int)len, XFSLABEL_MAX);
+		len = XFSLABEL_MAX;
+	}
+	if ( len == 2 &&
+	     (strcmp(label, "\"\"") == 0 ||
+	      strcmp(label, "''")   == 0 ||
+	      strcmp(label, "--")   == 0) )
+		label[0] = label[1] = '\0';
+	return len;
+}
+
 static char *
 do_label(xfs_agnumber_t agno, char *label)
 {
@@ -553,17 +653,7 @@ do_label(xfs_agnumber_t agno, char *label)
 		return &lbl[0];
 	}
 	/* set label */
-	if ((len = strlen(label)) > sizeof(tsb.sb_fname)) {
-		if (agno == 0)
-			dbprintf(_("%s: truncating label length from %d to %d\n"),
-				progname, (int)len, (int)sizeof(tsb.sb_fname));
-		len = sizeof(tsb.sb_fname);
-	}
-	if ( len == 2 &&
-	     (strcmp(label, "\"\"") == 0 ||
-	      strcmp(label, "''")   == 0 ||
-	      strcmp(label, "--")   == 0) )
-		label[0] = label[1] = '\0';
+	len = check_label(label, agno == 0);
 	memset(&tsb.sb_fname, 0, sizeof(tsb.sb_fname));
 	memcpy(&tsb.sb_fname, label, len);
 	memcpy(&lbl[0], &tsb.sb_fname, sizeof(tsb.sb_fname));
@@ -600,7 +690,14 @@ label_f(
 			return 0;
 		}
 
+		if (check_rtgroup_update_problems(mp)) {
+			exitcode = 1;
+			return 0;
+		}
+
 		dbprintf(_("writing all SBs\n"));
+		if (update_rt_supers(mp, NULL, argv[1]))
+			return 1;
 		for (ag = 0; ag < mp->m_sb.sb_agcount; ag++)
 			if ((p = do_label(ag, argv[1])) == NULL) {
 				dbprintf(_("failed to set label in AG %d\n"), ag);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 32/52] xfs_db: listify the definition of dbm_t
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (35 preceding siblings ...)
  2023-12-31 23:55   ` [PATCH 31/52] xfs_db: support changing the label and uuid of rt superblocks Darrick J. Wong
@ 2023-12-31 23:55   ` Darrick J. Wong
  2023-12-31 23:56   ` [PATCH 33/52] xfs_db: implement check for rt superblocks Darrick J. Wong
                     ` (14 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:55 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Convert this enum definition to a list so that code adding elements to
the enum do not have to reflow the whole thing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/check.c |   38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)


diff --git a/db/check.c b/db/check.c
index 20de9c74d4a..0c29c38eee0 100644
--- a/db/check.c
+++ b/db/check.c
@@ -26,14 +26,36 @@ typedef enum {
 } qtype_t;
 
 typedef enum {
-	DBM_UNKNOWN,	DBM_AGF,	DBM_AGFL,	DBM_AGI,
-	DBM_ATTR,	DBM_BTBMAPA,	DBM_BTBMAPD,	DBM_BTBNO,
-	DBM_BTCNT,	DBM_BTINO,	DBM_DATA,	DBM_DIR,
-	DBM_FREE1,	DBM_FREE2,	DBM_FREELIST,	DBM_INODE,
-	DBM_LOG,	DBM_MISSING,	DBM_QUOTA,	DBM_RTBITMAP,
-	DBM_RTDATA,	DBM_RTFREE,	DBM_RTSUM,	DBM_SB,
-	DBM_SYMLINK,	DBM_BTFINO,	DBM_BTRMAP,	DBM_BTREFC,
-	DBM_RLDATA,	DBM_COWDATA,
+	DBM_UNKNOWN,
+	DBM_AGF,
+	DBM_AGFL,
+	DBM_AGI,
+	DBM_ATTR,
+	DBM_BTBMAPA,
+	DBM_BTBMAPD,
+	DBM_BTBNO,
+	DBM_BTCNT,
+	DBM_BTINO,
+	DBM_DATA,
+	DBM_DIR,
+	DBM_FREE1,
+	DBM_FREE2,
+	DBM_FREELIST,
+	DBM_INODE,
+	DBM_LOG,
+	DBM_MISSING,
+	DBM_QUOTA,
+	DBM_RTBITMAP,
+	DBM_RTDATA,
+	DBM_RTFREE,
+	DBM_RTSUM,
+	DBM_SB,
+	DBM_SYMLINK,
+	DBM_BTFINO,
+	DBM_BTRMAP,
+	DBM_BTREFC,
+	DBM_RLDATA,
+	DBM_COWDATA,
 	DBM_NDBM
 } dbm_t;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 33/52] xfs_db: implement check for rt superblocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (36 preceding siblings ...)
  2023-12-31 23:55   ` [PATCH 32/52] xfs_db: listify the definition of dbm_t Darrick J. Wong
@ 2023-12-31 23:56   ` Darrick J. Wong
  2023-12-31 23:56   ` [PATCH 34/52] xfs_db: enable conversion of rt space units Darrick J. Wong
                     ` (13 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:56 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Implement the bare minimum needed to avoid xfs_check regressions when
realtime groups are enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/check.c |   20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)


diff --git a/db/check.c b/db/check.c
index 0c29c38eee0..7d0687a9db7 100644
--- a/db/check.c
+++ b/db/check.c
@@ -56,6 +56,7 @@ typedef enum {
 	DBM_BTREFC,
 	DBM_RLDATA,
 	DBM_COWDATA,
+	DBM_RTSB,
 	DBM_NDBM
 } dbm_t;
 
@@ -187,6 +188,7 @@ static const char	*typename[] = {
 	"btrefcnt",
 	"rldata",
 	"cowdata",
+	"rtsb",
 	NULL
 };
 
@@ -809,6 +811,23 @@ blockfree_f(
 	return 0;
 }
 
+static void
+rtgroups_init(
+	struct xfs_mount	*mp)
+{
+	xfs_rgnumber_t		rgno;
+
+	if (!xfs_has_rtgroups(mp))
+		return;
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		xfs_rtblock_t	rtbno;
+
+		rtbno = xfs_rgbno_to_rtb(mp, rgno, 0);
+		set_rdbmap(rtbno, mp->m_sb.sb_rextsize, DBM_RTSB);
+	}
+}
+
 /*
  * Check consistency of xfs filesystem contents.
  */
@@ -843,6 +862,7 @@ blockget_f(
 				 "filesystem.\n"));
 		}
 	}
+	rtgroups_init(mp);
 	if (blist_size) {
 		xfree(blist);
 		blist = NULL;


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 34/52] xfs_db: enable conversion of rt space units
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (37 preceding siblings ...)
  2023-12-31 23:56   ` [PATCH 33/52] xfs_db: implement check for rt superblocks Darrick J. Wong
@ 2023-12-31 23:56   ` Darrick J. Wong
  2023-12-31 23:56   ` [PATCH 35/52] xfs_db: report rtgroups via version command Darrick J. Wong
                     ` (12 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:56 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the xfs_db convert function about realtime extents, blocks, and
realtime group numbers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/convert.c      |   38 +++++++++++++++++++++++++++++++++++++-
 man/man8/xfs_db.8 |   17 +++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)


diff --git a/db/convert.c b/db/convert.c
index 7c0b1edc235..71d4a7f31b0 100644
--- a/db/convert.c
+++ b/db/convert.c
@@ -34,6 +34,10 @@
 	rtblock_to_bytes(rtx_to_rtblock(xfs_rbmblock_to_rtx(mp, (uint64_t)x)))
 #define rbmword_to_bytes(x)	\
 	rtblock_to_bytes(rtx_to_rtblock((uint64_t)(x) << XFS_NBWORDLOG))
+#define	rgblock_to_bytes(x)	\
+	((uint64_t)(x) << mp->m_sb.sb_blocklog)
+#define	rgnumber_to_bytes(x)	\
+	rgblock_to_bytes((uint64_t)(x) * mp->m_sb.sb_rgblocks)
 
 typedef enum {
 	CT_NONE = -1,
@@ -55,6 +59,8 @@ typedef enum {
 	CT_RSUMBLOCK,		/* block within rt summary */
 	CT_RSUMLOG,		/* log level for rtsummary computations */
 	CT_RSUMINFO,		/* info word within rt summary */
+	CT_RGBLOCK,		/* xfs_rgblock_t */
+	CT_RGNUMBER,		/* xfs_rgno_t */
 	NCTS
 } ctype_t;
 
@@ -80,6 +86,8 @@ typedef union {
 	xfs_fileoff_t	rbmblock;
 	unsigned int	rbmword;
 	xfs_fileoff_t	rsumblock;
+	xfs_rgnumber_t	rgnumber;
+	xfs_rgblock_t	rgblock;
 } cval_t;
 
 static uint64_t		bytevalue(ctype_t ctype, cval_t *val);
@@ -95,7 +103,7 @@ static const char	*agnumber_names[] = { "agnumber", "agno", NULL };
 static const char	*bboff_names[] = { "bboff", "daddroff", NULL };
 static const char	*blkoff_names[] = { "blkoff", "fsboff", "agboff",
 					    NULL };
-static const char	*rtblkoff_names[] = { "blkoff", "rtboff",
+static const char	*rtblkoff_names[] = { "blkoff", "rtboff", "rgboff",
 					    NULL };
 static const char	*byte_names[] = { "byte", "fsbyte", NULL };
 static const char	*daddr_names[] = { "daddr", "bb", NULL };
@@ -111,6 +119,8 @@ static const char	*rbmword_names[] = { "rbmword", "rbmw", NULL };
 static const char	*rsumblock_names[] = { "rsumblock", "rsmb", NULL };
 static const char	*rsumlog_names[] = { "rsumlog", "rsml", NULL };
 static const char	*rsumword_names[] = { "rsuminfo", "rsmi", NULL };
+static const char	*rgblock_names[] = { "rgblock", "rgbno", NULL };
+static const char	*rgnumber_names[] = { "rgnumber", "rgno", NULL };
 
 static int		rsuminfo;
 static int		rsumlog;
@@ -208,6 +218,14 @@ static const ctydesc_t	ctydescs_rt[NCTS] = {
 		.allowed = M(RSUMBLOCK),
 		.names   = rsumword_names,
 	},
+	[CT_RGBLOCK] = {
+		.allowed = M(RGNUMBER)|M(BBOFF)|M(BLKOFF)|M(RSUMLOG),
+		.names   = rgblock_names,
+	},
+	[CT_RGNUMBER] = {
+		.allowed = M(RGBLOCK)|M(BBOFF)|M(BLKOFF)|M(RSUMLOG),
+		.names   = rgnumber_names,
+	},
 };
 
 static const cmdinfo_t	convert_cmd =
@@ -295,6 +313,10 @@ bytevalue(ctype_t ctype, cval_t *val)
 		 * value.
 		 */
 		return 0;
+	case CT_RGBLOCK:
+		return rgblock_to_bytes(val->rgblock);
+	case CT_RGNUMBER:
+		return rgnumber_to_bytes(val->rgnumber);
 	case CT_NONE:
 	case NCTS:
 		break;
@@ -401,6 +423,8 @@ convert_f(int argc, char **argv)
 	case CT_RSUMBLOCK:
 	case CT_RSUMLOG:
 	case CT_RSUMINFO:
+	case CT_RGBLOCK:
+	case CT_RGNUMBER:
 		/* shouldn't get here */
 		ASSERT(0);
 		break;
@@ -551,6 +575,12 @@ rtconvert_f(int argc, char **argv)
 	case CT_RSUMINFO:
 		v = rt_daddr_to_rsuminfo(mp, v);
 		break;
+	case CT_RGBLOCK:
+		v = xfs_daddr_to_rgbno(mp, v >> BBSHIFT);
+		break;
+	case CT_RGNUMBER:
+		v = xfs_daddr_to_rgno(mp, v >> BBSHIFT);
+		break;
 	case CT_AGBLOCK:
 	case CT_AGINO:
 	case CT_AGNUMBER:
@@ -643,6 +673,12 @@ getvalue(char *s, ctype_t ctype, cval_t *val)
 	case CT_RSUMINFO:
 		rsuminfo = (unsigned int)v;
 		break;
+	case CT_RGBLOCK:
+		val->rgblock = (xfs_rgblock_t)v;
+		break;
+	case CT_RGNUMBER:
+		val->rgnumber = (xfs_rgnumber_t)v;
+		break;
 	case CT_NONE:
 	case NCTS:
 		/* NOTREACHED */
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 15a30c11ae5..d0115075888 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -1077,6 +1077,16 @@ with alternate names, are:
 .RS 1.0i
 .PD 0
 .HP
+.B rgblock
+or
+.B rgbno
+(realtime block within a realtime group)
+.HP
+.B rgnumber
+or
+.B rgno
+(realtime group number)
+.HP
 .B bboff
 or
 .B daddroff
@@ -1144,6 +1154,13 @@ or
 .RE
 .IP
 Only conversions that "make sense" are allowed.
+The compound form (with more than three arguments) is useful for
+conversions such as
+.B convert rgno
+.I rg
+.B rgbno
+.I rgb
+.BR rtblock .
 
 Realtime summary file location conversions have the following rules:
 Each info word in the rt summary file counts the number of free extents of a


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 35/52] xfs_db: report rtgroups via version command
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (38 preceding siblings ...)
  2023-12-31 23:56   ` [PATCH 34/52] xfs_db: enable conversion of rt space units Darrick J. Wong
@ 2023-12-31 23:56   ` Darrick J. Wong
  2023-12-31 23:56   ` [PATCH 36/52] xfs_db: metadump realtime devices Darrick J. Wong
                     ` (11 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:56 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Report the rtgroups feature in the version command output.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/sb.c |    2 ++
 1 file changed, 2 insertions(+)


diff --git a/db/sb.c b/db/sb.c
index ce04f489adf..1d6ca2fa939 100644
--- a/db/sb.c
+++ b/db/sb.c
@@ -853,6 +853,8 @@ version_string(
 		strcat(s, ",PARENT");
 	if (xfs_has_metadir(mp))
 		strcat(s, ",METADIR");
+	if (xfs_has_rtgroups(mp))
+		strcat(s, ",RTGROUPS");
 	return s;
 }
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 36/52] xfs_db: metadump realtime devices
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (39 preceding siblings ...)
  2023-12-31 23:56   ` [PATCH 35/52] xfs_db: report rtgroups via version command Darrick J. Wong
@ 2023-12-31 23:56   ` Darrick J. Wong
  2023-12-31 23:57   ` [PATCH 37/52] xfs_db: dump rt bitmap blocks Darrick J. Wong
                     ` (10 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:56 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the metadump device to dump the filesystem metadata of a realtime
device to the metadump file.  Currently, this is limited to the rt group
superblocks.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/metadump.c             |   61 +++++++++++++++++++++++++++++++++++++++++++++
 db/xfs_metadump.sh        |    5 ++--
 include/xfs_metadump.h    |    8 ++++++
 man/man8/xfs_metadump.8   |   11 ++++++++
 mdrestore/xfs_mdrestore.c |    5 +++-
 5 files changed, 87 insertions(+), 3 deletions(-)


diff --git a/db/metadump.c b/db/metadump.c
index 714be862231..be4cc01ff26 100644
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -85,6 +85,7 @@ static struct metadump {
 	bool			dirty_log;
 	bool			external_log;
 	bool			stdout_metadump;
+	bool			realtime_data;
 	xfs_ino_t		cur_ino;
 	/* Metadump file */
 	FILE			*outf;
@@ -3099,6 +3100,7 @@ init_metadump_v2(void)
 {
 	struct xfs_metadump_header	xmh = {0};
 	uint32_t			compat_flags = 0;
+	uint32_t			incompat_flags = 0;
 
 	xmh.xmh_magic = cpu_to_be32(XFS_MD_MAGIC_V2);
 	xmh.xmh_version = cpu_to_be32(2);
@@ -3111,8 +3113,11 @@ init_metadump_v2(void)
 		compat_flags |= XFS_MD2_COMPAT_DIRTYLOG;
 	if (metadump.external_log)
 		compat_flags |= XFS_MD2_COMPAT_EXTERNALLOG;
+	if (metadump.realtime_data)
+		incompat_flags |= XFS_MD2_INCOMPAT_RTDEVICE;
 
 	xmh.xmh_compat_flags = cpu_to_be32(compat_flags);
+	xmh.xmh_incompat_flags = cpu_to_be32(incompat_flags);
 
 	if (fwrite(&xmh, sizeof(xmh), 1, metadump.outf) != 1) {
 		print_warning("error writing to target file");
@@ -3122,6 +3127,39 @@ init_metadump_v2(void)
 	return 0;
 }
 
+static int
+copy_rtsupers(void)
+{
+	int		error;
+	xfs_rtblock_t	rtbno;
+	xfs_rgnumber_t	rgno = 0;
+
+	if (metadump.show_progress)
+		print_progress("Copying realtime superblocks");
+
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		rtbno = xfs_rgbno_to_rtb(mp, rgno, 0);
+
+		push_cur();
+		error = set_rt_cur(&typtab[TYP_RTSB],
+				xfs_rtb_to_daddr(mp, rtbno),
+				XFS_FSB_TO_BB(mp, 1), DB_RING_ADD, NULL);
+		if (error)
+			return 0;
+		if (iocur_top->data == NULL) {
+			pop_cur();
+			print_warning("cannot read rt super %u", rgno);
+			return !metadump.stop_on_read_error;
+		}
+		error = write_buf(iocur_top);
+		pop_cur();
+		if (error)
+			return 0;
+	}
+
+	return 1;
+}
+
 static int
 write_metadump_v2(
 	enum typnm		type,
@@ -3136,6 +3174,8 @@ write_metadump_v2(
 	if (type == TYP_LOG &&
 	    mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
 		addr |= XME_ADDR_LOG_DEVICE;
+	else if (type == TYP_RTSB)
+		addr |= XME_ADDR_RT_DEVICE;
 	else
 		addr |= XME_ADDR_DATA_DEVICE;
 
@@ -3184,6 +3224,7 @@ metadump_f(
 	metadump.zero_stale_data = true;
 	metadump.dirty_log = false;
 	metadump.external_log = false;
+	metadump.realtime_data = false;
 
 	if (mp->m_sb.sb_magicnum != XFS_SB_MAGIC) {
 		print_warning("bad superblock magic number %x, giving up",
@@ -3262,6 +3303,20 @@ metadump_f(
 		return 1;
 	}
 
+	/* The realtime device only contains metadata if rtgroups is enabled. */
+	if (mp->m_rtdev_targp->bt_bdev && xfs_has_rtgroups(mp))
+		metadump.realtime_data = true;
+
+	if (metadump.realtime_data && !version_opt_set)
+		metadump.version = 2;
+
+	if (metadump.version == 2 && xfs_has_realtime(mp) &&
+	    xfs_has_rtgroups(mp) &&
+	    !metadump.realtime_data) {
+		print_warning("realtime device not loaded, use -R");
+		return 1;
+	}
+
 	/*
 	 * If we'll copy the log, see if the log is dirty.
 	 *
@@ -3365,6 +3420,12 @@ metadump_f(
 	if (!exitcode && !(metadump.version == 1 && metadump.external_log))
 		exitcode = !copy_log();
 
+	/* copy rt sueprblocks */
+	if (!exitcode && metadump.realtime_data) {
+		if (!copy_rtsupers())
+			exitcode = 1;
+	}
+
 	/* write the remaining index */
 	if (!exitcode && metadump.mdops->finish_dump)
 		exitcode = metadump.mdops->finish_dump() < 0;
diff --git a/db/xfs_metadump.sh b/db/xfs_metadump.sh
index 9e8f86e53eb..b5c6959f200 100755
--- a/db/xfs_metadump.sh
+++ b/db/xfs_metadump.sh
@@ -6,9 +6,9 @@
 
 OPTS=" "
 DBOPTS=" "
-USAGE="Usage: xfs_metadump [-aefFogwV] [-m max_extents] [-l logdev] source target"
+USAGE="Usage: xfs_metadump [-aefFogwV] [-m max_extents] [-l logdev] [-r rtdev] [-v version] source target"
 
-while getopts "aefgl:m:owFv:V" c
+while getopts "aefFgl:m:or:wv:V" c
 do
 	case $c in
 	a)	OPTS=$OPTS"-a ";;
@@ -25,6 +25,7 @@ do
 		status=$?
 		exit $status
 		;;
+	r)	DBOPTS=$DBOPTS"-R "$OPTARG" ";;
 	\?)	echo $USAGE 1>&2
 		exit 2
 		;;
diff --git a/include/xfs_metadump.h b/include/xfs_metadump.h
index e9c3dcb8f71..e35f791573e 100644
--- a/include/xfs_metadump.h
+++ b/include/xfs_metadump.h
@@ -68,12 +68,18 @@ struct xfs_metadump_header {
 /* Dump contains external log contents. */
 #define XFS_MD2_COMPAT_EXTERNALLOG	(1 << 3)
 
+/* Dump contains realtime device contents. */
+#define XFS_MD2_INCOMPAT_RTDEVICE	(1U << 0)
+
+#define XFS_MD2_INCOMPAT_ALL		(XFS_MD2_INCOMPAT_RTDEVICE)
+
 struct xfs_meta_extent {
 	/*
 	 * Lowest 54 bits are used to store 512 byte addresses.
 	 * Next 2 bits is used for indicating the device.
 	 * 00 - Data device
 	 * 01 - External log
+	 * 10 - Realtime device
 	 */
 	__be64 xme_addr;
 	/* In units of 512 byte blocks */
@@ -88,6 +94,8 @@ struct xfs_meta_extent {
 #define XME_ADDR_DATA_DEVICE	(0ULL << XME_ADDR_DEVICE_SHIFT)
 /* Extent was copied from the log device */
 #define XME_ADDR_LOG_DEVICE	(1ULL << XME_ADDR_DEVICE_SHIFT)
+/* Extent was copied from the rt device */
+#define XME_ADDR_RT_DEVICE	(2ULL << XME_ADDR_DEVICE_SHIFT)
 
 #define XME_ADDR_DEVICE_MASK	(3ULL << XME_ADDR_DEVICE_SHIFT)
 
diff --git a/man/man8/xfs_metadump.8 b/man/man8/xfs_metadump.8
index 496b5926603..8618ea99b9a 100644
--- a/man/man8/xfs_metadump.8
+++ b/man/man8/xfs_metadump.8
@@ -12,6 +12,9 @@ xfs_metadump \- copy XFS filesystem metadata to a file
 .B \-l
 .I logdev
 ] [
+.B \-r
+.I rtdev
+] [
 .B \-v
 .I version
 ]
@@ -146,6 +149,14 @@ this value.  The default size is 2097151 blocks.
 .B \-o
 Disables obfuscation of file names and extended attributes.
 .TP
+.BI \-r  " rtdev"
+For filesystems that have a realtime section, this specifies the device where
+the realtime section resides.
+If the v2 metadump format is selected, the realtime group superblocks will be
+copied to the metadump.
+The v2 metadump format will be selected automatically if the filesystem
+contains realtime groups.
+.TP
 .B \-v
 The format of the metadump file to be produced.
 Valid values are 1 and 2.
diff --git a/mdrestore/xfs_mdrestore.c b/mdrestore/xfs_mdrestore.c
index 6465a481ce3..e0572b31e2c 100644
--- a/mdrestore/xfs_mdrestore.c
+++ b/mdrestore/xfs_mdrestore.c
@@ -314,7 +314,7 @@ read_header_v2(
 			sizeof(h->v2) - sizeof(h->v2.xmh_magic), 1, md_fp) != 1)
 		fatal("error reading from metadump file\n");
 
-	if (h->v2.xmh_incompat_flags != 0)
+	if (h->v2.xmh_incompat_flags & cpu_to_be32(~XFS_MD2_INCOMPAT_ALL))
 		fatal("Metadump header has unknown incompat flags set\n");
 
 	if (h->v2.xmh_reserved != 0)
@@ -324,6 +324,9 @@ read_header_v2(
 
 	if (!mdrestore.external_log && (compat & XFS_MD2_COMPAT_EXTERNALLOG))
 		fatal("External Log device is required\n");
+
+	if (h->v2.xmh_incompat_flags & cpu_to_be32(XFS_MD2_INCOMPAT_RTDEVICE))
+		fatal("Realtime device not yet supported\n");
 }
 
 static void


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 37/52] xfs_db: dump rt bitmap blocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (40 preceding siblings ...)
  2023-12-31 23:56   ` [PATCH 36/52] xfs_db: metadump realtime devices Darrick J. Wong
@ 2023-12-31 23:57   ` Darrick J. Wong
  2023-12-31 23:57   ` [PATCH 38/52] xfs_db: dump rt summary blocks Darrick J. Wong
                     ` (9 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:57 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that rtbitmap blocks have a header, make it so that xfs_db can
analyze the structure.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/bit.c           |   24 +++++++++++++++++++-----
 db/bit.h           |    1 +
 db/field.c         |    5 +++++
 db/field.h         |    4 ++++
 db/fprint.c        |   11 +++++++++--
 db/inode.c         |    6 ++++--
 db/rtgroup.c       |   34 ++++++++++++++++++++++++++++++++++
 db/rtgroup.h       |    3 +++
 db/type.c          |    5 +++++
 db/type.h          |    1 +
 include/xfs_arch.h |    6 ++++++
 11 files changed, 91 insertions(+), 9 deletions(-)


diff --git a/db/bit.c b/db/bit.c
index c9bfd2eb025..84f46290b5a 100644
--- a/db/bit.c
+++ b/db/bit.c
@@ -55,6 +55,7 @@ getbitval(
 	char		*p;
 	int64_t		rval;
 	int		signext;
+	bool		is_le = (flags & BV_LE);
 	int		z1, z2, z3, z4;
 
 	ASSERT(nbits<=64);
@@ -63,21 +64,34 @@ getbitval(
 	bit = bitoffs(bitoff);
 	signext = (flags & BVSIGNED) != 0;
 	z4 = ((intptr_t)p & 0xf) == 0 && bit == 0;
-	if (nbits == 64 && z4)
+	if (nbits == 64 && z4) {
+		if (is_le)
+			return le64_to_cpu(*(__be64 *)p);
 		return be64_to_cpu(*(__be64 *)p);
+	}
 	z3 = ((intptr_t)p & 0x7) == 0 && bit == 0;
 	if (nbits == 32 && z3) {
-		if (signext)
+		if (signext) {
+			if (is_le)
+				return (__s32)le32_to_cpu(*(__le32 *)p);
 			return (__s32)be32_to_cpu(*(__be32 *)p);
-		else
+		} else {
+			if (is_le)
+				return (__u32)le32_to_cpu(*(__le32 *)p);
 			return (__u32)be32_to_cpu(*(__be32 *)p);
+		}
 	}
 	z2 = ((intptr_t)p & 0x3) == 0 && bit == 0;
 	if (nbits == 16 && z2) {
-		if (signext)
+		if (signext) {
+			if (is_le)
+				return (__s16)le16_to_cpu(*(__le16 *)p);
 			return (__s16)be16_to_cpu(*(__be16 *)p);
-		else
+		} else {
+			if (is_le)
+				return (__u16)le16_to_cpu(*(__le16 *)p);
 			return (__u16)be16_to_cpu(*(__be16 *)p);
+		}
 	}
 	z1 = ((intptr_t)p & 0x1) == 0 && bit == 0;
 	if (nbits == 8 && z1) {
diff --git a/db/bit.h b/db/bit.h
index 4df86030abc..912283a7348 100644
--- a/db/bit.h
+++ b/db/bit.h
@@ -13,6 +13,7 @@
 
 #define	BVUNSIGNED	0
 #define	BVSIGNED	1
+#define	BV_LE		(1U << 1) /* little endian */
 
 extern int64_t		getbitval(void *obj, int bitoff, int nbits, int flags);
 extern void		setbitval(void *obuf, int bitoff, int nbits, void *ibuf);
diff --git a/db/field.c b/db/field.c
index cee5c661595..7dee8c3735c 100644
--- a/db/field.c
+++ b/db/field.c
@@ -392,6 +392,11 @@ const ftattr_t	ftattrtab[] = {
 	{ FLDT_UINT8X, "uint8x", fp_num, "%#x", SI(bitsz(uint8_t)), 0, NULL,
 	  NULL },
 	{ FLDT_UUID, "uuid", fp_uuid, NULL, SI(bitsz(uuid_t)), 0, NULL, NULL },
+
+	{ FLDT_RTWORD, "rtword", fp_num, "%#x", SI(bitsz(xfs_rtword_t)),
+	  FTARG_LE, NULL, NULL },
+	{ FLDT_RGBITMAP, "rgbitmap", NULL, (char *)rgbitmap_flds, btblock_size,
+	  FTARG_SIZE, NULL, rgbitmap_flds },
 	{ FLDT_ZZZ, NULL }
 };
 
diff --git a/db/field.h b/db/field.h
index 226753490ad..ce7e7297afa 100644
--- a/db/field.h
+++ b/db/field.h
@@ -191,6 +191,9 @@ typedef enum fldt	{
 	FLDT_UINT8O,
 	FLDT_UINT8X,
 	FLDT_UUID,
+
+	FLDT_RTWORD,
+	FLDT_RGBITMAP,
 	FLDT_ZZZ			/* mark last entry */
 } fldt_t;
 
@@ -246,6 +249,7 @@ extern const ftattr_t	ftattrtab[];
 #define	FTARG_SIZE	16	/* size field is a function */
 #define	FTARG_SKIPNMS	32	/* skip printing names this time */
 #define	FTARG_OKEMPTY	64	/* ok if this (union type) is empty */
+#define FTARG_LE	(1U << 7) /* little endian */
 
 extern int		bitoffset(const field_t *f, void *obj, int startoff,
 				  int idx);
diff --git a/db/fprint.c b/db/fprint.c
index 65accfda3fe..ac916d511e8 100644
--- a/db/fprint.c
+++ b/db/fprint.c
@@ -68,13 +68,20 @@ fp_num(
 	int		bitpos;
 	int		i;
 	int		isnull;
+	int		bvflags = 0;
 	int64_t		val;
 
+	if (arg & FTARG_LE)
+		bvflags |= BV_LE;
+	if (arg & FTARG_SIGNED)
+		bvflags |= BVSIGNED;
+	else
+		bvflags |= BVUNSIGNED;
+
 	for (i = 0, bitpos = bit;
 	     i < count && !seenint();
 	     i++, bitpos += size) {
-		val = getbitval(obj, bitpos, size,
-			(arg & FTARG_SIGNED) ? BVSIGNED : BVUNSIGNED);
+		val = getbitval(obj, bitpos, size, bvflags);
 		if ((arg & FTARG_SKIPZERO) && val == 0)
 			continue;
 		isnull = (arg & FTARG_SIGNED) || size == 64 ?
diff --git a/db/inode.c b/db/inode.c
index 4e2be6a1156..5510b2cb663 100644
--- a/db/inode.c
+++ b/db/inode.c
@@ -642,9 +642,11 @@ inode_next_type(void)
 	case S_IFLNK:
 		return TYP_SYMLINK;
 	case S_IFREG:
-		if (iocur_top->ino == mp->m_sb.sb_rbmino)
+		if (iocur_top->ino == mp->m_sb.sb_rbmino) {
+			if (xfs_has_rtgroups(mp))
+				return TYP_RGBITMAP;
 			return TYP_RTBITMAP;
-		else if (iocur_top->ino == mp->m_sb.sb_rsumino)
+		} else if (iocur_top->ino == mp->m_sb.sb_rsumino)
 			return TYP_RTSUMMARY;
 		else if (iocur_top->ino == mp->m_sb.sb_uquotino ||
 			 iocur_top->ino == mp->m_sb.sb_gquotino ||
diff --git a/db/rtgroup.c b/db/rtgroup.c
index 4a719215d6a..a6cf98de0b8 100644
--- a/db/rtgroup.c
+++ b/db/rtgroup.c
@@ -54,6 +54,7 @@ const field_t	rtsb_flds[] = {
 	{ "meta_uuid", FLDT_UUID, OI(OFF(meta_uuid)), C1, 0, TYP_NONE },
 	{ NULL }
 };
+#undef OFF
 
 const field_t	rtsb_hfld[] = {
 	{ "", FLDT_RTSB, OI(0), C1, 0, TYP_NONE },
@@ -113,3 +114,36 @@ rtsb_size(
 {
 	return bitize(mp->m_sb.sb_blocksize);
 }
+
+static int
+rtwords_count(
+	void			*obj,
+	int			startoff)
+{
+	unsigned int		blksz = mp->m_sb.sb_blocksize;
+
+	if (xfs_has_rtgroups(mp))
+		blksz -= sizeof(struct xfs_rtbuf_blkinfo);
+
+	return blksz >> XFS_WORDLOG;
+}
+
+#define	OFF(f)	bitize(offsetof(struct xfs_rtbuf_blkinfo, rt_ ## f))
+const field_t	rgbitmap_flds[] = {
+	{ "magicnum", FLDT_UINT32X, OI(OFF(magic)), C1, 0, TYP_NONE },
+	{ "crc", FLDT_CRC, OI(OFF(crc)), C1, 0, TYP_NONE },
+	{ "owner", FLDT_INO, OI(OFF(owner)), C1, 0, TYP_NONE },
+	{ "bno", FLDT_DFSBNO, OI(OFF(blkno)), C1, 0, TYP_BMAPBTD },
+	{ "lsn", FLDT_UINT64X, OI(OFF(lsn)), C1, 0, TYP_NONE },
+	{ "uuid", FLDT_UUID, OI(OFF(uuid)), C1, 0, TYP_NONE },
+	/* the rtword array is after the actual structure */
+	{ "rtwords", FLDT_RTWORD, OI(bitize(sizeof(struct xfs_rtbuf_blkinfo))),
+	  rtwords_count, FLD_ARRAY | FLD_COUNT, TYP_DATA },
+	{ NULL }
+};
+#undef OFF
+
+const field_t	rgbitmap_hfld[] = {
+	{ "", FLDT_RGBITMAP, OI(0), C1, 0, TYP_NONE },
+	{ NULL }
+};
diff --git a/db/rtgroup.h b/db/rtgroup.h
index 85960a3fb9f..06f554e1862 100644
--- a/db/rtgroup.h
+++ b/db/rtgroup.h
@@ -9,6 +9,9 @@
 extern const struct field	rtsb_flds[];
 extern const struct field	rtsb_hfld[];
 
+extern const struct field	rgbitmap_flds[];
+extern const struct field	rgbitmap_hfld[];
+
 extern void	rtsb_init(void);
 extern int	rtsb_size(void *obj, int startoff, int idx);
 
diff --git a/db/type.c b/db/type.c
index d875c0c6365..65e7b24146f 100644
--- a/db/type.c
+++ b/db/type.c
@@ -67,6 +67,7 @@ static const typ_t	__typtab[] = {
 	{ TYP_TEXT, "text", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_FINOBT, "finobt", handle_struct, finobt_hfld, NULL,
 		TYP_F_NO_CRC_OFF },
+	{ TYP_RGBITMAP, NULL },
 	{ TYP_NONE, NULL }
 };
 
@@ -113,6 +114,8 @@ static const typ_t	__typtab_crc[] = {
 	{ TYP_TEXT, "text", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_FINOBT, "finobt", handle_struct, finobt_crc_hfld,
 		&xfs_finobt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
+	{ TYP_RGBITMAP, "rgbitmap", handle_struct, rgbitmap_hfld,
+		&xfs_rtbitmap_buf_ops, XFS_RTBUF_CRC_OFF },
 	{ TYP_NONE, NULL }
 };
 
@@ -159,6 +162,8 @@ static const typ_t	__typtab_spcrc[] = {
 	{ TYP_TEXT, "text", handle_text, NULL, NULL, TYP_F_NO_CRC_OFF },
 	{ TYP_FINOBT, "finobt", handle_struct, finobt_spcrc_hfld,
 		&xfs_finobt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
+	{ TYP_RGBITMAP, "rgbitmap", handle_struct, rgbitmap_hfld,
+		&xfs_rtbitmap_buf_ops, XFS_RTBUF_CRC_OFF },
 	{ TYP_NONE, NULL }
 };
 
diff --git a/db/type.h b/db/type.h
index d4efa4b0fab..e2148c6351d 100644
--- a/db/type.h
+++ b/db/type.h
@@ -35,6 +35,7 @@ typedef enum typnm
 	TYP_SYMLINK,
 	TYP_TEXT,
 	TYP_FINOBT,
+	TYP_RGBITMAP,
 	TYP_NONE
 } typnm_t;
 
diff --git a/include/xfs_arch.h b/include/xfs_arch.h
index d46ae47094a..6312e62b0a1 100644
--- a/include/xfs_arch.h
+++ b/include/xfs_arch.h
@@ -200,6 +200,9 @@ static __inline__ void __swab64s(__u64 *addr)
 	((__force __le32)___constant_swab32((__u32)(val)))
 #define __constant_cpu_to_be32(val)	\
 	((__force __be32)(__u32)(val))
+
+#define le64_to_cpu(val)	(__swab64((__force __u64)(__le64)(val)))
+#define le16_to_cpu(val)	(__swab16((__force __u16)(__le16)(val)))
 #else
 #define cpu_to_be16(val)	((__force __be16)__swab16((__u16)(val)))
 #define cpu_to_be32(val)	((__force __be32)__swab32((__u32)(val)))
@@ -215,6 +218,9 @@ static __inline__ void __swab64s(__u64 *addr)
 	((__force __le32)(__u32)(val))
 #define __constant_cpu_to_be32(val)	\
 	((__force __be32)___constant_swab32((__u32)(val)))
+
+#define le64_to_cpu(val)	((__force __u64)(__le64)(val))
+#define le16_to_cpu(val)	((__force __u16)(__le16)(val))
 #endif
 
 static inline void be16_add_cpu(__be16 *a, __s16 b)


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 38/52] xfs_db: dump rt summary blocks
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (41 preceding siblings ...)
  2023-12-31 23:57   ` [PATCH 37/52] xfs_db: dump rt bitmap blocks Darrick J. Wong
@ 2023-12-31 23:57   ` Darrick J. Wong
  2023-12-31 23:57   ` [PATCH 39/52] xfs_mdrestore: restore rt group superblocks to realtime device Darrick J. Wong
                     ` (8 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:57 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Now that rtsummary blocks have a header, make it so that xfs_db can
analyze the structure.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/field.c   |    5 +++++
 db/field.h   |    3 +++
 db/inode.c   |    5 ++++-
 db/rtgroup.c |   20 ++++++++++++++++++++
 db/rtgroup.h |    3 +++
 db/type.c    |    5 +++++
 db/type.h    |    1 +
 7 files changed, 41 insertions(+), 1 deletion(-)


diff --git a/db/field.c b/db/field.c
index 7dee8c3735c..4a6a4cf51c3 100644
--- a/db/field.c
+++ b/db/field.c
@@ -397,6 +397,11 @@ const ftattr_t	ftattrtab[] = {
 	  FTARG_LE, NULL, NULL },
 	{ FLDT_RGBITMAP, "rgbitmap", NULL, (char *)rgbitmap_flds, btblock_size,
 	  FTARG_SIZE, NULL, rgbitmap_flds },
+	{ FLDT_SUMINFO, "suminfo", fp_num, "%u", SI(bitsz(xfs_suminfo_t)),
+	  0, NULL, NULL },
+	{ FLDT_RGSUMMARY, "rgsummary", NULL, (char *)rgsummary_flds,
+	  btblock_size, FTARG_SIZE, NULL, rgsummary_flds },
+
 	{ FLDT_ZZZ, NULL }
 };
 
diff --git a/db/field.h b/db/field.h
index ce7e7297afa..e9c6142f282 100644
--- a/db/field.h
+++ b/db/field.h
@@ -194,6 +194,9 @@ typedef enum fldt	{
 
 	FLDT_RTWORD,
 	FLDT_RGBITMAP,
+	FLDT_SUMINFO,
+	FLDT_RGSUMMARY,
+
 	FLDT_ZZZ			/* mark last entry */
 } fldt_t;
 
diff --git a/db/inode.c b/db/inode.c
index 5510b2cb663..16033c5ab79 100644
--- a/db/inode.c
+++ b/db/inode.c
@@ -646,8 +646,11 @@ inode_next_type(void)
 			if (xfs_has_rtgroups(mp))
 				return TYP_RGBITMAP;
 			return TYP_RTBITMAP;
-		} else if (iocur_top->ino == mp->m_sb.sb_rsumino)
+		} else if (iocur_top->ino == mp->m_sb.sb_rsumino) {
+			if (xfs_has_rtgroups(mp))
+				return TYP_RGSUMMARY;
 			return TYP_RTSUMMARY;
+		}
 		else if (iocur_top->ino == mp->m_sb.sb_uquotino ||
 			 iocur_top->ino == mp->m_sb.sb_gquotino ||
 			 iocur_top->ino == mp->m_sb.sb_pquotino)
diff --git a/db/rtgroup.c b/db/rtgroup.c
index a6cf98de0b8..52d7c0cd378 100644
--- a/db/rtgroup.c
+++ b/db/rtgroup.c
@@ -147,3 +147,23 @@ const field_t	rgbitmap_hfld[] = {
 	{ "", FLDT_RGBITMAP, OI(0), C1, 0, TYP_NONE },
 	{ NULL }
 };
+
+#define	OFF(f)	bitize(offsetof(struct xfs_rtbuf_blkinfo, rt_ ## f))
+const field_t	rgsummary_flds[] = {
+	{ "magicnum", FLDT_UINT32X, OI(OFF(magic)), C1, 0, TYP_NONE },
+	{ "crc", FLDT_CRC, OI(OFF(crc)), C1, 0, TYP_NONE },
+	{ "owner", FLDT_INO, OI(OFF(owner)), C1, 0, TYP_NONE },
+	{ "bno", FLDT_DFSBNO, OI(OFF(blkno)), C1, 0, TYP_BMAPBTD },
+	{ "lsn", FLDT_UINT64X, OI(OFF(lsn)), C1, 0, TYP_NONE },
+	{ "uuid", FLDT_UUID, OI(OFF(uuid)), C1, 0, TYP_NONE },
+	/* the suminfo array is after the actual structure */
+	{ "suminfo", FLDT_SUMINFO, OI(bitize(sizeof(struct xfs_rtbuf_blkinfo))),
+	  rtwords_count, FLD_ARRAY | FLD_COUNT, TYP_DATA },
+	{ NULL }
+};
+#undef OFF
+
+const field_t	rgsummary_hfld[] = {
+	{ "", FLDT_RGSUMMARY, OI(0), C1, 0, TYP_NONE },
+	{ NULL }
+};
diff --git a/db/rtgroup.h b/db/rtgroup.h
index 06f554e1862..5b120f2c9a2 100644
--- a/db/rtgroup.h
+++ b/db/rtgroup.h
@@ -12,6 +12,9 @@ extern const struct field	rtsb_hfld[];
 extern const struct field	rgbitmap_flds[];
 extern const struct field	rgbitmap_hfld[];
 
+extern const struct field	rgsummary_flds[];
+extern const struct field	rgsummary_hfld[];
+
 extern void	rtsb_init(void);
 extern int	rtsb_size(void *obj, int startoff, int idx);
 
diff --git a/db/type.c b/db/type.c
index 65e7b24146f..2091b4ac8b1 100644
--- a/db/type.c
+++ b/db/type.c
@@ -68,6 +68,7 @@ static const typ_t	__typtab[] = {
 	{ TYP_FINOBT, "finobt", handle_struct, finobt_hfld, NULL,
 		TYP_F_NO_CRC_OFF },
 	{ TYP_RGBITMAP, NULL },
+	{ TYP_RGSUMMARY, NULL },
 	{ TYP_NONE, NULL }
 };
 
@@ -116,6 +117,8 @@ static const typ_t	__typtab_crc[] = {
 		&xfs_finobt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
 	{ TYP_RGBITMAP, "rgbitmap", handle_struct, rgbitmap_hfld,
 		&xfs_rtbitmap_buf_ops, XFS_RTBUF_CRC_OFF },
+	{ TYP_RGSUMMARY, "rgsummary", handle_struct, rgsummary_hfld,
+		&xfs_rtsummary_buf_ops, XFS_RTBUF_CRC_OFF },
 	{ TYP_NONE, NULL }
 };
 
@@ -164,6 +167,8 @@ static const typ_t	__typtab_spcrc[] = {
 		&xfs_finobt_buf_ops, XFS_BTREE_SBLOCK_CRC_OFF },
 	{ TYP_RGBITMAP, "rgbitmap", handle_struct, rgbitmap_hfld,
 		&xfs_rtbitmap_buf_ops, XFS_RTBUF_CRC_OFF },
+	{ TYP_RGSUMMARY, "rgsummary", handle_struct, rgsummary_hfld,
+		&xfs_rtsummary_buf_ops, XFS_RTBUF_CRC_OFF },
 	{ TYP_NONE, NULL }
 };
 
diff --git a/db/type.h b/db/type.h
index e2148c6351d..e7f0ecc1768 100644
--- a/db/type.h
+++ b/db/type.h
@@ -36,6 +36,7 @@ typedef enum typnm
 	TYP_TEXT,
 	TYP_FINOBT,
 	TYP_RGBITMAP,
+	TYP_RGSUMMARY,
 	TYP_NONE
 } typnm_t;
 


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 39/52] xfs_mdrestore: restore rt group superblocks to realtime device
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (42 preceding siblings ...)
  2023-12-31 23:57   ` [PATCH 38/52] xfs_db: dump rt summary blocks Darrick J. Wong
@ 2023-12-31 23:57   ` Darrick J. Wong
  2023-12-31 23:57   ` [PATCH 40/52] xfs_io: support scrubbing rtgroup metadata Darrick J. Wong
                     ` (7 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:57 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Support restoring realtime device metadata to the realtime device, if
the dumped filesystem had one.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 man/man8/xfs_mdrestore.8  |   10 +++++++++
 mdrestore/xfs_mdrestore.c |   52 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 11 deletions(-)


diff --git a/man/man8/xfs_mdrestore.8 b/man/man8/xfs_mdrestore.8
index f60e7b56ebf..6f6e14e96c6 100644
--- a/man/man8/xfs_mdrestore.8
+++ b/man/man8/xfs_mdrestore.8
@@ -8,6 +8,9 @@ xfs_mdrestore \- restores an XFS metadump image to a filesystem image
 ] [
 .B \-l
 .I logdev
+] [
+.B \-r
+.I rtdev
 ]
 .I source
 .I target
@@ -17,6 +20,9 @@ xfs_mdrestore \- restores an XFS metadump image to a filesystem image
 [
 .B \-l
 .I logdev
+] [
+.B \-r
+.I rtdev
 ]
 .I source
 .br
@@ -61,6 +67,10 @@ Metadump in v2 format can contain metadata dumped from an external log.
 In such a scenario, the user has to provide a device to which the log device
 contents from the metadump file are copied.
 .TP
+.BI \-r " rtdev"
+Restore realtime device metadata to this device.
+This is only required for a metadump in v2 format.
+.TP
 .B \-V
 Prints the version number and exits.
 .SH DIAGNOSTICS
diff --git a/mdrestore/xfs_mdrestore.c b/mdrestore/xfs_mdrestore.c
index e0572b31e2c..07dcc21cc45 100644
--- a/mdrestore/xfs_mdrestore.c
+++ b/mdrestore/xfs_mdrestore.c
@@ -19,8 +19,9 @@ struct mdrestore_ops {
 	void (*read_header)(union mdrestore_headers *header, FILE *md_fp);
 	void (*show_info)(union mdrestore_headers *header, const char *md_file);
 	void (*restore)(union mdrestore_headers *header, FILE *md_fp,
-			int ddev_fd, bool is_data_target_file, int logdev_fd,
-			bool is_log_target_file);
+			int ddev_fd, bool is_data_target_file,
+			int logdev_fd, bool is_log_target_file,
+			int rtdev_fd, bool is_rt_target_file);
 };
 
 static struct mdrestore {
@@ -29,6 +30,7 @@ static struct mdrestore {
 	bool			show_info;
 	bool			progress_since_warning;
 	bool			external_log;
+	bool			realtime_data;
 } mdrestore;
 
 static void
@@ -200,7 +202,9 @@ restore_v1(
 	int			ddev_fd,
 	bool			is_data_target_file,
 	int			logdev_fd,
-	bool			is_log_target_file)
+	bool			is_log_target_file,
+	int			rtdev_fd,
+	bool			is_rt_target_file)
 {
 	struct xfs_metablock	*metablock;	/* header + index + blocks */
 	__be64			*block_index;
@@ -325,8 +329,9 @@ read_header_v2(
 	if (!mdrestore.external_log && (compat & XFS_MD2_COMPAT_EXTERNALLOG))
 		fatal("External Log device is required\n");
 
-	if (h->v2.xmh_incompat_flags & cpu_to_be32(XFS_MD2_INCOMPAT_RTDEVICE))
-		fatal("Realtime device not yet supported\n");
+	if ((h->v2.xmh_incompat_flags & cpu_to_be32(XFS_MD2_INCOMPAT_RTDEVICE)) &&
+	    !mdrestore.realtime_data)
+		fatal("Realtime device is required\n");
 }
 
 static void
@@ -335,14 +340,17 @@ show_info_v2(
 	const char		*md_file)
 {
 	uint32_t		compat_flags;
+	uint32_t		incompat_flags;
 
 	compat_flags = be32_to_cpu(h->v2.xmh_compat_flags);
+	incompat_flags = be32_to_cpu(h->v2.xmh_incompat_flags);
 
-	printf("%s: %sobfuscated, %s log, external log contents are %sdumped, %s metadata blocks,\n",
+	printf("%s: %sobfuscated, %s log, external log contents are %sdumped, rt device contents are %sdumped, %s metadata blocks,\n",
 		md_file,
 		compat_flags & XFS_MD2_COMPAT_OBFUSCATED ? "":"not ",
 		compat_flags & XFS_MD2_COMPAT_DIRTYLOG ? "dirty":"clean",
 		compat_flags & XFS_MD2_COMPAT_EXTERNALLOG ? "":"not ",
+		incompat_flags & XFS_MD2_INCOMPAT_RTDEVICE ? "":"not ",
 		compat_flags & XFS_MD2_COMPAT_FULLBLOCKS ? "full":"zeroed");
 }
 
@@ -381,7 +389,9 @@ restore_v2(
 	int			ddev_fd,
 	bool			is_data_target_file,
 	int			logdev_fd,
-	bool			is_log_target_file)
+	bool			is_log_target_file,
+	int			rtdev_fd,
+	bool			is_rt_target_file)
 {
 	struct xfs_sb		sb;
 	struct xfs_meta_extent	xme;
@@ -452,6 +462,10 @@ restore_v2(
 			device = "log";
 			fd = logdev_fd;
 			break;
+		case XME_ADDR_RT_DEVICE:
+			device = "rt";
+			fd = rtdev_fd;
+			break;
 		default:
 			fatal("Invalid device found in metadump\n");
 			break;
@@ -481,7 +495,7 @@ static struct mdrestore_ops mdrestore_ops_v2 = {
 static void
 usage(void)
 {
-	fprintf(stderr, "Usage: %s [-V] [-g] [-i] [-l logdev] source target\n",
+	fprintf(stderr, "Usage: %s [-V] [-g] [-i] [-l logdev] [-r rtdev] source target\n",
 		progname);
 	exit(1);
 }
@@ -494,20 +508,24 @@ main(
 	union mdrestore_headers	headers;
 	FILE			*src_f;
 	char			*logdev = NULL;
+	char			*rtdev = NULL;
 	int			data_dev_fd = -1;
 	int			log_dev_fd = -1;
+	int			rt_dev_fd = -1;
 	int			c;
 	bool			is_data_dev_file = false;
 	bool			is_log_dev_file = false;
+	bool			is_rt_dev_file = false;
 
 	mdrestore.show_progress = false;
 	mdrestore.show_info = false;
 	mdrestore.progress_since_warning = false;
 	mdrestore.external_log = false;
+	mdrestore.realtime_data = false;
 
 	progname = basename(argv[0]);
 
-	while ((c = getopt(argc, argv, "gil:V")) != EOF) {
+	while ((c = getopt(argc, argv, "gil:r:V")) != EOF) {
 		switch (c) {
 			case 'g':
 				mdrestore.show_progress = true;
@@ -519,6 +537,10 @@ main(
 				logdev = optarg;
 				mdrestore.external_log = true;
 				break;
+			case 'r':
+				rtdev = optarg;
+				mdrestore.realtime_data = true;
+				break;
 			case 'V':
 				printf("%s version %s\n", progname, VERSION);
 				exit(0);
@@ -587,12 +609,20 @@ main(
 		/* check and open log device */
 		log_dev_fd = open_device(logdev, &is_log_dev_file);
 
-	mdrestore.mdrops->restore(&headers, src_f, data_dev_fd,
-			is_data_dev_file, log_dev_fd, is_log_dev_file);
+	if (mdrestore.realtime_data)
+		/* check and open realtime device */
+		rt_dev_fd = open_device(rtdev, &is_rt_dev_file);
+
+	mdrestore.mdrops->restore(&headers, src_f,
+			data_dev_fd, is_data_dev_file,
+			log_dev_fd, is_log_dev_file,
+			rt_dev_fd, is_rt_dev_file);
 
 	close(data_dev_fd);
 	if (mdrestore.external_log)
 		close(log_dev_fd);
+	if (mdrestore.realtime_data)
+		close(rt_dev_fd);
 
 	if (src_f != stdin)
 		fclose(src_f);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 40/52] xfs_io: support scrubbing rtgroup metadata
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (43 preceding siblings ...)
  2023-12-31 23:57   ` [PATCH 39/52] xfs_mdrestore: restore rt group superblocks to realtime device Darrick J. Wong
@ 2023-12-31 23:57   ` Darrick J. Wong
  2023-12-31 23:58   ` [PATCH 41/52] xfs_io: add a command to display allocation group information Darrick J. Wong
                     ` (6 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:57 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Support scrubbing all rtgroup metadata with a scrubv call.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/scrub.c        |   40 ++++++++++++++++++++++++++++++++++++++++
 man/man8/xfs_io.8 |    3 ++-
 2 files changed, 42 insertions(+), 1 deletion(-)


diff --git a/io/scrub.c b/io/scrub.c
index cb4e24503dc..e9254c7882a 100644
--- a/io/scrub.c
+++ b/io/scrub.c
@@ -165,6 +165,32 @@ parse_metapath(
 	return true;
 }
 
+static bool
+parse_rtgroup(
+	int		argc,
+	char		**argv,
+	int		optind,
+	__u32		*rgno)
+{
+	char		*p;
+	unsigned long	control;
+
+	if (optind != argc - 1) {
+		fprintf(stderr, _("Must specify one rtgroup number.\n"));
+		return false;
+	}
+
+	control = strtoul(argv[optind], &p, 0);
+	if (*p != '\0') {
+		fprintf(stderr, _("Bad rtgroup number '%s'.\n"),
+				argv[optind]);
+		return false;
+	}
+
+	*rgno = control;
+	return true;
+}
+
 static int
 parse_args(
 	int				argc,
@@ -230,6 +256,12 @@ parse_args(
 			return command_usage(cmdinfo);
 		}
 		break;
+	case XFROG_SCRUB_GROUP_RTGROUP:
+		if (!parse_rtgroup(argc, argv, optind, &meta->sm_agno)) {
+			exitcode = 1;
+			return command_usage(cmdinfo);
+		}
+		break;
 	case XFROG_SCRUB_GROUP_FS:
 	case XFROG_SCRUB_GROUP_NONE:
 	case XFROG_SCRUB_GROUP_SUMMARY:
@@ -544,6 +576,8 @@ scrubv_f(
 		group = XFROG_SCRUB_GROUP_ISCAN;
 	else if (!strcmp(argv[optind], "summary"))
 		group = XFROG_SCRUB_GROUP_SUMMARY;
+	else if (!strcmp(argv[optind], "rtgroup"))
+		group = XFROG_SCRUB_GROUP_RTGROUP;
 	else {
 		printf(_("Unknown group '%s'.\n"), argv[optind]);
 		exitcode = 1;
@@ -581,6 +615,12 @@ scrubv_f(
 			return command_usage(&scrubv_cmd);
 		}
 		break;
+	case XFROG_SCRUB_GROUP_RTGROUP:
+		if (!parse_rtgroup(argc, argv, optind, &vhead->svh_agno)) {
+			exitcode = 1;
+			return command_usage(&scrubv_cmd);
+		}
+		break;
 	default:
 		ASSERT(0);
 		break;
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index ef3a5681a1a..61eed7f5aa8 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1422,11 +1422,12 @@ Currently supported versions are 1 and 5.
 .RE
 .PD
 .TP
-.BI "scrub " type " [ " agnumber " | " "ino" " " "gen" " | " metapath " ]"
+.BI "scrub " type " [ " agnumber " | " rgnumber " | " "ino" " " "gen" " | " metapath " ]"
 Scrub internal XFS filesystem metadata.  The
 .BI type
 parameter specifies which type of metadata to scrub.
 For AG metadata, one AG number must be specified.
+For realtime group metadata, one rtgroup number must be specified.
 For file metadata, the scrub is applied to the open file unless the
 inode number and generation number are specified.
 For metapath, the name of a file or a raw number must be specified.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 41/52] xfs_io: add a command to display allocation group information
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (44 preceding siblings ...)
  2023-12-31 23:57   ` [PATCH 40/52] xfs_io: support scrubbing rtgroup metadata Darrick J. Wong
@ 2023-12-31 23:58   ` Darrick J. Wong
  2023-12-31 23:58   ` [PATCH 42/52] xfs_io: add a command to display realtime " Darrick J. Wong
                     ` (5 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:58 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new 'aginfo' command to xfs_io so that we can display allocation
group geometry.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/Makefile       |    2 -
 io/aginfo.c       |  119 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 io/init.c         |    1 
 io/io.h           |    1 
 man/man8/xfs_io.8 |   12 +++++
 5 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 io/aginfo.c


diff --git a/io/Makefile b/io/Makefile
index 1be6ab77d87..32a40294358 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -8,7 +8,7 @@ include $(TOPDIR)/include/builddefs
 LTCOMMAND = xfs_io
 LSRCFILES = xfs_bmap.sh xfs_freeze.sh xfs_mkfile.sh
 HFILES = init.h io.h
-CFILES = init.c \
+CFILES = init.c aginfo.c \
 	attr.c bmap.c bulkstat.c crc32cselftest.c cowextsize.c encrypt.c \
 	file.c freeze.c fsuuid.c fsync.c getrusage.c imap.c inject.c label.c \
 	link.c mmap.c open.c parent.c pread.c prealloc.c pwrite.c reflink.c \
diff --git a/io/aginfo.c b/io/aginfo.c
new file mode 100644
index 00000000000..6cbfcb8de35
--- /dev/null
+++ b/io/aginfo.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All rights reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "platform_defs.h"
+#include "libxfs.h"
+#include "command.h"
+#include "input.h"
+#include "init.h"
+#include "io.h"
+#include "libfrog/logging.h"
+#include "libfrog/paths.h"
+#include "libfrog/fsgeom.h"
+
+static cmdinfo_t aginfo_cmd;
+
+static int
+report_aginfo(
+	struct xfs_fd		*xfd,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_ag_geometry	ageo = { 0 };
+	int			ret;
+
+	ret = -xfrog_ag_geometry(xfd->fd, agno, &ageo);
+	if (ret) {
+		xfrog_perror(ret, "aginfo");
+		return 1;
+	}
+
+	printf(_("AG: %u\n"),		ageo.ag_number);
+	printf(_("Blocks: %u\n"),	ageo.ag_length);
+	printf(_("Free Blocks: %u\n"),	ageo.ag_freeblks);
+	printf(_("Inodes: %u\n"),	ageo.ag_icount);
+	printf(_("Free Inodes: %u\n"),	ageo.ag_ifree);
+	printf(_("Sick: 0x%x\n"),	ageo.ag_sick);
+	printf(_("Checked: 0x%x\n"),	ageo.ag_checked);
+	printf(_("Flags: 0x%x\n"),	ageo.ag_flags);
+
+	return 0;
+}
+
+/* Display AG status. */
+static int
+aginfo_f(
+	int			argc,
+	char			**argv)
+{
+	struct xfs_fd		xfd = XFS_FD_INIT(file->fd);
+	unsigned long long	x;
+	xfs_agnumber_t		agno = NULLAGNUMBER;
+	int			c;
+	int			ret = 0;
+
+	ret = -xfd_prepare_geometry(&xfd);
+	if (ret) {
+		xfrog_perror(ret, "xfd_prepare_geometry");
+		exitcode = 1;
+		return 1;
+	}
+
+	while ((c = getopt(argc, argv, "a:")) != EOF) {
+		switch (c) {
+		case 'a':
+			errno = 0;
+			x = strtoll(optarg, NULL, 10);
+			if (!errno && x >= NULLAGNUMBER)
+				errno = ERANGE;
+			if (errno) {
+				perror("aginfo");
+				return 1;
+			}
+			agno = x;
+			break;
+		default:
+			return command_usage(&aginfo_cmd);
+		}
+	}
+
+	if (agno != NULLAGNUMBER) {
+		ret = report_aginfo(&xfd, agno);
+	} else {
+		for (agno = 0; !ret && agno < xfd.fsgeom.agcount; agno++) {
+			ret = report_aginfo(&xfd, agno);
+		}
+	}
+
+	return ret;
+}
+
+static void
+aginfo_help(void)
+{
+	printf(_(
+"\n"
+"Report allocation group geometry.\n"
+"\n"
+" -a agno  -- Report on the given allocation group.\n"
+"\n"));
+
+}
+
+static cmdinfo_t aginfo_cmd = {
+	.name = "aginfo",
+	.cfunc = aginfo_f,
+	.argmin = 0,
+	.argmax = -1,
+	.args = "[-a agno]",
+	.flags = CMD_NOMAP_OK,
+	.help = aginfo_help,
+};
+
+void
+aginfo_init(void)
+{
+	aginfo_cmd.oneline = _("Get XFS allocation group state.");
+	add_command(&aginfo_cmd);
+}
diff --git a/io/init.c b/io/init.c
index a6c3d0cf147..dee09b56333 100644
--- a/io/init.c
+++ b/io/init.c
@@ -44,6 +44,7 @@ init_cvtnum(
 static void
 init_commands(void)
 {
+	aginfo_init();
 	atomicupdate_init();
 	attr_init();
 	bmap_init();
diff --git a/io/io.h b/io/io.h
index a30b96401a7..61920fcee7f 100644
--- a/io/io.h
+++ b/io/io.h
@@ -190,3 +190,4 @@ extern void		repair_init(void);
 extern void		crc32cselftest_init(void);
 extern void		bulkstat_init(void);
 extern void		atomicupdate_init(void);
+extern void		aginfo_init(void);
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 61eed7f5aa8..36ab5f9dc11 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1239,6 +1239,18 @@ Dumps a list of pages or ranges of pages that are currently in core,
 for the current memory mapping.
 
 .SH FILESYSTEM COMMANDS
+.TP
+.BI "aginfo [ \-a " agno " ]"
+Show information about or update the state of allocation groups.
+.RE
+.RS 1.0i
+.PD 0
+.TP
+.BI \-a
+Act only on a specific allocation group.
+.PD
+.RE
+
 .TP
 .BI "bulkstat [ \-a " agno " ] [ \-d ] [ \-e " endino " ] [ \-m ] [ \-n " batchsize " ] [ \-q ] [ \-s " startino " ] [ \-v " version" ]
 Display raw stat information about a bunch of inodes in an XFS filesystem.


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 42/52] xfs_io: add a command to display realtime group information
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (45 preceding siblings ...)
  2023-12-31 23:58   ` [PATCH 41/52] xfs_io: add a command to display allocation group information Darrick J. Wong
@ 2023-12-31 23:58   ` Darrick J. Wong
  2023-12-31 23:58   ` [PATCH 43/52] xfs_io: display rt group in verbose bmap output Darrick J. Wong
                     ` (4 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:58 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add a new 'rginfo' command to xfs_io so that we can display realtime
group geometry.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/aginfo.c       |   96 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 libfrog/fsgeom.c  |   18 ++++++++++
 libfrog/fsgeom.h  |    4 ++
 man/man8/xfs_io.8 |   14 +++++++-
 4 files changed, 131 insertions(+), 1 deletion(-)


diff --git a/io/aginfo.c b/io/aginfo.c
index 6cbfcb8de35..43e0e9c21b8 100644
--- a/io/aginfo.c
+++ b/io/aginfo.c
@@ -14,6 +14,7 @@
 #include "libfrog/fsgeom.h"
 
 static cmdinfo_t aginfo_cmd;
+static cmdinfo_t rginfo_cmd;
 
 static int
 report_aginfo(
@@ -111,9 +112,104 @@ static cmdinfo_t aginfo_cmd = {
 	.help = aginfo_help,
 };
 
+static int
+report_rginfo(
+	struct xfs_fd		*xfd,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_rtgroup_geometry	rgeo = { 0 };
+	int			ret;
+
+	ret = -xfrog_rtgroup_geometry(xfd->fd, rgno, &rgeo);
+	if (ret) {
+		xfrog_perror(ret, "rginfo");
+		return 1;
+	}
+
+	printf(_("RG: %u\n"),		rgeo.rg_number);
+	printf(_("Blocks: %u\n"),	rgeo.rg_length);
+	printf(_("Sick: 0x%x\n"),	rgeo.rg_sick);
+	printf(_("Checked: 0x%x\n"),	rgeo.rg_checked);
+	printf(_("Flags: 0x%x\n"),	rgeo.rg_flags);
+
+	return 0;
+}
+
+/* Display rtgroup status. */
+static int
+rginfo_f(
+	int			argc,
+	char			**argv)
+{
+	struct xfs_fd		xfd = XFS_FD_INIT(file->fd);
+	unsigned long long	x;
+	xfs_rgnumber_t		rgno = NULLRGNUMBER;
+	int			c;
+	int			ret = 0;
+
+	ret = -xfd_prepare_geometry(&xfd);
+	if (ret) {
+		xfrog_perror(ret, "xfd_prepare_geometry");
+		exitcode = 1;
+		return 1;
+	}
+
+	while ((c = getopt(argc, argv, "r:")) != EOF) {
+		switch (c) {
+		case 'r':
+			errno = 0;
+			x = strtoll(optarg, NULL, 10);
+			if (!errno && x >= NULLRGNUMBER)
+				errno = ERANGE;
+			if (errno) {
+				perror("rginfo");
+				return 1;
+			}
+			rgno = x;
+			break;
+		default:
+			return command_usage(&rginfo_cmd);
+		}
+	}
+
+	if (rgno != NULLRGNUMBER) {
+		ret = report_rginfo(&xfd, rgno);
+	} else {
+		for (rgno = 0; !ret && rgno < xfd.fsgeom.rgcount; rgno++) {
+			ret = report_rginfo(&xfd, rgno);
+		}
+	}
+
+	return ret;
+}
+
+static void
+rginfo_help(void)
+{
+	printf(_(
+"\n"
+"Report realtime group geometry.\n"
+"\n"
+" -r rgno  -- Report on the given realtime group.\n"
+"\n"));
+
+}
+
+static cmdinfo_t rginfo_cmd = {
+	.name = "rginfo",
+	.cfunc = rginfo_f,
+	.argmin = 0,
+	.argmax = -1,
+	.args = "[-r rgno]",
+	.flags = CMD_NOMAP_OK,
+	.help = rginfo_help,
+};
+
 void
 aginfo_init(void)
 {
 	aginfo_cmd.oneline = _("Get XFS allocation group state.");
 	add_command(&aginfo_cmd);
+	rginfo_cmd.oneline = _("Get XFS realtime group state.");
+	add_command(&rginfo_cmd);
 }
diff --git a/libfrog/fsgeom.c b/libfrog/fsgeom.c
index b9618661427..63393f0442f 100644
--- a/libfrog/fsgeom.c
+++ b/libfrog/fsgeom.c
@@ -212,3 +212,21 @@ xfrog_ag_geometry(
 		return -errno;
 	return 0;
 }
+
+/*
+ * Try to obtain a rt group's geometry.  Returns zero or a negative error code.
+ */
+int
+xfrog_rtgroup_geometry(
+	int			fd,
+	unsigned int		rgno,
+	struct xfs_rtgroup_geometry	*rgeo)
+{
+	int			ret;
+
+	rgeo->rg_number = rgno;
+	ret = ioctl(fd, XFS_IOC_RTGROUP_GEOMETRY, rgeo);
+	if (ret)
+		return -errno;
+	return 0;
+}
diff --git a/libfrog/fsgeom.h b/libfrog/fsgeom.h
index 4f3542eafec..e5d43695901 100644
--- a/libfrog/fsgeom.h
+++ b/libfrog/fsgeom.h
@@ -5,10 +5,14 @@
 #ifndef __LIBFROG_FSGEOM_H__
 #define __LIBFROG_FSGEOM_H__
 
+struct xfs_rtgroup_geometry;
+
 void xfs_report_geom(struct xfs_fsop_geom *geo, const char *mntpoint,
 		const char *logname, const char *rtname);
 int xfrog_geometry(int fd, struct xfs_fsop_geom *fsgeo);
 int xfrog_ag_geometry(int fd, unsigned int agno, struct xfs_ag_geometry *ageo);
+int xfrog_rtgroup_geometry(int fd, unsigned int rgno,
+		struct xfs_rtgroup_geometry *rgeo);
 
 /*
  * Structure for recording whatever observations we want about the level of
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 36ab5f9dc11..6cf4b9e32e5 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1325,6 +1325,19 @@ specific points under adverse conditions. Without the
 .I tag
 argument, displays the list of error tags available.
 Only available in expert mode and requires privileges.
+
+.TP
+.BI "rginfo [ \-r " rgno " ]"
+Show information about or update the state of realtime allocation groups.
+.RE
+.RS 1.0i
+.PD 0
+.TP
+.BI \-r
+Act only on a specific realtime group.
+.PD
+.RE
+
 .TP
 .BI "resblks [ " blocks " ]"
 Get and/or set count of reserved filesystem blocks using the
@@ -1626,7 +1639,6 @@ flag.
 .B fsuuid
 Print the mounted filesystem UUID.
 
-
 .SH OTHER COMMANDS
 .TP
 .BR "help [ " command " ]"


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 43/52] xfs_io: display rt group in verbose bmap output
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (46 preceding siblings ...)
  2023-12-31 23:58   ` [PATCH 42/52] xfs_io: add a command to display realtime " Darrick J. Wong
@ 2023-12-31 23:58   ` Darrick J. Wong
  2023-12-31 23:59   ` [PATCH 44/52] xfs_io: display rt group in verbose fsmap output Darrick J. Wong
                     ` (3 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:58 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Display the rt group number in the bmap -v output, just like we do for
regular data files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/bmap.c |   30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)


diff --git a/io/bmap.c b/io/bmap.c
index 722a389baaa..11bbc0629cf 100644
--- a/io/bmap.c
+++ b/io/bmap.c
@@ -264,9 +264,16 @@ bmap_f(
 
 		foff_w = boff_w = aoff_w = MINRANGE_WIDTH;
 		tot_w = MINTOT_WIDTH;
-		if (is_rt)
-			sunit = swidth = bbperag = 0;
-		else {
+		if (is_rt) {
+			if (fsgeo.rgcount == 0) {
+				bbperag = 0;
+			} else {
+				bbperag = (off64_t)fsgeo.rgblocks *
+					  (off64_t)fsgeo.blocksize / BBSIZE;
+			}
+			sunit = 0;
+			swidth = 0;
+		} else {
 			bbperag = (off64_t)fsgeo.agblocks *
 				  (off64_t)fsgeo.blocksize / BBSIZE;
 			sunit = (fsgeo.sunit * fsgeo.blocksize) / BBSIZE;
@@ -295,7 +302,7 @@ bmap_f(
 					(long long)(map[i + 1].bmv_block +
 						map[i + 1].bmv_length - 1LL));
 				boff_w = max(boff_w, strlen(bbuf));
-				if (!is_rt) {
+				if (bbperag > 0) {
 					agno = map[i + 1].bmv_block / bbperag;
 					agoff = map[i + 1].bmv_block -
 							(agno * bbperag);
@@ -312,13 +319,20 @@ bmap_f(
 					numlen(map[i+1].bmv_length, 10));
 			}
 		}
-		agno_w = is_rt ? 0 : max(MINAG_WIDTH, numlen(fsgeo.agcount, 10));
+		if (is_rt) {
+			if (fsgeo.rgcount > 0)
+				agno_w = max(MINAG_WIDTH, numlen(fsgeo.rgcount, 10));
+			else
+				agno_w = 0;
+		} else {
+			agno_w = max(MINAG_WIDTH, numlen(fsgeo.agcount, 10));
+		}
 		printf("%4s: %-*s %-*s %*s %-*s %*s%s\n",
 			_("EXT"),
 			foff_w, _("FILE-OFFSET"),
 			boff_w, is_rt ? _("RT-BLOCK-RANGE") : _("BLOCK-RANGE"),
-			agno_w, is_rt ? "" : _("AG"),
-			aoff_w, is_rt ? "" : _("AG-OFFSET"),
+			agno_w, is_rt ? (fsgeo.rgcount ? _("RG") : "") : _("AG"),
+			aoff_w, is_rt ? (fsgeo.rgcount ? _("RG-OFFSET") : "") : _("AG-OFFSET"),
 			tot_w, _("TOTAL"),
 			flg ? _(" FLAGS") : "");
 		for (i = 0; i < egcnt; i++) {
@@ -377,7 +391,7 @@ bmap_f(
 						map[i + 1].bmv_length - 1LL));
 				printf("%4d: %-*s %-*s", i, foff_w, rbuf,
 					boff_w, bbuf);
-				if (!is_rt) {
+				if (bbperag > 0) {
 					agno = map[i + 1].bmv_block / bbperag;
 					agoff = map[i + 1].bmv_block -
 							(agno * bbperag);


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 44/52] xfs_io: display rt group in verbose fsmap output
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (47 preceding siblings ...)
  2023-12-31 23:58   ` [PATCH 43/52] xfs_io: display rt group in verbose bmap output Darrick J. Wong
@ 2023-12-31 23:59   ` Darrick J. Wong
  2023-12-31 23:59   ` [PATCH 45/52] xfs_spaceman: report on realtime group health Darrick J. Wong
                     ` (2 subsequent siblings)
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:59 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Display the rt group number in the fsmap output, just like we do for
regular data files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/fsmap.c |   23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)


diff --git a/io/fsmap.c b/io/fsmap.c
index 7db51847e2b..cb70f86cb96 100644
--- a/io/fsmap.c
+++ b/io/fsmap.c
@@ -14,6 +14,7 @@
 
 static cmdinfo_t	fsmap_cmd;
 static dev_t		xfs_data_dev;
+static dev_t		xfs_rt_dev;
 
 static void
 fsmap_help(void)
@@ -170,7 +171,7 @@ dump_map_verbose(
 	unsigned long long	i;
 	struct fsmap		*p;
 	int			agno;
-	off64_t			agoff, bperag;
+	off64_t			agoff, bperag, bperrtg;
 	int			foff_w, boff_w, aoff_w, tot_w, agno_w, own_w;
 	int			nr_w, dev_w;
 	char			rbuf[40], bbuf[40], abuf[40], obuf[40];
@@ -185,6 +186,8 @@ dump_map_verbose(
 	tot_w = MINTOT_WIDTH;
 	bperag = (off64_t)fsgeo->agblocks *
 		  (off64_t)fsgeo->blocksize;
+	bperrtg = (off64_t)fsgeo->rgblocks *
+		  (off64_t)fsgeo->blocksize;
 	sunit = (fsgeo->sunit * fsgeo->blocksize);
 	swidth = (fsgeo->swidth * fsgeo->blocksize);
 
@@ -243,6 +246,13 @@ dump_map_verbose(
 				"(%lld..%lld)",
 				(long long)BTOBBT(agoff),
 				(long long)BTOBBT(agoff + p->fmr_length - 1));
+		} else if (p->fmr_device == xfs_rt_dev && fsgeo->rgcount > 0) {
+			agno = p->fmr_physical / bperrtg;
+			agoff = p->fmr_physical - (agno * bperrtg);
+			snprintf(abuf, sizeof(abuf),
+				"(%lld..%lld)",
+				(long long)BTOBBT(agoff),
+				(long long)BTOBBT(agoff + p->fmr_length - 1));
 		} else
 			abuf[0] = 0;
 		aoff_w = max(aoff_w, strlen(abuf));
@@ -315,6 +325,16 @@ dump_map_verbose(
 			snprintf(gbuf, sizeof(gbuf),
 				"%lld",
 				(long long)agno);
+		} else if (p->fmr_device == xfs_rt_dev && fsgeo->rgcount > 0) {
+			agno = p->fmr_physical / bperrtg;
+			agoff = p->fmr_physical - (agno * bperrtg);
+			snprintf(abuf, sizeof(abuf),
+				"(%lld..%lld)",
+				(long long)BTOBBT(agoff),
+				(long long)BTOBBT(agoff + p->fmr_length - 1));
+			snprintf(gbuf, sizeof(gbuf),
+				"%lld",
+				(long long)agno);
 		} else {
 			abuf[0] = 0;
 			gbuf[0] = 0;
@@ -501,6 +521,7 @@ fsmap_f(
 	}
 	fs = fs_table_lookup(file->name, FS_MOUNT_POINT);
 	xfs_data_dev = fs ? fs->fs_datadev : 0;
+	xfs_rt_dev = fs ? fs->fs_rtdev : 0;
 
 	head->fmh_count = map_size;
 	do {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 45/52] xfs_spaceman: report on realtime group health
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (48 preceding siblings ...)
  2023-12-31 23:59   ` [PATCH 44/52] xfs_io: display rt group in verbose fsmap output Darrick J. Wong
@ 2023-12-31 23:59   ` Darrick J. Wong
  2023-12-31 23:59   ` [PATCH 46/52] xfs_scrub: scrub realtime allocation group metadata Darrick J. Wong
  2023-12-31 23:59   ` [PATCH 47/52] xfs_scrub: call GETFSMAP for each rt group in parallel Darrick J. Wong
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:59 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Add the realtime group status to the health reporting done by
xfs_spaceman.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 man/man8/xfs_spaceman.8 |    5 +++-
 spaceman/health.c       |   59 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 61 insertions(+), 3 deletions(-)


diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8
index 0d299132a78..7d2d1ff94ee 100644
--- a/man/man8/xfs_spaceman.8
+++ b/man/man8/xfs_spaceman.8
@@ -91,7 +91,7 @@ The output will have the same format that
 .BR "xfs_info" "(8)"
 prints when querying a filesystem.
 .TP
-.BI "health [ \-a agno] [ \-c ] [ \-f ] [ \-i inum ] [ \-n ] [ \-q ] [ paths ]"
+.BI "health [ \-a agno] [ \-c ] [ \-f ] [ \-i inum ] [ \-n ] [ \-q ] [ \-r rgno ] [ paths ]"
 Reports the health of the given group of filesystem metadata.
 .RS 1.0i
 .PD 0
@@ -119,6 +119,9 @@ This option is disabled by default to minimize runtime.
 .B \-q
 Report only unhealthy metadata.
 .TP
+.B \-r
+Report on the health of the given realtime group.
+.TP
 .B paths
 Report on the health of the files at the given path.
 .PD
diff --git a/spaceman/health.c b/spaceman/health.c
index 47cf9099492..d9a18fcc3b4 100644
--- a/spaceman/health.c
+++ b/spaceman/health.c
@@ -136,6 +136,18 @@ static const struct flag_map ag_flags[] = {
 	{0},
 };
 
+static const struct flag_map rtgroup_flags[] = {
+	{
+		.mask = XFS_RTGROUP_GEOM_SICK_SUPER,
+		.descr = "superblock",
+	},
+	{
+		.mask = XFS_RTGROUP_GEOM_SICK_BITMAP,
+		.descr = "realtime bitmap",
+	},
+	{0},
+};
+
 static const struct flag_map inode_flags[] = {
 	{
 		.mask = XFS_BS_SICK_INODE,
@@ -220,6 +232,25 @@ report_ag_sick(
 	return 0;
 }
 
+/* Report on a rt group's health. */
+static int
+report_rtgroup_sick(
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_rtgroup_geometry rgeo = { 0 };
+	char			descr[256];
+	int			ret;
+
+	ret = -xfrog_rtgroup_geometry(file->xfd.fd, rgno, &rgeo);
+	if (ret) {
+		xfrog_perror(ret, "rtgroup_geometry");
+		return 1;
+	}
+	snprintf(descr, sizeof(descr) - 1, _("rtgroup %u"), rgno);
+	report_sick(descr, rtgroup_flags, rgeo.rg_sick, rgeo.rg_checked);
+	return 0;
+}
+
 /* Report on an inode's health. */
 static int
 report_inode_health(
@@ -346,7 +377,7 @@ report_bulkstat_health(
 	return error;
 }
 
-#define OPT_STRING ("a:cfi:nq")
+#define OPT_STRING ("a:cfi:nqr:")
 
 /* Report on health problems in XFS filesystem. */
 static int
@@ -356,6 +387,7 @@ health_f(
 {
 	unsigned long long	x;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 	bool			default_report = true;
 	int			c;
 	int			ret;
@@ -403,6 +435,17 @@ health_f(
 		case 'q':
 			quiet = true;
 			break;
+		case 'r':
+			default_report = false;
+			errno = 0;
+			x = strtoll(optarg, NULL, 10);
+			if (!errno && x >= NULLRGNUMBER)
+				errno = ERANGE;
+			if (errno) {
+				perror("rtgroup health");
+				return 1;
+			}
+			break;
 		default:
 			return command_usage(&health_cmd);
 		}
@@ -438,6 +481,12 @@ health_f(
 			if (ret)
 				return 1;
 			break;
+		case 'r':
+			rgno = strtoll(optarg, NULL, 10);
+			ret = report_rtgroup_sick(rgno);
+			if (ret)
+				return 1;
+			break;
 		default:
 			break;
 		}
@@ -459,6 +508,11 @@ health_f(
 			if (ret)
 				return 1;
 		}
+		for (rgno = 0; rgno < file->xfd.fsgeom.rgcount; rgno++) {
+			ret = report_rtgroup_sick(rgno);
+			if (ret)
+				return 1;
+		}
 		if (comprehensive) {
 			ret = report_bulkstat_health(NULLAGNUMBER);
 			if (ret)
@@ -489,6 +543,7 @@ health_help(void)
 " -i inum  -- Report health of a given inode number.\n"
 " -n       -- Try to report file names.\n"
 " -q       -- Only report unhealthy metadata.\n"
+" -r rgno  -- Report health of the given realtime group.\n"
 " paths    -- Report health of the given file path.\n"
 "\n"));
 
@@ -499,7 +554,7 @@ static cmdinfo_t health_cmd = {
 	.cfunc = health_f,
 	.argmin = 0,
 	.argmax = -1,
-	.args = "[-a agno] [-c] [-f] [-i inum] [-n] [-q] [paths]",
+	.args = "[-a agno] [-c] [-f] [-i inum] [-n] [-q] [-r rgno] [paths]",
 	.flags = CMD_FLAG_ONESHOT,
 	.help = health_help,
 };


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 46/52] xfs_scrub: scrub realtime allocation group metadata
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (49 preceding siblings ...)
  2023-12-31 23:59   ` [PATCH 45/52] xfs_spaceman: report on realtime group health Darrick J. Wong
@ 2023-12-31 23:59   ` Darrick J. Wong
  2023-12-31 23:59   ` [PATCH 47/52] xfs_scrub: call GETFSMAP for each rt group in parallel Darrick J. Wong
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:59 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Scan realtime group metadata as part of phase 2, just like we do for AG
metadata.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/phase2.c |  100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 scrub/scrub.h  |    9 +++++
 2 files changed, 107 insertions(+), 2 deletions(-)


diff --git a/scrub/phase2.c b/scrub/phase2.c
index c24d137358c..a3ada334e86 100644
--- a/scrub/phase2.c
+++ b/scrub/phase2.c
@@ -28,6 +28,14 @@ struct scan_ctl {
 	pthread_mutex_t		rbm_waitlock;
 	bool			rbm_done;
 
+	/*
+	 * Control mechanism to signal that each group's scan of the rt bitmap
+	 * file scan is done and wake up any waiters.
+	 */
+	pthread_cond_t		rbm_group_wait;
+	pthread_mutex_t		rbm_group_waitlock;
+	unsigned int		rbm_group_count;
+
 	bool			aborted;
 };
 
@@ -247,6 +255,61 @@ scan_fs_metadata(
 	}
 }
 
+/* Scrub each rt group's metadata. */
+static void
+scan_rtgroup_metadata(
+	struct workqueue	*wq,
+	xfs_agnumber_t		rgno,
+	void			*arg)
+{
+	struct scrub_item	sri;
+	struct scrub_ctx	*ctx = (struct scrub_ctx *)wq->wq_ctx;
+	struct scan_ctl		*sctl = arg;
+	char			descr[DESCR_BUFSZ];
+	bool			defer_repairs;
+	int			ret;
+
+	if (sctl->aborted)
+		goto out;
+
+	scrub_item_init_rtgroup(&sri, rgno);
+	snprintf(descr, DESCR_BUFSZ, _("rtgroup %u"), rgno);
+
+	/*
+	 * Try to check all of the rtgroup metadata items that we just
+	 * scheduled.  If we return with some types still needing a check, try
+	 * repairing any damaged metadata that we've found so far, and try
+	 * again.  Abort if we stop making forward progress.
+	 */
+	scrub_item_schedule_group(&sri, XFROG_SCRUB_GROUP_RTGROUP);
+	ret = scrub_item_check(ctx, &sri);
+	if (ret) {
+		sctl->aborted = true;
+		goto out;
+	}
+
+	ret = repair_and_scrub_loop(ctx, &sri, descr, &defer_repairs);
+	if (ret) {
+		sctl->aborted = true;
+		goto out;
+	}
+
+	/* Everything else gets fixed during phase 4. */
+	ret = defer_fs_repair(ctx, &sri);
+	if (ret) {
+		sctl->aborted = true;
+		goto out;
+	}
+
+out:
+	/* Signal anybody waiting for the group bitmap scan to finish. */
+	pthread_mutex_lock(&sctl->rbm_group_waitlock);
+	sctl->rbm_group_count--;
+	if (sctl->rbm_group_count == 0)
+		pthread_cond_broadcast(&sctl->rbm_group_wait);
+	pthread_mutex_unlock(&sctl->rbm_group_waitlock);
+}
+
 /* Scan all filesystem metadata. */
 int
 phase2_func(
@@ -260,6 +323,7 @@ phase2_func(
 	struct scrub_item	sri;
 	const struct xfrog_scrub_descr *sc = xfrog_scrubbers;
 	xfs_agnumber_t		agno;
+	xfs_rgnumber_t		rgno;
 	unsigned int		type;
 	int			ret, ret2;
 
@@ -326,14 +390,46 @@ phase2_func(
 		goto out_wq;
 
 	/*
-	 * Wait for the rt bitmap to finish scanning, then scan the rt summary
-	 * since the summary can be regenerated completely from the bitmap.
+	 * Wait for the rt bitmap to finish scanning, then scan the realtime
+	 * group metadata.  When rtgroups are enabled, the RTBITMAP scanner
+	 * only checks the inode and fork data of the rt bitmap file, and each
+	 * group checks its own part of the rtbitmap.
 	 */
 	pthread_mutex_lock(&sctl.rbm_waitlock);
 	while (!sctl.rbm_done)
 		pthread_cond_wait(&sctl.rbm_wait, &sctl.rbm_waitlock);
 	pthread_mutex_unlock(&sctl.rbm_waitlock);
 
+	if (sctl.aborted)
+		goto out_wq;
+
+	for (rgno = 0;
+	     rgno < ctx->mnt.fsgeom.rgcount && !sctl.aborted;
+	     rgno++) {
+		pthread_mutex_lock(&sctl.rbm_group_waitlock);
+		sctl.rbm_group_count++;
+		pthread_mutex_unlock(&sctl.rbm_group_waitlock);
+		ret = -workqueue_add(&wq, scan_rtgroup_metadata, rgno, &sctl);
+		if (ret) {
+			str_liberror(ctx, ret,
+					_("queueing rtgroup scrub work"));
+			goto out_wq;
+		}
+	}
+
+	if (sctl.aborted)
+		goto out_wq;
+
+	/*
+	 * Wait for the rtgroups to finish scanning, then scan the rt summary
+	 * since the summary can be regenerated completely from the bitmap.
+	 */
+	pthread_mutex_lock(&sctl.rbm_group_waitlock);
+	while (sctl.rbm_group_count > 0)
+		pthread_cond_wait(&sctl.rbm_group_wait,
+				&sctl.rbm_group_waitlock);
+	pthread_mutex_unlock(&sctl.rbm_group_waitlock);
+
 	if (sctl.aborted)
 		goto out_wq;
 
diff --git a/scrub/scrub.h b/scrub/scrub.h
index 3bb3ea1d07b..bb94a11dcfc 100644
--- a/scrub/scrub.h
+++ b/scrub/scrub.h
@@ -90,6 +90,15 @@ scrub_item_init_ag(struct scrub_item *sri, xfs_agnumber_t agno)
 	sri->sri_gen = -1U;
 }
 
+static inline void
+scrub_item_init_rtgroup(struct scrub_item *sri, xfs_rgnumber_t rgno)
+{
+	memset(sri, 0, sizeof(*sri));
+	sri->sri_agno = rgno;
+	sri->sri_ino = -1ULL;
+	sri->sri_gen = -1U;
+}
+
 static inline void
 scrub_item_init_fs(struct scrub_item *sri)
 {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 47/52] xfs_scrub: call GETFSMAP for each rt group in parallel
  2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
                     ` (50 preceding siblings ...)
  2023-12-31 23:59   ` [PATCH 46/52] xfs_scrub: scrub realtime allocation group metadata Darrick J. Wong
@ 2023-12-31 23:59   ` Darrick J. Wong
  51 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2023-12-31 23:59 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

If realtime groups are enabled, we should take advantage of the sharding
to speed up the spacemap scans.  Do so by issuing per-rtgroup GETFSMAP
calls.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/spacemap.c |   74 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 8 deletions(-)


diff --git a/scrub/spacemap.c b/scrub/spacemap.c
index f20ecfeac5d..8796cb115ea 100644
--- a/scrub/spacemap.c
+++ b/scrub/spacemap.c
@@ -128,6 +128,47 @@ scan_ag_rmaps(
 	}
 }
 
+/* Iterate all the reverse mappings of a realtime group. */
+static void
+scan_rtg_rmaps(
+	struct workqueue	*wq,
+	xfs_agnumber_t		rgno,
+	void			*arg)
+{
+	struct scrub_ctx	*ctx = (struct scrub_ctx *)wq->wq_ctx;
+	struct scan_blocks	*sbx = arg;
+	struct fsmap		keys[2];
+	off64_t			bperrg;
+	int			ret;
+
+	bperrg = (off64_t)ctx->mnt.fsgeom.rgblocks *
+		 (off64_t)ctx->mnt.fsgeom.blocksize;
+
+	memset(keys, 0, sizeof(struct fsmap) * 2);
+	keys->fmr_device = ctx->fsinfo.fs_rtdev;
+	keys->fmr_physical = rgno * bperrg;
+	(keys + 1)->fmr_device = ctx->fsinfo.fs_rtdev;
+	(keys + 1)->fmr_physical = ((rgno + 1) * bperrg) - 1;
+	(keys + 1)->fmr_owner = ULLONG_MAX;
+	(keys + 1)->fmr_offset = ULLONG_MAX;
+	(keys + 1)->fmr_flags = UINT_MAX;
+
+	if (sbx->aborted)
+		return;
+
+	ret = scrub_iterate_fsmap(ctx, keys, sbx->fn, sbx->arg);
+	if (ret) {
+		char		descr[DESCR_BUFSZ];
+
+		snprintf(descr, DESCR_BUFSZ, _("dev %d:%d rtgroup %u fsmap"),
+					major(ctx->fsinfo.fs_datadev),
+					minor(ctx->fsinfo.fs_datadev),
+					rgno);
+		str_liberror(ctx, ret, descr);
+		sbx->aborted = true;
+	}
+}
+
 /* Iterate all the reverse mappings of a standalone device. */
 static void
 scan_dev_rmaps(
@@ -208,14 +249,6 @@ scrub_scan_all_spacemaps(
 		str_liberror(ctx, ret, _("creating fsmap workqueue"));
 		return ret;
 	}
-	if (ctx->fsinfo.fs_rt) {
-		ret = -workqueue_add(&wq, scan_rt_rmaps, 0, &sbx);
-		if (ret) {
-			sbx.aborted = true;
-			str_liberror(ctx, ret, _("queueing rtdev fsmap work"));
-			goto out;
-		}
-	}
 	if (ctx->fsinfo.fs_log) {
 		ret = -workqueue_add(&wq, scan_log_rmaps, 0, &sbx);
 		if (ret) {
@@ -232,6 +265,31 @@ scrub_scan_all_spacemaps(
 			break;
 		}
 	}
+	if (ctx->fsinfo.fs_rt) {
+		for (agno = 0; agno < ctx->mnt.fsgeom.rgcount; agno++) {
+			ret = -workqueue_add(&wq, scan_rtg_rmaps, agno, &sbx);
+			if (ret) {
+				sbx.aborted = true;
+				str_liberror(ctx, ret,
+						_("queueing rtgroup fsmap work"));
+				break;
+			}
+		}
+
+		/*
+		 * If the fs doesn't have any realtime groups, scan the entire
+		 * volume all at once, since the above loop did nothing.
+		 */
+		if (ctx->mnt.fsgeom.rgcount == 0) {
+			ret = -workqueue_add(&wq, scan_rt_rmaps, 0, &sbx);
+			if (ret) {
+				sbx.aborted = true;
+				str_liberror(ctx, ret,
+						_("queueing rtdev fsmap work"));
+				goto out;
+			}
+		}
+	}
 out:
 	ret = -workqueue_terminate(&wq);
 	if (ret) {


^ permalink raw reply related	[flat|nested] 632+ messages in thread

* [PATCH 6/8] xfs_db: enable conversion of rt space units
  2022-12-30 22:19 [PATCHSET v1.0 0/8] xfs_db: debug realtime geometry Darrick J. Wong
@ 2022-12-30 22:19 ` Darrick J. Wong
  0 siblings, 0 replies; 632+ messages in thread
From: Darrick J. Wong @ 2022-12-30 22:19 UTC (permalink / raw)
  To: djwong, cem; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Teach the xfs_db convert function about rt extents, rt block numbers,
and how to compute offsets within the rt bitmap and summary files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/convert.c      |  232 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 man/man8/xfs_db.8 |   51 ++++++++++++
 2 files changed, 260 insertions(+), 23 deletions(-)


diff --git a/db/convert.c b/db/convert.c
index e1466057031..811bac00f71 100644
--- a/db/convert.c
+++ b/db/convert.c
@@ -26,6 +26,10 @@
 	 agino_to_bytes(XFS_INO_TO_AGINO(mp, (x))))
 #define	inoidx_to_bytes(x)	\
 	((uint64_t)(x) << mp->m_sb.sb_inodelog)
+#define rtblock_to_bytes(x)	\
+	((uint64_t)(x) << mp->m_sb.sb_blocklog)
+#define rtx_to_rtblock(x)	\
+	((uint64_t)(x) * mp->m_sb.sb_rextsize)
 
 typedef enum {
 	CT_NONE = -1,
@@ -40,11 +44,12 @@ typedef enum {
 	CT_INO,			/* xfs_ino_t */
 	CT_INOIDX,		/* index of inode in fsblock */
 	CT_INOOFF,		/* byte offset in inode */
+	CT_RTBLOCK,		/* realtime block */
+	CT_RTX,			/* realtime extent */
 	NCTS
 } ctype_t;
 
 typedef struct ctydesc {
-	ctype_t		ctype;
 	int		allowed;
 	const char	**names;
 } ctydesc_t;
@@ -61,12 +66,16 @@ typedef union {
 	xfs_ino_t	ino;
 	int		inoidx;
 	int		inooff;
+	xfs_rtblock_t	rtblock;
+	xfs_rtblock_t	rtx;
 } cval_t;
 
 static uint64_t		bytevalue(ctype_t ctype, cval_t *val);
+static int		rtconvert_f(int argc, char **argv);
 static int		convert_f(int argc, char **argv);
 static int		getvalue(char *s, ctype_t ctype, cval_t *val);
-static ctype_t		lookupcty(char *ctyname);
+static ctype_t		lookupcty(const struct ctydesc *descs,
+				  const char *ctyname);
 
 static const char	*agblock_names[] = { "agblock", "agbno", NULL };
 static const char	*agino_names[] = { "agino", "aginode", NULL };
@@ -74,6 +83,8 @@ static const char	*agnumber_names[] = { "agnumber", "agno", NULL };
 static const char	*bboff_names[] = { "bboff", "daddroff", NULL };
 static const char	*blkoff_names[] = { "blkoff", "fsboff", "agboff",
 					    NULL };
+static const char	*rtblkoff_names[] = { "blkoff", "rtboff",
+					    NULL };
 static const char	*byte_names[] = { "byte", "fsbyte", NULL };
 static const char	*daddr_names[] = { "daddr", "bb", NULL };
 static const char	*fsblock_names[] = { "fsblock", "fsb", "fsbno", NULL };
@@ -81,30 +92,91 @@ static const char	*ino_names[] = { "ino", "inode", NULL };
 static const char	*inoidx_names[] = { "inoidx", "offset", NULL };
 static const char	*inooff_names[] = { "inooff", "inodeoff", NULL };
 
+static const char	*rtblock_names[] = { "rtblock", "rtb", "rtbno", NULL };
+static const char	*rtx_names[] = { "rtx", "rtextent", NULL };
+
 static const ctydesc_t	ctydescs[NCTS] = {
-	{ CT_AGBLOCK, M(AGNUMBER)|M(BBOFF)|M(BLKOFF)|M(INOIDX)|M(INOOFF),
-	  agblock_names },
-	{ CT_AGINO, M(AGNUMBER)|M(INOOFF), agino_names },
-	{ CT_AGNUMBER,
-	  M(AGBLOCK)|M(AGINO)|M(BBOFF)|M(BLKOFF)|M(INOIDX)|M(INOOFF),
-	  agnumber_names },
-	{ CT_BBOFF, M(AGBLOCK)|M(AGNUMBER)|M(DADDR)|M(FSBLOCK), bboff_names },
-	{ CT_BLKOFF, M(AGBLOCK)|M(AGNUMBER)|M(FSBLOCK), blkoff_names },
-	{ CT_BYTE, 0, byte_names },
-	{ CT_DADDR, M(BBOFF), daddr_names },
-	{ CT_FSBLOCK, M(BBOFF)|M(BLKOFF)|M(INOIDX), fsblock_names },
-	{ CT_INO, M(INOOFF), ino_names },
-	{ CT_INOIDX, M(AGBLOCK)|M(AGNUMBER)|M(FSBLOCK)|M(INOOFF),
-	  inoidx_names },
-	{ CT_INOOFF,
-	  M(AGBLOCK)|M(AGINO)|M(AGNUMBER)|M(FSBLOCK)|M(INO)|M(INOIDX),
-	  inooff_names },
+	[CT_AGBLOCK] = {
+		.allowed = M(AGNUMBER)|M(BBOFF)|M(BLKOFF)|M(INOIDX)|M(INOOFF),
+		.names   = agblock_names,
+	},
+	[CT_AGINO] = {
+		.allowed = M(AGNUMBER)|M(INOOFF),
+		.names   = agino_names,
+	},
+	[CT_AGNUMBER] = {
+		.allowed = M(AGBLOCK)|M(AGINO)|M(BBOFF)|M(BLKOFF)|M(INOIDX)|M(INOOFF),
+		.names   = agnumber_names,
+	},
+	[CT_BBOFF] = {
+		.allowed = M(AGBLOCK)|M(AGNUMBER)|M(DADDR)|M(FSBLOCK),
+		.names   = bboff_names,
+	},
+	[CT_BLKOFF] = {
+		.allowed = M(AGBLOCK)|M(AGNUMBER)|M(FSBLOCK),
+		.names   = blkoff_names,
+	},
+	[CT_BYTE] = {
+		.allowed = 0,
+		.names   = byte_names,
+	},
+	[CT_DADDR] = {
+		.allowed = M(BBOFF),
+		.names   = daddr_names,
+	},
+	[CT_FSBLOCK] = {
+		.allowed = M(BBOFF)|M(BLKOFF)|M(INOIDX),
+		.names   = fsblock_names,
+	},
+	[CT_INO] = {
+		.allowed = M(INOOFF),
+		.names   = ino_names,
+	},
+	[CT_INOIDX] = {
+		.allowed = M(AGBLOCK)|M(AGNUMBER)|M(FSBLOCK)|M(INOOFF),
+		.names   = inoidx_names,
+	},
+	[CT_INOOFF] = {
+		.allowed = M(AGBLOCK)|M(AGINO)|M(AGNUMBER)|M(FSBLOCK)|M(INO)|M(INOIDX),
+		.names   = inooff_names,
+	},
+};
+
+static const ctydesc_t	ctydescs_rt[NCTS] = {
+	[CT_BBOFF] = {
+		.allowed = M(DADDR)|M(RTBLOCK),
+		.names   = bboff_names,
+	},
+	[CT_BLKOFF] = {
+		.allowed = M(RTBLOCK),
+		.names   = rtblkoff_names,
+	},
+	[CT_BYTE] = {
+		.allowed = 0,
+		.names   = byte_names,
+	},
+	[CT_DADDR] = {
+		.allowed = M(BBOFF),
+		.names   = daddr_names,
+	},
+	[CT_RTBLOCK] = {
+		.allowed = M(BBOFF)|M(BLKOFF),
+		.names   = rtblock_names,
+	},
+	[CT_RTX] = {
+		.allowed = M(BBOFF)|M(BLKOFF),
+		.names   = rtx_names,
+	},
 };
 
 static const cmdinfo_t	convert_cmd =
 	{ "convert", NULL, convert_f, 3, 9, 0, "type num [type num]... type",
 	  "convert from one address form to another", NULL };
 
+static const cmdinfo_t	rtconvert_cmd =
+	{ "rtconvert", NULL, rtconvert_f, 3, 9, 0, "type num [type num]... type",
+	  "convert from one realtime address form to another", NULL };
+
 static uint64_t
 bytevalue(ctype_t ctype, cval_t *val)
 {
@@ -131,6 +203,10 @@ bytevalue(ctype_t ctype, cval_t *val)
 		return inoidx_to_bytes(val->inoidx);
 	case CT_INOOFF:
 		return (uint64_t)val->inooff;
+	case CT_RTBLOCK:
+		return rtblock_to_bytes(val->rtblock);
+	case CT_RTX:
+		return rtblock_to_bytes(rtx_to_rtblock(val->rtx));
 	case CT_NONE:
 	case NCTS:
 		break;
@@ -159,13 +235,13 @@ convert_f(int argc, char **argv)
 			 "arguments\n"), argc);
 		return 0;
 	}
-	if ((wtype = lookupcty(argv[argc - 1])) == CT_NONE) {
+	if ((wtype = lookupcty(ctydescs, argv[argc - 1])) == CT_NONE) {
 		dbprintf(_("unknown conversion type %s\n"), argv[argc - 1]);
 		return 0;
 	}
 
 	for (i = mask = conmask = 0; i < (argc - 1) / 2; i++) {
-		c = lookupcty(argv[i * 2]);
+		c = lookupcty(ctydescs, argv[i * 2]);
 		if (c == CT_NONE) {
 			dbprintf(_("unknown conversion type %s\n"), argv[i * 2]);
 			return 0;
@@ -230,6 +306,107 @@ convert_f(int argc, char **argv)
 	case CT_INOOFF:
 		v &= mp->m_sb.sb_inodesize - 1;
 		break;
+	case CT_RTBLOCK:
+	case CT_RTX:
+		/* shouldn't get here */
+		ASSERT(0);
+		break;
+	case CT_NONE:
+	case NCTS:
+		/* NOTREACHED */
+		break;
+	}
+	dbprintf("0x%llx (%llu)\n", v, v);
+	return 0;
+}
+
+static inline xfs_rtblock_t
+xfs_daddr_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	return daddr >> mp->m_blkbb_log;
+}
+
+static int
+rtconvert_f(int argc, char **argv)
+{
+	ctype_t		c;
+	int		conmask;
+	cval_t		cvals[NCTS] = {};
+	int		i;
+	int		mask;
+	uint64_t	v;
+	ctype_t		wtype;
+
+	/* move past the "rtconvert" command */
+	argc--;
+	argv++;
+
+	if ((argc % 2) != 1) {
+		dbprintf(_("bad argument count %d to rtconvert, expected 3,5,7,9 "
+			 "arguments\n"), argc);
+		return 0;
+	}
+	if ((wtype = lookupcty(ctydescs_rt, argv[argc - 1])) == CT_NONE) {
+		dbprintf(_("unknown conversion type %s\n"), argv[argc - 1]);
+		return 0;
+	}
+
+	for (i = mask = conmask = 0; i < (argc - 1) / 2; i++) {
+		c = lookupcty(ctydescs_rt, argv[i * 2]);
+		if (c == CT_NONE) {
+			dbprintf(_("unknown conversion type %s\n"), argv[i * 2]);
+			return 0;
+		}
+		if (c == wtype) {
+			dbprintf(_("result type same as argument\n"));
+			return 0;
+		}
+		if (conmask & (1 << c)) {
+			dbprintf(_("conflicting conversion type %s\n"),
+				argv[i * 2]);
+			return 0;
+		}
+		if (!getvalue(argv[i * 2 + 1], c, &cvals[c]))
+			return 0;
+		mask |= 1 << c;
+		conmask |= ~ctydescs_rt[c].allowed;
+	}
+	v = 0;
+	for (c = (ctype_t)0; c < NCTS; c++) {
+		if (!(mask & (1 << c)))
+			continue;
+		v += bytevalue(c, &cvals[c]);
+	}
+	switch (wtype) {
+	case CT_BBOFF:
+		v &= BBMASK;
+		break;
+	case CT_BLKOFF:
+		v &= mp->m_blockmask;
+		break;
+	case CT_BYTE:
+		break;
+	case CT_DADDR:
+		v >>= BBSHIFT;
+		break;
+	case CT_RTBLOCK:
+		v = xfs_daddr_to_rtb(mp, v >> BBSHIFT);
+		break;
+	case CT_RTX:
+		v = xfs_daddr_to_rtb(mp, v >> BBSHIFT) / mp->m_sb.sb_rextsize;
+		break;
+	case CT_AGBLOCK:
+	case CT_AGINO:
+	case CT_AGNUMBER:
+	case CT_FSBLOCK:
+	case CT_INO:
+	case CT_INOIDX:
+	case CT_INOOFF:
+		/* shouldn't get here */
+		ASSERT(0);
+		break;
 	case CT_NONE:
 	case NCTS:
 		/* NOTREACHED */
@@ -243,6 +420,7 @@ void
 convert_init(void)
 {
 	add_command(&convert_cmd);
+	add_command(&rtconvert_cmd);
 }
 
 static int
@@ -290,6 +468,12 @@ getvalue(char *s, ctype_t ctype, cval_t *val)
 	case CT_INOOFF:
 		val->inooff = (int)v;
 		break;
+	case CT_RTBLOCK:
+		val->rtblock = (xfs_rtblock_t)v;
+		break;
+	case CT_RTX:
+		val->rtx = (xfs_rtblock_t)v;
+		break;
 	case CT_NONE:
 	case NCTS:
 		/* NOTREACHED */
@@ -299,13 +483,15 @@ getvalue(char *s, ctype_t ctype, cval_t *val)
 }
 
 static ctype_t
-lookupcty(char *ctyname)
+lookupcty(
+	const struct ctydesc	*descs,
+	const char		*ctyname)
 {
 	ctype_t		cty;
 	const char	**name;
 
 	for (cty = (ctype_t)0; cty < NCTS; cty++) {
-		for (name = ctydescs[cty].names; *name; name++) {
+		for (name = descs[cty].names; name && *name; name++) {
 			if (strcmp(ctyname, *name) == 0)
 				return cty;
 		}
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index de30fbc6230..40ff7de335e 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -953,6 +953,57 @@ Show position ring (if no
 argument is given), or move to a specific entry in the position ring given by
 .IR index .
 .TP
+.BI "rtconvert " "type number" " [" "type number" "] ... " type
+Convert from one address form to another for realtime section addresses.
+The known
+.IR type s,
+with alternate names, are:
+.RS 1.0i
+.PD 0
+.HP
+.B bboff
+or
+.B daddroff
+(byte offset in a
+.BR daddr )
+.HP
+.B blkoff
+or
+.B fsboff or
+.B rtboff
+(byte offset in a
+.B rtblock
+or
+.BR rtextent )
+.HP
+.B byte
+or
+.B fsbyte
+(byte address in filesystem)
+.HP
+.B daddr
+or
+.B bb
+(disk address, 512-byte blocks)
+.HP
+.B rtblock
+or
+.B rtb
+or
+.B rtbno
+(realtime filesystem block, see the
+.B fsblock
+command)
+.HP
+.B rtx
+or
+.B rtextent
+(realtime extent)
+.PD
+.RE
+.IP
+Only conversions that "make sense" are allowed.
+.TP
 .BI "sb [" agno ]
 Set current address to SB header in allocation group
 .IR agno .


^ permalink raw reply related	[flat|nested] 632+ messages in thread

end of thread, other threads:[~2024-01-01  1:09 UTC | newest]

Thread overview: 632+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-31 18:23 [NYE PATCHRIVER 3/4] xfs: modernize the realtime volume Darrick J. Wong
2023-12-31 19:34 ` [PATCHSET v2.0 01/15] xfs: hoist inode operations to libxfs Darrick J. Wong
2023-12-31 21:01   ` [PATCH 01/21] xfs: move inode copy-on-write predicates to xfs_inode.[ch] Darrick J. Wong
2023-12-31 21:01   ` [PATCH 02/21] xfs: hoist extent size helpers to libxfs Darrick J. Wong
2023-12-31 21:01   ` [PATCH 03/21] xfs: hoist inode flag conversion functions " Darrick J. Wong
2023-12-31 21:01   ` [PATCH 04/21] xfs: hoist project id get/set " Darrick J. Wong
2023-12-31 21:02   ` [PATCH 05/21] xfs: pack icreate initialization parameters into a separate structure Darrick J. Wong
2023-12-31 21:02   ` [PATCH 06/21] xfs: implement atime updates in xfs_trans_ichgtime Darrick J. Wong
2023-12-31 21:02   ` [PATCH 07/21] xfs: use xfs_trans_ichgtime to set times when allocating inode Darrick J. Wong
2023-12-31 21:02   ` [PATCH 08/21] xfs: split new inode creation into two pieces Darrick J. Wong
2023-12-31 21:03   ` [PATCH 09/21] xfs: hoist new inode initialization functions to libxfs Darrick J. Wong
2023-12-31 21:03   ` [PATCH 10/21] xfs: push xfs_icreate_args creation out of xfs_create* Darrick J. Wong
2023-12-31 21:03   ` [PATCH 11/21] xfs: wrap inode creation dqalloc calls Darrick J. Wong
2023-12-31 21:03   ` [PATCH 12/21] xfs: hoist xfs_iunlink to libxfs Darrick J. Wong
2023-12-31 21:04   ` [PATCH 13/21] xfs: hoist xfs_{bump,drop}link " Darrick J. Wong
2023-12-31 21:04   ` [PATCH 14/21] xfs: create libxfs helper to link a new inode into a directory Darrick J. Wong
2023-12-31 21:04   ` [PATCH 15/21] xfs: create libxfs helper to link an existing " Darrick J. Wong
2023-12-31 21:04   ` [PATCH 16/21] xfs: hoist inode free function to libxfs Darrick J. Wong
2023-12-31 21:05   ` [PATCH 17/21] xfs: create libxfs helper to remove an existing inode/name from a directory Darrick J. Wong
2023-12-31 21:05   ` [PATCH 18/21] xfs: create libxfs helper to exchange two directory entries Darrick J. Wong
2023-12-31 21:05   ` [PATCH 19/21] xfs: create libxfs helper to rename " Darrick J. Wong
2023-12-31 21:05   ` [PATCH 20/21] xfs: move dirent update hooks to xfs_dir2.c Darrick J. Wong
2023-12-31 21:06   ` [PATCH 21/21] xfs: get rid of trivial rename helpers Darrick J. Wong
2023-12-31 19:35 ` [PATCHSET v2.0 02/15] xfs: metadata inode directories Darrick J. Wong
2023-12-31 21:06   ` [PATCH 01/32] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Darrick J. Wong
2023-12-31 21:06   ` [PATCH 02/32] xfs: create imeta abstractions to get and set metadata inodes Darrick J. Wong
2023-12-31 21:07   ` [PATCH 03/32] xfs: create transaction reservations for metadata inode operations Darrick J. Wong
2023-12-31 21:07   ` [PATCH 04/32] xfs: refactor the v4 group/project inode pointer switch Darrick J. Wong
2023-12-31 21:07   ` [PATCH 05/32] xfs: convert quota file creation to use imeta methods Darrick J. Wong
2023-12-31 21:07   ` [PATCH 06/32] xfs: iget for metadata inodes Darrick J. Wong
2023-12-31 21:08   ` [PATCH 07/32] xfs: define the on-disk format for the metadir feature Darrick J. Wong
2023-12-31 21:08   ` [PATCH 08/32] xfs: update imeta transaction reservations for metadir Darrick J. Wong
2023-12-31 21:08   ` [PATCH 09/32] xfs: load metadata directory root at mount time Darrick J. Wong
2023-12-31 21:08   ` [PATCH 10/32] xfs: convert metadata inode lookup keys to use paths Darrick J. Wong
2023-12-31 21:09   ` [PATCH 11/32] xfs: enforce metadata inode flag Darrick J. Wong
2023-12-31 21:09   ` [PATCH 12/32] xfs: allow deletion of metadata directory files Darrick J. Wong
2023-12-31 21:09   ` [PATCH 13/32] xfs: read and write metadata inode directory Darrick J. Wong
2023-12-31 21:09   ` [PATCH 14/32] xfs: ensure metadata directory paths exist before creating files Darrick J. Wong
2023-12-31 21:10   ` [PATCH 15/32] xfs: disable the agi rotor for metadata inodes Darrick J. Wong
2023-12-31 21:10   ` [PATCH 16/32] xfs: hide metadata inodes from everyone because they are special Darrick J. Wong
2023-12-31 21:10   ` [PATCH 17/32] xfs: advertise metadata directory feature Darrick J. Wong
2023-12-31 21:10   ` [PATCH 18/32] xfs: allow bulkstat to return metadata directories Darrick J. Wong
2023-12-31 21:11   ` [PATCH 19/32] xfs: enable creation of dynamically allocated metadir path structures Darrick J. Wong
2023-12-31 21:11   ` [PATCH 20/32] xfs: don't count metadata directory files to quota Darrick J. Wong
2023-12-31 21:11   ` [PATCH 21/32] xfs: adjust xfs_bmap_add_attrfork for metadir Darrick J. Wong
2023-12-31 21:11   ` [PATCH 22/32] xfs: record health problems with the metadata directory Darrick J. Wong
2023-12-31 21:12   ` [PATCH 23/32] xfs: do not count metadata directory files when doing online quotacheck Darrick J. Wong
2023-12-31 21:12   ` [PATCH 24/32] xfs: scrub shouldn't tag metadata files with attr forks if metadir and parent pointers are on Darrick J. Wong
2023-12-31 21:12   ` [PATCH 25/32] xfs: scrub metadata directories Darrick J. Wong
2023-12-31 21:13   ` [PATCH 26/32] xfs: teach nlink scrubber to deal with metadata directory roots Darrick J. Wong
2023-12-31 21:13   ` [PATCH 27/32] xfs: don't check secondary super inode pointers when metadir enabled Darrick J. Wong
2023-12-31 21:13   ` [PATCH 28/32] xfs: move repair temporary files to the metadata directory tree Darrick J. Wong
2023-12-31 21:13   ` [PATCH 29/32] xfs: check metadata directory file path connectivity Darrick J. Wong
2023-12-31 21:14   ` [PATCH 30/32] xfs: repair " Darrick J. Wong
2023-12-31 21:14   ` [PATCH 31/32] xfs: fix up repair functions for metadata directory roots Darrick J. Wong
2023-12-31 21:14   ` [PATCH 32/32] xfs: enable metadata directory feature Darrick J. Wong
2023-12-31 19:35 ` [PATCHSET v2.0 03/15] xfs: refactor realtime meta inode locking Darrick J. Wong
2023-12-31 21:14   ` [PATCH 1/4] xfs: refactor realtime scrubbing context management Darrick J. Wong
2023-12-31 21:15   ` [PATCH 2/4] xfs: use separate lock classes for realtime metadata inode ILOCKs Darrick J. Wong
2023-12-31 21:15   ` [PATCH 3/4] xfs: refactor realtime inode locking Darrick J. Wong
2023-12-31 21:15   ` [PATCH 4/4] xfs: remove XFS_ILOCK_RT* Darrick J. Wong
2023-12-31 19:35 ` [PATCHSET v2.0 04/15] xfsprogs: shard the realtime section Darrick J. Wong
2023-12-31 21:15   ` [PATCH 01/24] xfs: create incore realtime group structures Darrick J. Wong
2023-12-31 21:16   ` [PATCH 02/24] xfs: define the format of rt groups Darrick J. Wong
2023-12-31 21:16   ` [PATCH 03/24] xfs: reduce rt summary file levels for rtgroups filesystems Darrick J. Wong
2023-12-31 21:16   ` [PATCH 04/24] xfs: check the realtime superblock at mount time Darrick J. Wong
2023-12-31 21:16   ` [PATCH 05/24] xfs: update primary realtime super every time we update the primary fs super Darrick J. Wong
2023-12-31 21:17   ` [PATCH 06/24] xfs: write secondary realtime superblocks to disk Darrick J. Wong
2023-12-31 21:17   ` [PATCH 07/24] xfs: grow the realtime section when realtime groups are enabled Darrick J. Wong
2023-12-31 21:17   ` [PATCH 08/24] xfs: always update secondary rt supers when we update secondary fs supers Darrick J. Wong
2023-12-31 21:17   ` [PATCH 09/24] xfs: export realtime group geometry via XFS_FSOP_GEOM Darrick J. Wong
2023-12-31 21:18   ` [PATCH 10/24] xfs: check that rtblock extents do not overlap with the rt group metadata Darrick J. Wong
2023-12-31 21:18   ` [PATCH 11/24] xfs: add frextents to the lazysbcounters when rtgroups enabled Darrick J. Wong
2023-12-31 21:18   ` [PATCH 12/24] xfs: record rt group superblock errors in the health system Darrick J. Wong
2023-12-31 21:19   ` [PATCH 13/24] xfs: define locking primitives for realtime groups Darrick J. Wong
2023-12-31 21:19   ` [PATCH 14/24] xfs: export the geometry of realtime groups to userspace Darrick J. Wong
2023-12-31 21:19   ` [PATCH 15/24] xfs: add block headers to realtime bitmap blocks Darrick J. Wong
2023-12-31 21:19   ` [PATCH 16/24] xfs: encode the rtbitmap in little endian format Darrick J. Wong
2023-12-31 21:20   ` [PATCH 17/24] xfs: add block headers to realtime summary blocks Darrick J. Wong
2023-12-31 21:20   ` [PATCH 18/24] xfs: encode the rtsummary in big endian format Darrick J. Wong
2023-12-31 21:20   ` [PATCH 19/24] xfs: store rtgroup information with a bmap intent Darrick J. Wong
2023-12-31 21:20   ` [PATCH 20/24] xfs: use an incore rtgroup rotor for rtpick Darrick J. Wong
2023-12-31 21:21   ` [PATCH 21/24] xfs: scrub the realtime group superblock Darrick J. Wong
2023-12-31 21:21   ` [PATCH 22/24] xfs: repair secondary realtime group superblocks Darrick J. Wong
2023-12-31 21:21   ` [PATCH 23/24] xfs: scrub each rtgroup's portion of the rtbitmap separately Darrick J. Wong
2023-12-31 21:21   ` [PATCH 24/24] xfs: enable realtime group feature Darrick J. Wong
2023-12-31 19:35 ` [PATCHSET v2.0 05/15] xfs: enable FITRIM for the realtime section Darrick J. Wong
2023-12-31 21:22   ` [PATCH 1/1] xfs: enable FITRIM on the realtime device Darrick J. Wong
2023-12-31 19:36 ` [PATCHSET v2.0 06/15] xfs: refactor btrees to support records in inode root Darrick J. Wong
2023-12-31 21:22   ` [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros Darrick J. Wong
2023-12-31 21:22   ` [PATCH 02/14] xfs: refactor the allocation and freeing of incore inode fork btree roots Darrick J. Wong
2023-12-31 21:22   ` [PATCH 03/14] xfs: refactor creation of bmap " Darrick J. Wong
2023-12-31 21:23   ` [PATCH 04/14] xfs: fix a sloppy memory handling bug in xfs_iroot_realloc Darrick J. Wong
2023-12-31 21:23   ` [PATCH 05/14] xfs: hoist the code that moves the incore inode fork broot memory Darrick J. Wong
2023-12-31 21:23   ` [PATCH 06/14] xfs: move the zero records logic into xfs_bmap_broot_space_calc Darrick J. Wong
2023-12-31 21:23   ` [PATCH 07/14] xfs: rearrange xfs_iroot_realloc a bit Darrick J. Wong
2023-12-31 21:24   ` [PATCH 08/14] xfs: standardize the btree maxrecs function parameters Darrick J. Wong
2023-12-31 21:24   ` [PATCH 09/14] xfs: generalize the btree root reallocation function Darrick J. Wong
2023-12-31 21:24   ` [PATCH 10/14] xfs: support leaves in the incore btree root block in xfs_iroot_realloc Darrick J. Wong
2023-12-31 21:24   ` [PATCH 11/14] xfs: hoist the node iroot update code out of xfs_btree_new_iroot Darrick J. Wong
2023-12-31 21:25   ` [PATCH 12/14] xfs: hoist the node iroot update code out of xfs_btree_kill_iroot Darrick J. Wong
2023-12-31 21:25   ` [PATCH 13/14] xfs: support storing records in the inode core root Darrick J. Wong
2023-12-31 21:25   ` [PATCH 14/14] xfs: update btree keys correctly when _insrec splits an inode root block Darrick J. Wong
2023-12-31 19:36 ` [PATCHSET v2.0 07/15] xfs: enable in-core block reservation for rt metadata Darrick J. Wong
2023-12-31 21:26   ` [PATCH 1/2] xfs: simplify xfs_ag_resv_free signature Darrick J. Wong
2023-12-31 21:26   ` [PATCH 2/2] xfs: allow inode-based btrees to reserve space in the data device Darrick J. Wong
2023-12-31 19:36 ` [PATCHSET v2.0 08/15] xfs: extent free log intent cleanups Darrick J. Wong
2023-12-31 21:26   ` [PATCH 1/9] xfs: clean up extent free log intent item tracepoint callsites Darrick J. Wong
2023-12-31 21:26   ` [PATCH 2/9] xfs: convert "skip_discard" to a proper flags bitset Darrick J. Wong
2023-12-31 21:27   ` [PATCH 3/9] xfs: pass the fsbno to xfs_perag_intent_get Darrick J. Wong
2023-12-31 21:27   ` [PATCH 4/9] xfs: add a xefi_entry helper Darrick J. Wong
2023-12-31 21:27   ` [PATCH 5/9] xfs: reuse xfs_extent_free_cancel_item Darrick J. Wong
2023-12-31 21:27   ` [PATCH 6/9] xfs: factor out a xfs_efd_add_extent helper Darrick J. Wong
2023-12-31 21:28   ` [PATCH 7/9] xfs: remove duplicate asserts in xfs_defer_extent_free Darrick J. Wong
2023-12-31 21:28   ` [PATCH 8/9] xfs: remove xfs_defer_agfl_block Darrick J. Wong
2023-12-31 21:28   ` [PATCH 9/9] xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c Darrick J. Wong
2023-12-31 19:36 ` [PATCHSET v2.0 09/15] xfs: widen EFI format to support rt Darrick J. Wong
2023-12-31 21:28   ` [PATCH 1/2] xfs: support logging EFIs for realtime extents Darrick J. Wong
2023-12-31 21:29   ` [PATCH 2/2] xfs: support error injection when freeing rt extents Darrick J. Wong
2023-12-31 19:37 ` [PATCHSET v2.0 10/15] xfs: rmap log intent cleanups Darrick J. Wong
2023-12-31 21:29   ` [PATCH 01/10] xfs: attach rtgroup objects to btree cursors Darrick J. Wong
2023-12-31 21:29   ` [PATCH 02/10] xfs: give rmap btree cursor error tracepoints their own class Darrick J. Wong
2023-12-31 21:29   ` [PATCH 03/10] xfs: prepare rmap btree tracepoints for widening Darrick J. Wong
2023-12-31 21:30   ` [PATCH 04/10] xfs: clean up rmap log intent item tracepoint callsites Darrick J. Wong
2023-12-31 21:30   ` [PATCH 05/10] xfs: remove xfs_trans_set_rmap_flags Darrick J. Wong
2023-12-31 21:30   ` [PATCH 06/10] xfs: add a ri_entry helper Darrick J. Wong
2023-12-31 21:31   ` [PATCH 07/10] xfs: reuse xfs_rmap_update_cancel_item Darrick J. Wong
2023-12-31 21:31   ` [PATCH 08/10] xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one Darrick J. Wong
2023-12-31 21:31   ` [PATCH 09/10] xfs: simplify usage of the rcur local variable " Darrick J. Wong
2023-12-31 21:31   ` [PATCH 10/10] xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c Darrick J. Wong
2023-12-31 19:37 ` [PATCHSET v2.0 11/15] xfs: realtime reverse-mapping support Darrick J. Wong
2023-12-31 21:32   ` [PATCH 01/39] xfs: prepare rmap btree cursor tracepoints for realtime Darrick J. Wong
2023-12-31 21:32   ` [PATCH 02/39] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions Darrick J. Wong
2023-12-31 21:32   ` [PATCH 03/39] xfs: introduce realtime rmap btree definitions Darrick J. Wong
2023-12-31 21:32   ` [PATCH 04/39] xfs: define the on-disk realtime rmap btree format Darrick J. Wong
2023-12-31 21:33   ` [PATCH 05/39] xfs: realtime rmap btree transaction reservations Darrick J. Wong
2023-12-31 21:33   ` [PATCH 06/39] xfs: add realtime rmap btree operations Darrick J. Wong
2023-12-31 21:33   ` [PATCH 07/39] xfs: prepare rmap functions to deal with rtrmapbt Darrick J. Wong
2023-12-31 21:33   ` [PATCH 08/39] xfs: add a realtime flag to the rmap update log redo items Darrick J. Wong
2023-12-31 21:34   ` [PATCH 09/39] xfs: support recovering rmap intent items targetting realtime extents Darrick J. Wong
2023-12-31 21:34   ` [PATCH 10/39] xfs: add realtime rmap btree block detection to log recovery Darrick J. Wong
2023-12-31 21:34   ` [PATCH 11/39] xfs: add realtime reverse map inode to metadata directory Darrick J. Wong
2023-12-31 21:34   ` [PATCH 12/39] xfs: add metadata reservations for realtime rmap btrees Darrick J. Wong
2023-12-31 21:35   ` [PATCH 13/39] xfs: wire up a new inode fork type for the realtime rmap Darrick J. Wong
2023-12-31 21:35   ` [PATCH 14/39] xfs: allow inodes with zero extents but nonzero nblocks Darrick J. Wong
2023-12-31 21:35   ` [PATCH 15/39] xfs: use realtime EFI to free extents when realtime rmap is enabled Darrick J. Wong
2023-12-31 21:35   ` [PATCH 16/39] xfs: wire up rmap map and unmap to the realtime rmapbt Darrick J. Wong
2023-12-31 21:36   ` [PATCH 17/39] xfs: create routine to allocate and initialize a realtime rmap btree inode Darrick J. Wong
2023-12-31 21:36   ` [PATCH 18/39] xfs: rearrange xfs_fsmap.c a little bit Darrick J. Wong
2023-12-31 21:36   ` [PATCH 19/39] xfs: wire up getfsmap to the realtime reverse mapping btree Darrick J. Wong
2023-12-31 21:37   ` [PATCH 20/39] xfs: check that the rtrmapbt maxlevels doesn't increase when growing fs Darrick J. Wong
2023-12-31 21:37   ` [PATCH 21/39] xfs: add realtime rmap btree when adding rt volume Darrick J. Wong
2023-12-31 21:37   ` [PATCH 22/39] xfs: report realtime rmap btree corruption errors to the health system Darrick J. Wong
2023-12-31 21:37   ` [PATCH 23/39] xfs: fix scrub tracepoints when inode-rooted btrees are involved Darrick J. Wong
2023-12-31 21:38   ` [PATCH 24/39] xfs: allow queued realtime intents to drain before scrubbing Darrick J. Wong
2023-12-31 21:38   ` [PATCH 25/39] xfs: scrub the realtime rmapbt Darrick J. Wong
2023-12-31 21:38   ` [PATCH 26/39] xfs: cross-reference realtime bitmap to realtime rmapbt scrubber Darrick J. Wong
2023-12-31 21:38   ` [PATCH 27/39] xfs: cross-reference the realtime rmapbt Darrick J. Wong
2023-12-31 21:39   ` [PATCH 28/39] xfs: scan rt rmap when we're doing an intense rmap check of bmbt mappings Darrick J. Wong
2023-12-31 21:39   ` [PATCH 29/39] xfs: scrub the metadir path of rt rmap btree files Darrick J. Wong
2023-12-31 21:39   ` [PATCH 30/39] xfs: walk the rt reverse mapping tree when rebuilding rmap Darrick J. Wong
2023-12-31 21:39   ` [PATCH 31/39] xfs: online repair of realtime file bmaps Darrick J. Wong
2023-12-31 21:40   ` [PATCH 32/39] xfs: repair inodes that have realtime extents Darrick J. Wong
2023-12-31 21:40   ` [PATCH 33/39] xfs: repair rmap btree inodes Darrick J. Wong
2023-12-31 21:40   ` [PATCH 34/39] xfs: online repair of realtime bitmaps for a realtime group Darrick J. Wong
2023-12-31 21:40   ` [PATCH 35/39] xfs: support repairing metadata btrees rooted in metadir inodes Darrick J. Wong
2023-12-31 21:41   ` [PATCH 36/39] xfs: online repair of the realtime rmap btree Darrick J. Wong
2023-12-31 21:41   ` [PATCH 37/39] xfs: create a shadow rmap btree during realtime rmap repair Darrick J. Wong
2023-12-31 21:41   ` [PATCH 38/39] xfs: hook live realtime rmap operations during a repair operation Darrick J. Wong
2023-12-31 21:41   ` [PATCH 39/39] xfs: enable realtime rmap btree Darrick J. Wong
2023-12-31 19:37 ` [PATCHSET v2.0 12/15] xfs: refcount log intent cleanups Darrick J. Wong
2023-12-31 21:42   ` [PATCH 01/10] xfs: give refcount btree cursor error tracepoints their own class Darrick J. Wong
2023-12-31 21:42   ` [PATCH 02/10] xfs: create specialized classes for refcount tracepoints Darrick J. Wong
2023-12-31 21:42   ` [PATCH 03/10] xfs: prepare refcount btree tracepoints for widening Darrick J. Wong
2023-12-31 21:43   ` [PATCH 04/10] xfs: clean up refcount log intent item tracepoint callsites Darrick J. Wong
2023-12-31 21:43   ` [PATCH 05/10] xfs: remove xfs_trans_set_refcount_flags Darrick J. Wong
2023-12-31 21:43   ` [PATCH 06/10] xfs: add a ci_entry helper Darrick J. Wong
2023-12-31 21:43   ` [PATCH 07/10] xfs: reuse xfs_refcount_update_cancel_item Darrick J. Wong
2023-12-31 21:44   ` [PATCH 08/10] xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one Darrick J. Wong
2023-12-31 21:44   ` [PATCH 09/10] xfs: simplify usage of the rcur local variable " Darrick J. Wong
2023-12-31 21:44   ` [PATCH 10/10] xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c Darrick J. Wong
2023-12-31 19:37 ` [PATCHSET v2.0 13/15] xfs: reflink on the realtime device Darrick J. Wong
2023-12-31 21:44   ` [PATCH 01/44] xfs: prepare refcount btree cursor tracepoints for realtime Darrick J. Wong
2023-12-31 21:45   ` [PATCH 02/44] xfs: introduce realtime refcount btree definitions Darrick J. Wong
2023-12-31 21:45   ` [PATCH 03/44] xfs: namespace the maximum length/refcount symbols Darrick J. Wong
2023-12-31 21:45   ` [PATCH 04/44] xfs: define the on-disk realtime refcount btree format Darrick J. Wong
2023-12-31 21:45   ` [PATCH 05/44] xfs: realtime refcount btree transaction reservations Darrick J. Wong
2023-12-31 21:46   ` [PATCH 06/44] xfs: add realtime refcount btree operations Darrick J. Wong
2023-12-31 21:46   ` [PATCH 07/44] xfs: prepare refcount functions to deal with rtrefcountbt Darrick J. Wong
2023-12-31 21:46   ` [PATCH 08/44] xfs: add a realtime flag to the refcount update log redo items Darrick J. Wong
2023-12-31 21:46   ` [PATCH 09/44] xfs: support recovering refcount intent items targetting realtime extents Darrick J. Wong
2023-12-31 21:47   ` [PATCH 10/44] xfs: add realtime refcount btree block detection to log recovery Darrick J. Wong
2023-12-31 21:47   ` [PATCH 11/44] xfs: add realtime refcount btree inode to metadata directory Darrick J. Wong
2023-12-31 21:47   ` [PATCH 12/44] xfs: add metadata reservations for realtime refcount btree Darrick J. Wong
2023-12-31 21:47   ` [PATCH 13/44] xfs: wire up a new inode fork type for the realtime refcount Darrick J. Wong
2023-12-31 21:48   ` [PATCH 14/44] xfs: wire up realtime refcount btree cursors Darrick J. Wong
2023-12-31 21:48   ` [PATCH 15/44] xfs: create routine to allocate and initialize a realtime refcount btree inode Darrick J. Wong
2023-12-31 21:48   ` [PATCH 16/44] xfs: update rmap to allow cow staging extents in the rt rmap Darrick J. Wong
2023-12-31 21:49   ` [PATCH 17/44] xfs: compute rtrmap btree max levels when reflink enabled Darrick J. Wong
2023-12-31 21:49   ` [PATCH 18/44] xfs: refactor reflink quota updates Darrick J. Wong
2023-12-31 21:49   ` [PATCH 19/44] xfs: enable CoW for realtime data Darrick J. Wong
2023-12-31 21:49   ` [PATCH 20/44] xfs: enable sharing of realtime file blocks Darrick J. Wong
2023-12-31 21:50   ` [PATCH 21/44] xfs: allow inodes to have the realtime and reflink flags Darrick J. Wong
2023-12-31 21:50   ` [PATCH 22/44] xfs: refcover CoW leftovers in the realtime volume Darrick J. Wong
2023-12-31 21:50   ` [PATCH 23/44] xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files Darrick J. Wong
2023-12-31 21:50   ` [PATCH 24/44] xfs: apply rt extent alignment constraints to CoW extsize hint Darrick J. Wong
2023-12-31 21:51   ` [PATCH 25/44] xfs: enable extent size hints for CoW operations Darrick J. Wong
2023-12-31 21:51   ` [PATCH 26/44] xfs: check that the rtrefcount maxlevels doesn't increase when growing fs Darrick J. Wong
2023-12-31 21:51   ` [PATCH 27/44] xfs: add realtime refcount btree when adding rt volume Darrick J. Wong
2023-12-31 21:51   ` [PATCH 28/44] xfs: report realtime refcount btree corruption errors to the health system Darrick J. Wong
2023-12-31 21:52   ` [PATCH 29/44] xfs: scrub the realtime refcount btree Darrick J. Wong
2023-12-31 21:52   ` [PATCH 30/44] xfs: cross-reference checks with the rt " Darrick J. Wong
2023-12-31 21:52   ` [PATCH 31/44] xfs: allow overlapping rtrmapbt records for shared data extents Darrick J. Wong
2023-12-31 21:52   ` [PATCH 32/44] xfs: check reference counts of gaps between rt refcount records Darrick J. Wong
2023-12-31 21:53   ` [PATCH 33/44] xfs: allow dquot rt block count to exceed rt blocks on reflink fs Darrick J. Wong
2023-12-31 21:53   ` [PATCH 34/44] xfs: detect and repair misaligned rtinherit directory cowextsize hints Darrick J. Wong
2023-12-31 21:53   ` [PATCH 35/44] xfs: scrub the metadir path of rt refcount btree files Darrick J. Wong
2023-12-31 21:53   ` [PATCH 36/44] xfs: don't flag quota rt block usage on rtreflink filesystems Darrick J. Wong
2023-12-31 21:54   ` [PATCH 37/44] xfs: check new rtbitmap records against rt refcount btree Darrick J. Wong
2023-12-31 21:54   ` [PATCH 38/44] xfs: walk the rt reference count tree when rebuilding rmap Darrick J. Wong
2023-12-31 21:54   ` [PATCH 39/44] xfs: capture realtime CoW staging extents when rebuilding rt rmapbt Darrick J. Wong
2023-12-31 21:54   ` [PATCH 40/44] xfs: online repair of the realtime refcount btree Darrick J. Wong
2023-12-31 21:55   ` [PATCH 41/44] xfs: repair inodes that have a refcount btree in the data fork Darrick J. Wong
2023-12-31 21:55   ` [PATCH 42/44] xfs: check for shared rt extents when rebuilding rt file's " Darrick J. Wong
2023-12-31 21:55   ` [PATCH 43/44] xfs: fix CoW forks for realtime files Darrick J. Wong
2023-12-31 21:56   ` [PATCH 44/44] xfs: enable realtime reflink Darrick J. Wong
2023-12-31 19:38 ` [PATCHSET v2.0 14/15] xfs: reflink with large realtime extents Darrick J. Wong
2023-12-31 21:56   ` [PATCH 1/9] vfs: explicitly pass the block size to the remap prep function Darrick J. Wong
2023-12-31 21:56   ` [PATCH 2/9] xfs: enable CoW when rt extent size is larger than 1 block Darrick J. Wong
2023-12-31 21:56   ` [PATCH 3/9] xfs: forcibly convert unwritten blocks within an rt extent before sharing Darrick J. Wong
2023-12-31 21:57   ` [PATCH 4/9] xfs: add some tracepoints for writeback Darrick J. Wong
2023-12-31 21:57   ` [PATCH 5/9] xfs: extend writeback requests to handle rt cow correctly Darrick J. Wong
2023-12-31 21:57   ` [PATCH 6/9] xfs: enable extent size hints for CoW when rtextsize > 1 Darrick J. Wong
2023-12-31 21:57   ` [PATCH 7/9] xfs: allow reflink on the rt volume when extent size is larger than 1 rt block Darrick J. Wong
2023-12-31 21:58   ` [PATCH 8/9] xfs: fix integer overflow when validating extent size hints Darrick J. Wong
2023-12-31 21:58   ` [PATCH 9/9] xfs: support realtime reflink with an extent size that isn't a power of 2 Darrick J. Wong
2023-12-31 19:38 ` [PATCHSET v2.0 15/15] xfs: enable quota for realtime voluems Darrick J. Wong
2023-12-31 21:58   ` [PATCH 1/6] xfs: attach dquots to rt metadata files when starting quota Darrick J. Wong
2023-12-31 21:58   ` [PATCH 2/6] xfs: fix chown with rt quota Darrick J. Wong
2023-12-31 21:59   ` [PATCH 3/6] xfs: fix rt growfs quota accounting Darrick J. Wong
2023-12-31 21:59   ` [PATCH 4/6] xfs: advertise realtime quota support in the xqm stat files Darrick J. Wong
2023-12-31 21:59   ` [PATCH 5/6] xfs: report realtime block quota limits on realtime directories Darrick J. Wong
2023-12-31 21:59   ` [PATCH 6/6] xfs: enable realtime quota again Darrick J. Wong
2023-12-31 19:51 ` [PATCHSET v2.0 01/17] xfsprogs: hoist inode operations to libxfs Darrick J. Wong
2023-12-31 23:22   ` [PATCH 01/28] xfs: hoist extent size helpers " Darrick J. Wong
2023-12-31 23:22   ` [PATCH 02/28] xfs: hoist inode flag conversion functions " Darrick J. Wong
2023-12-31 23:23   ` [PATCH 03/28] xfs: hoist project id get/set " Darrick J. Wong
2023-12-31 23:23   ` [PATCH 04/28] libxfs: put all the inode functions in a single file Darrick J. Wong
2023-12-31 23:23   ` [PATCH 05/28] libxfs: pass IGET flags through to xfs_iread Darrick J. Wong
2023-12-31 23:23   ` [PATCH 06/28] xfs: pack icreate initialization parameters into a separate structure Darrick J. Wong
2023-12-31 23:24   ` [PATCH 07/28] xfs: implement atime updates in xfs_trans_ichgtime Darrick J. Wong
2023-12-31 23:24   ` [PATCH 08/28] libxfs: rearrange libxfs_trans_ichgtime call when creating inodes Darrick J. Wong
2023-12-31 23:24   ` [PATCH 09/28] libxfs: set access time when creating files Darrick J. Wong
2023-12-31 23:24   ` [PATCH 10/28] libxfs: when creating a file in a directory, set the project id based on the parent Darrick J. Wong
2023-12-31 23:25   ` [PATCH 11/28] libxfs: pass flags2 from parent to child when creating files Darrick J. Wong
2023-12-31 23:25   ` [PATCH 12/28] xfs: split new inode creation into two pieces Darrick J. Wong
2023-12-31 23:25   ` [PATCH 13/28] libxfs: backport inode init code from the kernel Darrick J. Wong
2023-12-31 23:25   ` [PATCH 14/28] libxfs: remove libxfs_dir_ialloc Darrick J. Wong
2023-12-31 23:26   ` [PATCH 15/28] libxfs: implement get_random_u32 Darrick J. Wong
2023-12-31 23:26   ` [PATCH 16/28] xfs: hoist new inode initialization functions to libxfs Darrick J. Wong
2023-12-31 23:26   ` [PATCH 17/28] xfs: hoist xfs_iunlink " Darrick J. Wong
2023-12-31 23:26   ` [PATCH 18/28] xfs: hoist xfs_{bump,drop}link " Darrick J. Wong
2023-12-31 23:27   ` [PATCH 19/28] xfs: create libxfs helper to link a new inode into a directory Darrick J. Wong
2023-12-31 23:27   ` [PATCH 20/28] xfs: create libxfs helper to link an existing " Darrick J. Wong
2023-12-31 23:27   ` [PATCH 21/28] xfs: hoist inode free function to libxfs Darrick J. Wong
2023-12-31 23:28   ` [PATCH 22/28] xfs: create libxfs helper to remove an existing inode/name from a directory Darrick J. Wong
2023-12-31 23:28   ` [PATCH 23/28] xfs: create libxfs helper to exchange two directory entries Darrick J. Wong
2023-12-31 23:28   ` [PATCH 24/28] xfs: create libxfs helper to rename " Darrick J. Wong
2023-12-31 23:28   ` [PATCH 25/28] xfs: move dirent update hooks to xfs_dir2.c Darrick J. Wong
2023-12-31 23:29   ` [PATCH 26/28] xfs_db: port the iunlink command to use the libxfs iunlink function Darrick J. Wong
2023-12-31 23:29   ` [PATCH 27/28] xfs_repair: use library functions to reset root/rbm/rsum inodes Darrick J. Wong
2023-12-31 23:29   ` [PATCH 28/28] xfs_repair: use library functions for orphanage creation Darrick J. Wong
2023-12-31 19:52 ` [PATCHSET v2.0 02/17] xfsprogs: metadata inode directories Darrick J. Wong
2023-12-31 23:29   ` [PATCH 01/58] xfs: don't use the incore struct xfs_sb for offsets into struct xfs_dsb Darrick J. Wong
2023-12-31 23:30   ` [PATCH 02/58] xfs: create imeta abstractions to get and set metadata inodes Darrick J. Wong
2023-12-31 23:30   ` [PATCH 03/58] xfs: create transaction reservations for metadata inode operations Darrick J. Wong
2023-12-31 23:30   ` [PATCH 04/58] mkfs: clean up the rtinit() function Darrick J. Wong
2023-12-31 23:30   ` [PATCH 05/58] libxfs: convert all users to libxfs_imeta_create Darrick J. Wong
2023-12-31 23:31   ` [PATCH 06/58] mkfs: break up the rest of the rtinit() function Darrick J. Wong
2023-12-31 23:31   ` [PATCH 07/58] xfs: iget for metadata inodes Darrick J. Wong
2023-12-31 23:31   ` [PATCH 08/58] xfs: define the on-disk format for the metadir feature Darrick J. Wong
2023-12-31 23:31   ` [PATCH 09/58] xfs: update imeta transaction reservations for metadir Darrick J. Wong
2023-12-31 23:32   ` [PATCH 10/58] xfs: load metadata directory root at mount time Darrick J. Wong
2023-12-31 23:32   ` [PATCH 11/58] xfs: convert metadata inode lookup keys to use paths Darrick J. Wong
2023-12-31 23:32   ` [PATCH 12/58] xfs: enforce metadata inode flag Darrick J. Wong
2023-12-31 23:32   ` [PATCH 13/58] xfs: allow deletion of metadata directory files Darrick J. Wong
2023-12-31 23:33   ` [PATCH 14/58] xfs: read and write metadata inode directory Darrick J. Wong
2023-12-31 23:33   ` [PATCH 15/58] xfs: ensure metadata directory paths exist before creating files Darrick J. Wong
2023-12-31 23:33   ` [PATCH 16/58] xfs: disable the agi rotor for metadata inodes Darrick J. Wong
2023-12-31 23:34   ` [PATCH 17/58] xfs: advertise metadata directory feature Darrick J. Wong
2023-12-31 23:34   ` [PATCH 18/58] xfs: allow bulkstat to return metadata directories Darrick J. Wong
2023-12-31 23:34   ` [PATCH 19/58] xfs: enable creation of dynamically allocated metadir path structures Darrick J. Wong
2023-12-31 23:34   ` [PATCH 20/58] xfs: adjust xfs_bmap_add_attrfork for metadir Darrick J. Wong
2023-12-31 23:35   ` [PATCH 21/58] xfs: record health problems with the metadata directory Darrick J. Wong
2023-12-31 23:35   ` [PATCH 22/58] xfs: check metadata directory file path connectivity Darrick J. Wong
2023-12-31 23:35   ` [PATCH 23/58] libfrog: report metadata directories in the geometry report Darrick J. Wong
2023-12-31 23:35   ` [PATCH 24/58] libfrog: allow METADIR in xfrog_bulkstat_single5 Darrick J. Wong
2023-12-31 23:36   ` [PATCH 25/58] xfs_io: support scrubbing metadata directory paths Darrick J. Wong
2023-12-31 23:36   ` [PATCH 26/58] xfs_db: basic xfs_check support for metadir Darrick J. Wong
2023-12-31 23:36   ` [PATCH 27/58] xfs_db: report metadir support for version command Darrick J. Wong
2023-12-31 23:36   ` [PATCH 28/58] xfs_db: don't obfuscate metadata directories and attributes Darrick J. Wong
2023-12-31 23:37   ` [PATCH 29/58] xfs_db: support metadata directories in the path command Darrick J. Wong
2023-12-31 23:37   ` [PATCH 30/58] xfs_db: mask superblock fields when metadir feature is enabled Darrick J. Wong
2023-12-31 23:37   ` [PATCH 31/58] xfs_io: support the bulkstat metadata directory flag Darrick J. Wong
2023-12-31 23:37   ` [PATCH 32/58] xfs_io: support scrubbing metadata directory paths Darrick J. Wong
2023-12-31 23:38   ` [PATCH 33/58] xfs_spaceman: report health of metadir inodes too Darrick J. Wong
2023-12-31 23:38   ` [PATCH 34/58] xfs_scrub: scan metadata directories during phase 3 Darrick J. Wong
2023-12-31 23:38   ` [PATCH 35/58] xfs_scrub: re-run metafile scrubbers during phase 5 Darrick J. Wong
2023-12-31 23:38   ` [PATCH 36/58] xfs_repair: don't zero the incore secondary super when zeroing Darrick J. Wong
2023-12-31 23:39   ` [PATCH 37/58] xfs_repair: refactor metadata inode tagging Darrick J. Wong
2023-12-31 23:39   ` [PATCH 38/58] xfs_repair: reject regular directory dirents that point to metadata fieles Darrick J. Wong
2023-12-31 23:39   ` [PATCH 39/58] xfs_repair: dont check metadata directory dirent inumbers Darrick J. Wong
2023-12-31 23:40   ` [PATCH 40/58] xfs_repair: refactor fixing dotdot Darrick J. Wong
2023-12-31 23:40   ` [PATCH 41/58] xfs_repair: refactor marking of metadata inodes Darrick J. Wong
2023-12-31 23:40   ` [PATCH 42/58] xfs_repair: refactor root directory initialization Darrick J. Wong
2023-12-31 23:40   ` [PATCH 43/58] xfs_repair: refactor grabbing realtime metadata inodes Darrick J. Wong
2023-12-31 23:41   ` [PATCH 44/58] xfs_repair: check metadata inode flag Darrick J. Wong
2023-12-31 23:41   ` [PATCH 45/58] xfs_repair: rebuild the metadata directory Darrick J. Wong
2023-12-31 23:41   ` [PATCH 46/58] xfs_repair: don't let metadata and regular files mix Darrick J. Wong
2023-12-31 23:41   ` [PATCH 47/58] xfs_repair: update incore metadata state whenever we create new files Darrick J. Wong
2023-12-31 23:42   ` [PATCH 48/58] xfs_repair: pass private data pointer to scan_lbtree Darrick J. Wong
2023-12-31 23:42   ` [PATCH 49/58] xfs_repair: mark space used by metadata files Darrick J. Wong
2023-12-31 23:42   ` [PATCH 50/58] xfs_repair: adjust keep_fsinos to handle metadata directories Darrick J. Wong
2023-12-31 23:42   ` [PATCH 51/58] xfs_repair: metadata dirs are never plausible root dirs Darrick J. Wong
2023-12-31 23:43   ` [PATCH 52/58] xfs_repair: reattach quota inodes to metadata directory Darrick J. Wong
2023-12-31 23:43   ` [PATCH 53/58] xfs_repair: drop all the metadata directory files during pass 4 Darrick J. Wong
2023-12-31 23:43   ` [PATCH 54/58] xfs_repair: truncate and unmark orphaned metadata inodes Darrick J. Wong
2023-12-31 23:43   ` [PATCH 55/58] xfs_repair: do not count metadata directory files when doing quotacheck Darrick J. Wong
2023-12-31 23:44   ` [PATCH 56/58] xfs_repair: allow sysadmins to add metadata directories Darrick J. Wong
2023-12-31 23:44   ` [PATCH 57/58] mkfs.xfs: enable " Darrick J. Wong
2023-12-31 23:44   ` [PATCH 58/58] mkfs: add a utility to generate protofiles Darrick J. Wong
2023-12-31 19:52 ` [PATCHSET v2.0 03/17] xfs_db: debug realtime geometry Darrick J. Wong
2023-12-31 23:44   ` [PATCH 1/8] xfs_db: support passing the realtime device to the debugger Darrick J. Wong
2023-12-31 23:45   ` [PATCH 2/8] xfs_db: report the realtime device when associated with each io cursor Darrick J. Wong
2023-12-31 23:45   ` [PATCH 3/8] xfs_db: make the daddr command target the realtime device Darrick J. Wong
2023-12-31 23:45   ` [PATCH 4/8] xfs_db: access realtime file blocks Darrick J. Wong
2023-12-31 23:46   ` [PATCH 5/8] xfs_db: access arbitrary realtime blocks and extents Darrick J. Wong
2023-12-31 23:46   ` [PATCH 6/8] xfs_db: enable conversion of rt space units Darrick J. Wong
2023-12-31 23:46   ` [PATCH 7/8] xfs_db: convert rtbitmap geometry Darrick J. Wong
2023-12-31 23:46   ` [PATCH 8/8] xfs_db: convert rtsummary geometry Darrick J. Wong
2023-12-31 19:52 ` [PATCHSET v2.0 04/17] xfs_metadump: support external devices Darrick J. Wong
2023-12-31 23:47   ` [PATCH 1/1] xfs_db: allow setting current address to log blocks Darrick J. Wong
2023-12-31 19:52 ` [PATCHSET v2.0 05/17] xfsprogs: refactor realtime meta inode locking Darrick J. Wong
2023-12-31 23:47   ` [PATCH 1/2] xfs: refactor realtime " Darrick J. Wong
2023-12-31 23:47   ` [PATCH 2/2] xfs: remove XFS_ILOCK_RT* Darrick J. Wong
2023-12-31 19:53 ` [PATCHSET v2.0 06/17] xfsprogs: shard the realtime section Darrick J. Wong
2023-12-27 13:00   ` [PATCH 48/52] xfs_scrub: trim realtime volumes too Darrick J. Wong
2023-12-27 13:00   ` [PATCH 49/52] xfs_scrub: use histograms to speed up phase 8 on the realtime volume Darrick J. Wong
2023-12-27 13:00   ` [PATCH 50/52] mkfs: add headers to realtime bitmap blocks Darrick J. Wong
2023-12-27 13:00   ` [PATCH 51/52] mkfs: add headers to realtime summary blocks Darrick J. Wong
2023-12-27 13:01   ` [PATCH 52/52] mkfs: format realtime groups Darrick J. Wong
2023-12-31 23:47   ` [PATCH 01/52] xfs: create incore realtime group structures Darrick J. Wong
2023-12-31 23:48   ` [PATCH 02/52] xfs: define the format of rt groups Darrick J. Wong
2023-12-31 23:48   ` [PATCH 03/52] xfs: reduce rt summary file levels for rtgroups filesystems Darrick J. Wong
2023-12-31 23:48   ` [PATCH 04/52] xfs: update primary realtime super every time we update the primary fs super Darrick J. Wong
2023-12-31 23:48   ` [PATCH 05/52] xfs: write secondary realtime superblocks to disk Darrick J. Wong
2023-12-31 23:49   ` [PATCH 06/52] xfs: grow the realtime section when realtime groups are enabled Darrick J. Wong
2023-12-31 23:49   ` [PATCH 07/52] xfs: export realtime group geometry via XFS_FSOP_GEOM Darrick J. Wong
2023-12-31 23:49   ` [PATCH 08/52] xfs: check that rtblock extents do not overlap with the rt group metadata Darrick J. Wong
2023-12-31 23:49   ` [PATCH 09/52] xfs: add frextents to the lazysbcounters when rtgroups enabled Darrick J. Wong
2023-12-31 23:50   ` [PATCH 10/52] xfs: record rt group superblock errors in the health system Darrick J. Wong
2023-12-31 23:50   ` [PATCH 11/52] xfs: define locking primitives for realtime groups Darrick J. Wong
2023-12-31 23:50   ` [PATCH 12/52] xfs: export the geometry of realtime groups to userspace Darrick J. Wong
2023-12-31 23:50   ` [PATCH 13/52] xfs: add block headers to realtime bitmap blocks Darrick J. Wong
2023-12-31 23:51   ` [PATCH 14/52] xfs: encode the rtbitmap in little endian format Darrick J. Wong
2023-12-31 23:51   ` [PATCH 15/52] xfs: add block headers to realtime summary blocks Darrick J. Wong
2023-12-31 23:51   ` [PATCH 16/52] xfs: encode the rtsummary in big endian format Darrick J. Wong
2023-12-31 23:51   ` [PATCH 17/52] xfs: store rtgroup information with a bmap intent Darrick J. Wong
2023-12-31 23:52   ` [PATCH 18/52] xfs: use an incore rtgroup rotor for rtpick Darrick J. Wong
2023-12-31 23:52   ` [PATCH 19/52] xfs: scrub the realtime group superblock Darrick J. Wong
2023-12-31 23:52   ` [PATCH 20/52] xfs: repair secondary realtime group superblocks Darrick J. Wong
2023-12-31 23:53   ` [PATCH 21/52] xfs: scrub each rtgroup's portion of the rtbitmap separately Darrick J. Wong
2023-12-31 23:53   ` [PATCH 22/52] libfrog: report rt groups in output Darrick J. Wong
2023-12-31 23:53   ` [PATCH 23/52] libxfs: implement some sanity checking for enormous rgcount Darrick J. Wong
2023-12-31 23:53   ` [PATCH 24/52] xfs_repair: support realtime groups Darrick J. Wong
2023-12-31 23:54   ` [PATCH 25/52] xfs_repair: improve rtbitmap discrepancy reporting Darrick J. Wong
2023-12-31 23:54   ` [PATCH 26/52] xfs_repair: repair rtbitmap block headers Darrick J. Wong
2023-12-31 23:54   ` [PATCH 27/52] xfs_repair: repair rtsummary " Darrick J. Wong
2023-12-31 23:54   ` [PATCH 28/52] xfs_repair: support adding rtgroups to a filesystem Darrick J. Wong
2023-12-31 23:55   ` [PATCH 29/52] xfs_db: listify the definition of enum typnm Darrick J. Wong
2023-12-31 23:55   ` [PATCH 30/52] xfs_db: support dumping realtime superblocks Darrick J. Wong
2023-12-31 23:55   ` [PATCH 31/52] xfs_db: support changing the label and uuid of rt superblocks Darrick J. Wong
2023-12-31 23:55   ` [PATCH 32/52] xfs_db: listify the definition of dbm_t Darrick J. Wong
2023-12-31 23:56   ` [PATCH 33/52] xfs_db: implement check for rt superblocks Darrick J. Wong
2023-12-31 23:56   ` [PATCH 34/52] xfs_db: enable conversion of rt space units Darrick J. Wong
2023-12-31 23:56   ` [PATCH 35/52] xfs_db: report rtgroups via version command Darrick J. Wong
2023-12-31 23:56   ` [PATCH 36/52] xfs_db: metadump realtime devices Darrick J. Wong
2023-12-31 23:57   ` [PATCH 37/52] xfs_db: dump rt bitmap blocks Darrick J. Wong
2023-12-31 23:57   ` [PATCH 38/52] xfs_db: dump rt summary blocks Darrick J. Wong
2023-12-31 23:57   ` [PATCH 39/52] xfs_mdrestore: restore rt group superblocks to realtime device Darrick J. Wong
2023-12-31 23:57   ` [PATCH 40/52] xfs_io: support scrubbing rtgroup metadata Darrick J. Wong
2023-12-31 23:58   ` [PATCH 41/52] xfs_io: add a command to display allocation group information Darrick J. Wong
2023-12-31 23:58   ` [PATCH 42/52] xfs_io: add a command to display realtime " Darrick J. Wong
2023-12-31 23:58   ` [PATCH 43/52] xfs_io: display rt group in verbose bmap output Darrick J. Wong
2023-12-31 23:59   ` [PATCH 44/52] xfs_io: display rt group in verbose fsmap output Darrick J. Wong
2023-12-31 23:59   ` [PATCH 45/52] xfs_spaceman: report on realtime group health Darrick J. Wong
2023-12-31 23:59   ` [PATCH 46/52] xfs_scrub: scrub realtime allocation group metadata Darrick J. Wong
2023-12-31 23:59   ` [PATCH 47/52] xfs_scrub: call GETFSMAP for each rt group in parallel Darrick J. Wong
2023-12-31 19:53 ` [PATCHSET v2.0 07/17] xfsprogs: refactor btrees to support records in inode root Darrick J. Wong
2023-12-27 13:01   ` [PATCH 01/14] xfs: replace shouty XFS_BM{BT,DR} macros Darrick J. Wong
2023-12-27 13:01   ` [PATCH 02/14] xfs: refactor the allocation and freeing of incore inode fork btree roots Darrick J. Wong
2023-12-27 13:01   ` [PATCH 03/14] xfs: refactor creation of bmap " Darrick J. Wong
2023-12-27 13:02   ` [PATCH 04/14] xfs: fix a sloppy memory handling bug in xfs_iroot_realloc Darrick J. Wong
2023-12-27 13:02   ` [PATCH 05/14] xfs: hoist the code that moves the incore inode fork broot memory Darrick J. Wong
2023-12-27 13:02   ` [PATCH 06/14] xfs: move the zero records logic into xfs_bmap_broot_space_calc Darrick J. Wong
2023-12-27 13:02   ` [PATCH 07/14] xfs: rearrange xfs_iroot_realloc a bit Darrick J. Wong
2023-12-27 13:03   ` [PATCH 08/14] xfs: standardize the btree maxrecs function parameters Darrick J. Wong
2023-12-27 13:03   ` [PATCH 09/14] xfs: generalize the btree root reallocation function Darrick J. Wong
2023-12-27 13:03   ` [PATCH 10/14] xfs: support leaves in the incore btree root block in xfs_iroot_realloc Darrick J. Wong
2023-12-27 13:03   ` [PATCH 11/14] xfs: hoist the node iroot update code out of xfs_btree_new_iroot Darrick J. Wong
2023-12-27 13:04   ` [PATCH 12/14] xfs: hoist the node iroot update code out of xfs_btree_kill_iroot Darrick J. Wong
2023-12-27 13:04   ` [PATCH 13/14] xfs: support storing records in the inode core root Darrick J. Wong
2023-12-27 13:04   ` [PATCH 14/14] xfs: update btree keys correctly when _insrec splits an inode root block Darrick J. Wong
2023-12-31 19:53 ` [PATCHSET v2.0 08/17] xfsprogs: enable in-core block reservation for rt metadata Darrick J. Wong
2023-12-27 13:05   ` [PATCH 1/2] xfs: simplify xfs_ag_resv_free signature Darrick J. Wong
2023-12-27 13:05   ` [PATCH 2/2] xfs: allow inode-based btrees to reserve space in the data device Darrick J. Wong
2023-12-31 19:54 ` [PATCHSET v2.0 09/17] xfsprogs: extent free log intent cleanups Darrick J. Wong
2023-12-27 13:05   ` [PATCH 1/8] xfs: clean up extent free log intent item tracepoint callsites Darrick J. Wong
2023-12-27 13:05   ` [PATCH 2/8] xfs: convert "skip_discard" to a proper flags bitset Darrick J. Wong
2023-12-27 13:06   ` [PATCH 3/8] xfs: pass the fsbno to xfs_perag_intent_get Darrick J. Wong
2023-12-27 13:06   ` [PATCH 4/8] xfs: add a xefi_entry helper Darrick J. Wong
2023-12-27 13:06   ` [PATCH 5/8] xfs: reuse xfs_extent_free_cancel_item Darrick J. Wong
2023-12-27 13:06   ` [PATCH 6/8] xfs: remove duplicate asserts in xfs_defer_extent_free Darrick J. Wong
2023-12-27 13:07   ` [PATCH 7/8] xfs: remove xfs_defer_agfl_block Darrick J. Wong
2023-12-27 13:07   ` [PATCH 8/8] xfs: move xfs_extent_free_defer_add to xfs_extfree_item.c Darrick J. Wong
2023-12-31 19:54 ` [PATCHSET v2.0 10/17] xfsprogs: widen EFI format to support rt Darrick J. Wong
2023-12-27 13:07   ` [PATCH 1/3] xfs: support logging EFIs for realtime extents Darrick J. Wong
2023-12-27 13:07   ` [PATCH 2/3] xfs: support error injection when freeing rt extents Darrick J. Wong
2023-12-27 13:08   ` [PATCH 3/3] xfs_logprint: report realtime EFIs Darrick J. Wong
2023-12-31 19:54 ` [PATCHSET v2.0 11/17] xfsprogs: rmap log intent cleanups Darrick J. Wong
2023-12-27 13:08   ` [PATCH 1/9] xfs: attach rtgroup objects to btree cursors Darrick J. Wong
2023-12-27 13:08   ` [PATCH 2/9] xfs: give rmap btree cursor error tracepoints their own class Darrick J. Wong
2023-12-27 13:08   ` [PATCH 3/9] xfs: prepare rmap btree tracepoints for widening Darrick J. Wong
2023-12-27 13:09   ` [PATCH 4/9] xfs: clean up rmap log intent item tracepoint callsites Darrick J. Wong
2023-12-27 13:09   ` [PATCH 5/9] xfs: add a ri_entry helper Darrick J. Wong
2023-12-27 13:09   ` [PATCH 6/9] xfs: reuse xfs_rmap_update_cancel_item Darrick J. Wong
2023-12-27 13:09   ` [PATCH 7/9] xfs: don't bother calling xfs_rmap_finish_one_cleanup in xfs_rmap_finish_one Darrick J. Wong
2023-12-27 13:10   ` [PATCH 8/9] xfs: simplify usage of the rcur local variable " Darrick J. Wong
2023-12-27 13:10   ` [PATCH 9/9] xfs: move xfs_rmap_update_defer_add to xfs_rmap_item.c Darrick J. Wong
2023-12-31 19:54 ` [PATCHSET v2.0 12/17] xfsprogs: realtime reverse-mapping support Darrick J. Wong
2023-12-27 13:10   ` [PATCH 01/47] xfs: simplify the xfs_rmap_{alloc,free}_extent calling conventions Darrick J. Wong
2023-12-27 13:11   ` [PATCH 02/47] xfs: introduce realtime rmap btree definitions Darrick J. Wong
2023-12-27 13:11   ` [PATCH 03/47] xfs: define the on-disk realtime rmap btree format Darrick J. Wong
2023-12-27 13:11   ` [PATCH 04/47] xfs: realtime rmap btree transaction reservations Darrick J. Wong
2023-12-27 13:11   ` [PATCH 05/47] xfs: add realtime rmap btree operations Darrick J. Wong
2023-12-27 13:12   ` [PATCH 06/47] xfs: prepare rmap functions to deal with rtrmapbt Darrick J. Wong
2023-12-27 13:12   ` [PATCH 07/47] xfs: add a realtime flag to the rmap update log redo items Darrick J. Wong
2023-12-27 13:12   ` [PATCH 08/47] xfs: add realtime reverse map inode to metadata directory Darrick J. Wong
2023-12-27 13:12   ` [PATCH 09/47] xfs: add metadata reservations for realtime rmap btrees Darrick J. Wong
2023-12-27 13:13   ` [PATCH 10/47] xfs: wire up a new inode fork type for the realtime rmap Darrick J. Wong
2023-12-27 13:13   ` [PATCH 11/47] xfs: allow inodes with zero extents but nonzero nblocks Darrick J. Wong
2023-12-27 13:13   ` [PATCH 12/47] xfs: use realtime EFI to free extents when realtime rmap is enabled Darrick J. Wong
2023-12-27 13:13   ` [PATCH 13/47] xfs: wire up rmap map and unmap to the realtime rmapbt Darrick J. Wong
2023-12-27 13:14   ` [PATCH 14/47] xfs: create routine to allocate and initialize a realtime rmap btree inode Darrick J. Wong
2023-12-27 13:14   ` [PATCH 15/47] xfs: report realtime rmap btree corruption errors to the health system Darrick J. Wong
2023-12-27 13:14   ` [PATCH 16/47] xfs: allow queued realtime intents to drain before scrubbing Darrick J. Wong
2023-12-27 13:14   ` [PATCH 17/47] xfs: scrub the realtime rmapbt Darrick J. Wong
2023-12-27 13:15   ` [PATCH 18/47] xfs: scrub the metadir path of rt rmap btree files Darrick J. Wong
2023-12-27 13:15   ` [PATCH 19/47] xfs: online repair of the realtime rmap btree Darrick J. Wong
2023-12-27 13:15   ` [PATCH 20/47] xfs: create a shadow rmap btree during realtime rmap repair Darrick J. Wong
2023-12-27 13:15   ` [PATCH 21/47] xfs: hook live realtime rmap operations during a repair operation Darrick J. Wong
2023-12-27 13:16   ` [PATCH 22/47] xfs_db: display the realtime rmap btree contents Darrick J. Wong
2023-12-27 13:16   ` [PATCH 23/47] xfs_db: support the realtime rmapbt Darrick J. Wong
2023-12-27 13:16   ` [PATCH 24/47] xfs_db: support rudimentary checks of the rtrmap btree Darrick J. Wong
2023-12-27 13:17   ` [PATCH 25/47] xfs_db: copy the realtime rmap btree Darrick J. Wong
2023-12-27 13:17   ` [PATCH 26/47] xfs_db: make fsmap query the realtime reverse mapping tree Darrick J. Wong
2023-12-27 13:17   ` [PATCH 27/47] xfs_io: support scrubbing rtgroup metadata paths Darrick J. Wong
2023-12-27 13:17   ` [PATCH 28/47] libfrog: enable scrubbng of the realtime rmap Darrick J. Wong
2023-12-27 13:18   ` [PATCH 29/47] xfs_scrub: check rtrmapbt metadata directory connections Darrick J. Wong
2023-12-27 13:18   ` [PATCH 30/47] xfs_scrub: retest metadata across scrub groups after a repair Darrick J. Wong
2023-12-27 13:18   ` [PATCH 31/47] xfs_spaceman: report health status of the realtime rmap btree Darrick J. Wong
2023-12-27 13:18   ` [PATCH 32/47] libxfs: dirty buffers should be marked uptodate too Darrick J. Wong
2023-12-27 13:19   ` [PATCH 33/47] xfs_repair: flag suspect long-format btree blocks Darrick J. Wong
2023-12-27 13:19   ` [PATCH 34/47] xfs_repair: use realtime rmap btree data to check block types Darrick J. Wong
2023-12-27 13:19   ` [PATCH 35/47] xfs_repair: create a new set of incore rmap information for rt groups Darrick J. Wong
2023-12-27 13:19   ` [PATCH 36/47] xfs_repair: collect relatime reverse-mapping data for refcount/rmap tree rebuilding Darrick J. Wong
2023-12-27 13:20   ` [PATCH 37/47] xfs_repair: refactor realtime inode check Darrick J. Wong
2023-12-27 13:20   ` [PATCH 38/47] xfs_repair: find and mark the rtrmapbt inodes Darrick J. Wong
2023-12-27 13:20   ` [PATCH 39/47] xfs_repair: check existing realtime rmapbt entries against observed rmaps Darrick J. Wong
2023-12-27 13:20   ` [PATCH 40/47] xfs_repair: always check realtime file mappings against incore info Darrick J. Wong
2023-12-27 13:21   ` [PATCH 41/47] xfs_repair: rebuild the realtime rmap btree Darrick J. Wong
2023-12-27 13:21   ` [PATCH 42/47] xfs_repair: check for global free space concerns with default btree slack levels Darrick J. Wong
2023-12-27 13:21   ` [PATCH 43/47] xfs_repair: rebuild the bmap btree for realtime files Darrick J. Wong
2023-12-27 13:21   ` [PATCH 44/47] xfs_repair: reserve per-AG space while rebuilding rt metadata Darrick J. Wong
2023-12-27 13:22   ` [PATCH 45/47] xfs_repair: allow sysadmins to add realtime reverse mapping indexes Darrick J. Wong
2023-12-27 13:22   ` [PATCH 46/47] xfs_logprint: report realtime RUIs Darrick J. Wong
2023-12-27 13:22   ` [PATCH 47/47] mkfs: create the realtime rmap inode Darrick J. Wong
2023-12-31 19:55 ` [PATCHSET v2.0 13/17] xfsprogs: file write utility refactoring Darrick J. Wong
2023-12-27 13:23   ` [PATCH 1/4] libxfs: resync libxfs_alloc_file_space interface with the kernel Darrick J. Wong
2023-12-27 13:23   ` [PATCH 2/4] mkfs: use libxfs_alloc_file_space for rtinit Darrick J. Wong
2023-12-27 13:23   ` [PATCH 3/4] xfs_repair: use libxfs_alloc_file_space to reallocate rt metadata Darrick J. Wong
2023-12-27 13:23   ` [PATCH 4/4] mkfs: use file write helper to populate files Darrick J. Wong
2023-12-31 19:55 ` [PATCHSET v2.0 14/17] xfsprogs: refcount log intent cleanups Darrick J. Wong
2023-12-27 13:24   ` [PATCH 1/9] xfs: give refcount btree cursor error tracepoints their own class Darrick J. Wong
2023-12-27 13:24   ` [PATCH 2/9] xfs: create specialized classes for refcount tracepoints Darrick J. Wong
2023-12-27 13:24   ` [PATCH 3/9] xfs: prepare refcount btree tracepoints for widening Darrick J. Wong
2023-12-27 13:24   ` [PATCH 4/9] xfs: clean up refcount log intent item tracepoint callsites Darrick J. Wong
2023-12-27 13:25   ` [PATCH 5/9] xfs: add a ci_entry helper Darrick J. Wong
2023-12-27 13:25   ` [PATCH 6/9] xfs: reuse xfs_refcount_update_cancel_item Darrick J. Wong
2023-12-27 13:25   ` [PATCH 7/9] xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one Darrick J. Wong
2023-12-27 13:25   ` [PATCH 8/9] xfs: simplify usage of the rcur local variable " Darrick J. Wong
2023-12-27 13:26   ` [PATCH 9/9] xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c Darrick J. Wong
2023-12-31 19:55 ` [PATCHSET v2.0 15/17] xfsprogs: reflink on the realtime device Darrick J. Wong
2023-12-27 13:26   ` [PATCH 01/42] xfs: introduce realtime refcount btree definitions Darrick J. Wong
2023-12-27 13:26   ` [PATCH 02/42] xfs: namespace the maximum length/refcount symbols Darrick J. Wong
2023-12-27 13:26   ` [PATCH 03/42] xfs: define the on-disk realtime refcount btree format Darrick J. Wong
2023-12-27 13:27   ` [PATCH 04/42] xfs: realtime refcount btree transaction reservations Darrick J. Wong
2023-12-27 13:27   ` [PATCH 05/42] xfs: add realtime refcount btree operations Darrick J. Wong
2023-12-27 13:27   ` [PATCH 06/42] xfs: prepare refcount functions to deal with rtrefcountbt Darrick J. Wong
2023-12-27 13:27   ` [PATCH 07/42] xfs: add a realtime flag to the refcount update log redo items Darrick J. Wong
2023-12-27 13:28   ` [PATCH 08/42] xfs: add realtime refcount btree inode to metadata directory Darrick J. Wong
2023-12-27 13:28   ` [PATCH 09/42] xfs: add metadata reservations for realtime refcount btree Darrick J. Wong
2023-12-27 13:28   ` [PATCH 10/42] xfs: wire up a new inode fork type for the realtime refcount Darrick J. Wong
2023-12-27 13:29   ` [PATCH 11/42] xfs: wire up realtime refcount btree cursors Darrick J. Wong
2023-12-27 13:29   ` [PATCH 12/42] xfs: create routine to allocate and initialize a realtime refcount btree inode Darrick J. Wong
2023-12-27 13:29   ` [PATCH 13/42] xfs: update rmap to allow cow staging extents in the rt rmap Darrick J. Wong
2023-12-27 13:29   ` [PATCH 14/42] xfs: compute rtrmap btree max levels when reflink enabled Darrick J. Wong
2023-12-27 13:30   ` [PATCH 15/42] xfs: allow inodes to have the realtime and reflink flags Darrick J. Wong
2023-12-27 13:30   ` [PATCH 16/42] xfs: refcover CoW leftovers in the realtime volume Darrick J. Wong
2023-12-27 13:30   ` [PATCH 17/42] xfs: fix xfs_get_extsz_hint behavior with realtime alwayscow files Darrick J. Wong
2023-12-27 13:30   ` [PATCH 18/42] xfs: apply rt extent alignment constraints to CoW extsize hint Darrick J. Wong
2023-12-27 13:31   ` [PATCH 19/42] xfs: enable extent size hints for CoW operations Darrick J. Wong
2023-12-27 13:31   ` [PATCH 20/42] xfs: report realtime refcount btree corruption errors to the health system Darrick J. Wong
2023-12-27 13:31   ` [PATCH 21/42] xfs: scrub the realtime refcount btree Darrick J. Wong
2023-12-27 13:31   ` [PATCH 22/42] xfs: scrub the metadir path of rt refcount btree files Darrick J. Wong
2023-12-27 13:32   ` [PATCH 23/42] libfrog: enable scrubbing of the realtime refcount data Darrick J. Wong
2023-12-27 13:32   ` [PATCH 24/42] xfs_db: display the realtime refcount btree contents Darrick J. Wong
2023-12-27 13:32   ` [PATCH 25/42] xfs_db: support the realtime refcountbt Darrick J. Wong
2023-12-27 13:32   ` [PATCH 26/42] xfs_db: widen block type mask to 64 bits Darrick J. Wong
2023-12-27 13:33   ` [PATCH 27/42] xfs_db: support rudimentary checks of the rtrefcount btree Darrick J. Wong
2023-12-27 13:33   ` [PATCH 28/42] xfs_db: copy the realtime refcount btree Darrick J. Wong
2023-12-27 13:33   ` [PATCH 29/42] xfs_spaceman: report health of " Darrick J. Wong
2023-12-27 13:33   ` [PATCH 30/42] xfs_repair: allow CoW staging extents in the realtime rmap records Darrick J. Wong
2023-12-27 13:34   ` [PATCH 31/42] xfs_repair: use realtime refcount btree data to check block types Darrick J. Wong
2023-12-27 13:34   ` [PATCH 32/42] xfs_repair: find and mark the rtrefcountbt inode Darrick J. Wong
2023-12-27 13:34   ` [PATCH 33/42] xfs_repair: compute refcount data for the realtime groups Darrick J. Wong
2023-12-27 13:35   ` [PATCH 34/42] xfs_repair: check existing realtime refcountbt entries against observed refcounts Darrick J. Wong
2023-12-27 13:35   ` [PATCH 35/42] xfs_repair: reject unwritten shared extents Darrick J. Wong
2023-12-27 13:35   ` [PATCH 36/42] xfs_repair: rebuild the realtime refcount btree Darrick J. Wong
2023-12-27 13:35   ` [PATCH 37/42] xfs_repair: allow realtime files to have the reflink flag set Darrick J. Wong
2023-12-27 13:36   ` [PATCH 38/42] xfs_repair: validate CoW extent size hint on rtinherit directories Darrick J. Wong
2023-12-27 13:36   ` [PATCH 39/42] xfs_repair: allow sysadmins to add realtime reflink Darrick J. Wong
2023-12-27 13:36   ` [PATCH 40/42] xfs_logprint: report realtime CUIs Darrick J. Wong
2023-12-27 13:36   ` [PATCH 41/42] mkfs: validate CoW extent size hint when rtinherit is set Darrick J. Wong
2023-12-27 13:37   ` [PATCH 42/42] mkfs: enable reflink on the realtime device Darrick J. Wong
2023-12-31 19:55 ` [PATCHSET v2.0 16/17] xfsprogs: reflink with large realtime extents Darrick J. Wong
2023-12-27 13:37   ` [PATCH 1/3] xfs: enable extent size hints for CoW when rtextsize > 1 Darrick J. Wong
2023-12-27 13:37   ` [PATCH 2/3] xfs: fix integer overflow when validating extent size hints Darrick J. Wong
2023-12-27 13:37   ` [PATCH 3/3] mkfs: enable reflink with realtime extent sizes > 1 Darrick J. Wong
2023-12-31 19:56 ` [PATCHSET v2.0 17/17] xfsprogs: enable quota for realtime voluems Darrick J. Wong
2023-12-27 13:38   ` [PATCH 1/1] xfs_quota: report warning limits for realtime space quotas Darrick J. Wong
2023-12-31 20:00 ` [PATCHSET v2.0 1/9] fstests: test XFS metadata directories Darrick J. Wong
2023-12-27 13:50   ` [PATCH 01/11] xfs/122: fix metadirino Darrick J. Wong
2023-12-27 13:50   ` [PATCH 02/11] various: fix finding metadata inode numbers when metadir is enabled Darrick J. Wong
2023-12-27 13:50   ` [PATCH 03/11] xfs/{030,033,178}: forcibly disable metadata directory trees Darrick J. Wong
2023-12-27 13:50   ` [PATCH 04/11] common/repair: patch up repair sb inode value complaints Darrick J. Wong
2023-12-27 13:51   ` [PATCH 05/11] xfs/206: update for metadata directory support Darrick J. Wong
2023-12-27 13:51   ` [PATCH 06/11] xfs/{050,144,153,299,330}: update quota reports to handle metadir trees Darrick J. Wong
2023-12-27 13:51   ` [PATCH 07/11] xfs/856: add metadir upgrade to test matrix Darrick J. Wong
2023-12-27 13:51   ` [PATCH 08/11] xfs/509: adjust inumbers accounting for metadata directories Darrick J. Wong
2023-12-27 13:52   ` [PATCH 09/11] xfs: create fuzz tests " Darrick J. Wong
2023-12-27 13:52   ` [PATCH 10/11] xfs: baseline golden output for metadata directory fuzz tests Darrick J. Wong
2023-12-27 13:52   ` [PATCH 11/11] xfs: test metapath repairs Darrick J. Wong
2023-12-31 20:00 ` [PATCHSET v2.0 2/9] xfsprogs: shard the realtime section Darrick J. Wong
2023-12-27 13:53   ` [PATCH 01/17] common/populate: refactor caching of metadumps to a helper Darrick J. Wong
2023-12-27 13:53   ` [PATCH 02/17] common/xfs: wipe external logs during mdrestore operations Darrick J. Wong
2023-12-27 13:53   ` [PATCH 03/17] common/ext4: reformat " Darrick J. Wong
2023-12-27 13:53   ` [PATCH 04/17] xfs: use metadump v2 format by default Darrick J. Wong
2023-12-27 13:54   ` [PATCH 05/17] common/xfs: capture external logs during metadump/mdrestore Darrick J. Wong
2023-12-27 13:54   ` [PATCH 06/17] xfs/122: update for rtgroups Darrick J. Wong
2023-12-27 13:54   ` [PATCH 07/17] punch-alternating: detect xfs realtime files with large allocation units Darrick J. Wong
2023-12-27 13:54   ` [PATCH 08/17] xfs/206: update mkfs filtering for rt groups feature Darrick J. Wong
2023-12-27 13:55   ` [PATCH 09/17] common: pass the realtime device to xfs_db when possible Darrick J. Wong
2023-12-27 13:55   ` [PATCH 10/17] common: filter rtgroups when we're disabling metadir Darrick J. Wong
2023-12-27 13:55   ` [PATCH 11/17] xfs/185: update for rtgroups Darrick J. Wong
2023-12-27 13:55   ` [PATCH 12/17] xfs/449: update test to know about xfs_db -R Darrick J. Wong
2023-12-27 13:56   ` [PATCH 13/17] xfs/122: update for rtbitmap headers Darrick J. Wong
2023-12-27 13:56   ` [PATCH 14/17] xfs/122: udpate test to pick up rtword/suminfo ondisk unions Darrick J. Wong
2023-12-27 13:56   ` [PATCH 15/17] xfs/27[46],xfs/556: fix tests to deal with rtgroups output in bmap/fsmap commands Darrick J. Wong
2023-12-27 13:56   ` [PATCH 16/17] common/xfs: capture realtime devices during metadump/mdrestore Darrick J. Wong
2023-12-27 13:57   ` [PATCH 17/17] common/fuzzy: adapt the scrub stress tests to support rtgroups Darrick J. Wong
2023-12-31 20:00 ` [PATCHSET v2.0 3/9] fstests: enable FITRIM for the realtime section Darrick J. Wong
2023-12-27 13:57   ` [PATCH 1/2] xfs: refactor statfs field extraction Darrick J. Wong
2023-12-27 13:57   ` [PATCH 2/2] common/xfs: FITRIM now supports realtime volumes Darrick J. Wong
2023-12-31 20:00 ` [PATCHSET v2.0 4/9] fstests: fixes for realtime rmap Darrick J. Wong
2023-12-27 13:57   ` [PATCH 01/13] xfs: fix tests that try to access the realtime rmap inode Darrick J. Wong
2023-12-27 13:58   ` [PATCH 02/13] fuzz: for fuzzing the rtrmapbt, find the path to the rt rmap btree file Darrick J. Wong
2023-12-27 13:58   ` [PATCH 03/13] xfs: race fsstress with realtime rmap btree scrub and repair Darrick J. Wong
2023-12-27 13:58   ` [PATCH 04/13] xfs/856: add rtrmapbt upgrade to test matrix Darrick J. Wong
2023-12-27 13:59   ` [PATCH 05/13] xfs/122: update for rtgroups-based realtime rmap btrees Darrick J. Wong
2023-12-27 13:59   ` [PATCH 06/13] xfs: fix various problems with fsmap detecting the data device Darrick J. Wong
2023-12-27 13:59   ` [PATCH 07/13] xfs/341: update test for rtgroup-based rmap Darrick J. Wong
2023-12-27 13:59   ` [PATCH 08/13] xfs/3{43,32}: adapt tests for rt extent size greater than 1 Darrick J. Wong
2023-12-27 14:00   ` [PATCH 09/13] xfs: skip tests if formatting small filesystem fails Darrick J. Wong
2023-12-27 14:00   ` [PATCH 10/13] xfs/443: use file allocation unit, not dbsize Darrick J. Wong
2023-12-27 14:00   ` [PATCH 11/13] populate: adjust rtrmap calculations for rtgroups Darrick J. Wong
2023-12-27 14:00   ` [PATCH 12/13] populate: check that we created a realtime rmap btree of the given height Darrick J. Wong
2023-12-27 14:01   ` [PATCH 13/13] fuzzy: create missing fuzz tests for rt rmap btrees Darrick J. Wong
2023-12-31 20:01 ` [PATCHSET v2.0 5/9] fstests: establish baseline for realtime rmap fuzz tests Darrick J. Wong
2023-12-27 14:01   ` [PATCH 1/1] fuzzy: create known output for rt rmap btree " Darrick J. Wong
2023-12-31 20:01 ` [PATCHSET v2.0 6/9] fstests: reflink on the realtime device Darrick J. Wong
2023-12-27 14:01   ` [PATCH 1/9] xfs/122: update fields for realtime reflink Darrick J. Wong
2023-12-27 14:01   ` [PATCH 2/9] common/populate: create realtime refcount btree Darrick J. Wong
2023-12-27 14:02   ` [PATCH 3/9] xfs: create fuzz tests for the " Darrick J. Wong
2023-12-27 14:02   ` [PATCH 4/9] xfs/27[24]: adapt for checking files on the realtime volume Darrick J. Wong
2023-12-27 14:02   ` [PATCH 5/9] xfs: race fsstress with realtime refcount btree scrub and repair Darrick J. Wong
2023-12-27 14:02   ` [PATCH 6/9] xfs: remove xfs/131 now that we allow reflink on realtime volumes Darrick J. Wong
2023-12-27 14:03   ` [PATCH 7/9] xfs/856: add rtreflink upgrade to test matrix Darrick J. Wong
2023-12-27 14:03   ` [PATCH 8/9] generic/331,xfs/240: support files that skip delayed allocation Darrick J. Wong
2023-12-27 14:03   ` [PATCH 9/9] common/xfs: fix _xfs_get_file_block_size when rtinherit is set and no rt section Darrick J. Wong
2023-12-31 20:01 ` [PATCHSET v2.0 7/9] fstests: establish baseline for realtime reflink fuzz tests Darrick J. Wong
2023-12-27 14:03   ` [PATCH 1/1] xfs: baseline golden output for rt refcount btree " Darrick J. Wong
2023-12-31 20:01 ` [PATCHSET v2.0 8/9] fstests: reflink with large realtime extents Darrick J. Wong
2023-12-27 14:04   ` [PATCH 1/5] xfs: make sure that CoW will write around when rextsize > 1 Darrick J. Wong
2023-12-27 14:04   ` [PATCH 2/5] xfs: skip cowextsize hint fragmentation tests on realtime volumes Darrick J. Wong
2023-12-27 14:04   ` [PATCH 3/5] misc: add more congruent oplen testing Darrick J. Wong
2023-12-27 14:05   ` [PATCH 4/5] xfs: test COWing entire rt extents Darrick J. Wong
2023-12-27 14:05   ` [PATCH 5/5] generic/303: avoid test failures on weird rt extent sizes Darrick J. Wong
2023-12-31 20:02 ` [PATCHSET v2.0 9/9] fstests: functional tests for rt quota Darrick J. Wong
2023-12-27 14:05   ` [PATCH 1/3] common: enable testing of realtime quota when supported Darrick J. Wong
2023-12-27 14:05   ` [PATCH 2/3] xfs: fix quota tests to adapt to realtime quota Darrick J. Wong
2023-12-27 14:06   ` [PATCH 3/3] xfs: regression testing of quota on the realtime device Darrick J. Wong
2023-12-31 20:03 ` [PATCHSET v2.0 1/4] xfs-documentation: document metadata directories Darrick J. Wong
2023-12-27 14:08   ` [PATCH 1/1] design: document the changes required to handle " Darrick J. Wong
2023-12-31 20:03 ` [PATCHSET v2.0 2/4] xfs-documentation: shard the realtime section Darrick J. Wong
2023-12-27 14:08   ` [PATCH 1/2] design: move discussion of realtime volumes to a separate section Darrick J. Wong
2023-12-27 14:08   ` [PATCH 2/2] design: document realtime groups Darrick J. Wong
2023-12-31 20:04 ` [PATCHSET v2.0 3/4] xfs-documentation: realtime reverse-mapping support Darrick J. Wong
2023-12-27 14:08   ` [PATCH 1/1] design: document the revisions to the realtime rmap formats Darrick J. Wong
2023-12-31 20:04 ` [PATCHSET v2.0 4/4] xfs-documentation: reflink on the realtime device Darrick J. Wong
2023-12-27 14:09   ` [PATCH 1/1] design: document changes for the realtime refcount btree Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2022-12-30 22:19 [PATCHSET v1.0 0/8] xfs_db: debug realtime geometry Darrick J. Wong
2022-12-30 22:19 ` [PATCH 6/8] xfs_db: enable conversion of rt space units Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.