All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Subject: [PATCH 4/5] xfs_spaceman: implement clearing free space
Date: Fri, 30 Dec 2022 14:20:48 -0800	[thread overview]
Message-ID: <167243884818.740087.2981858571326952921.stgit@magnolia> (raw)
In-Reply-To: <167243884763.740087.13414287212519500865.stgit@magnolia>

From: Darrick J. Wong <djwong@kernel.org>

First attempt at evacuating all the used blocks from part of a
filesystem.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 Makefile                |    2 
 libfrog/Makefile        |    6 
 libfrog/clearspace.c    | 2723 +++++++++++++++++++++++++++++++++++++++++++++++
 libfrog/clearspace.h    |   72 +
 man/man8/xfs_spaceman.8 |   17 
 spaceman/Makefile       |    6 
 spaceman/clearfree.c    |  164 +++
 spaceman/init.c         |    1 
 spaceman/space.h        |    2 
 9 files changed, 2989 insertions(+), 4 deletions(-)
 create mode 100644 libfrog/clearspace.c
 create mode 100644 libfrog/clearspace.h
 create mode 100644 spaceman/clearfree.c


diff --git a/Makefile b/Makefile
index 0edc2700933..c7c31444446 100644
--- a/Makefile
+++ b/Makefile
@@ -105,7 +105,7 @@ quota: libxcmd
 repair: libxlog libxcmd
 copy: libxlog
 mkfs: libxcmd
-spaceman: libxcmd
+spaceman: libhandle libxcmd
 scrub: libhandle libxcmd
 rtcp: libfrog
 
diff --git a/libfrog/Makefile b/libfrog/Makefile
index 04aecf1abf1..4031f07e422 100644
--- a/libfrog/Makefile
+++ b/libfrog/Makefile
@@ -61,6 +61,12 @@ ifeq ($(HAVE_FIEXCHANGE),yes)
 LCFLAGS += -DHAVE_FIEXCHANGE
 endif
 
+ifeq ($(HAVE_GETFSMAP),yes)
+CFILES+=clearspace.c
+HFILES+=clearspace.h
+endif
+
+
 LDIRT = gen_crc32table crc32table.h crc32selftest
 
 default: crc32selftest ltdepend $(LTLIBRARY)
diff --git a/libfrog/clearspace.c b/libfrog/clearspace.c
new file mode 100644
index 00000000000..601257022b8
--- /dev/null
+++ b/libfrog/clearspace.c
@@ -0,0 +1,2723 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include <linux/fsmap.h>
+#include "paths.h"
+#include "fsgeom.h"
+#include "fsrefcounts.h"
+#include "logging.h"
+#include "bulkstat.h"
+#include "bitmap.h"
+#include "fiexchange.h"
+#include "file_exchange.h"
+#include "clearspace.h"
+#include "handle.h"
+
+/* VFS helpers */
+
+#ifndef FALLOC_FL_MAP_FREE_SPACE
+#define FALLOC_FL_MAP_FREE_SPACE	0x8000
+#endif
+
+/* Remap the file range described by @fcr into fd, or return an errno. */
+static inline int
+clonerange(int fd, struct file_clone_range *fcr)
+{
+	int	ret;
+
+	ret = ioctl(fd, FICLONERANGE, fcr);
+	if (ret)
+		return errno;
+
+	return 0;
+}
+
+/* Exchange the file ranges described by @xchg into fd, or return an errno. */
+static inline int
+exchangerange(int fd, struct file_xchg_range *xchg)
+{
+	int	ret;
+
+	ret = ioctl(fd, FIEXCHANGE_RANGE, xchg);
+	if (ret)
+		return errno;
+
+	return 0;
+}
+
+/*
+ * Deduplicate part of fd into the file range described by fdr.  If the
+ * operation succeeded, we set @same to whether or not we deduped the data and
+ * return zero.  If not, return an errno.
+ */
+static inline int
+deduperange(int fd, struct file_dedupe_range *fdr, bool *same)
+{
+	struct file_dedupe_range_info *info = &fdr->info[0];
+	int	ret;
+
+	assert(fdr->dest_count == 1);
+	*same = false;
+
+	ret = ioctl(fd, FIDEDUPERANGE, fdr);
+	if (ret)
+		return errno;
+
+	if (info->status < 0)
+		return -info->status;
+
+	if (info->status == FILE_DEDUPE_RANGE_DIFFERS)
+		return 0;
+
+	/* The kernel should never dedupe more than it was asked. */
+	assert(fdr->src_length >= info->bytes_deduped);
+
+	*same = true;
+	return 0;
+}
+
+/* Space clearing operation control */
+
+#define QUERY_BATCH_SIZE		1024
+
+struct clearspace_tgt {
+	unsigned long long	start;
+	unsigned long long	length;
+	unsigned long long	owners;
+	unsigned long long	prio;
+	unsigned long long	evacuated;
+	bool			try_again;
+};
+
+struct clearspace_req {
+	struct xfs_fd		*xfd;
+
+	/* all the blocks that we've tried to clear */
+	struct bitmap		*visited;
+
+	/* stat buffer of the open file */
+	struct stat		statbuf;
+	struct stat		temp_statbuf;
+	struct stat		space_statbuf;
+
+	/* handle to this filesystem */
+	void			*fshandle;
+	size_t			fshandle_sz;
+
+	/* physical storage that we want to clear */
+	unsigned long long	start;
+	unsigned long long	length;
+	dev_t			dev;
+
+	/* convenience variable */
+	bool			realtime:1;
+	bool			use_reflink:1;
+	bool			can_evac_metadata:1;
+
+	/*
+	 * The "space capture" file.  Each extent in this file must be mapped
+	 * to the same byte offset as the byte address of the physical space.
+	 */
+	int			space_fd;
+
+	/* work file for migrating file data */
+	int			work_fd;
+
+	/* preallocated buffers for queries */
+	struct getbmapx		*bhead;
+	struct fsmap_head	*mhead;
+	struct fsrefs_head	*rhead;
+
+	/* buffer for copying data */
+	char			*buf;
+
+	/* buffer for deduping data */
+	struct file_dedupe_range *fdr;
+
+	/* tracing mask and indent level */
+	unsigned int		trace_mask;
+	unsigned int		trace_indent;
+};
+
+static inline bool
+csp_is_internal_owner(
+	const struct clearspace_req	*req,
+	unsigned long long		owner)
+{
+	return owner == req->temp_statbuf.st_ino ||
+	       owner == req->space_statbuf.st_ino;
+}
+
+/* Debugging stuff */
+
+static const struct csp_errstr {
+	unsigned int		mask;
+	const char		*tag;
+} errtags[] = {
+	{ CSP_TRACE_FREEZE,	"freeze" },
+	{ CSP_TRACE_GRAB,	"grab" },
+	{ CSP_TRACE_PREP,	"prep" },
+	{ CSP_TRACE_TARGET,	"target" },
+	{ CSP_TRACE_DEDUPE,	"dedupe" },
+	{ CSP_TRACE_FIEXCHANGE, "fiexchange" },
+	{ CSP_TRACE_XREBUILD,	"rebuild" },
+	{ CSP_TRACE_EFFICACY,	"efficacy" },
+	{ CSP_TRACE_SETUP,	"setup" },
+	{ CSP_TRACE_DUMPFILE,	"dumpfile" },
+	{ CSP_TRACE_BITMAP,	"bitmap" },
+
+	/* prioritize high level functions over low level queries for tagging */
+	{ CSP_TRACE_FSMAP,	"fsmap" },
+	{ CSP_TRACE_FSREFS,	"fsrefs" },
+	{ CSP_TRACE_BMAPX,	"bmapx" },
+	{ CSP_TRACE_FALLOC,	"falloc" },
+	{ CSP_TRACE_STATUS,	"status" },
+	{ 0, NULL },
+};
+
+static void
+csp_debug(
+	struct clearspace_req	*req,
+	unsigned int		mask,
+	const char		*func,
+	int			line,
+	const char		*format,
+	...)
+{
+	const struct csp_errstr	*et = errtags;
+	bool			debug = (req->trace_mask & ~CSP_TRACE_STATUS);
+	int			indent = req->trace_indent;
+	va_list			args;
+
+	if ((req->trace_mask & mask) != mask)
+		return;
+
+	if (debug) {
+		while (indent > 0) {
+			fprintf(stderr, "  ");
+			indent--;
+		}
+
+		for (; et->tag; et++) {
+			if (et->mask & mask) {
+				fprintf(stderr, "%s: ", et->tag);
+				break;
+			}
+		}
+	}
+
+	va_start(args, format);
+	vfprintf(stderr, format, args);
+	va_end(args);
+
+	if (debug)
+		fprintf(stderr, " (line %d)\n", line);
+	else
+		fprintf(stderr, "\n");
+	fflush(stderr);
+}
+
+#define trace_freeze(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FREEZE, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_grabfree(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_GRAB, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_fsmap(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FSMAP, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_fsmap_rec(req, mask, mrec)	\
+	while (!csp_is_internal_owner((req), (mrec)->fmr_owner)) { \
+		csp_debug((req), (mask) | CSP_TRACE_FSMAP, __func__, __LINE__, \
+"fsmap phys 0x%llx owner 0x%llx offset 0x%llx bytecount 0x%llx flags 0x%x", \
+				(unsigned long long)(mrec)->fmr_physical, \
+				(unsigned long long)(mrec)->fmr_owner, \
+				(unsigned long long)(mrec)->fmr_offset, \
+				(unsigned long long)(mrec)->fmr_length, \
+				(mrec)->fmr_flags); \
+		break; \
+	}
+
+#define trace_fsrefs(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FSREFS, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_fsrefs_rec(req, mask, rrec)	\
+	csp_debug((req), (mask) | CSP_TRACE_FSREFS, __func__, __LINE__, \
+"fsref phys 0x%llx bytecount 0x%llx owners %llu flags 0x%x", \
+			(unsigned long long)(rrec)->fcr_physical, \
+			(unsigned long long)(rrec)->fcr_length, \
+			(unsigned long long)(rrec)->fcr_owners, \
+			(rrec)->fcr_flags)
+
+#define trace_bmapx(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_BMAPX, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_bmapx_rec(req, mask, brec)	\
+	csp_debug((req), (mask) | CSP_TRACE_BMAPX, __func__, __LINE__, \
+"bmapx pos 0x%llx bytecount 0x%llx phys 0x%llx flags 0x%x", \
+			(unsigned long long)BBTOB((brec)->bmv_offset), \
+			(unsigned long long)BBTOB((brec)->bmv_length), \
+			(unsigned long long)BBTOB((brec)->bmv_block), \
+			(brec)->bmv_oflags)
+
+#define trace_prep(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_PREP, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_target(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_TARGET, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_dedupe(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_DEDUPE, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_falloc(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FALLOC, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_fiexchange(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_FIEXCHANGE, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_xrebuild(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_XREBUILD, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_setup(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_SETUP, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_status(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_STATUS, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_dumpfile(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_DUMPFILE, __func__, __LINE__, format, __VA_ARGS__)
+
+#define trace_bitmap(req, format, ...)	\
+	csp_debug((req), CSP_TRACE_BITMAP, __func__, __LINE__, format, __VA_ARGS__)
+
+/* VFS Iteration helpers */
+
+static inline void
+start_spacefd_iter(struct clearspace_req *req)
+{
+	req->trace_indent++;
+}
+
+static inline void
+end_spacefd_iter(struct clearspace_req *req)
+{
+	req->trace_indent--;
+}
+
+/*
+ * Iterate each hole in the space-capture file.  Returns 1 if holepos/length
+ * has been set to a hole; 0 if there aren't any holes left, or -1 for error.
+ */
+static inline int
+spacefd_hole_iter(
+	const struct clearspace_req	*req,
+	loff_t			*holepos,
+	loff_t			*length)
+{
+	loff_t			end = req->start + req->length;
+	loff_t			h;
+	loff_t			d;
+
+	if (*length == 0)
+		d = req->start;
+	else
+		d = *holepos + *length;
+	if (d >= end)
+		return 0;
+
+	h = lseek(req->space_fd, d, SEEK_HOLE);
+	if (h < 0) {
+		perror(_("finding start of hole in space capture file"));
+		return h;
+	}
+	if (h >= end)
+		return 0;
+
+	d = lseek(req->space_fd, h, SEEK_DATA);
+	if (d < 0 && errno == ENXIO)
+		d = end;
+	if (d < 0) {
+		perror(_("finding end of hole in space capture file"));
+		return d;
+	}
+	if (d > end)
+		d = end;
+
+	*holepos = h;
+	*length = d - h;
+	return 1;
+}
+
+/*
+ * Iterate each written region in the space-capture file.  Returns 1 if
+ * datapos/length have been set to a data area; 0 if there isn't any data left,
+ * or -1 for error.
+ */
+static int
+spacefd_data_iter(
+	const struct clearspace_req	*req,
+	loff_t			*datapos,
+	loff_t			*length)
+{
+	loff_t			end = req->start + req->length;
+	loff_t			d;
+	loff_t			h;
+
+	if (*length == 0)
+		h = req->start;
+	else
+		h = *datapos + *length;
+	if (h >= end)
+		return 0;
+
+	d = lseek(req->space_fd, h, SEEK_DATA);
+	if (d < 0 && errno == ENXIO)
+		return 0;
+	if (d < 0) {
+		perror(_("finding start of data in space capture file"));
+		return d;
+	}
+	if (d >= end)
+		return 0;
+
+	h = lseek(req->space_fd, d, SEEK_HOLE);
+	if (h < 0) {
+		perror(_("finding end of data in space capture file"));
+		return h;
+	}
+	if (h > end)
+		h = end;
+
+	*datapos = d;
+	*length = h - d;
+	return 1;
+}
+
+/* Filesystem space usage queries */
+
+/* Allocate the structures needed for a fsmap query. */
+static void
+start_fsmap_query(
+	struct clearspace_req	*req,
+	dev_t			dev,
+	unsigned long long	physical,
+	unsigned long long	length)
+{
+	struct fsmap_head	*mhead = req->mhead;
+
+	assert(req->mhead->fmh_count == 0);
+	memset(mhead, 0, sizeof(struct fsmap_head));
+	mhead->fmh_count = QUERY_BATCH_SIZE;
+	mhead->fmh_keys[0].fmr_device = dev;
+	mhead->fmh_keys[0].fmr_physical = physical;
+	mhead->fmh_keys[1].fmr_device = dev;
+	mhead->fmh_keys[1].fmr_physical = physical + length;
+	mhead->fmh_keys[1].fmr_owner = ULLONG_MAX;
+	mhead->fmh_keys[1].fmr_flags = UINT_MAX;
+	mhead->fmh_keys[1].fmr_offset = ULLONG_MAX;
+
+	trace_fsmap(req, "dev %u:%u physical 0x%llx bytecount 0x%llx highkey 0x%llx",
+			major(dev), minor(dev),
+			(unsigned long long)physical,
+			(unsigned long long)length,
+			(unsigned long long)mhead->fmh_keys[1].fmr_physical);
+	req->trace_indent++;
+}
+
+static inline void
+end_fsmap_query(
+	struct clearspace_req	*req)
+{
+	req->trace_indent--;
+	req->mhead->fmh_count = 0;
+}
+
+/* Set us up for the next run_fsmap_query, or return false. */
+static inline bool
+advance_fsmap_cursor(struct fsmap_head *mhead)
+{
+	struct fsmap	*mrec;
+
+	mrec = &mhead->fmh_recs[mhead->fmh_entries - 1];
+	if (mrec->fmr_flags & FMR_OF_LAST)
+		return false;
+
+	fsmap_advance(mhead);
+	return true;
+}
+
+/*
+ * Run a GETFSMAP query.  Returns 1 if there are rows, 0 if there are no rows,
+ * or -1 for error.
+ */
+static inline int
+run_fsmap_query(
+	struct clearspace_req	*req)
+{
+	struct fsmap_head	*mhead = req->mhead;
+	int			ret;
+
+	if (mhead->fmh_entries > 0 && !advance_fsmap_cursor(mhead))
+		return 0;
+
+	trace_fsmap(req,
+ "ioctl dev %u:%u physical 0x%llx length 0x%llx highkey 0x%llx",
+			major(mhead->fmh_keys[0].fmr_device),
+			minor(mhead->fmh_keys[0].fmr_device),
+			(unsigned long long)mhead->fmh_keys[0].fmr_physical,
+			(unsigned long long)mhead->fmh_keys[0].fmr_length,
+			(unsigned long long)mhead->fmh_keys[1].fmr_physical);
+
+	ret = ioctl(req->xfd->fd, FS_IOC_GETFSMAP, mhead);
+	if (ret) {
+		perror(_("querying fsmap data"));
+		return -1;
+	}
+
+	if (!(mhead->fmh_oflags & FMH_OF_DEV_T)) {
+		fprintf(stderr, _("fsmap does not return dev_t.\n"));
+		return -1;
+	}
+
+	if (mhead->fmh_entries == 0)
+		return 0;
+
+	return 1;
+}
+
+#define for_each_fsmap_row(req, rec) \
+	for ((rec) = (req)->mhead->fmh_recs; \
+	     (rec) < (req)->mhead->fmh_recs + (req)->mhead->fmh_entries; \
+	     (rec)++)
+
+/* Allocate the structures needed for a fsrefcounts query. */
+static void
+start_fsrefs_query(
+	struct clearspace_req	*req,
+	dev_t			dev,
+	unsigned long long	physical,
+	unsigned long long	length)
+{
+	struct fsrefs_head	*rhead = req->rhead;
+
+	assert(req->rhead->fch_count == 0);
+	memset(rhead, 0, sizeof(struct fsrefs_head));
+	rhead->fch_count = QUERY_BATCH_SIZE;
+	rhead->fch_keys[0].fcr_device = dev;
+	rhead->fch_keys[0].fcr_physical = physical;
+	rhead->fch_keys[1].fcr_device = dev;
+	rhead->fch_keys[1].fcr_physical = physical + length;
+	rhead->fch_keys[1].fcr_owners = ULLONG_MAX;
+	rhead->fch_keys[1].fcr_flags = UINT_MAX;
+
+	trace_fsrefs(req, "dev %u:%u physical 0x%llx bytecount 0x%llx highkey 0x%llx",
+			major(dev), minor(dev),
+			(unsigned long long)physical,
+			(unsigned long long)length,
+			(unsigned long long)rhead->fch_keys[1].fcr_physical);
+	req->trace_indent++;
+}
+
+static inline void
+end_fsrefs_query(
+	struct clearspace_req	*req)
+{
+	req->trace_indent--;
+	req->rhead->fch_count = 0;
+}
+
+/* Set us up for the next run_fsrefs_query, or return false. */
+static inline bool
+advance_fsrefs_query(struct fsrefs_head *rhead)
+{
+	struct fsrefs	*rrec;
+
+	rrec = &rhead->fch_recs[rhead->fch_entries - 1];
+	if (rrec->fcr_flags & FCR_OF_LAST)
+		return false;
+
+	fsrefs_advance(rhead);
+	return true;
+}
+
+/*
+ * Run a GETFSREFCOUNTS query.  Returns 1 if there are rows, 0 if there are
+ * no rows, or -1 for error.
+ */
+static inline int
+run_fsrefs_query(
+	struct clearspace_req	*req)
+{
+	struct fsrefs_head	*rhead = req->rhead;
+	int			ret;
+
+	if (rhead->fch_entries > 0 && !advance_fsrefs_query(rhead))
+		return 0;
+
+	trace_fsrefs(req,
+ "ioctl dev %u:%u physical 0x%llx length 0x%llx highkey 0x%llx",
+			major(rhead->fch_keys[0].fcr_device),
+			minor(rhead->fch_keys[0].fcr_device),
+			(unsigned long long)rhead->fch_keys[0].fcr_physical,
+			(unsigned long long)rhead->fch_keys[0].fcr_length,
+			(unsigned long long)rhead->fch_keys[1].fcr_physical);
+
+	ret = ioctl(req->xfd->fd, FS_IOC_GETFSREFCOUNTS, rhead);
+	if (ret) {
+		perror(_("querying refcount data"));
+		abort();
+		return -1;
+	}
+
+	if (!(rhead->fch_oflags & FCH_OF_DEV_T)) {
+		fprintf(stderr, _("fsrefcounts does not return dev_t.\n"));
+		return -1;
+	}
+
+	if (rhead->fch_entries == 0)
+		return 0;
+
+	return 1;
+}
+
+#define for_each_fsref_row(req, rec) \
+	for ((rec) = (req)->rhead->fch_recs; \
+	     (rec) < (req)->rhead->fch_recs + (req)->rhead->fch_entries; \
+	     (rec)++)
+
+/* Allocate the structures needed for a bmapx query. */
+static void
+start_bmapx_query(
+	struct clearspace_req	*req,
+	unsigned int		fork,
+	unsigned long long	pos,
+	unsigned long long	length)
+{
+	struct getbmapx		*bhead = req->bhead;
+
+	assert(fork == BMV_IF_ATTRFORK || fork == BMV_IF_COWFORK || !fork);
+	assert(req->bhead->bmv_count == 0);
+
+	memset(bhead, 0, sizeof(struct getbmapx));
+	bhead[0].bmv_offset = BTOBB(pos);
+	bhead[0].bmv_length = BTOBB(length);
+	bhead[0].bmv_count = QUERY_BATCH_SIZE + 1;
+	bhead[0].bmv_iflags = fork | BMV_IF_PREALLOC | BMV_IF_DELALLOC;
+
+	trace_bmapx(req, "%s pos 0x%llx bytecount 0x%llx",
+			fork == BMV_IF_COWFORK ? "cow" : fork == BMV_IF_ATTRFORK ? "attr" : "data",
+			(unsigned long long)BBTOB(bhead[0].bmv_offset),
+			(unsigned long long)BBTOB(bhead[0].bmv_length));
+	req->trace_indent++;
+}
+
+static inline void
+end_bmapx_query(
+	struct clearspace_req	*req)
+{
+	req->trace_indent--;
+	req->bhead->bmv_count = 0;
+}
+
+/* Set us up for the next run_bmapx_query, or return false. */
+static inline bool
+advance_bmapx_query(struct getbmapx *bhead)
+{
+	struct getbmapx		*brec;
+	unsigned long long	next_offset;
+	unsigned long long	end = bhead->bmv_offset + bhead->bmv_length;
+
+	brec = &bhead[bhead->bmv_entries];
+	if (brec->bmv_oflags & BMV_OF_LAST)
+		return false;
+
+	next_offset = brec->bmv_offset + brec->bmv_length;
+	if (next_offset > end)
+		return false;
+
+	bhead->bmv_offset = next_offset;
+	bhead->bmv_length = end - next_offset;
+	return true;
+}
+
+/*
+ * Run a GETBMAPX query.  Returns 1 if there are rows, 0 if there are no rows,
+ * or -1 for error.
+ */
+static inline int
+run_bmapx_query(
+	struct clearspace_req	*req,
+	int			fd)
+{
+	struct getbmapx		*bhead = req->bhead;
+	unsigned int		fork;
+	int			ret;
+
+	if (bhead->bmv_entries > 0 && !advance_bmapx_query(bhead))
+		return 0;
+
+	fork = bhead[0].bmv_iflags & (BMV_IF_COWFORK | BMV_IF_ATTRFORK);
+	trace_bmapx(req, "ioctl %s pos 0x%llx bytecount 0x%llx",
+			fork == BMV_IF_COWFORK ? "cow" : fork == BMV_IF_ATTRFORK ? "attr" : "data",
+			(unsigned long long)BBTOB(bhead[0].bmv_offset),
+			(unsigned long long)BBTOB(bhead[0].bmv_length));
+
+	ret = ioctl(fd, XFS_IOC_GETBMAPX, bhead);
+	if (ret) {
+		perror(_("querying bmapx data"));
+		return -1;
+	}
+
+	if (bhead->bmv_entries == 0)
+		return 0;
+
+	return 1;
+}
+
+#define for_each_bmapx_row(req, rec) \
+	for ((rec) = (req)->bhead + 1; \
+	     (rec) < (req)->bhead + 1 + (req)->bhead->bmv_entries; \
+	     (rec)++)
+
+static inline void
+csp_dump_bmapx_row(
+	struct clearspace_req	*req,
+	unsigned int		nr,
+	const struct getbmapx	*brec)
+{
+	if (brec->bmv_block == -1) {
+		trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx hole",
+				nr,
+				(unsigned long long)BBTOB(brec->bmv_offset),
+				(unsigned long long)BBTOB(brec->bmv_length));
+		return;
+	}
+
+	if (brec->bmv_block == -2) {
+		trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx delalloc",
+				nr,
+				(unsigned long long)BBTOB(brec->bmv_offset),
+				(unsigned long long)BBTOB(brec->bmv_length));
+		return;
+	}
+
+	trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx phys 0x%llx flags 0x%x",
+			nr,
+			(unsigned long long)BBTOB(brec->bmv_offset),
+			(unsigned long long)BBTOB(brec->bmv_length),
+			(unsigned long long)BBTOB(brec->bmv_block),
+			brec->bmv_oflags);
+}
+
+static inline void
+csp_dump_bmapx(
+	struct clearspace_req	*req,
+	int			fd,
+	unsigned int		indent,
+	const char		*tag)
+{
+	unsigned int		nr;
+	int			ret;
+
+	trace_dumpfile(req, "DUMP BMAP OF DATA FORK %s", tag);
+	start_bmapx_query(req, 0, req->start, req->length);
+	nr = 0;
+	while ((ret = run_bmapx_query(req, fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			csp_dump_bmapx_row(req, nr++, brec);
+			if (nr > 10)
+				goto dump_cow;
+		}
+	}
+
+dump_cow:
+	end_bmapx_query(req);
+	trace_dumpfile(req, "DUMP BMAP OF COW FORK %s", tag);
+	start_bmapx_query(req, BMV_IF_COWFORK, req->start, req->length);
+	nr = 0;
+	while ((ret = run_bmapx_query(req, fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			csp_dump_bmapx_row(req, nr++, brec);
+			if (nr > 10)
+				goto dump_attr;
+		}
+	}
+
+dump_attr:
+	end_bmapx_query(req);
+	trace_dumpfile(req, "DUMP BMAP OF ATTR FORK %s", tag);
+	start_bmapx_query(req, BMV_IF_ATTRFORK, req->start, req->length);
+	nr = 0;
+	while ((ret = run_bmapx_query(req, fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			csp_dump_bmapx_row(req, nr++, brec);
+			if (nr > 10)
+				goto stop;
+		}
+	}
+
+stop:
+	end_bmapx_query(req);
+	trace_dumpfile(req, "DONE DUMPING %s", tag);
+}
+
+/* Return the first bmapx for the given file range. */
+static int
+bmapx_one(
+	struct clearspace_req	*req,
+	int			fd,
+	unsigned long long	pos,
+	unsigned long long	length,
+	struct getbmapx		*brec)
+{
+	struct getbmapx		bhead[2];
+	int			ret;
+
+	memset(bhead, 0, sizeof(struct getbmapx) * 2);
+	bhead[0].bmv_offset = BTOBB(pos);
+	bhead[0].bmv_length = BTOBB(length);
+	bhead[0].bmv_count = 2;
+	bhead[0].bmv_iflags = BMV_IF_PREALLOC | BMV_IF_DELALLOC;
+
+	ret = ioctl(fd, XFS_IOC_GETBMAPX, bhead);
+	if (ret) {
+		perror(_("simple bmapx query"));
+		return -1;
+	}
+
+	if (bhead->bmv_entries > 0) {
+		memcpy(brec, &bhead[1], sizeof(struct getbmapx));
+		return 0;
+	}
+
+	memset(brec, 0, sizeof(struct getbmapx));
+	brec->bmv_offset = pos;
+	brec->bmv_block = -1;	/* hole */
+	brec->bmv_length = length;
+	return 0;
+}
+
+/* Constrain space map records. */
+static void
+__trim_fsmap(
+	uint64_t		start,
+	uint64_t		length,
+	struct fsmap		*fsmap)
+{
+	unsigned long long	delta, end;
+	bool			need_off;
+
+	need_off = (fsmap->fmr_flags & (FMR_OF_EXTENT_MAP |
+					FMR_OF_SPECIAL_OWNER));
+
+	if (fsmap->fmr_physical < start) {
+		delta = start - fsmap->fmr_physical;
+		fsmap->fmr_physical = start;
+		fsmap->fmr_length -= delta;
+		if (need_off)
+			fsmap->fmr_offset += delta;
+	}
+
+	end = fsmap->fmr_physical + fsmap->fmr_length;
+	if (end > start + length) {
+		delta = end - (start + length);
+		fsmap->fmr_length -= delta;
+	}
+}
+
+static inline void
+trim_target_fsmap(const struct clearspace_tgt *tgt, struct fsmap *fsmap)
+{
+	return __trim_fsmap(tgt->start, tgt->length, fsmap);
+}
+
+static inline void
+trim_request_fsmap(const struct clearspace_req *req, struct fsmap *fsmap)
+{
+	return __trim_fsmap(req->start, req->length, fsmap);
+}
+
+/* Actual space clearing code */
+
+/*
+ * Map all the free space in the region that we're clearing to the space
+ * catcher file.
+ */
+static int
+csp_grab_free_space(
+	struct clearspace_req	*req)
+{
+	int			ret;
+
+	trace_grabfree(req, "start 0x%llx length 0x%llx",
+			(unsigned long long)req->start,
+			(unsigned long long)req->length);
+
+	ret = fallocate(req->space_fd, FALLOC_FL_MAP_FREE_SPACE, req->start,
+			req->length);
+	if (ret) {
+		perror(_("map free space to space capture file"));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Rank a refcount record.  We prefer to tackle highly shared and longer
+ * extents first.
+ */
+static inline unsigned long long
+csp_space_prio(
+	const struct xfs_fsop_geom	*g,
+	const struct fsrefs		*p)
+{
+	unsigned long long		blocks = p->fcr_length / g->blocksize;
+	unsigned long long		ret = blocks * p->fcr_owners;
+
+	if (ret < blocks || ret < p->fcr_owners)
+		return UINT64_MAX;
+	return ret;
+}
+
+/* Make the current refcount record the clearing target if desirable. */
+static void
+csp_adjust_target(
+	struct clearspace_req		*req,
+	struct clearspace_tgt		*target,
+	const struct fsrefs		*rec,
+	unsigned long long		prio)
+{
+	if (prio < target->prio)
+		return;
+	if (prio == target->prio &&
+	    rec->fcr_length <= target->length)
+		return;
+
+	/* Ignore results that go beyond the end of what we wanted. */
+	if (rec->fcr_physical >= req->start + req->length)
+		return;
+
+	/* Ignore regions that we already tried to clear. */
+	if (bitmap_test(req->visited, rec->fcr_physical, rec->fcr_length))
+		return;
+
+	trace_target(req,
+ "set target, prio 0x%llx -> 0x%llx phys 0x%llx bytecount 0x%llx",
+			target->prio, prio,
+			(unsigned long long)rec->fcr_physical,
+			(unsigned long long)rec->fcr_length);
+
+	target->start = rec->fcr_physical;
+	target->length = rec->fcr_length;
+	target->owners = rec->fcr_owners;
+	target->prio = prio;
+}
+
+/*
+ * Decide if this refcount record maps to extents that are sufficiently
+ * interesting to target.
+ */
+static int
+csp_evaluate_refcount(
+	struct clearspace_req		*req,
+	const struct fsrefs		*rrec,
+	struct clearspace_tgt		*target)
+{
+	const struct xfs_fsop_geom	*fsgeom = &req->xfd->fsgeom;
+	unsigned long long		prio = csp_space_prio(fsgeom, rrec);
+	int				ret;
+
+	if (rrec->fcr_device != req->dev)
+		return 0;
+
+	if (prio < target->prio)
+		return 0;
+
+	/*
+	 * XFS only supports sharing data blocks.  If there's more than one
+	 * owner, we know that we can easily move the blocks.
+	 */
+	if (rrec->fcr_owners > 1) {
+		csp_adjust_target(req, target, rrec, prio);
+		return 0;
+	}
+
+	/*
+	 * Otherwise, this extent has single owners.  Walk the fsmap records to
+	 * figure out if they're movable or not.
+	 */
+	start_fsmap_query(req, rrec->fcr_device, rrec->fcr_physical,
+			rrec->fcr_length);
+	while ((ret = run_fsmap_query(req)) > 0) {
+		struct fsmap	*mrec;
+		uint64_t	next_phys = 0;
+
+		for_each_fsmap_row(req, mrec) {
+			struct fsrefs	fake_rec = { };
+
+			trace_fsmap_rec(req, CSP_TRACE_TARGET, mrec);
+
+			if (mrec->fmr_device != rrec->fcr_device)
+				continue;
+			if (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER)
+				continue;
+			if (csp_is_internal_owner(req, mrec->fmr_owner))
+				continue;
+
+			/*
+			 * If the space has become shared since the fsrefs
+			 * query, just skip this record.  We might come back to
+			 * it in a later iteration.
+			 */
+			if (mrec->fmr_physical < next_phys)
+				continue;
+
+			/* Fake enough of a fsrefs to calculate the priority. */
+			fake_rec.fcr_physical = mrec->fmr_physical;
+			fake_rec.fcr_length = mrec->fmr_length;
+			fake_rec.fcr_owners = 1;
+			prio = csp_space_prio(fsgeom, &fake_rec);
+
+			/* Target unwritten extents first; they're cheap. */
+			if (mrec->fmr_flags & FMR_OF_PREALLOC)
+				prio |= (1ULL << 63);
+
+			csp_adjust_target(req, target, &fake_rec, prio);
+
+			next_phys = mrec->fmr_physical + mrec->fmr_length;
+		}
+	}
+	end_fsmap_query(req);
+
+	return ret;
+}
+
+/*
+ * Given a range of storage to search, find the most appealing target for space
+ * clearing.  If nothing suitable is found, the target will be zeroed.
+ */
+static int
+csp_find_target(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target)
+{
+	int			ret;
+
+	memset(target, 0, sizeof(struct clearspace_tgt));
+
+	start_fsrefs_query(req, req->dev, req->start, req->length);
+	while ((ret = run_fsrefs_query(req)) > 0) {
+		struct fsrefs	*rrec;
+
+		for_each_fsref_row(req, rrec) {
+			trace_fsrefs_rec(req, CSP_TRACE_TARGET, rrec);
+			ret = csp_evaluate_refcount(req, rrec, target);
+			if (ret) {
+				end_fsrefs_query(req);
+				return ret;
+			}
+		}
+	}
+	end_fsrefs_query(req);
+
+	if (target->length != 0) {
+		/*
+		 * Mark this extent visited so that we won't try again this
+		 * round.
+		 */
+		trace_bitmap(req, "set filedata start 0x%llx length 0x%llx",
+				target->start, target->length);
+		ret = bitmap_set(req->visited, target->start, target->length);
+		if (ret) {
+			perror(_("marking file extent visited"));
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/* Try to evacuate blocks by using online repair. */
+static int
+csp_evac_file_metadata(
+	struct clearspace_req		*req,
+	struct clearspace_tgt		*target,
+	const struct fsmap		*mrec,
+	int				fd,
+	const struct xfs_bulkstat	*bulkstat)
+{
+	struct xfs_scrub_metadata	scrub = {
+		.sm_type		= XFS_SCRUB_TYPE_PROBE,
+		.sm_flags		= XFS_SCRUB_IFLAG_REPAIR |
+					  XFS_SCRUB_IFLAG_FORCE_REBUILD,
+	};
+	struct xfs_fd			*xfd = req->xfd;
+	int				ret;
+
+	trace_xrebuild(req,
+ "ino 0x%llx pos 0x%llx bytecount 0x%llx phys 0x%llx flags 0x%llx",
+				(unsigned long long)mrec->fmr_owner,
+				(unsigned long long)mrec->fmr_offset,
+				(unsigned long long)mrec->fmr_physical,
+				(unsigned long long)mrec->fmr_length,
+				(unsigned long long)mrec->fmr_flags);
+
+	if (fd == -1) {
+		scrub.sm_ino = mrec->fmr_owner;
+		scrub.sm_gen = bulkstat->bs_gen;
+		fd = xfd->fd;
+	}
+
+	if (mrec->fmr_flags & FMR_OF_ATTR_FORK) {
+		if (mrec->fmr_flags & FMR_OF_EXTENT_MAP)
+			scrub.sm_type = XFS_SCRUB_TYPE_BMBTA;
+		else
+			scrub.sm_type = XFS_SCRUB_TYPE_XATTR;
+	} else if (mrec->fmr_flags & FMR_OF_EXTENT_MAP) {
+		scrub.sm_type = XFS_SCRUB_TYPE_BMBTD;
+	} else if (S_ISLNK(bulkstat->bs_mode)) {
+		scrub.sm_type = XFS_SCRUB_TYPE_SYMLINK;
+	} else if (S_ISDIR(bulkstat->bs_mode)) {
+		scrub.sm_type = XFS_SCRUB_TYPE_DIR;
+	}
+
+	if (scrub.sm_type == XFS_SCRUB_TYPE_PROBE)
+		return 0;
+
+	trace_xrebuild(req, "ino 0x%llx gen 0x%x type %u",
+			(unsigned long long)mrec->fmr_owner,
+			(unsigned int)bulkstat->bs_gen,
+			(unsigned int)scrub.sm_type);
+
+	ret = ioctl(fd, XFS_IOC_SCRUB_METADATA, &scrub);
+	if (ret) {
+		fprintf(stderr,
+	_("evacuating inode 0x%llx metadata type %u: %s\n"),
+				(unsigned long long)mrec->fmr_owner,
+				scrub.sm_type, strerror(errno));
+		return -1;
+	}
+
+	target->evacuated++;
+	return 0;
+}
+
+/*
+ * Open an inode via handle.  Returns a file descriptor, -2 if the file is
+ * gone, or -1 on error.
+ */
+static int
+csp_open_by_handle(
+	struct clearspace_req	*req,
+	int			oflags,
+	uint64_t		ino,
+	uint32_t		gen)
+{
+	struct xfs_handle	handle = { };
+	struct xfs_fsop_handlereq hreq = {
+		.oflags		= oflags | O_NOATIME | O_NOFOLLOW |
+				  O_NOCTTY | O_LARGEFILE,
+		.ihandle	= &handle,
+		.ihandlen	= sizeof(handle),
+	};
+	int			ret;
+
+	memcpy(&handle.ha_fsid, req->fshandle, sizeof(handle.ha_fsid));
+	handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
+			sizeof(handle.ha_fid.fid_len);
+	handle.ha_fid.fid_pad = 0;
+	handle.ha_fid.fid_ino = ino;
+	handle.ha_fid.fid_gen = gen;
+
+	/*
+	 * Since we extracted the fshandle from the open file instead of using
+	 * path_to_fshandle, the fsid cache doesn't know about the fshandle.
+	 * Construct the open by handle request manually.
+	 */
+	ret = ioctl(req->xfd->fd, XFS_IOC_OPEN_BY_HANDLE, &hreq);
+	if (ret < 0) {
+		if (errno == ENOENT || errno == EINVAL)
+			return -2;
+
+		fprintf(stderr, _("open inode 0x%llx: %s\n"),
+				(unsigned long long)ino,
+				strerror(errno));
+		return -1;
+	}
+
+	return ret;
+}
+
+/*
+ * Open a file for evacuation.  Returns a positive errno on error; a fd in @fd
+ * if the caller is supposed to do something; or @fd == -1 if there's nothing
+ * further to do.
+ */
+static int
+csp_evac_open(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target,
+	const struct fsmap	*mrec,
+	struct xfs_bulkstat	*bulkstat,
+	int			oflags,
+	int			*fd)
+{
+	struct xfs_bulkstat	__bs;
+	int			target_fd;
+	int			ret;
+
+	*fd = -1;
+
+	if (csp_is_internal_owner(req, mrec->fmr_owner) ||
+	    (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER))
+		goto nothing_to_do;
+
+	if (bulkstat == NULL)
+		bulkstat = &__bs;
+
+	/*
+	 * Snapshot this file so that we can perform a fresh-only exchange.
+	 * For other types of files we just skip to the evacuation step.
+	 */
+	ret = -xfrog_bulkstat_single(req->xfd, mrec->fmr_owner, 0, bulkstat);
+	if (ret) {
+		if (ret == ENOENT || ret == EINVAL)
+			goto nothing_to_do;
+
+		fprintf(stderr, _("bulkstat inode 0x%llx: %s\n"),
+				(unsigned long long)mrec->fmr_owner,
+				strerror(ret));
+		return ret;
+	}
+
+	/*
+	 * If we get stats for a different inode, the file may have been freed
+	 * out from under us and there's nothing to do.
+	 */
+	if (bulkstat->bs_ino != mrec->fmr_owner)
+		goto nothing_to_do;
+
+	/*
+	 * We're only allowed to open regular files and directories via handle
+	 * so jump to online rebuild for all other file types.
+	 */
+	if (!S_ISREG(bulkstat->bs_mode) && !S_ISDIR(bulkstat->bs_mode))
+		return csp_evac_file_metadata(req, target, mrec, -1,
+				bulkstat);
+
+	if (S_ISDIR(bulkstat->bs_mode))
+		oflags = O_RDONLY;
+
+	target_fd = csp_open_by_handle(req, oflags, mrec->fmr_owner,
+			bulkstat->bs_gen);
+	if (target_fd == -2)
+		goto nothing_to_do;
+	if (target_fd < 0)
+		return -target_fd;
+
+	/*
+	 * Exchange only works for regular file data blocks.  If that isn't the
+	 * case, our only recourse is online rebuild.
+	 */
+	if (S_ISDIR(bulkstat->bs_mode) ||
+	    (mrec->fmr_flags & (FMR_OF_ATTR_FORK | FMR_OF_EXTENT_MAP))) {
+		int	ret2;
+
+		ret = csp_evac_file_metadata(req, target, mrec, target_fd,
+				bulkstat);
+		ret2 = close(target_fd);
+		if (!ret && ret2)
+			ret = ret2;
+		return ret;
+	}
+
+	*fd = target_fd;
+	return 0;
+
+nothing_to_do:
+	target->try_again = true;
+	return 0;
+}
+
+/* Unshare the space in the work file that we're using for deduplication. */
+static int
+csp_unshare_workfile(
+	struct clearspace_req	*req,
+	unsigned long long	start,
+	unsigned long long	length)
+{
+	int			ret;
+
+	trace_falloc(req, "funshare workfd pos 0x%llx bytecount 0x%llx",
+			start, length);
+
+	ret = fallocate(req->work_fd, FALLOC_FL_UNSHARE_RANGE, start, length);
+	if (ret) {
+		perror(_("unsharing work file"));
+		return ret;
+	}
+
+	ret = fsync(req->work_fd);
+	if (ret) {
+		perror(_("syncing work file"));
+		return ret;
+	}
+
+	/* Make sure we didn't get any space within the clearing range. */
+	start_bmapx_query(req, 0, start, length);
+	while ((ret = run_bmapx_query(req, req->work_fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			unsigned long long	p, l;
+
+			trace_bmapx_rec(req, CSP_TRACE_FALLOC, brec);
+			p = BBTOB(brec->bmv_block);
+			l = BBTOB(brec->bmv_length);
+
+			if (p + l < req->start || p >= req->start + req->length)
+				continue;
+
+			trace_prep(req,
+	"workfd has extent inside clearing range, phys 0x%llx fsbcount 0x%llx",
+					p, l);
+			end_bmapx_query(req);
+			return -1;
+		}
+	}
+	end_bmapx_query(req);
+
+	return 0;
+}
+
+/* Try to deduplicate every block in the fdr request, if we can. */
+static int
+csp_evac_dedupe_loop(
+	struct clearspace_req		*req,
+	struct clearspace_tgt		*target,
+	unsigned long long		ino,
+	int				max_reqlen)
+{
+	struct file_dedupe_range	*fdr = req->fdr;
+	struct file_dedupe_range_info	*info = &fdr->info[0];
+	loff_t				last_unshare_off = -1;
+	int				ret;
+
+	while (fdr->src_length > 0) {
+		struct getbmapx		brec;
+		bool			same;
+		unsigned int		old_reqlen = fdr->src_length;
+
+		if (max_reqlen && fdr->src_length > max_reqlen)
+			fdr->src_length = max_reqlen;
+
+		trace_dedupe(req, "ino 0x%llx pos 0x%llx bytecount 0x%llx",
+				ino,
+				(unsigned long long)info->dest_offset,
+				(unsigned long long)fdr->src_length);
+
+		ret = bmapx_one(req, req->work_fd, fdr->src_offset,
+				fdr->src_length, &brec);
+		if (ret)
+			return ret;
+
+		trace_dedupe(req, "workfd pos 0x%llx phys 0x%llx",
+				(unsigned long long)fdr->src_offset,
+				(unsigned long long)BBTOB(brec.bmv_block));
+
+		ret = deduperange(req->work_fd, fdr, &same);
+		if (ret == ENOSPC && last_unshare_off < fdr->src_offset) {
+			req->trace_indent++;
+			trace_dedupe(req, "funshare workfd at phys 0x%llx",
+					(unsigned long long)fdr->src_offset);
+			/*
+			 * If we ran out of space, it's possible that we have
+			 * reached the maximum sharing factor of the blocks in
+			 * the work file.  Try unsharing the range of the work
+			 * file to get a singly-owned range and loop again.
+			 */
+			ret = csp_unshare_workfile(req, fdr->src_offset,
+					fdr->src_length);
+			req->trace_indent--;
+			if (ret)
+				return ret;
+
+			ret = fsync(req->work_fd);
+			if (ret) {
+				perror(_("sync after unshare work file"));
+				return ret;
+			}
+
+			last_unshare_off = fdr->src_offset;
+			fdr->src_length = old_reqlen;
+			continue;
+		}
+		if (ret) {
+			fprintf(stderr, _("evacuating inode 0x%llx: %s\n"),
+					ino, strerror(ret));
+			return ret;
+		}
+
+		if (same) {
+			req->trace_indent++;
+			trace_dedupe(req,
+	"evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx",
+					ino,
+					(unsigned long long)info->dest_offset,
+					(unsigned long long)info->bytes_deduped);
+			req->trace_indent--;
+
+			target->evacuated++;
+		} else {
+			req->trace_indent++;
+			trace_dedupe(req,
+	"failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx",
+					ino,
+					(unsigned long long)info->dest_offset,
+					(unsigned long long)fdr->src_length);
+			req->trace_indent--;
+
+			target->try_again = true;
+
+			/*
+			 * If we aren't single-stepping the deduplication,
+			 * stop early so that the caller goes into single-step
+			 * mode.
+			 */
+			if (!max_reqlen) {
+				fdr->src_length = old_reqlen;
+				return 0;
+			}
+
+			/* Contents changed, move on to the next block. */
+			info->bytes_deduped = fdr->src_length;
+		}
+		fdr->src_length = old_reqlen;
+
+		fdr->src_offset += info->bytes_deduped;
+		info->dest_offset += info->bytes_deduped;
+		fdr->src_length -= info->bytes_deduped;
+	}
+
+	return 0;
+}
+
+/*
+ * Evacuate one fsmapping by using dedupe to remap data stored in the target
+ * range to a copy stored in the work file.
+ */
+static int
+csp_evac_dedupe_fsmap(
+	struct clearspace_req		*req,
+	struct clearspace_tgt		*target,
+	const struct fsmap		*mrec)
+{
+	struct file_dedupe_range	*fdr = req->fdr;
+	struct file_dedupe_range_info	*info = &fdr->info[0];
+	bool				can_single_step;
+	int				target_fd;
+	int				ret, ret2;
+
+	if (mrec->fmr_device != req->dev) {
+		fprintf(stderr, _("wrong fsmap device in results.\n"));
+		return -1;
+	}
+
+	ret = csp_evac_open(req, target, mrec, NULL, O_RDONLY, &target_fd);
+	if (ret || target_fd < 0)
+		return ret;
+
+	/*
+	 * Use dedupe to try to shift the target file's mappings to use the
+	 * copy of the data that's in the work file.
+	 */
+	fdr->src_offset = mrec->fmr_physical;
+	fdr->src_length = mrec->fmr_length;
+	fdr->dest_count = 1;
+	info->dest_fd = target_fd;
+	info->dest_offset = mrec->fmr_offset;
+
+	can_single_step = mrec->fmr_length > req->xfd->fsgeom.blocksize;
+
+	/* First we try to do the entire thing all at once. */
+	ret = csp_evac_dedupe_loop(req, target, mrec->fmr_owner, 0);
+	if (ret)
+		goto out_fd;
+
+	/* If there's any work left, try again one block at a time. */
+	if (can_single_step && fdr->src_length > 0) {
+		ret = csp_evac_dedupe_loop(req, target, mrec->fmr_owner,
+				req->xfd->fsgeom.blocksize);
+		if (ret)
+			goto out_fd;
+	}
+
+out_fd:
+	ret2 = close(target_fd);
+	if (!ret && ret2)
+		ret = ret2;
+	return ret;
+}
+
+/* Use deduplication to remap data extents away from where we're clearing. */
+static int
+csp_evac_dedupe(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target)
+{
+	int			ret;
+
+	start_fsmap_query(req, req->dev, target->start, target->length);
+	while ((ret = run_fsmap_query(req)) > 0) {
+		struct fsmap	*mrec;
+
+		for_each_fsmap_row(req, mrec) {
+			trace_fsmap_rec(req, CSP_TRACE_DEDUPE, mrec);
+			trim_target_fsmap(target, mrec);
+
+			req->trace_indent++;
+			ret = csp_evac_dedupe_fsmap(req, target, mrec);
+			req->trace_indent--;
+			if (ret)
+				goto out;
+
+			ret = csp_grab_free_space(req);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	end_fsmap_query(req);
+	if (ret)
+		trace_dedupe(req, "ret %d", ret);
+	return ret;
+}
+
+#define BUFFERCOPY_BUFSZ		65536
+
+/*
+ * Use a memory buffer to copy part of src_fd to dst_fd, or return an errno. */
+static int
+csp_buffercopy(
+	struct clearspace_req	*req,
+	int			src_fd,
+	loff_t			src_off,
+	int			dst_fd,
+	loff_t			dst_off,
+	loff_t			len)
+{
+	int			ret = 0;
+
+	while (len > 0) {
+		size_t count = min(BUFFERCOPY_BUFSZ, len);
+		ssize_t bytes_read, bytes_written;
+
+		bytes_read = pread(src_fd, req->buf, count, src_off);
+		if (bytes_read < 0) {
+			ret = errno;
+			break;
+		}
+
+		bytes_written = pwrite(dst_fd, req->buf, bytes_read, dst_off);
+		if (bytes_written < 0) {
+			ret = errno;
+			break;
+		}
+
+		src_off += bytes_written;
+		dst_off += bytes_written;
+		len -= bytes_written;
+	}
+
+	return ret;
+}
+
+/*
+ * Prepare the work file to assist in evacuating file data by copying the
+ * contents of the frozen space into the work file.
+ */
+static int
+csp_prepare_for_dedupe(
+	struct clearspace_req	*req)
+{
+	struct file_clone_range	fcr;
+	struct stat		statbuf;
+	loff_t			datapos = 0;
+	loff_t			length = 0;
+	int			ret;
+
+	ret = fstat(req->space_fd, &statbuf);
+	if (ret) {
+		perror(_("space capture file"));
+		return ret;
+	}
+
+	ret = ftruncate(req->work_fd, 0);
+	if (ret) {
+		perror(_("truncate work file"));
+		return ret;
+	}
+
+	ret = ftruncate(req->work_fd, statbuf.st_size);
+	if (ret) {
+		perror(_("reset work file"));
+		return ret;
+	}
+
+	/* Make a working copy of the frozen file data. */
+	start_spacefd_iter(req);
+	while ((ret = spacefd_data_iter(req, &datapos, &length)) > 0) {
+		trace_prep(req, "clone spacefd data 0x%llx length 0x%llx",
+				(long long)datapos, (long long)length);
+
+		fcr.src_fd = req->space_fd;
+		fcr.src_offset = datapos;
+		fcr.src_length = length;
+		fcr.dest_offset = datapos;
+
+		ret = clonerange(req->work_fd, &fcr);
+		if (ret == ENOSPC) {
+			req->trace_indent++;
+			trace_prep(req,
+	"falling back to buffered copy at 0x%llx",
+					(long long)datapos);
+			req->trace_indent--;
+			ret = csp_buffercopy(req, req->space_fd, datapos,
+					req->work_fd, datapos, length);
+		}
+		if (ret) {
+			perror(
+	_("copying space capture file contents to work file"));
+			return ret;
+		}
+	}
+	end_spacefd_iter(req);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Unshare the work file so that it contains an identical copy of the
+	 * contents of the space capture file but mapped to different blocks.
+	 * This is key to using dedupe to migrate file space away from the
+	 * requested region.
+	 */
+	req->trace_indent++;
+	ret = csp_unshare_workfile(req, req->start, req->length);
+	req->trace_indent--;
+	return ret;
+}
+
+/*
+ * Evacuate one fsmapping by using dedupe to remap data stored in the target
+ * range to a copy stored in the work file.
+ */
+static int
+csp_evac_exchange_fsmap(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target,
+	const struct fsmap	*mrec)
+{
+	struct xfs_bulkstat	bulkstat;
+	struct file_xchg_range	xchg = { };
+	struct getbmapx		brec;
+	int			target_fd;
+	int			ret, ret2;
+
+	if (mrec->fmr_device != req->dev) {
+		fprintf(stderr, _("wrong fsmap device in results.\n"));
+		return -1;
+	}
+
+	ret = csp_evac_open(req, target, mrec, &bulkstat, O_RDWR, &target_fd);
+	if (ret || target_fd < 0)
+		return ret;
+
+	ret = ftruncate(req->work_fd, 0);
+	if (ret) {
+		perror(_("truncating work file"));
+		goto out_fd;
+	}
+
+	/*
+	 * Copy the data from the original file to the work file.  We assume
+	 * that the work file will end up with different data blocks and that
+	 * they're outside of the requested range.
+	 */
+	ret = csp_buffercopy(req, target_fd, mrec->fmr_offset, req->work_fd,
+			mrec->fmr_offset, mrec->fmr_length);
+	if (ret) {
+		fprintf(stderr, _("copying target file to work file: %s\n"),
+				strerror(ret));
+		goto out_fd;
+	}
+
+	ret = fsync(req->work_fd);
+	if (ret) {
+		perror(_("flush work file for fiexchange"));
+		goto out_fd;
+	}
+
+	ret = bmapx_one(req, req->work_fd, mrec->fmr_physical,
+			mrec->fmr_length, &brec);
+	if (ret)
+		return ret;
+
+	trace_fiexchange(req, "workfd pos 0x%llx phys 0x%llx",
+			(unsigned long long)mrec->fmr_physical,
+			(unsigned long long)BBTOB(brec.bmv_block));
+
+	/*
+	 * Exchange the mappings, with the freshness check enabled.  This
+	 * should result in the target file being switched to new blocks unless
+	 * it has changed, in which case we bounce out and find a new target.
+	 */
+	xfrog_file_exchange_prep(NULL, FILE_XCHG_RANGE_NONATOMIC,
+			mrec->fmr_offset, req->work_fd, mrec->fmr_offset,
+			mrec->fmr_length, &xchg);
+	xfrog_file_exchange_require_file2_fresh(&xchg, &bulkstat);
+	ret = exchangerange(target_fd, &xchg);
+	if (ret) {
+		if (ret == EBUSY) {
+			req->trace_indent++;
+			trace_fiexchange(req,
+ "failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx",
+					bulkstat.bs_ino,
+					(unsigned long long)mrec->fmr_offset,
+					(unsigned long long)mrec->fmr_length);
+			req->trace_indent--;
+			target->try_again = true;
+		} else {
+			fprintf(stderr,
+	_("exchanging target and work file contents: %s\n"),
+					strerror(ret));
+		}
+		goto out_fd;
+	}
+
+	req->trace_indent++;
+	trace_fiexchange(req,
+ "evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx",
+			bulkstat.bs_ino,
+			(unsigned long long)mrec->fmr_offset,
+			(unsigned long long)mrec->fmr_length);
+	req->trace_indent--;
+	target->evacuated++;
+
+out_fd:
+	ret2 = close(target_fd);
+	if (!ret && ret2)
+		ret = ret2;
+	return ret;
+}
+
+/*
+ * Try to evacuate all data blocks in the target region by copying the contents
+ * to a new file and exchanging the extents.
+ */
+static int
+csp_evac_exchange(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target)
+{
+	int			ret;
+
+	start_fsmap_query(req, req->dev, target->start, target->length);
+	while ((ret = run_fsmap_query(req)) > 0) {
+		struct fsmap	*mrec;
+
+		for_each_fsmap_row(req, mrec) {
+			trace_fsmap_rec(req, CSP_TRACE_FIEXCHANGE, mrec);
+			trim_target_fsmap(target, mrec);
+
+			req->trace_indent++;
+			ret = csp_evac_exchange_fsmap(req, target, mrec);
+			req->trace_indent--;
+			if (ret)
+				goto out;
+
+			ret = csp_grab_free_space(req);
+			if (ret)
+				goto out;
+		}
+	}
+out:
+	end_fsmap_query(req);
+	if (ret)
+		trace_fiexchange(req, "ret %d", ret);
+	return ret;
+}
+
+/* Try to evacuate blocks by using online repair to rebuild AG metadata. */
+static int
+csp_evac_ag_metadata(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target,
+	uint32_t		agno,
+	uint32_t		mask)
+{
+	struct xfs_scrub_metadata scrub = {
+		.sm_flags	= XFS_SCRUB_IFLAG_REPAIR |
+				  XFS_SCRUB_IFLAG_FORCE_REBUILD,
+	};
+	unsigned int		i;
+	int			ret;
+
+	trace_xrebuild(req, "agno 0x%x mask 0x%x",
+			(unsigned int)agno,
+			(unsigned int)mask);
+
+	for (i = XFS_SCRUB_TYPE_AGFL; i < XFS_SCRUB_TYPE_REFCNTBT; i++) {
+
+		if (!(mask & (1U << i)))
+			continue;
+
+		scrub.sm_type = i;
+
+		req->trace_indent++;
+		trace_xrebuild(req, "agno %u type %u",
+				(unsigned int)agno,
+				(unsigned int)scrub.sm_type);
+		req->trace_indent--;
+
+		ret = ioctl(req->xfd->fd, XFS_IOC_SCRUB_METADATA, &scrub);
+		if (ret) {
+			if (errno == ENOENT || errno == ENOSPC)
+				continue;
+			fprintf(stderr, _("rebuilding ag %u type %u: %s\n"),
+					(unsigned int)agno, scrub.sm_type,
+					strerror(errno));
+			return -1;
+		}
+
+		target->evacuated++;
+
+		ret = csp_grab_free_space(req);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* Compute a scrub mask for a fsmap special owner. */
+static uint32_t
+fsmap_owner_to_scrub_mask(__u64 owner)
+{
+	switch (owner) {
+	case XFS_FMR_OWN_FREE:
+	case XFS_FMR_OWN_UNKNOWN:
+	case XFS_FMR_OWN_FS:
+	case XFS_FMR_OWN_LOG:
+		/* can't move these */
+		return 0;
+	case XFS_FMR_OWN_AG:
+		return (1U << XFS_SCRUB_TYPE_BNOBT) |
+		       (1U << XFS_SCRUB_TYPE_CNTBT) |
+		       (1U << XFS_SCRUB_TYPE_AGFL) |
+		       (1U << XFS_SCRUB_TYPE_RMAPBT);
+	case XFS_FMR_OWN_INOBT:
+		return (1U << XFS_SCRUB_TYPE_INOBT) |
+		       (1U << XFS_SCRUB_TYPE_FINOBT);
+	case XFS_FMR_OWN_REFC:
+		return (1U << XFS_SCRUB_TYPE_REFCNTBT);
+	case XFS_FMR_OWN_INODES:
+	case XFS_FMR_OWN_COW:
+		/* don't know how to get rid of these */
+		return 0;
+	case XFS_FMR_OWN_DEFECTIVE:
+		/* good, get rid of it */
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+/* Try to clear all per-AG metadata from the requested range. */
+static int
+csp_evac_fs_metadata(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target,
+	bool			*cleared_anything)
+{
+	uint32_t		curr_agno = -1U;
+	uint32_t		curr_mask = 0;
+	int			ret = 0;
+
+	if (req->realtime)
+		return 0;
+
+	start_fsmap_query(req, req->dev, target->start, target->length);
+	while ((ret = run_fsmap_query(req)) > 0) {
+		struct fsmap	*mrec;
+
+		for_each_fsmap_row(req, mrec) {
+			uint64_t	daddr;
+			uint32_t	agno;
+			uint32_t	mask;
+
+			if (mrec->fmr_device != req->dev)
+				continue;
+			if (!(mrec->fmr_flags & FMR_OF_SPECIAL_OWNER))
+				continue;
+
+			/* Ignore regions that we already tried to clear. */
+			if (bitmap_test(req->visited, mrec->fmr_physical,
+						mrec->fmr_length))
+				continue;
+
+			mask = fsmap_owner_to_scrub_mask(mrec->fmr_owner);
+			if (!mask)
+				continue;
+
+			trace_fsmap_rec(req, CSP_TRACE_XREBUILD, mrec);
+
+			daddr = BTOBB(mrec->fmr_physical);
+			agno = cvt_daddr_to_agno(req->xfd, daddr);
+
+			trace_xrebuild(req,
+	"agno 0x%x -> 0x%x mask 0x%x owner %lld",
+					curr_agno, agno, curr_mask,
+					(unsigned long long)mrec->fmr_owner);
+
+			if (curr_agno == -1U) {
+				curr_agno = agno;
+			} else if (curr_agno != agno) {
+				ret = csp_evac_ag_metadata(req, target,
+						curr_agno, curr_mask);
+				if (ret)
+					goto out;
+
+				*cleared_anything = true;
+				curr_agno = agno;
+				curr_mask = 0;
+			}
+
+			/* Put this on the list and try to clear it once. */
+			curr_mask |= mask;
+			ret = bitmap_set(req->visited, mrec->fmr_physical,
+					mrec->fmr_length);
+			if (ret) {
+				perror(_("marking metadata extent visited"));
+				goto out;
+			}
+		}
+	}
+
+	if (curr_agno != -1U && curr_mask != 0) {
+		ret = csp_evac_ag_metadata(req, target, curr_agno, curr_mask);
+		if (ret)
+			goto out;
+		*cleared_anything = true;
+	}
+
+	if (*cleared_anything)
+		trace_bitmap(req, "set metadata start 0x%llx length 0x%llx",
+				target->start, target->length);
+
+out:
+	end_fsmap_query(req);
+	if (ret)
+		trace_xrebuild(req, "ret %d", ret);
+	return ret;
+}
+
+/*
+ * Check that at least the start of the mapping was frozen into the work file
+ * at the correct offset.  Set @len to the number of bytes that were frozen.
+ * Returns -1 for error, zero if written extents are waiting to be mapped into
+ * the space capture file, or 1 if there's nothing to transfer to the space
+ * capture file.
+ */
+static int
+csp_freeze_check_attempt(
+	struct clearspace_req	*req,
+	const struct fsmap	*mrec,
+	unsigned long long	*len)
+{
+	struct getbmapx		brec;
+	int			ret;
+
+	*len = 0;
+
+	ret = bmapx_one(req, req->work_fd, mrec->fmr_physical,
+			mrec->fmr_length, &brec);
+	if (ret)
+		return ret;
+
+	trace_freeze(req,
+ "does workfd pos 0x%llx len 0x%llx map to phys 0x%llx len 0x%llx?",
+			(unsigned long long)mrec->fmr_physical,
+			(unsigned long long)mrec->fmr_length,
+			(unsigned long long)BBTOB(brec.bmv_block),
+			(unsigned long long)BBTOB(brec.bmv_length));
+
+	/* freeze of an unwritten extent punches a hole in the work file. */
+	if ((mrec->fmr_flags & FMR_OF_PREALLOC) && brec.bmv_block == -1) {
+		*len = BBTOB(brec.bmv_length);
+		return 1;
+	}
+
+	/*
+	 * freeze of a written extent must result in the same physical space
+	 * being mapped into the work file.
+	 */
+	if (!(mrec->fmr_flags & FMR_OF_PREALLOC) &&
+	    BBTOB(brec.bmv_block) == mrec->fmr_physical) {
+		*len = BBTOB(brec.bmv_length);
+		return 0;
+	}
+
+	/*
+	 * We didn't find what we were looking for, which implies that the
+	 * mapping changed out from under us.  Punch out everything that could
+	 * have been mapped into the work file.  Set @len to zero and return so
+	 * that we try again with the next mapping.
+	 */
+
+	trace_falloc(req, "fpunch workfd pos 0x%llx bytecount 0x%llx",
+			(unsigned long long)mrec->fmr_physical,
+			(unsigned long long)mrec->fmr_length);
+
+	ret = fallocate(req->work_fd,
+			FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+			mrec->fmr_physical, mrec->fmr_length);
+	if (ret) {
+		perror(_("resetting work file after failed freeze"));
+		return ret;
+	}
+
+	return 1;
+}
+
+/*
+ * Open a file to try to freeze whatever data is in the requested range.
+ *
+ * Returns nonzero on error.  Returns zero and a file descriptor in @fd if the
+ * caller is supposed to do something; or returns zero and @fd == -1 if there's
+ * nothing to freeze.
+ */
+static int
+csp_freeze_open(
+	struct clearspace_req	*req,
+	const struct fsmap	*mrec,
+	int			*fd)
+{
+	struct xfs_bulkstat	bulkstat;
+	int			target_fd;
+	int			ret;
+
+	*fd = -1;
+
+	ret = -xfrog_bulkstat_single(req->xfd, mrec->fmr_owner, 0, &bulkstat);
+	if (ret) {
+		if (ret == ENOENT || ret == EINVAL)
+			return 0;
+
+		fprintf(stderr, _("bulkstat inode 0x%llx: %s\n"),
+				(unsigned long long)mrec->fmr_owner,
+				strerror(errno));
+		return ret;
+	}
+
+	/*
+	 * If we get stats for a different inode, the file may have been freed
+	 * out from under us and there's nothing to do.
+	 */
+	if (bulkstat.bs_ino != mrec->fmr_owner)
+		return 0;
+
+	/* Skip anything we can't freeze. */
+	if (!S_ISREG(bulkstat.bs_mode) && !S_ISDIR(bulkstat.bs_mode))
+		return 0;
+
+	target_fd = csp_open_by_handle(req, O_RDONLY, mrec->fmr_owner,
+			bulkstat.bs_gen);
+	if (target_fd == -2)
+		return 0;
+	if (target_fd < 0)
+		return target_fd;
+
+	/*
+	 * Skip mappings for directories, xattr data, and block mapping btree
+	 * blocks.  We still have to close the file though.
+	 */
+	if (S_ISDIR(bulkstat.bs_mode) ||
+	    (mrec->fmr_flags & (FMR_OF_ATTR_FORK | FMR_OF_EXTENT_MAP))) {
+		return close(target_fd);
+	}
+
+	*fd = target_fd;
+	return 0;
+}
+
+/*
+ * Given a fsmap, try to reflink the physical space into the space capture
+ * file.
+ */
+static int
+csp_freeze_req_fsmap(
+	struct clearspace_req	*req,
+	unsigned long long	*cursor,
+	const struct fsmap	*mrec)
+{
+	struct fsmap		short_mrec;
+	struct file_clone_range	fcr = { };
+	unsigned long long	frozen_len;
+	int			src_fd;
+	int			ret, ret2;
+
+	if (mrec->fmr_device != req->dev) {
+		fprintf(stderr, _("wrong fsmap device in results.\n"));
+		return -1;
+	}
+
+	/* Ignore mappings for our secret files. */
+	if (csp_is_internal_owner(req, mrec->fmr_owner))
+		return 0;
+
+	/* Ignore mappings before the cursor. */
+	if (mrec->fmr_physical + mrec->fmr_length < *cursor)
+		return 0;
+
+	/* Jump past mappings for metadata. */
+	if (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER)
+		goto skip;
+
+	/*
+	 * Open this file so that we can try to freeze its data blocks.
+	 * For other types of files we just skip to the evacuation step.
+	 */
+	ret = csp_freeze_open(req, mrec, &src_fd);
+	if (ret)
+		return ret;
+	if (src_fd < 0)
+		goto skip;
+
+	/*
+	 * If the cursor is in the middle of this mapping, increase the start
+	 * of the mapping to start at the cursor.
+	 */
+	if (mrec->fmr_physical < *cursor) {
+		unsigned long long	delta = *cursor - mrec->fmr_physical;
+
+		short_mrec = *mrec;
+		short_mrec.fmr_physical = *cursor;
+		short_mrec.fmr_offset += delta;
+		short_mrec.fmr_length -= delta;
+
+		mrec = &short_mrec;
+	}
+
+	req->trace_indent++;
+	if (mrec->fmr_length == 0) {
+		trace_freeze(req, "skipping zero-length freeze", 0);
+		goto out_fd;
+	}
+
+	/*
+	 * Reflink the mapping from the source file into the work file.  If we
+	 * can't do that, we're sunk.  If the mapping is unwritten, we'll leave
+	 * a hole in the work file.
+	 */
+	fcr.src_fd = src_fd;
+	fcr.src_offset = mrec->fmr_offset;
+	fcr.src_length = mrec->fmr_length;
+	fcr.dest_offset = mrec->fmr_physical;
+
+	trace_freeze(req, "freeze to workfd pos 0x%llx",
+			(unsigned long long)fcr.dest_offset);
+
+	ret = clonerange(req->work_fd, &fcr);
+	if (ret) {
+		fprintf(stderr, _("freezing space to work file: %s\n"),
+				strerror(ret));
+		goto out_fd;
+	}
+
+	req->trace_indent++;
+	ret = csp_freeze_check_attempt(req, mrec, &frozen_len);
+	req->trace_indent--;
+	if (ret < 0)
+		goto out_fd;
+	if (ret == 1) {
+		ret = 0;
+		goto advance;
+	}
+
+	/*
+	 * We've frozen the mapping by reflinking it into the work file and
+	 * confirmed that the work file has the space we wanted.  Now we need
+	 * to map the same extent into the space capture file.  If reflink
+	 * fails because we're out of space, fall back to FIEXCHANGE.  The end
+	 * goal is to populate the space capture file; we don't care about
+	 * the contents of the work file.
+	 */
+	fcr.src_fd = req->work_fd;
+	fcr.src_offset = mrec->fmr_physical;
+	fcr.dest_offset = mrec->fmr_physical;
+	fcr.src_length = frozen_len;
+
+	trace_freeze(req, "link phys 0x%llx len 0x%llx to spacefd",
+			(unsigned long long)mrec->fmr_physical,
+			(unsigned long long)mrec->fmr_length);
+
+	ret = clonerange(req->space_fd, &fcr);
+	if (ret == ENOSPC) {
+		struct file_xchg_range	xchg;
+
+		xfrog_file_exchange_prep(NULL, FILE_XCHG_RANGE_NONATOMIC,
+				mrec->fmr_physical, req->work_fd,
+				mrec->fmr_physical, frozen_len, &xchg);
+		ret = exchangerange(req->space_fd, &xchg);
+	}
+	if (ret) {
+		fprintf(stderr, _("freezing space to space capture file: %s\n"),
+				strerror(ret));
+		goto out_fd;
+	}
+
+advance:
+	*cursor += frozen_len;
+out_fd:
+	ret2 = close(src_fd);
+	if (!ret && ret2)
+		ret = ret2;
+	req->trace_indent--;
+	if (ret)
+		trace_freeze(req, "ret %d", ret);
+	return ret;
+skip:
+	*cursor += mrec->fmr_length;
+	return 0;
+}
+
+/*
+ * Try to freeze all the space in the requested range against overwrites.
+ *
+ * For each file data fsmap within each hole in the part of the space capture
+ * file corresponding to the requested range, try to reflink the space into the
+ * space capture file so that any subsequent writes to the original owner are
+ * CoW and nobody else can allocate the space.  If we cannot use reflink to
+ * freeze all the space, we cannot proceed with the clearing.
+ */
+static int
+csp_freeze_req_range(
+	struct clearspace_req	*req)
+{
+	unsigned long long	cursor = req->start;
+	loff_t			holepos = 0;
+	loff_t			length = 0;
+	int			ret;
+
+	ret = ftruncate(req->space_fd, req->start + req->length);
+	if (ret) {
+		perror(_("setting up space capture file"));
+		return ret;
+	}
+
+	if (!req->use_reflink)
+		return 0;
+
+	start_spacefd_iter(req);
+	while ((ret = spacefd_hole_iter(req, &holepos, &length)) > 0) {
+		trace_freeze(req, "spacefd hole 0x%llx length 0x%llx",
+				(long long)holepos, (long long)length);
+
+		start_fsmap_query(req, req->dev, holepos, length);
+		while ((ret = run_fsmap_query(req)) > 0) {
+			struct fsmap	*mrec;
+
+			for_each_fsmap_row(req, mrec) {
+				trace_fsmap_rec(req, CSP_TRACE_FREEZE, mrec);
+				trim_request_fsmap(req, mrec);
+				ret = csp_freeze_req_fsmap(req, &cursor, mrec);
+				if (ret) {
+					end_fsmap_query(req);
+					goto out;
+				}
+			}
+		}
+		end_fsmap_query(req);
+	}
+out:
+	end_spacefd_iter(req);
+	return ret;
+}
+
+/*
+ * Dump all speculative preallocations, COW staging blocks, and inactive inodes
+ * to try to free up as much space as we can.
+ */
+static int
+csp_collect_garbage(
+	struct clearspace_req	*req)
+{
+	struct xfs_fs_eofblocks	eofb = {
+		.eof_version	= XFS_EOFBLOCKS_VERSION,
+		.eof_flags	= XFS_EOF_FLAGS_SYNC,
+	};
+	int			ret;
+
+	ret = ioctl(req->xfd->fd, XFS_IOC_FREE_EOFBLOCKS, &eofb);
+	if (ret) {
+		perror(_("xfs garbage collector"));
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Set up the target to clear all metadata from the given range. */
+static inline void
+csp_target_metadata(
+	struct clearspace_req	*req,
+	struct clearspace_tgt	*target)
+{
+	target->start = req->start;
+	target->length = req->length;
+	target->prio = 0;
+	target->evacuated = 0;
+	target->owners = 0;
+	target->try_again = false;
+}
+
+/*
+ * Loop through the space to find the most appealing part of the device to
+ * clear, then try to evacuate everything within.
+ */
+int
+clearspace_run(
+	struct clearspace_req	*req)
+{
+	struct clearspace_tgt	target;
+	const struct csp_errstr	*es;
+	bool			cleared_anything;
+	int			ret;
+
+	if (req->trace_mask) {
+		fprintf(stderr, "debug flags 0x%x:", req->trace_mask);
+		for (es = errtags; es->tag; es++) {
+			if (req->trace_mask & es->mask)
+				fprintf(stderr, " %s", es->tag);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	req->trace_indent = 0;
+	trace_status(req,
+ _("Clearing dev %u:%u physical 0x%llx bytecount 0x%llx."),
+			major(req->dev), minor(req->dev),
+			req->start, req->length);
+
+	if (req->trace_mask & ~CSP_TRACE_STATUS)
+		trace_status(req, "reflink? %d evac_metadata? %d",
+				req->use_reflink, req->can_evac_metadata);
+
+	ret = bitmap_alloc(&req->visited);
+	if (ret) {
+		perror(_("allocating visited bitmap"));
+		return ret;
+	}
+
+	/*
+	 * Empty out CoW forks and speculative post-EOF preallocations before
+	 * starting the clearing process.  This may be somewhat overkill.
+	 */
+	ret = syncfs(req->xfd->fd);
+	if (ret) {
+		perror(_("syncing filesystem"));
+		goto out_bitmap;
+	}
+
+	ret = csp_collect_garbage(req);
+	if (ret)
+		goto out_bitmap;
+
+	/*
+	 * Try to freeze as much of the requested range as we can, grab the
+	 * free space in that range, and run freeze again to pick up anything
+	 * that may have been allocated while all that was going on.
+	 */
+	ret = csp_freeze_req_range(req);
+	if (ret)
+		goto out_bitmap;
+
+	ret = csp_grab_free_space(req);
+	if (ret)
+		goto out_bitmap;
+
+	ret = csp_freeze_req_range(req);
+	if (ret)
+		goto out_bitmap;
+
+	/*
+	 * If reflink is enabled, our strategy is to dedupe to free blocks in
+	 * the area that we're clearing without making any user-visible changes
+	 * to the file contents.  For all the written file data blocks in area
+	 * we're clearing, make an identical copy in the work file that is
+	 * backed by blocks that are not in the clearing area.
+	 */
+	if (req->use_reflink) {
+		ret = csp_prepare_for_dedupe(req);
+		if (ret)
+			goto out_bitmap;
+	}
+
+	/* Evacuate as many file blocks as we can. */
+	do {
+		ret = csp_find_target(req, &target);
+		if (ret)
+			goto out_bitmap;
+
+		if (target.length == 0)
+			break;
+
+		trace_target(req,
+	"phys 0x%llx len 0x%llx owners 0x%llx prio 0x%llx",
+				target.start, target.length,
+				target.owners, target.prio);
+
+		if (req->use_reflink)
+			ret = csp_evac_dedupe(req, &target);
+		else
+			ret = csp_evac_exchange(req, &target);
+		if (ret)
+			goto out_bitmap;
+
+		trace_status(req, _("Evacuated %llu file items."),
+				target.evacuated);
+	} while (target.evacuated > 0 || target.try_again);
+
+	if (!req->can_evac_metadata)
+		goto out_bitmap;
+
+	/* Evacuate as many AG metadata blocks as we can. */
+	do {
+		csp_target_metadata(req, &target);
+
+		ret = csp_evac_fs_metadata(req, &target, &cleared_anything);
+		if (ret)
+			goto out_bitmap;
+
+		trace_status(req, "evacuated %llu metadata items",
+				target.evacuated);
+	} while (target.evacuated > 0 && cleared_anything);
+
+out_bitmap:
+	bitmap_free(&req->visited);
+	return ret;
+}
+
+/* How much space did we actually clear? */
+int
+clearspace_efficacy(
+	struct clearspace_req	*req,
+	unsigned long long	*cleared_bytes)
+{
+	unsigned long long	cleared = 0;
+	int			ret;
+
+	start_bmapx_query(req, 0, req->start, req->length);
+	while ((ret = run_bmapx_query(req, req->space_fd)) > 0) {
+		struct getbmapx	*brec;
+
+		for_each_bmapx_row(req, brec) {
+			if (brec->bmv_block == -1)
+				continue;
+
+			trace_bmapx_rec(req, CSP_TRACE_EFFICACY, brec);
+
+			if (brec->bmv_offset != brec->bmv_block) {
+				fprintf(stderr,
+	_("space capture file mapped incorrectly\n"));
+				end_bmapx_query(req);
+				return -1;
+			}
+			cleared += BBTOB(brec->bmv_length);
+		}
+	}
+	end_bmapx_query(req);
+	if (ret)
+		return ret;
+
+	*cleared_bytes = cleared;
+	return 0;
+}
+
+/*
+ * Create a temporary file on the same volume (data/rt) that we're trying to
+ * clear free space on.
+ */
+static int
+csp_open_tempfile(
+	struct clearspace_req	*req,
+	struct stat		*statbuf)
+{
+	struct fsxattr		fsx;
+	int			fd, ret;
+
+	fd = openat(req->xfd->fd, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
+	if (fd < 0) {
+		perror(_("opening temp file"));
+		return -1;
+	}
+
+	/* Make sure we got the same filesystem as the open file. */
+	ret = fstat(fd, statbuf);
+	if (ret) {
+		perror(_("stat temp file"));
+		goto fail;
+	}
+	if (statbuf->st_dev != req->statbuf.st_dev) {
+		fprintf(stderr,
+	_("Cannot create temp file on same fs as open file.\n"));
+		goto fail;
+	}
+
+	/* Ensure this file targets the correct data/rt device. */
+	ret = ioctl(fd, FS_IOC_FSGETXATTR, &fsx);
+	if (ret) {
+		perror(_("FSGETXATTR temp file"));
+		goto fail;
+	}
+
+	if (!!(fsx.fsx_xflags & FS_XFLAG_REALTIME) != req->realtime) {
+		if (req->realtime)
+			fsx.fsx_xflags |= FS_XFLAG_REALTIME;
+		else
+			fsx.fsx_xflags &= ~FS_XFLAG_REALTIME;
+
+		ret = ioctl(fd, FS_IOC_FSSETXATTR, &fsx);
+		if (ret) {
+			perror(_("FSSETXATTR temp file"));
+			goto fail;
+		}
+	}
+
+	trace_setup(req, "opening temp inode 0x%llx as fd %d",
+			(unsigned long long)statbuf->st_ino, fd);
+
+	return fd;
+fail:
+	close(fd);
+	return -1;
+}
+
+/* Extract fshandle from the open file. */
+static int
+csp_install_file(
+	struct clearspace_req	*req,
+	struct xfs_fd		*xfd)
+{
+	void			*handle;
+	size_t			handle_sz;
+	int			ret;
+
+	ret = fstat(xfd->fd, &req->statbuf);
+	if (ret)
+		return ret;
+
+	if (!S_ISDIR(req->statbuf.st_mode)) {
+		errno = -ENOTDIR;
+		return -1;
+	}
+
+	ret = fd_to_handle(xfd->fd, &handle, &handle_sz);
+	if (ret)
+		return ret;
+
+	ret = handle_to_fshandle(handle, handle_sz, &req->fshandle,
+			&req->fshandle_sz);
+	if (ret)
+		return ret;
+
+	free_handle(handle, handle_sz);
+	req->xfd = xfd;
+	return 0;
+}
+
+/* Decide if we can use online repair to evacuate metadata. */
+static void
+csp_detect_evac_metadata(
+	struct clearspace_req		*req)
+{
+	struct xfs_scrub_metadata	scrub = {
+		.sm_type		= XFS_SCRUB_TYPE_PROBE,
+		.sm_flags		= XFS_SCRUB_IFLAG_REPAIR |
+					  XFS_SCRUB_IFLAG_FORCE_REBUILD,
+	};
+	int				ret;
+
+	ret = ioctl(req->xfd->fd, XFS_IOC_SCRUB_METADATA, &scrub);
+	if (ret)
+		return;
+
+	/*
+	 * We'll try to evacuate metadata if the probe works.  This doesn't
+	 * guarantee success; it merely means that the kernel call exists.
+	 */
+	req->can_evac_metadata = true;
+}
+
+/* Detect FALLOC_FL_MAP_FREE; this is critical for grabbing free space! */
+static int
+csp_detect_fallocate_map_free(
+	struct clearspace_req	*req)
+{
+	int			ret;
+
+	/*
+	 * A single-byte fallocate request will succeed without doing anything
+	 * to the filesystem.
+	 */
+	ret = fallocate(req->work_fd, FALLOC_FL_MAP_FREE_SPACE, 0, 1);
+	if (!ret)
+		return 0;
+
+	if (errno == EOPNOTSUPP) {
+		fprintf(stderr,
+	_("Filesystem does not support FALLOC_FL_MAP_FREE_SPACE\n"));
+		return -1;
+	}
+
+	perror(_("test FALLOC_FL_MAP_FREE_SPACE on work file"));
+	return -1;
+}
+
+/*
+ * Assemble operation information to clear the physical space in part of a
+ * filesystem.
+ */
+int
+clearspace_init(
+	struct clearspace_req		**reqp,
+	const struct clearspace_init	*attrs)
+{
+	struct clearspace_req		*req;
+	int				ret;
+
+	req = calloc(1, sizeof(struct clearspace_req));
+	if (!req) {
+		perror(_("malloc clearspace"));
+		return -1;
+	}
+
+	req->work_fd = -1;
+	req->space_fd = -1;
+	req->trace_mask = attrs->trace_mask;
+
+	req->realtime = attrs->is_realtime;
+	req->dev = attrs->dev;
+	req->start = attrs->start;
+	req->length = attrs->length;
+
+	ret = csp_install_file(req, attrs->xfd);
+	if (ret) {
+		perror(attrs->fname);
+		goto fail;
+	}
+
+	csp_detect_evac_metadata(req);
+
+	req->work_fd = csp_open_tempfile(req, &req->temp_statbuf);
+	if (req->work_fd < 0)
+		goto fail;
+
+	req->space_fd = csp_open_tempfile(req, &req->space_statbuf);
+	if (req->space_fd < 0)
+		goto fail;
+
+	ret = csp_detect_fallocate_map_free(req);
+	if (ret)
+		goto fail;
+
+	req->mhead = calloc(1, fsmap_sizeof(QUERY_BATCH_SIZE));
+	if (!req->mhead) {
+		perror(_("opening fs mapping query"));
+		goto fail;
+	}
+
+	req->rhead = calloc(1, fsrefs_sizeof(QUERY_BATCH_SIZE));
+	if (!req->rhead) {
+		perror(_("opening refcount query"));
+		goto fail;
+	}
+
+	req->bhead = calloc(QUERY_BATCH_SIZE + 1, sizeof(struct getbmapx));
+	if (!req->bhead) {
+		perror(_("opening file mapping query"));
+		goto fail;
+	}
+
+	req->buf = malloc(BUFFERCOPY_BUFSZ);
+	if (!req->buf) {
+		perror(_("allocating file copy buffer"));
+		goto fail;
+	}
+
+	req->fdr = calloc(1, sizeof(struct file_dedupe_range) +
+			     sizeof(struct file_dedupe_range_info));
+	if (!req->fdr) {
+		perror(_("allocating dedupe control buffer"));
+		goto fail;
+	}
+
+	req->use_reflink = req->xfd->fsgeom.flags & XFS_FSOP_GEOM_FLAGS_REFLINK;
+
+	*reqp = req;
+	return 0;
+fail:
+	clearspace_free(&req);
+	return -1;
+}
+
+/* Free all resources associated with a space clearing request. */
+int
+clearspace_free(
+	struct clearspace_req	**reqp)
+{
+	struct clearspace_req	*req = *reqp;
+	int			ret = 0;
+
+	if (!req)
+		return 0;
+
+	*reqp = NULL;
+	free(req->fdr);
+	free(req->buf);
+	free(req->bhead);
+	free(req->rhead);
+	free(req->mhead);
+
+	if (req->space_fd >= 0) {
+		ret = close(req->space_fd);
+		if (ret)
+			perror(_("closing space capture file"));
+	}
+
+	if (req->work_fd >= 0) {
+		int	ret2 = close(req->work_fd);
+
+		if (ret2) {
+			perror(_("closing work file"));
+			if (!ret && ret2)
+				ret = ret2;
+		}
+	}
+
+	if (req->fshandle)
+		free_handle(req->fshandle, req->fshandle_sz);
+	free(req);
+	return ret;
+}
diff --git a/libfrog/clearspace.h b/libfrog/clearspace.h
new file mode 100644
index 00000000000..4b12c9b0475
--- /dev/null
+++ b/libfrog/clearspace.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Oracle. Inc.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __LIBFROG_CLEARSPACE_H__
+#define __LIBFROG_CLEARSPACE_H__
+
+struct clearspace_req;
+
+struct clearspace_init {
+	/* Open file and its pathname */
+	struct xfs_fd		*xfd;
+	const char		*fname;
+
+	/* Which device do we want? */
+	bool			is_realtime;
+	dev_t			dev;
+
+	/* Range of device to clear. */
+	unsigned long long	start;
+	unsigned long long	length;
+
+	unsigned int		trace_mask;
+};
+
+int clearspace_init(struct clearspace_req **reqp,
+		const struct clearspace_init *init);
+int clearspace_free(struct clearspace_req **reqp);
+
+int clearspace_run(struct clearspace_req *req);
+
+int clearspace_efficacy(struct clearspace_req *req,
+		unsigned long long *cleared_bytes);
+
+/* Debugging levels */
+
+#define CSP_TRACE_FREEZE	(0x01)
+#define CSP_TRACE_GRAB		(0x02)
+#define CSP_TRACE_FSMAP		(0x04)
+#define CSP_TRACE_FSREFS	(0x08)
+#define CSP_TRACE_BMAPX		(0x10)
+#define CSP_TRACE_PREP		(0x20)
+#define CSP_TRACE_TARGET	(0x40)
+#define CSP_TRACE_DEDUPE	(0x80)
+#define CSP_TRACE_FALLOC	(0x100)
+#define CSP_TRACE_FIEXCHANGE	(0x200)
+#define CSP_TRACE_XREBUILD	(0x400)
+#define CSP_TRACE_EFFICACY	(0x800)
+#define CSP_TRACE_SETUP		(0x1000)
+#define CSP_TRACE_STATUS	(0x2000)
+#define CSP_TRACE_DUMPFILE	(0x4000)
+#define CSP_TRACE_BITMAP	(0x8000)
+
+#define CSP_TRACE_ALL		(CSP_TRACE_FREEZE | \
+				 CSP_TRACE_GRAB | \
+				 CSP_TRACE_FSMAP | \
+				 CSP_TRACE_FSREFS | \
+				 CSP_TRACE_BMAPX | \
+				 CSP_TRACE_PREP	 | \
+				 CSP_TRACE_TARGET | \
+				 CSP_TRACE_DEDUPE | \
+				 CSP_TRACE_FALLOC | \
+				 CSP_TRACE_FIEXCHANGE | \
+				 CSP_TRACE_XREBUILD | \
+				 CSP_TRACE_EFFICACY | \
+				 CSP_TRACE_SETUP | \
+				 CSP_TRACE_STATUS | \
+				 CSP_TRACE_DUMPFILE | \
+				 CSP_TRACE_BITMAP)
+
+#endif /* __LIBFROG_CLEARSPACE_H__ */
diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8
index 837fc497f27..7c11953d16b 100644
--- a/man/man8/xfs_spaceman.8
+++ b/man/man8/xfs_spaceman.8
@@ -25,6 +25,23 @@ then the program exits.
 
 .SH COMMANDS
 .TP
+.BI "clearfree [ \-n nr ] [ \-r ] [ \-v mask ] " start " " length
+Try to clear the specified physical range in the filesystem.
+The
+.B start
+and
+.B length
+arguments must be given in units of bytes.
+If the
+.B -n
+option is given, run the clearing algorithm this many times.
+If the
+.B -r
+option is given, clear the realtime device.
+If the
+.B -v
+option is given, print what's happening every step of the way.
+.TP
 .BI "freesp [ \-dgrs ] [-a agno]... [ \-b | \-e bsize | \-h bsize | \-m factor ]"
 With no arguments,
 .B freesp
diff --git a/spaceman/Makefile b/spaceman/Makefile
index 1f048d54a4d..75df4ce86c2 100644
--- a/spaceman/Makefile
+++ b/spaceman/Makefile
@@ -10,8 +10,8 @@ HFILES = init.h space.h
 CFILES = info.c init.c file.c health.c prealloc.c trim.c
 LSRCFILES = xfs_info.sh
 
-LLDLIBS = $(LIBXCMD) $(LIBFROG)
-LTDEPENDENCIES = $(LIBXCMD) $(LIBFROG)
+LLDLIBS = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG)
+LTDEPENDENCIES = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG)
 LLDFLAGS = -static
 
 ifeq ($(ENABLE_EDITLINE),yes)
@@ -19,7 +19,7 @@ LLDLIBS += $(LIBEDITLINE) $(LIBTERMCAP)
 endif
 
 ifeq ($(HAVE_GETFSMAP),yes)
-CFILES += freesp.c
+CFILES += freesp.c clearfree.c
 endif
 
 default: depend $(LTCOMMAND)
diff --git a/spaceman/clearfree.c b/spaceman/clearfree.c
new file mode 100644
index 00000000000..e944a07b887
--- /dev/null
+++ b/spaceman/clearfree.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "platform_defs.h"
+#include "command.h"
+#include "init.h"
+#include "libfrog/paths.h"
+#include "input.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/clearspace.h"
+#include "handle.h"
+#include "space.h"
+
+static void
+clearfree_help(void)
+{
+	printf(_(
+"Evacuate the contents of the given range of physical storage in the filesystem"
+"\n"
+" -n -- Run the space clearing algorithm this many times.\n"
+" -r -- clear space on the realtime device.\n"
+" -v -- verbosity level, or \"all\" to print everything.\n"
+"\n"
+"The start and length arguments are required, and must be specified in units\n"
+"of bytes.\n"
+"\n"));
+}
+
+static int
+clearfree_f(
+	int			argc,
+	char			**argv)
+{
+	struct clearspace_init	attrs = {
+		.xfd		= &file->xfd,
+		.fname		= file->name,
+	};
+	struct clearspace_req	*req = NULL;
+	unsigned long long	cleared;
+	unsigned long		arg;
+	long long		lnum;
+	unsigned int		i, nr = 1;
+	int			c, ret;
+
+	while ((c = getopt(argc, argv, "n:rv:")) != EOF) {
+		switch (c) {
+		case 'n':
+			errno = 0;
+			arg = strtoul(optarg, NULL, 0);
+			if (errno) {
+				perror(optarg);
+				return 1;
+			}
+			if (arg > UINT_MAX)
+				arg = UINT_MAX;
+			nr = arg;
+			break;
+		case 'r':	/* rt device */
+			attrs.is_realtime = true;
+			break;
+		case 'v':	/* Verbose output */
+			if (!strcmp(optarg, "all")) {
+				attrs.trace_mask = CSP_TRACE_ALL;
+			} else {
+				errno = 0;
+				attrs.trace_mask = strtoul(optarg, NULL, 0);
+				if (errno) {
+					perror(optarg);
+					return 1;
+				}
+			}
+			break;
+		default:
+			exitcode = 1;
+			clearfree_help();
+			return 0;
+		}
+	}
+
+	if (attrs.trace_mask)
+		attrs.trace_mask |= CSP_TRACE_STATUS;
+
+	if (argc != optind + 2) {
+		clearfree_help();
+		goto fail;
+	}
+
+	if (attrs.is_realtime) {
+		if (file->xfd.fsgeom.rtblocks == 0) {
+			fprintf(stderr, _("No realtime volume present.\n"));
+			goto fail;
+		}
+		attrs.dev = file->fs_path.fs_rtdev;
+	} else {
+		attrs.dev = file->fs_path.fs_datadev;
+	}
+
+	lnum = cvtnum(file->xfd.fsgeom.blocksize, file->xfd.fsgeom.sectsize,
+			argv[optind]);
+	if (lnum < 0) {
+		fprintf(stderr, _("Bad clearfree start sector %s.\n"),
+				argv[optind]);
+		goto fail;
+	}
+	attrs.start = lnum;
+
+	lnum = cvtnum(file->xfd.fsgeom.blocksize, file->xfd.fsgeom.sectsize,
+			argv[optind + 1]);
+	if (lnum < 0) {
+		fprintf(stderr, _("Bad clearfree length %s.\n"),
+				argv[optind + 1]);
+		goto fail;
+	}
+	attrs.length = lnum;
+
+	ret = clearspace_init(&req, &attrs);
+	if (ret)
+		goto fail;
+
+	for (i = 0; i < nr; i++) {
+		ret = clearspace_run(req);
+		if (ret)
+			goto fail;
+	}
+
+	ret = clearspace_efficacy(req, &cleared);
+	if (ret)
+		goto fail;
+
+	printf(_("Cleared 0x%llx bytes (%.1f%%) from 0x%llx to 0x%llx.\n"),
+			cleared, 100.0 * cleared / attrs.length, attrs.start,
+			attrs.start + attrs.length);
+
+	ret = clearspace_free(&req);
+	if (ret)
+		goto fail;
+
+	fshandle_destroy();
+	return 0;
+fail:
+	fshandle_destroy();
+	exitcode = 1;
+	return 1;
+}
+
+static struct cmdinfo clearfree_cmd = {
+	.name		= "clearfree",
+	.cfunc		= clearfree_f,
+	.argmin		= 0,
+	.argmax		= -1,
+	.flags		= CMD_FLAG_ONESHOT,
+	.args		= "[-n runs] [-r] [-v mask] start length",
+	.help		= clearfree_help,
+};
+
+void
+clearfree_init(void)
+{
+	clearfree_cmd.oneline = _("clear free space in the filesystem");
+
+	add_command(&clearfree_cmd);
+}
diff --git a/spaceman/init.c b/spaceman/init.c
index cf1ff3cbb0e..bce62dec47f 100644
--- a/spaceman/init.c
+++ b/spaceman/init.c
@@ -35,6 +35,7 @@ init_commands(void)
 	trim_init();
 	freesp_init();
 	health_init();
+	clearfree_init();
 }
 
 static int
diff --git a/spaceman/space.h b/spaceman/space.h
index 723209edd99..be4a7426ebf 100644
--- a/spaceman/space.h
+++ b/spaceman/space.h
@@ -28,8 +28,10 @@ extern void	quit_init(void);
 extern void	trim_init(void);
 #ifdef HAVE_GETFSMAP
 extern void	freesp_init(void);
+extern void	clearfree_init(void);
 #else
 # define freesp_init()	do { } while (0)
+# define clearfree_init()	do { } while(0)
 #endif
 extern void	info_init(void);
 extern void	health_init(void);


  parent reply	other threads:[~2022-12-31  3:29 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-30 21:14 [NYE DELUGE 4/4] xfs: freespace defrag for online shrink Darrick J. Wong
2022-12-30 22:19 ` [PATCHSET 0/3] xfs: improve post-close eofblocks gc behavior Darrick J. Wong
2022-12-30 22:19   ` [PATCH 2/3] xfs: don't free EOF blocks on read close Darrick J. Wong
2022-12-30 22:19   ` [PATCH 3/3] xfs: Don't free EOF blocks on close when extent size hints are set Darrick J. Wong
2022-12-30 22:19   ` [PATCH 1/3] xfs: only free posteof blocks on first close Darrick J. Wong
2022-12-30 22:19 ` [PATCHSET 0/3] xfs: vectorize scrub kernel calls Darrick J. Wong
2022-12-30 22:19   ` [PATCH 2/3] xfs: whine to dmesg when we encounter errors Darrick J. Wong
2022-12-30 22:19   ` [PATCH 1/3] xfs: track deferred ops statistics Darrick J. Wong
2022-12-30 22:19   ` [PATCH 3/3] xfs: introduce vectored scrub mode Darrick J. Wong
2022-12-30 22:19 ` [PATCHSET 0/1] xfs: report refcount information to userspace Darrick J. Wong
2022-12-30 22:19   ` [PATCH 1/1] xfs: export reference count " Darrick J. Wong
2022-12-30 22:19 ` [PATCHSET 0/2] xfs: defragment free space Darrick J. Wong
2022-12-30 22:19   ` [PATCH 2/2] xfs: fallocate free space into a file Darrick J. Wong
2022-12-30 22:19   ` [PATCH 1/2] xfs: capture the offset and length in fallocate tracepoints Darrick J. Wong
2022-12-30 22:20 ` [PATCHSET 00/11] xfs_scrub: vectorize kernel calls Darrick J. Wong
2022-12-30 22:20   ` [PATCH 03/11] libfrog: support vectored scrub Darrick J. Wong
2022-12-30 22:20   ` [PATCH 04/11] xfs_io: " Darrick J. Wong
2022-12-30 22:20   ` [PATCH 01/11] xfs: track deferred ops statistics Darrick J. Wong
2022-12-30 22:20   ` [PATCH 02/11] xfs: introduce vectored scrub mode Darrick J. Wong
2022-12-30 22:20   ` [PATCH 08/11] xfs_scrub: vectorize scrub calls Darrick J. Wong
2022-12-30 22:20   ` [PATCH 09/11] xfs_scrub: vectorize repair calls Darrick J. Wong
2022-12-30 22:20   ` [PATCH 07/11] xfs_scrub: convert scrub and repair epilogues to use xfs_scrub_vec Darrick J. Wong
2022-12-30 22:20   ` [PATCH 11/11] xfs_scrub: try spot repairs of metadata items to make scrub progress Darrick J. Wong
2022-12-30 22:20   ` [PATCH 06/11] xfs_scrub: split the repair epilogue code into a separate function Darrick J. Wong
2022-12-30 22:20   ` [PATCH 05/11] xfs_scrub: split the scrub " Darrick J. Wong
2022-12-30 22:20   ` [PATCH 10/11] xfs_scrub: use scrub barriers to reduce kernel calls Darrick J. Wong
2022-12-30 22:20 ` [PATCHSET 0/1] libxfs: report refcount information to userspace Darrick J. Wong
2022-12-30 22:20   ` [PATCH 1/1] xfs_io: dump reference count information Darrick J. Wong
2022-12-30 22:20 ` [PATCHSET 0/5] xfsprogs: defragment free space Darrick J. Wong
2022-12-30 22:20   ` [PATCH 1/5] xfs: fallocate free space into a file Darrick J. Wong
2022-12-30 22:20   ` [PATCH 2/5] xfs_io: support using fallocate to map free space Darrick J. Wong
2022-12-30 22:20   ` Darrick J. Wong [this message]
2022-12-30 22:20   ` [PATCH 3/5] xfs_db: get and put blocks on the AGFL Darrick J. Wong
2022-12-30 22:20   ` [PATCH 5/5] xfs_spaceman: defragment free space with normal files Darrick J. Wong
2022-12-30 22:21 ` [PATCHSET 0/1] xfs_scrub: vectorize kernel calls Darrick J. Wong
2022-12-30 22:21   ` [PATCH 1/1] xfs/122: update for vectored scrub Darrick J. Wong
2022-12-30 22:21 ` [PATCHSET 0/1] fstests: functional test for refcount reporting Darrick J. Wong
2022-12-30 22:21   ` [PATCH 1/1] xfs: test output of new FSREFCOUNTS ioctl Darrick J. Wong
2022-12-30 22:21 ` [PATCHSET 0/1] fstests: defragment free space Darrick J. Wong
2022-12-30 22:21   ` [PATCH 1/1] xfs: test clearing of " Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=167243884818.740087.2981858571326952921.stgit@magnolia \
    --to=djwong@kernel.org \
    --cc=cem@kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.