All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: Kent Overstreet <kent.overstreet@linux.dev>,
	Dave Chinner <dchinner@redhat.com>,
	linux-xfs@vger.kernel.org, willy@infradead.org,
	linux-fsdevel@vger.kernel.org
Subject: [PATCH 2/7] xfs: enable sorting of xfile-backed arrays
Date: Thu, 27 Jul 2023 15:25:50 -0700	[thread overview]
Message-ID: <169049623600.921478.515800900911021005.stgit@frogsfrogsfrogs> (raw)
In-Reply-To: <169049623563.921478.13811535720302490179.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

The btree bulk loading code requires that records be provided in the
correct record sort order for the given btree type.  In general, repair
code cannot be required to collect records in order, and it is not
feasible to insert new records in the middle of an array to maintain
sort order.

Implement a sorting algorithm so that we can sort the records just prior
to bulk loading.  In principle, an xfarray could consume many gigabytes
of memory and its backing pages can be sent out to disk at any time.
This means that we cannot map the entire array into memory at once, so
we must find a way to divide the work into smaller portions (e.g. a
page) that /can/ be mapped into memory.

Quicksort seems like a reasonable fit for this purpose, since it uses a
divide and conquer strategy to keep its average runtime logarithmic.
The solution presented here is a port of the glibc implementation, which
itself is derived from the median-of-three and tail call recursion
strategies outlined by Sedgwick.

Subsequent patches will optimize the implementation further by utilizing
the kernel's heapsort on directly-mapped memory whenever possible, and
improving the quicksort pivot selection algorithm to try to avoid O(n^2)
collapses.

Note: The sorting functionality gets its own patch because the basic big
array mechanisms were plenty for a single code patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/trace.h   |  114 ++++++++++
 fs/xfs/scrub/xfarray.c |  569 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/xfarray.h |   67 ++++++
 3 files changed, 750 insertions(+)


diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 0b9e781840f37..2fbee6389e2a0 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -18,6 +18,7 @@
 
 struct xfile;
 struct xfarray;
+struct xfarray_sortinfo;
 
 /*
  * ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -846,6 +847,119 @@ TRACE_EVENT(xfarray_create,
 		  __entry->obj_size_log)
 );
 
+TRACE_EVENT(xfarray_isort,
+	TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+	TP_ARGS(si, lo, hi),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, lo)
+		__field(unsigned long long, hi)
+	),
+	TP_fast_assign(
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+		__entry->lo = lo;
+		__entry->hi = hi;
+	),
+	TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu",
+		  __entry->ino,
+		  __entry->lo,
+		  __entry->hi,
+		  __entry->hi - __entry->lo)
+);
+
+TRACE_EVENT(xfarray_qsort,
+	TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+	TP_ARGS(si, lo, hi),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, lo)
+		__field(unsigned long long, hi)
+		__field(int, stack_depth)
+		__field(int, max_stack_depth)
+	),
+	TP_fast_assign(
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+		__entry->lo = lo;
+		__entry->hi = hi;
+		__entry->stack_depth = si->stack_depth;
+		__entry->max_stack_depth = si->max_stack_depth;
+	),
+	TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu stack %d/%d",
+		  __entry->ino,
+		  __entry->lo,
+		  __entry->hi,
+		  __entry->hi - __entry->lo,
+		  __entry->stack_depth,
+		  __entry->max_stack_depth)
+);
+
+TRACE_EVENT(xfarray_sort,
+	TP_PROTO(struct xfarray_sortinfo *si, size_t bytes),
+	TP_ARGS(si, bytes),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, nr)
+		__field(size_t, obj_size)
+		__field(size_t, bytes)
+		__field(unsigned int, max_stack_depth)
+	),
+	TP_fast_assign(
+		__entry->nr = si->array->nr;
+		__entry->obj_size = si->array->obj_size;
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+		__entry->bytes = bytes;
+		__entry->max_stack_depth = si->max_stack_depth;
+	),
+	TP_printk("xfino 0x%lx nr %llu objsz %zu stack %u bytes %zu",
+		  __entry->ino,
+		  __entry->nr,
+		  __entry->obj_size,
+		  __entry->max_stack_depth,
+		  __entry->bytes)
+);
+
+TRACE_EVENT(xfarray_sort_stats,
+	TP_PROTO(struct xfarray_sortinfo *si, int error),
+	TP_ARGS(si, error),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+#ifdef DEBUG
+		__field(unsigned long long, loads)
+		__field(unsigned long long, stores)
+		__field(unsigned long long, compares)
+#endif
+		__field(unsigned int, max_stack_depth)
+		__field(unsigned int, max_stack_used)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+#ifdef DEBUG
+		__entry->loads = si->loads;
+		__entry->stores = si->stores;
+		__entry->compares = si->compares;
+#endif
+		__entry->max_stack_depth = si->max_stack_depth;
+		__entry->max_stack_used = si->max_stack_used;
+		__entry->error = error;
+	),
+	TP_printk(
+#ifdef DEBUG
+		  "xfino 0x%lx loads %llu stores %llu compares %llu stack_depth %u/%u error %d",
+#else
+		  "xfino 0x%lx stack_depth %u/%u error %d",
+#endif
+		  __entry->ino,
+#ifdef DEBUG
+		  __entry->loads,
+		  __entry->stores,
+		  __entry->compares,
+#endif
+		  __entry->max_stack_used,
+		  __entry->max_stack_depth,
+		  __entry->error)
+);
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index ca4a4a307010f..226488d85d6d6 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -367,3 +367,572 @@ xfarray_load_next(
 	*idx = cur;
 	return 0;
 }
+
+/* Sorting functions */
+
+#ifdef DEBUG
+# define xfarray_sort_bump_loads(si)	do { (si)->loads++; } while (0)
+# define xfarray_sort_bump_stores(si)	do { (si)->stores++; } while (0)
+# define xfarray_sort_bump_compares(si)	do { (si)->compares++; } while (0)
+#else
+# define xfarray_sort_bump_loads(si)
+# define xfarray_sort_bump_stores(si)
+# define xfarray_sort_bump_compares(si)
+#endif /* DEBUG */
+
+/* Load an array element for sorting. */
+static inline int
+xfarray_sort_load(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		idx,
+	void			*ptr)
+{
+	xfarray_sort_bump_loads(si);
+	return xfarray_load(si->array, idx, ptr);
+}
+
+/* Store an array element for sorting. */
+static inline int
+xfarray_sort_store(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		idx,
+	void			*ptr)
+{
+	xfarray_sort_bump_stores(si);
+	return xfarray_store(si->array, idx, ptr);
+}
+
+/* Compare an array element for sorting. */
+static inline int
+xfarray_sort_cmp(
+	struct xfarray_sortinfo	*si,
+	const void		*a,
+	const void		*b)
+{
+	xfarray_sort_bump_compares(si);
+	return si->cmp_fn(a, b);
+}
+
+/* Return a pointer to the low index stack for quicksort partitioning. */
+static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
+{
+	return (xfarray_idx_t *)(si + 1);
+}
+
+/* Return a pointer to the high index stack for quicksort partitioning. */
+static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
+{
+	return xfarray_sortinfo_lo(si) + si->max_stack_depth;
+}
+
+/* Allocate memory to handle the sort. */
+static inline int
+xfarray_sortinfo_alloc(
+	struct xfarray		*array,
+	xfarray_cmp_fn		cmp_fn,
+	unsigned int		flags,
+	struct xfarray_sortinfo	**infop)
+{
+	struct xfarray_sortinfo	*si;
+	size_t			nr_bytes = sizeof(struct xfarray_sortinfo);
+	int			max_stack_depth;
+
+	/*
+	 * Tail-call recursion during the partitioning phase means that
+	 * quicksort will never recurse more than log2(nr) times.  We need one
+	 * extra level of stack to hold the initial parameters.
+	 */
+	max_stack_depth = ilog2(array->nr) + 1;
+
+	/* Each level of quicksort uses a lo and a hi index */
+	nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
+
+	/* One record for the pivot */
+	nr_bytes += array->obj_size;
+
+	si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
+	if (!si)
+		return -ENOMEM;
+
+	si->array = array;
+	si->cmp_fn = cmp_fn;
+	si->flags = flags;
+	si->max_stack_depth = max_stack_depth;
+	si->max_stack_used = 1;
+
+	xfarray_sortinfo_lo(si)[0] = 0;
+	xfarray_sortinfo_hi(si)[0] = array->nr - 1;
+
+	trace_xfarray_sort(si, nr_bytes);
+	*infop = si;
+	return 0;
+}
+
+/* Should this sort be terminated by a fatal signal? */
+static inline bool
+xfarray_sort_terminated(
+	struct xfarray_sortinfo	*si,
+	int			*error)
+{
+	/*
+	 * If preemption is disabled, we need to yield to the scheduler every
+	 * few seconds so that we don't run afoul of the soft lockup watchdog
+	 * or RCU stall detector.
+	 */
+	cond_resched();
+
+	if ((si->flags & XFARRAY_SORT_KILLABLE) &&
+	    fatal_signal_pending(current)) {
+		if (*error == 0)
+			*error = -EINTR;
+		return true;
+	}
+	return false;
+}
+
+/* Do we want an insertion sort? */
+static inline bool
+xfarray_want_isort(
+	struct xfarray_sortinfo *si,
+	xfarray_idx_t		start,
+	xfarray_idx_t		end)
+{
+	/*
+	 * For array subsets smaller than 8 elements, it's slightly faster to
+	 * use insertion sort than quicksort's stack machine.
+	 */
+	return (end - start) < 8;
+}
+
+/* Return the scratch space within the sortinfo structure. */
+static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
+{
+	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
+}
+
+/*
+ * Perform an insertion sort on a subset of the array.
+ * Though insertion sort is an O(n^2) algorithm, for small set sizes it's
+ * faster than quicksort's stack machine, so we let it take over for that.
+ * This ought to be replaced with something more efficient.
+ */
+STATIC int
+xfarray_isort(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		lo,
+	xfarray_idx_t		hi)
+{
+	void			*a = xfarray_sortinfo_isort_scratch(si);
+	void			*b = xfarray_scratch(si->array);
+	xfarray_idx_t		tmp;
+	xfarray_idx_t		i;
+	xfarray_idx_t		run;
+	int			error;
+
+	trace_xfarray_isort(si, lo, hi);
+
+	/*
+	 * Move the smallest element in a[lo..hi] to a[lo].  This
+	 * simplifies the loop control logic below.
+	 */
+	tmp = lo;
+	error = xfarray_sort_load(si, tmp, b);
+	if (error)
+		return error;
+	for (run = lo + 1; run <= hi; run++) {
+		/* if a[run] < a[tmp], tmp = run */
+		error = xfarray_sort_load(si, run, a);
+		if (error)
+			return error;
+		if (xfarray_sort_cmp(si, a, b) < 0) {
+			tmp = run;
+			memcpy(b, a, si->array->obj_size);
+		}
+
+		if (xfarray_sort_terminated(si, &error))
+			return error;
+	}
+
+	/*
+	 * The smallest element is a[tmp]; swap with a[lo] if tmp != lo.
+	 * Recall that a[tmp] is already in *b.
+	 */
+	if (tmp != lo) {
+		error = xfarray_sort_load(si, lo, a);
+		if (error)
+			return error;
+		error = xfarray_sort_store(si, tmp, a);
+		if (error)
+			return error;
+		error = xfarray_sort_store(si, lo, b);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Perform an insertion sort on a[lo+1..hi].  We already made sure
+	 * that the smallest value in the original range is now in a[lo],
+	 * so the inner loop should never underflow.
+	 *
+	 * For each a[lo+2..hi], make sure it's in the correct position
+	 * with respect to the elements that came before it.
+	 */
+	for (run = lo + 2; run <= hi; run++) {
+		error = xfarray_sort_load(si, run, a);
+		if (error)
+			return error;
+
+		/*
+		 * Find the correct place for a[run] by walking leftwards
+		 * towards the start of the range until a[tmp] is no longer
+		 * greater than a[run].
+		 */
+		tmp = run - 1;
+		error = xfarray_sort_load(si, tmp, b);
+		if (error)
+			return error;
+		while (xfarray_sort_cmp(si, a, b) < 0) {
+			tmp--;
+			error = xfarray_sort_load(si, tmp, b);
+			if (error)
+				return error;
+
+			if (xfarray_sort_terminated(si, &error))
+				return error;
+		}
+		tmp++;
+
+		/*
+		 * If tmp != run, then a[tmp..run-1] are all less than a[run],
+		 * so right barrel roll a[tmp..run] to get this range in
+		 * sorted order.
+		 */
+		if (tmp == run)
+			continue;
+
+		for (i = run; i >= tmp; i--) {
+			error = xfarray_sort_load(si, i - 1, b);
+			if (error)
+				return error;
+			error = xfarray_sort_store(si, i, b);
+			if (error)
+				return error;
+
+			if (xfarray_sort_terminated(si, &error))
+				return error;
+		}
+		error = xfarray_sort_store(si, tmp, a);
+		if (error)
+			return error;
+
+		if (xfarray_sort_terminated(si, &error))
+			return error;
+	}
+
+	return 0;
+}
+
+/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
+static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
+{
+	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
+}
+
+/*
+ * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
+ * the cached pivot record for the next step.
+ *
+ * Select the median value from a[lo], a[mid], and a[hi].  Put the median in
+ * a[lo], the lowest in a[mid], and the highest in a[hi].  Using the median of
+ * the three reduces the chances that we pick the worst case pivot value, since
+ * it's likely that our array values are nearly sorted.
+ */
+STATIC int
+xfarray_qsort_pivot(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		lo,
+	xfarray_idx_t		hi)
+{
+	void			*a = xfarray_sortinfo_pivot(si);
+	void			*b = xfarray_scratch(si->array);
+	xfarray_idx_t		mid = lo + ((hi - lo) / 2);
+	int			error;
+
+	/* if a[mid] < a[lo], swap a[mid] and a[lo]. */
+	error = xfarray_sort_load(si, mid, a);
+	if (error)
+		return error;
+	error = xfarray_sort_load(si, lo, b);
+	if (error)
+		return error;
+	if (xfarray_sort_cmp(si, a, b) < 0) {
+		error = xfarray_sort_store(si, lo, a);
+		if (error)
+			return error;
+		error = xfarray_sort_store(si, mid, b);
+		if (error)
+			return error;
+	}
+
+	/* if a[hi] < a[mid], swap a[mid] and a[hi]. */
+	error = xfarray_sort_load(si, hi, a);
+	if (error)
+		return error;
+	error = xfarray_sort_load(si, mid, b);
+	if (error)
+		return error;
+	if (xfarray_sort_cmp(si, a, b) < 0) {
+		error = xfarray_sort_store(si, mid, a);
+		if (error)
+			return error;
+		error = xfarray_sort_store(si, hi, b);
+		if (error)
+			return error;
+	} else {
+		goto move_front;
+	}
+
+	/* if a[mid] < a[lo], swap a[mid] and a[lo]. */
+	error = xfarray_sort_load(si, mid, a);
+	if (error)
+		return error;
+	error = xfarray_sort_load(si, lo, b);
+	if (error)
+		return error;
+	if (xfarray_sort_cmp(si, a, b) < 0) {
+		error = xfarray_sort_store(si, lo, a);
+		if (error)
+			return error;
+		error = xfarray_sort_store(si, mid, b);
+		if (error)
+			return error;
+	}
+
+move_front:
+	/*
+	 * Move our selected pivot to a[lo].  Recall that a == si->pivot, so
+	 * this leaves us with the pivot cached in the sortinfo structure.
+	 */
+	error = xfarray_sort_load(si, lo, b);
+	if (error)
+		return error;
+	error = xfarray_sort_load(si, mid, a);
+	if (error)
+		return error;
+	error = xfarray_sort_store(si, mid, b);
+	if (error)
+		return error;
+	return xfarray_sort_store(si, lo, a);
+}
+
+/*
+ * Set up the pointers for the next iteration.  We push onto the stack all of
+ * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
+ * current stack frame to point to the unsorted values between a[beg[i]] and
+ * a[lo] so that those values will be sorted when we pop the stack.
+ */
+static inline int
+xfarray_qsort_push(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		*si_lo,
+	xfarray_idx_t		*si_hi,
+	xfarray_idx_t		lo,
+	xfarray_idx_t		hi)
+{
+	/* Check for stack overflows */
+	if (si->stack_depth >= si->max_stack_depth - 1) {
+		ASSERT(si->stack_depth < si->max_stack_depth - 1);
+		return -EFSCORRUPTED;
+	}
+
+	si->max_stack_used = max_t(uint8_t, si->max_stack_used,
+					    si->stack_depth + 2);
+
+	si_lo[si->stack_depth + 1] = lo + 1;
+	si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
+	si_hi[si->stack_depth++] = lo - 1;
+
+	/*
+	 * Always start with the smaller of the two partitions to keep the
+	 * amount of recursion in check.
+	 */
+	if (si_hi[si->stack_depth]     - si_lo[si->stack_depth] >
+	    si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
+		swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
+		swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
+	}
+
+	return 0;
+}
+
+/*
+ * Sort the array elements via quicksort.  This implementation incorporates
+ * four optimizations discussed in Sedgewick:
+ *
+ * 1. Use an explicit stack of array indices to store the next array partition
+ *    to sort.  This helps us to avoid recursion in the call stack, which is
+ *    particularly expensive in the kernel.
+ *
+ * 2. For arrays with records in arbitrary or user-controlled order, choose the
+ *    pivot element using a median-of-three decision tree.  This reduces the
+ *    probability of selecting a bad pivot value which causes worst case
+ *    behavior (i.e. partition sizes of 1).
+ *
+ * 3. The smaller of the two sub-partitions is pushed onto the stack to start
+ *    the next level of recursion, and the larger sub-partition replaces the
+ *    current stack frame.  This guarantees that we won't need more than
+ *    log2(nr) stack space.
+ *
+ * 4. Use insertion sort for small sets since since insertion sort is faster
+ *    for small, mostly sorted array segments.  In the author's experience,
+ *    substituting insertion sort for arrays smaller than 8 elements yields
+ *    a ~10% reduction in runtime.
+ */
+
+/*
+ * Due to the use of signed indices, we can only support up to 2^63 records.
+ * Files can only grow to 2^63 bytes, so this is not much of a limitation.
+ */
+#define QSORT_MAX_RECS		(1ULL << 63)
+
+int
+xfarray_sort(
+	struct xfarray		*array,
+	xfarray_cmp_fn		cmp_fn,
+	unsigned int		flags)
+{
+	struct xfarray_sortinfo	*si;
+	xfarray_idx_t		*si_lo, *si_hi;
+	void			*pivot;
+	void			*scratch = xfarray_scratch(array);
+	xfarray_idx_t		lo, hi;
+	int			error = 0;
+
+	if (array->nr < 2)
+		return 0;
+	if (array->nr >= QSORT_MAX_RECS)
+		return -E2BIG;
+
+	error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
+	if (error)
+		return error;
+	si_lo = xfarray_sortinfo_lo(si);
+	si_hi = xfarray_sortinfo_hi(si);
+	pivot = xfarray_sortinfo_pivot(si);
+
+	while (si->stack_depth >= 0) {
+		lo = si_lo[si->stack_depth];
+		hi = si_hi[si->stack_depth];
+
+		trace_xfarray_qsort(si, lo, hi);
+
+		/* Nothing left in this partition to sort; pop stack. */
+		if (lo >= hi) {
+			si->stack_depth--;
+			continue;
+		}
+
+		/* If insertion sort can solve our problems, we're done. */
+		if (xfarray_want_isort(si, lo, hi)) {
+			error = xfarray_isort(si, lo, hi);
+			if (error)
+				goto out_free;
+			si->stack_depth--;
+			continue;
+		}
+
+		/* Pick a pivot, move it to a[lo] and stash it. */
+		error = xfarray_qsort_pivot(si, lo, hi);
+		if (error)
+			goto out_free;
+
+		/*
+		 * Rearrange a[lo..hi] such that everything smaller than the
+		 * pivot is on the left side of the range and everything larger
+		 * than the pivot is on the right side of the range.
+		 */
+		while (lo < hi) {
+			/*
+			 * Decrement hi until it finds an a[hi] less than the
+			 * pivot value.
+			 */
+			error = xfarray_sort_load(si, hi, scratch);
+			if (error)
+				goto out_free;
+			while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
+								lo < hi) {
+				if (xfarray_sort_terminated(si, &error))
+					goto out_free;
+
+				hi--;
+				error = xfarray_sort_load(si, hi, scratch);
+				if (error)
+					goto out_free;
+			}
+
+			if (xfarray_sort_terminated(si, &error))
+				goto out_free;
+
+			/* Copy that item (a[hi]) to a[lo]. */
+			if (lo < hi) {
+				error = xfarray_sort_store(si, lo++, scratch);
+				if (error)
+					goto out_free;
+			}
+
+			/*
+			 * Increment lo until it finds an a[lo] greater than
+			 * the pivot value.
+			 */
+			error = xfarray_sort_load(si, lo, scratch);
+			if (error)
+				goto out_free;
+			while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
+								lo < hi) {
+				if (xfarray_sort_terminated(si, &error))
+					goto out_free;
+
+				lo++;
+				error = xfarray_sort_load(si, lo, scratch);
+				if (error)
+					goto out_free;
+			}
+
+			if (xfarray_sort_terminated(si, &error))
+				goto out_free;
+
+			/* Copy that item (a[lo]) to a[hi]. */
+			if (lo < hi) {
+				error = xfarray_sort_store(si, hi--, scratch);
+				if (error)
+					goto out_free;
+			}
+
+			if (xfarray_sort_terminated(si, &error))
+				goto out_free;
+		}
+
+		/*
+		 * Put our pivot value in the correct place at a[lo].  All
+		 * values between a[beg[i]] and a[lo - 1] should be less than
+		 * the pivot; and all values between a[lo + 1] and a[end[i]-1]
+		 * should be greater than the pivot.
+		 */
+		error = xfarray_sort_store(si, lo, pivot);
+		if (error)
+			goto out_free;
+
+		/* Set up the stack frame to process the two partitions. */
+		error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
+		if (error)
+			goto out_free;
+
+		if (xfarray_sort_terminated(si, &error))
+			goto out_free;
+	}
+
+out_free:
+	trace_xfarray_sort_stats(si, error);
+	kvfree(si);
+	return error;
+}
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 3ef7911b104b8..86c09897a4126 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -54,4 +54,71 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr)
 uint64_t xfarray_length(struct xfarray *array);
 int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
 
+/* Declarations for xfile array sort functionality. */
+
+typedef cmp_func_t xfarray_cmp_fn;
+
+struct xfarray_sortinfo {
+	struct xfarray		*array;
+
+	/* Comparison function for the sort. */
+	xfarray_cmp_fn		cmp_fn;
+
+	/* Maximum height of the partition stack. */
+	uint8_t			max_stack_depth;
+
+	/* Current height of the partition stack. */
+	int8_t			stack_depth;
+
+	/* Maximum stack depth ever used. */
+	uint8_t			max_stack_used;
+
+	/* XFARRAY_SORT_* flags; see below. */
+	unsigned int		flags;
+
+#ifdef DEBUG
+	/* Performance statistics. */
+	uint64_t		loads;
+	uint64_t		stores;
+	uint64_t		compares;
+#endif
+
+	/*
+	 * Extra bytes are allocated beyond the end of the structure to store
+	 * quicksort information.  C does not permit multiple VLAs per struct,
+	 * so we document all of this in a comment.
+	 *
+	 * Pretend that we have a typedef for array records:
+	 *
+	 * typedef char[array->obj_size]	xfarray_rec_t;
+	 *
+	 * First comes the quicksort partition stack:
+	 *
+	 * xfarray_idx_t	lo[max_stack_depth];
+	 * xfarray_idx_t	hi[max_stack_depth];
+	 *
+	 * union {
+	 *
+	 * If for a given subset we decide to use an insertion sort, we use the
+	 * scratchpad record after the xfarray and a second scratchpad record
+	 * here to compare items:
+	 *
+	 * 	xfarray_rec_t	scratch;
+	 *
+	 * Otherwise, we want to partition the records to partition the array.
+	 * We store the chosen pivot record here and use the xfarray scratchpad
+	 * to rearrange the array around the pivot:
+	 *
+	 * 	xfarray_rec_t	pivot;
+	 *
+	 * }
+	 */
+};
+
+/* Sort can be interrupted by a fatal signal. */
+#define XFARRAY_SORT_KILLABLE	(1U << 0)
+
+int xfarray_sort(struct xfarray *array, xfarray_cmp_fn cmp_fn,
+		unsigned int flags);
+
 #endif /* __XFS_SCRUB_XFARRAY_H__ */


  parent reply	other threads:[~2023-07-27 22:25 UTC|newest]

Thread overview: 90+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-27 22:11 [MEGAPATCHSET v26] xfs: online repair, part of part 1 Darrick J. Wong
2023-07-27 22:18 ` [PATCHSET v26.0 0/9] xfs: fix online repair block reaping Darrick J. Wong
2023-07-27 22:21   ` [PATCH 1/9] xfs: cull repair code that will never get used Darrick J. Wong
2023-07-27 22:21   ` [PATCH 2/9] xfs: move the post-repair block reaping code to a separate file Darrick J. Wong
2023-07-27 22:22   ` [PATCH 3/9] xfs: only invalidate blocks if we're going to free them Darrick J. Wong
2023-07-27 22:22   ` [PATCH 4/9] xfs: only allow reaping of per-AG blocks in xrep_reap_extents Darrick J. Wong
2023-07-27 22:22   ` [PATCH 5/9] xfs: use deferred frees to reap old btree blocks Darrick J. Wong
2023-07-27 22:22   ` [PATCH 6/9] xfs: rearrange xrep_reap_block to make future code flow easier Darrick J. Wong
2023-07-27 22:23   ` [PATCH 7/9] xfs: allow scanning ranges of the buffer cache for live buffers Darrick J. Wong
2023-07-27 22:23   ` [PATCH 8/9] xfs: reap large AG metadata extents when possible Darrick J. Wong
2023-07-27 22:23   ` [PATCH 9/9] xfs: use per-AG bitmaps to reap unused AG metadata blocks during repair Darrick J. Wong
2023-08-07  6:19   ` [PATCHSET v26.0 0/9] xfs: fix online repair block reaping Dave Chinner
2023-08-08  0:40     ` Darrick J. Wong
2023-08-08  5:17       ` Dave Chinner
2023-08-09 23:17         ` Darrick J. Wong
2023-07-27 22:18 ` [PATCHSET v26.0 0/6] xfs: prepare repair for bulk loading Darrick J. Wong
2023-07-27 22:24   ` [PATCH 1/6] xfs: force all buffers to be written during btree bulk load Darrick J. Wong
2023-07-27 22:24   ` [PATCH 2/6] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
2023-08-07  6:58     ` Dave Chinner
2023-08-08  1:08       ` Darrick J. Wong
2023-07-27 22:24   ` [PATCH 3/6] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
2023-08-07  8:41     ` Dave Chinner
2023-08-08  0:54       ` Darrick J. Wong
2023-08-08  6:11         ` Dave Chinner
2023-08-09 23:52           ` Darrick J. Wong
2023-08-10 20:36             ` Darrick J. Wong
2023-09-08 23:34       ` Darrick J. Wong
2023-07-27 22:24   ` [PATCH 4/6] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
2023-07-27 22:25   ` [PATCH 5/6] xfs: move btree bulkload record initialization to ->get_record implementations Darrick J. Wong
2023-07-27 22:25   ` [PATCH 6/6] xfs: constrain dirty buffers while formatting a staged btree Darrick J. Wong
2023-07-27 22:19 ` [PATCHSET v26.0 0/7] xfs: stage repair information in pageable memory Darrick J. Wong
2023-07-27 22:25   ` [PATCH 1/7] xfs: create a big array data structure Darrick J. Wong
2023-07-28  3:10     ` Matthew Wilcox
2023-07-28  4:39       ` Darrick J. Wong
2023-07-27 22:25   ` Darrick J. Wong [this message]
2023-07-27 22:26   ` [PATCH 3/7] xfs: convert xfarray insertion sort to heapsort using scratchpad memory Darrick J. Wong
2023-07-27 22:26   ` [PATCH 4/7] xfs: teach xfile to pass back direct-map pages to caller Darrick J. Wong
2023-07-27 22:26   ` [PATCH 5/7] xfs: speed up xfarray sort by sorting xfile page contents directly Darrick J. Wong
2023-07-27 22:26   ` [PATCH 6/7] xfs: cache pages used for xfarray quicksort convergence Darrick J. Wong
2023-07-27 22:27   ` [PATCH 7/7] xfs: improve xfarray quicksort pivot Darrick J. Wong
2023-07-27 22:19 ` [PATCHSET v26.0 0/2] xfs: add usage counters for scrub Darrick J. Wong
2023-07-27 22:27   ` [PATCH 1/2] xfs: create scaffolding for creating debugfs entries Darrick J. Wong
2023-07-27 22:27   ` [PATCH 2/2] xfs: track usage statistics of online fsck Darrick J. Wong
2023-08-08  7:09   ` [PATCHSET v26.0 0/2] xfs: add usage counters for scrub Dave Chinner
2023-07-27 22:19 ` [PATCHSET v26.0 0/4] xfs: online scrubbing of realtime summary files Darrick J. Wong
2023-07-27 22:27   ` [PATCH 1/4] xfs: get our own reference to inodes that we want to scrub Darrick J. Wong
2023-07-27 22:28   ` [PATCH 2/4] xfs: wrap ilock/iunlock operations on sc->ip Darrick J. Wong
2023-07-27 22:28   ` [PATCH 3/4] xfs: move the realtime summary file scrubber to a separate source file Darrick J. Wong
2023-07-27 22:28   ` [PATCH 4/4] xfs: implement online scrubbing of rtsummary info Darrick J. Wong
2023-07-27 22:19 ` [PATCHSET v26.0 0/2] xfs: miscellaneous repair tweaks Darrick J. Wong
2023-07-27 22:28   ` [PATCH 1/2] xfs: always rescan allegedly healthy per-ag metadata after repair Darrick J. Wong
2023-07-27 22:29   ` [PATCH 2/2] xfs: allow the user to cancel repairs before we start writing Darrick J. Wong
2023-07-27 22:20 ` [PATCHSET v26.0 0/2] xfs: force rebuilding of metadata Darrick J. Wong
2023-07-27 22:29   ` [PATCH 1/2] xfs: don't complain about unfixed metadata when repairs were injected Darrick J. Wong
2023-07-27 22:29   ` [PATCH 2/2] xfs: allow userspace to rebuild metadata structures Darrick J. Wong
2023-07-27 22:20 ` [PATCHSET v26.0 0/2] xfs: fixes to the AGFL repair code Darrick J. Wong
2023-07-27 22:30   ` [PATCH 1/2] xfs: clear pagf_agflreset when repairing the AGFL Darrick J. Wong
2023-07-27 22:30   ` [PATCH 2/2] xfs: fix agf_fllast when repairing an empty AGFL Darrick J. Wong
2023-08-08  7:10     ` Dave Chinner
2023-07-27 22:20 ` [PATCHSET v26.0 0/5] xfs: online repair of AG btrees Darrick J. Wong
2023-07-27 22:30   ` [PATCH 1/5] xfs: repair free space btrees Darrick J. Wong
2023-07-27 22:30   ` [PATCH 2/5] xfs: hide xfs_inode_is_allocated in scrub common code Darrick J. Wong
2023-08-08  7:13     ` Dave Chinner
2023-07-27 22:31   ` [PATCH 3/5] xfs: rewrite xchk_inode_is_allocated to work properly Darrick J. Wong
2023-08-08  7:14     ` Dave Chinner
2023-07-27 22:31   ` [PATCH 4/5] xfs: repair inode btrees Darrick J. Wong
2023-07-27 22:31   ` [PATCH 5/5] xfs: repair refcount btrees Darrick J. Wong
2023-07-27 22:20 ` [PATCHSET v26.0 0/2] xfs: fixes for the block mapping checker Darrick J. Wong
2023-07-27 22:31   ` [PATCH 1/2] xfs: simplify returns in xchk_bmap Darrick J. Wong
2023-07-27 22:32   ` [PATCH 2/2] xfs: don't check reflink iflag state when checking cow fork Darrick J. Wong
2023-08-08  7:16   ` [PATCHSET v26.0 0/2] xfs: fixes for the block mapping checker Dave Chinner
2023-07-27 22:21 ` [PATCHSET v26.0 0/6] xfs: online repair of inodes and forks Darrick J. Wong
2023-07-27 22:32   ` [PATCH 1/6] xfs: disable online repair quota helpers when quota not enabled Darrick J. Wong
2023-07-27 22:32   ` [PATCH 2/6] xfs: try to attach dquots to files before repairing them Darrick J. Wong
2023-07-27 22:32   ` [PATCH 3/6] xfs: repair inode records Darrick J. Wong
2023-08-09  8:42     ` Dave Chinner
2023-08-10  0:43       ` Darrick J. Wong
2023-07-27 22:33   ` [PATCH 4/6] xfs: zap broken inode forks Darrick J. Wong
2023-07-27 22:33   ` [PATCH 5/6] xfs: abort directory parent scrub scans if we encounter a zapped directory Darrick J. Wong
2023-07-27 22:33   ` [PATCH 6/6] xfs: repair obviously broken inode modes Darrick J. Wong
2023-08-09  9:44   ` [PATCHSET v26.0 0/6] xfs: online repair of inodes and forks Dave Chinner
2023-08-10  0:45     ` Darrick J. Wong
2023-07-27 22:21 ` [PATCHSET v26.0 0/5] xfs: online repair of file fork mappings Darrick J. Wong
2023-07-27 22:33   ` [PATCH 1/5] xfs: reintroduce reaping of file metadata blocks to xrep_reap_extents Darrick J. Wong
2023-07-27 22:34   ` [PATCH 2/5] xfs: repair inode fork block mapping data structures Darrick J. Wong
2023-07-27 22:34   ` [PATCH 3/5] xfs: refactor repair forcing tests into a repair.c helper Darrick J. Wong
2023-07-27 22:34   ` [PATCH 4/5] xfs: create a ranged query function for refcount btrees Darrick J. Wong
2023-07-27 22:34   ` [PATCH 5/5] xfs: repair problems in CoW forks Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2023-05-26  0:28 [PATCHSET v25.0 0/7] xfs: stage repair information in pageable memory Darrick J. Wong
2023-05-26  0:47 ` [PATCH 2/7] xfs: enable sorting of xfile-backed arrays Darrick J. Wong
2022-12-30 22:12 [PATCHSET v24.0 0/7] xfs: stage repair information in pageable memory Darrick J. Wong
2022-12-30 22:12 ` [PATCH 2/7] xfs: enable sorting of xfile-backed arrays Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=169049623600.921478.515800900911021005.stgit@frogsfrogsfrogs \
    --to=djwong@kernel.org \
    --cc=dchinner@redhat.com \
    --cc=kent.overstreet@linux.dev \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.