linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/5] lib/sort: Make swap functions more generic
  2019-03-16  2:43 [PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
@ 2019-02-21  6:30 ` George Spelvin
  2019-02-21  8:21 ` [PATCH v2 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-02-21  6:30 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

Rather than having special-case swap functions for 4- and 8-byte objects,
special-case aligned multiples of 4 or 8 bytes.  This speeds up most
users of sort() by avoiding fallback to the byte copy loop.

Despite what commit ca96ab859ab4 ("lib/sort: Add 64 bit swap function")
claims, very few users of sort() sort pointers (or pointer-sized
objects); most sort structures containing at least two words.
(E.g. drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte
struct acpi_fan_fps.)

The functions also got renamed to reflect the fact that they support
multiple words.  In the great tradition of bikeshedding, the names were
by far the most contentious issue during review of this patch series.

x86-64 code size 872 -> 886 bytes (+14)

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Feedback-from: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Feedback-from: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Feedback-from: Geert Uytterhoeven <geert@linux-m68k.org>
---
 lib/sort.c | 135 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 105 insertions(+), 30 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..ec79eac85e21 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -11,35 +11,108 @@
 #include <linux/export.h>
 #include <linux/sort.h>
 
-static int alignment_ok(const void *base, int align)
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required aignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
 {
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
+	unsigned char lsbits = (unsigned char)size;
+
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
 }
 
-static void u32_swap(void *a, void *b, int size)
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, int size)
 {
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, int size)
-{
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, int size)
-{
-	char t;
+	size_t n = (unsigned int)size;
 
 	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, int size)
+{
+	size_t n = (unsigned int)size;
+
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a, @b: pointers to the elements
+ * @size: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, int size)
+{
+	size_t n = (unsigned int)size;
+
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
 }
 
 /**
@@ -50,8 +123,10 @@ static void generic_swap(void *a, void *b, int size)
  * @cmp_func: pointer to comparison function
  * @swap_func: pointer to swap function or NULL
  *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
+ * This function does a heapsort on the given array.  You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * isn't usually a bottleneck.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * qsort is about 20% faster on average, it suffers from exploitable
@@ -67,12 +142,12 @@ void sort(void *base, size_t num, size_t size,
 	int i = (num/2 - 1) * size, n = num * size, c, r;
 
 	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
+		if (is_aligned(base, size, 8))
+			swap_func = swap_words_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = swap_words_32;
 		else
-			swap_func = generic_swap;
+			swap_func = swap_bytes;
 	}
 
 	/* heapify */
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v2 2/5] lib/sort: Use more efficient bottom-up heapsort variant
  2019-03-16  2:43 [PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
  2019-02-21  6:30 ` [PATCH v2 1/5] lib/sort: Make swap functions more generic George Spelvin
  2019-02-21  8:21 ` [PATCH v2 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
@ 2019-02-21  8:21 ` George Spelvin
  2019-03-05  3:06 ` [PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-02-21  8:21 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

This uses fewer comparisons than the previous code (approaching half
as many for large random inputs), but produces identical results;
it actually performs the exact same series of swap operations.

Specifically, it reduces the average number of compares from
2*n*log2(n) - 3*n + o(n)  to  n*log2(n) + 0.37*n + o(n).

This is still 1.63*n worse than glibc qsort() which manages
n*log2(n) - 1.26*n, but at least the leading coefficient is correct.

Standard heapsort, when sifting down, performs two comparisons
per level: one to find the greater child, and a second to see
if the current node should be exchanged with that child.

Bottom-up heapsort observes that it's better to postpone the second
comparison and search for the leaf where -infinity would be sent
to, then search back *up* for the current node's destination.

Since sifting down usually proceeds to the leaf level (that's where
half the nodes are), this does O(1) second comparisons rather
than log2(n).  That saves a lot of (expensive since Spectre)
indirect function calls.

The one time it's worse than the previous code is if there are
large numbers of duplicate keys, when the top-down algorithm is
O(n) and bottom-up is O(n log n).  For distinct keys, it's provably
always better, doing 1.5*n*log2(n) + O(n) in the worst case.

(The code is not significantly more complex.  This patch also
merges the heap-building and -extracting sift-down loops,
resulting in a net code size savings.)

x86-64 code size 885 -> 767 bytes (-118)

(I see the checkpatch complaint about "else if (n -= size)".
The alternative is significantly uglier.)

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
---
 lib/sort.c | 110 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 80 insertions(+), 30 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index ec79eac85e21..0d24d0c5c0fc 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -1,8 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
  *
- * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
+ * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
+ *
+ * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
+ * better) at the expense of stack usage and much larger code to avoid
+ * quicksort's O(n^2) worst case.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -15,7 +20,7 @@
  * is_aligned - is this pointer & size okay for word-wide copying?
  * @base: pointer to data
  * @size: size of each element
- * @align: required aignment (typically 4 or 8)
+ * @align: required alignment (typically 4 or 8)
  *
  * Returns true if elements can be copied using word loads and stores.
  * The size must be a multiple of the alignment, and the base address must
@@ -115,6 +120,32 @@ static void swap_bytes(void *a, void *b, int size)
 	} while (n);
 }
 
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
+}
+
 /**
  * sort - sort an array of elements
  * @base: pointer to data to sort
@@ -129,17 +160,20 @@ static void swap_bytes(void *a, void *b, int size)
  * isn't usually a bottleneck.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
+ * quicksort is slightly faster on average, it suffers from exploitable
  * O(n*n) worst-case behavior and extra memory requirements that make
  * it less suitable for kernel use.
  */
-
 void sort(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *),
 	  void (*swap_func)(void *, void *, int size))
 {
 	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
+	size_t n = num * size, a = (num/2) * size;
+	const unsigned int lsbit = size & -size;  /* Used to find parent */
+
+	if (!a)		/* num < 2 || size == 0 */
+		return;
 
 	if (!swap_func) {
 		if (is_aligned(base, size, 8))
@@ -150,32 +184,48 @@ void sort(void *base, size_t num, size_t size,
 			swap_func = swap_bytes;
 	}
 
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
+	/*
+	 * Loop invariants:
+	 * 1. elements [a,n) satisfy the heap property (compare greater than
+	 *    all of their children),
+	 * 2. elements [n,num*size) are sorted, and
+	 * 3. a <= b <= c <= d <= n (whenever they are valid).
+	 */
+	for (;;) {
+		size_t b, c, d;
 
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
+		if (a)			/* Building heap: sift down --a */
+			a -= size;
+		else if (n -= size)	/* Sorting: Extract root to --n */
+			swap_func(base, base + n, size);
+		else			/* Sort complete */
+			break;
+
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + size, (d = c + size) < n;)
+			b = cmp_func(base + c, base + d) >= 0 ? c : d;
+		if (d == n)	/* Special case last leaf with no sibling */
+			b = c;
+
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && cmp_func(base + a, base + b) >= 0)
+			b = parent(b, lsbit, size);
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = parent(b, lsbit, size);
+			swap_func(base + b, base + c, size);
 		}
 	}
 }
-
 EXPORT_SYMBOL(sort);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v2 3/5] lib/sort: Avoid indirect calls to built-in swap
  2019-03-16  2:43 [PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
  2019-02-21  6:30 ` [PATCH v2 1/5] lib/sort: Make swap functions more generic George Spelvin
@ 2019-02-21  8:21 ` George Spelvin
  2019-02-21  8:21 ` [PATCH v2 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-02-21  8:21 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

Similar to what's being done in the net code, this takes advantage of
the fact that most invocations use only a few common swap functions, and
replaces indirect calls to them with (highly predictable) conditional
branches.  (The downside, of course, is that if you *do* use a custom
swap function, there are a few extra predicted branches on the code path.)

This actually *shrinks* the x86-64 code, because it inlines the various
swap functions inside do_swap, eliding function prologues & epilogues.

x86-64 code size 767 -> 703 bytes (-64)

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
---
 lib/sort.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index 0d24d0c5c0fc..50855ea8c262 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -54,10 +54,8 @@ static bool is_aligned(const void *base, size_t size, unsigned char align)
  * subtract (since the intervening mov instructions don't alter the flags).
  * Gcc 8.1.0 doesn't have that problem.
  */
-static void swap_words_32(void *a, void *b, int size)
+static void swap_words_32(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 		u32 t = *(u32 *)(a + (n -= 4));
 		*(u32 *)(a + n) = *(u32 *)(b + n);
@@ -80,10 +78,8 @@ static void swap_words_32(void *a, void *b, int size)
  * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
  * x32 ABI).  Are there any cases the kernel needs to worry about?
  */
-static void swap_words_64(void *a, void *b, int size)
+static void swap_words_64(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 #ifdef CONFIG_64BIT
 		u64 t = *(u64 *)(a + (n -= 8));
@@ -109,10 +105,8 @@ static void swap_words_64(void *a, void *b, int size)
  *
  * This is the fallback if alignment doesn't allow using larger chunks.
  */
-static void swap_bytes(void *a, void *b, int size)
+static void swap_bytes(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 		char t = ((char *)a)[--n];
 		((char *)a)[n] = ((char *)b)[n];
@@ -120,6 +114,33 @@ static void swap_bytes(void *a, void *b, int size)
 	} while (n);
 }
 
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES    (swap_func_t)2
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func)
+{
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, (int)size);
+}
+
 /**
  * parent - given the offset of the child, find the offset of the parent.
  * @i: the offset of the heap element whose parent is sought.  Non-zero.
@@ -157,7 +178,7 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size)
  * This function does a heapsort on the given array.  You may provide
  * a swap_func function if you need to do something more than a memory
  * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
- * isn't usually a bottleneck.
+ * avoids a slow retpoline and so is significantly faster.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * quicksort is slightly faster on average, it suffers from exploitable
@@ -177,11 +198,11 @@ void sort(void *base, size_t num, size_t size,
 
 	if (!swap_func) {
 		if (is_aligned(base, size, 8))
-			swap_func = swap_words_64;
+			swap_func = SWAP_WORDS_64;
 		else if (is_aligned(base, size, 4))
-			swap_func = swap_words_32;
+			swap_func = SWAP_WORDS_32;
 		else
-			swap_func = swap_bytes;
+			swap_func = SWAP_BYTES;
 	}
 
 	/*
@@ -197,7 +218,7 @@ void sort(void *base, size_t num, size_t size,
 		if (a)			/* Building heap: sift down --a */
 			a -= size;
 		else if (n -= size)	/* Sorting: Extract root to --n */
-			swap_func(base, base + n, size);
+			do_swap(base, base + n, size, swap_func);
 		else			/* Sort complete */
 			break;
 
@@ -224,7 +245,7 @@ void sort(void *base, size_t num, size_t size,
 		c = b;			/* Where "a" belongs */
 		while (b != a) {	/* Shift it into place */
 			b = parent(b, lsbit, size);
-			swap_func(base + b, base + c, size);
+			do_swap(base + b, base + c, size, swap_func);
 		}
 	}
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-16  2:43 [PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                   ` (2 preceding siblings ...)
  2019-02-21  8:21 ` [PATCH v2 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
@ 2019-03-05  3:06 ` George Spelvin
  2019-03-05  5:58 ` [PATCH v2 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
  5 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-05  3:06 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

Rather than a fixed-size array of pending sorted runs, use the ->prev
links to keep track of things.  This reduces stack usage, eliminates
some ugly overflow handling, and reduces the code size.

Also:
* merge() no longer needs to handle NULL inputs, so simplify.
* The same applies to merge_and_restore_back_links(), which is renamed
  to the less ponderous merge_final().  (It's a static helper function,
  so we don't need a super-descriptive name; comments will do.)
* Document the actual return value requirements on the (*cmp)()
  function; some callers are already using this feature.

x86-64 code size 1086 -> 739 bytes (-347)

(Yes, I see checkpatch complaining about no space after comma in
"__attribute__((nonnull(2,3,4,5)))".  Checkpatch is wrong.)

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Feedback-from: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Feedback-from: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Feedback-from: Geert Uytterhoeven <geert@linux-m68k.org>
---
 include/linux/list_sort.h |   1 +
 lib/list_sort.c           | 169 ++++++++++++++++++++++++--------------
 2 files changed, 109 insertions(+), 61 deletions(-)

diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h
index ba79956e848d..20f178c24e9d 100644
--- a/include/linux/list_sort.h
+++ b/include/linux/list_sort.h
@@ -6,6 +6,7 @@
 
 struct list_head;
 
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 	       int (*cmp)(void *priv, struct list_head *a,
 			  struct list_head *b));
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 85759928215b..fc807dd60a51 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -7,33 +7,47 @@
 #include <linux/list_sort.h>
 #include <linux/list.h>
 
-#define MAX_LIST_LENGTH_BITS 20
+/*
+ * By declaring the compare function with the __pure attribute, we give
+ * the compiler more opportunity to optimize.  Ideally, we'd use this in
+ * the prototype of list_sort(), but that would involve a lot of churn
+ * at all call sites, so just cast the function pointer passed in.
+ */
+typedef int __pure __attribute__((nonnull(2,3))) (*cmp_func)(void *,
+		struct list_head const *, struct list_head const *);
 
 /*
  * Returns a list organized in an intermediate format suited
  * to chaining of merge() calls: null-terminated, no reserved or
  * sentinel head node, "prev" links not maintained.
  */
-static struct list_head *merge(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
+__attribute__((nonnull(2,3,4)))
+static struct list_head *merge(void *priv, cmp_func cmp,
 				struct list_head *a, struct list_head *b)
 {
-	struct list_head head, *tail = &head;
+	struct list_head *head, **tail = &head;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
-			tail->next = a;
+		if (cmp(priv, a, b) <= 0) {
+			*tail = a;
+			tail = &a->next;
 			a = a->next;
+			if (!a) {
+				*tail = b;
+				break;
+			}
 		} else {
-			tail->next = b;
+			*tail = b;
+			tail = &b->next;
 			b = b->next;
+			if (!b) {
+				*tail = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a?:b;
-	return head.next;
+	return head;
 }
 
 /*
@@ -43,44 +57,52 @@ static struct list_head *merge(void *priv,
  * prev-link restoration pass, or maintaining the prev links
  * throughout.
  */
-static void merge_and_restore_back_links(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
-				struct list_head *head,
-				struct list_head *a, struct list_head *b)
+__attribute__((nonnull(2,3,4,5)))
+static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
+			struct list_head *a, struct list_head *b)
 {
 	struct list_head *tail = head;
 	u8 count = 0;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
+		if (cmp(priv, a, b) <= 0) {
 			tail->next = a;
 			a->prev = tail;
+			tail = a;
 			a = a->next;
+			if (!a)
+				break;
 		} else {
 			tail->next = b;
 			b->prev = tail;
+			tail = b;
 			b = b->next;
+			if (!b) {
+				b = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a ? : b;
 
+	/* Finish linking remainder of list b on to tail */
+	tail->next = b;
 	do {
 		/*
-		 * In worst cases this loop may run many iterations.
+		 * If the merge is highly unbalanced (e.g. the input is
+		 * already sorted), this loop may run many iterations.
 		 * Continue callbacks to the client even though no
 		 * element comparison is needed, so the client's cmp()
 		 * routine can invoke cond_resched() periodically.
 		 */
-		if (unlikely(!(++count)))
-			(*cmp)(priv, tail->next, tail->next);
-
-		tail->next->prev = tail;
-		tail = tail->next;
-	} while (tail->next);
+		if (unlikely(!++count))
+			cmp(priv, b, b);
+		b->prev = tail;
+		tail = b;
+		b = b->next;
+	} while (b);
 
+	/* And the final links to make a circular doubly-linked list */
 	tail->next = head;
 	head->prev = tail;
 }
@@ -91,55 +113,80 @@ static void merge_and_restore_back_links(void *priv,
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements "merge sort", which has O(nlog(n))
- * complexity.
+ * This function implements a bottom-up merge sort, which has O(nlog(n))
+ * complexity.  We use depth-first order to take advantage of cacheing.
+ * (E.g. when we get to the fourth element, we immediately merge the
+ * first two 2-element lists.)
  *
- * The comparison function @cmp must return a negative value if @a
- * should sort before @b, and a positive value if @a should sort after
- * @b. If @a and @b are equivalent, and their original relative
- * ordering is to be preserved, @cmp must return 0.
+ * The comparison funtion @cmp must return > 0 if @a should sort after
+ * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
+ * sort before @b *or* their original order should be preserved.  It is
+ * always called with the element that came first in the input in @a,
+ * and list_sort is a stable sort, so it is not necessary to distinguish
+ * the @a < @b and @a == @b cases.
+ *
+ * This is compatible with two styles of @cmp function:
+ * - The traditional style which returns <0 / =0 / >0, or
+ * - Returning a boolean 0/1.
+ * The latter offers a chance to save a few cycles in the comparison
+ * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c).
+ *
+ * A good way to write a multi-word comparison is
+ *	if (a->high != b->high)
+ *		return a->high > b->high;
+ *	if (a->middle != b->middle)
+ *		return a->middle > b->middle;
+ *	return a->low > b->low;
  */
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 		int (*cmp)(void *priv, struct list_head *a,
 			struct list_head *b))
 {
-	struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists
-						-- last slot is a sentinel */
-	int lev;  /* index into part[] */
-	int max_lev = 0;
-	struct list_head *list;
+	struct list_head *list = head->next, *pending = NULL;
+	size_t count = 0;	/* Count of pending */
 
-	if (list_empty(head))
+	if (list == head->prev)	/* Zero or one elements */
 		return;
 
-	memset(part, 0, sizeof(part));
-
+	/* Convert to a null-terminated singly-linked list. */
 	head->prev->next = NULL;
-	list = head->next;
 
-	while (list) {
+	/*
+	 * Data structure invariants:
+	 * - All lists are singly linked and null-terminated; prev
+	 *   pointers are not maintained.
+	 * - pending is a prev-linked "list of lists" of sorted
+	 *   sublists awaiting further merging.
+	 * - Each of the sorted sublists is power-of-two in size,
+	 *   corresponding to bits set in "count".
+	 * - Sublists are sorted by size and age, smallest & newest at front.
+	 */
+	do {
+		size_t bits;
 		struct list_head *cur = list;
+
+		/* Extract the head of "list" as a single-element list "cur" */
 		list = list->next;
 		cur->next = NULL;
 
-		for (lev = 0; part[lev]; lev++) {
-			cur = merge(priv, cmp, part[lev], cur);
-			part[lev] = NULL;
+		/* Do merges corresponding to set lsbits in count */
+		for (bits = count; bits & 1; bits >>= 1) {
+			cur = merge(priv, (cmp_func)cmp, pending, cur);
+			pending = pending->prev;  /* Untouched by merge() */
 		}
-		if (lev > max_lev) {
-			if (unlikely(lev >= ARRAY_SIZE(part)-1)) {
-				printk_once(KERN_DEBUG "list too long for efficiency\n");
-				lev--;
-			}
-			max_lev = lev;
-		}
-		part[lev] = cur;
+		/* And place the result at the head of "pending" */
+		cur->prev = pending;
+		pending = cur;
+		count++;
+	} while (list->next);
+
+	/* Now merge together last element with all pending lists */
+	while (pending->prev) {
+		list = merge(priv, (cmp_func)cmp, pending, list);
+		pending = pending->prev;
 	}
-
-	for (lev = 0; lev < max_lev; lev++)
-		if (part[lev])
-			list = merge(priv, cmp, part[lev], list);
-
-	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
+	/* The final merge, rebuilding prev links */
+	merge_final(priv, (cmp_func)cmp, head, pending, list);
 }
 EXPORT_SYMBOL(list_sort);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v2 5/5] lib/list_sort: Optimize number of calls to comparison function
  2019-03-16  2:43 [PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                   ` (3 preceding siblings ...)
  2019-03-05  3:06 ` [PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
@ 2019-03-05  5:58 ` George Spelvin
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
  5 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-05  5:58 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

CONFIG_RETPOLINE has severely degraded indirect function call
performance, so it's worth putting some effort into reducing
the number of times cmp() is called.

This patch avoids badly unbalanced merges on unlucky input sizes.
It slightly increases the code size, but saves an average of 0.2*n
calls to cmp().

x86-64 code size 739 -> 803 bytes (+64)

Unfortunately, there's not a lot of low-hanging fruit in a merge
sort; it already performs only n*log2(n) - K*n + O(1) compares.
The leading coefficient is already at the theoretical limit (log2(n!)
corresponds to K=1.4427), so we're fighting over the linear term, and
the best mergesort can do is K=1.2645, achieved when n is a power of 2.

The differences between mergesort variants appear when n is *not*
a power of 2; K is a function of the fractional part of log2(n).
Top-down mergesort does best of all, achieving a minimum K=1.2408, and
an average (over all sizes) K=1.248.  However, that requires knowing
the number of entries to be sorted ahead of time, and making a full
pass over the input to count it conflicts with a second performance
goal, which is cache blocking.

Obviously, we have to read the entire list into L1 cache at some point,
and performance is best if it fits.  But if it doesn't fit, each full
pass over the input causes a cache miss per element, which is undesirable.

While textbooks explain bottom-up mergesort as a succession of merging
passes, practical implementations do merging in depth-first order:
as soon as two lists of the same size are available, they are merged.
This allows as many merge passes as possible to fit into L1; only the
final few merges force cache misses.

This cache-friendly depth-first merge order depends on us merging the
beginning of the input as much as possible before we've even seen the
end of the input (and thus know its size).

The simple eager merge pattern causes bad performance when n is just
over a power of 2.  If n=1028, the final merge is between 1024- and
4-element lists, which is wasteful of comparisons.  (This is actually
worse on average than n=1025, because a 1204:1 merge will, on average,
end after 512 compares, while 1024:4 will walk 4/5 of the list.)

Because of this, bottom-up mergesort achieves K < 0.5 for such sizes,
and has an average (over all sizes) K of around 1.  (My experiments
show K=1.01, while theory predicts K=0.965.)

There are "worst-case optimal" variants of bottom-up mergesort which
avoid this bad performance, but the algorithms given in the literature,
such as queue-mergesort and boustrodephonic mergesort, depend on the
breadth-first multi-pass structure that we are trying to avoid.

This implementation is as eager as possible while ensuring that all merge
passes are at worst 1:2 unbalanced.  This achieves the same average
K=1.207 as queue-mergesort, which is 0.2*n better then bottom-up, and
only 0.04*n behind top-down mergesort.

Specifically, defers merging two lists of size 2^k until it is known
that there are 2^k additional inputs following.  This ensures that the
final uneven merges triggered by reaching the end of the input will be
at worst 2:1.  This will avoid cache misses as long as 3*2^k elements
fit into the cache.

(I confess to being more than a little bit proud of how clean this
code turned out.  It took a lot of thinking, but the resultant inner
loop is very simple and efficient.)

Refs:
  Bottom-up Mergesort: A Detailed Analysis
  Wolfgang Panny, Helmut Prodinger
  Algorithmica 14(4):340--354, October 1995
  https://doi.org/10.1007/BF01294131
  https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.5260

  The cost distribution of queue-mergesort, optimal mergesorts, and
  power-of-two rules
  Wei-Mei Chen, Hsien-Kuei Hwang, Gen-Huey Chen
  Journal of Algorithms 30(2); Pages 423--448, February 1999
  https://doi.org/10.1006/jagm.1998.0986
  https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5380

  Queue-Mergesort
  Mordecai J. Golin, Robert Sedgewick
  Information Processing Letters, 48(5):253--259, 10 December 1993
  https://doi.org/10.1016/0020-0190(93)90088-q
  https://sci-hub.tw/10.1016/0020-0190(93)90088-Q

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Feedback-from: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
 lib/list_sort.c | 115 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 23 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index fc807dd60a51..623a9158ac8a 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -113,11 +113,6 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements a bottom-up merge sort, which has O(nlog(n))
- * complexity.  We use depth-first order to take advantage of cacheing.
- * (E.g. when we get to the fourth element, we immediately merge the
- * first two 2-element lists.)
- *
  * The comparison funtion @cmp must return > 0 if @a should sort after
  * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
  * sort before @b *or* their original order should be preserved.  It is
@@ -137,6 +132,60 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  *	if (a->middle != b->middle)
  *		return a->middle > b->middle;
  *	return a->low > b->low;
+ *
+ *
+ * This mergesort is as eager as possible while always performing at least
+ * 2:1 balanced merges.  Given two pending sublists of size 2^k, they are
+ * merged to a size-2^(k+1) list as soon as we have 2^k following elements.
+ *
+ * Thus, it will avoid cache thrashing as long as 3*2^k elements can
+ * fit into the cache.  Not quite as good as a fully-eager bottom-up
+ * mergesort, but it does use 0.2*n fewer comparisons, so is faster in
+ * the common case that everything fits into L1.
+ *
+ *
+ * The merging is controlled by "count", the number of elements in the
+ * pending lists.  This is beautiully simple code, but rather subtle.
+ *
+ * Each time we increment "count", we set one bit (bit k) and clear
+ * bits k-1 .. 0.  Each time this happens (except the very first time
+ * for each bit, when count increments to 2^k), we merge two lists of
+ * size 2^k into one list of size 2^(k+1).
+ *
+ * This merge happens exactly when the count reaches an odd multiple of
+ * 2^k, which is when we have 2^k elements pending in smaller lists,
+ * so it's safe to merge away two lists of size 2^k.
+ *
+ * After this happens twice, we have created two lists of size 2^(k+1),
+ * which will be merged into a list of size 2^(k+2) before we create
+ * a third list of size 2^(k+1), so there are never more than two pending.
+ *
+ * The number of pending lists of size 2^k is determined by the
+ * state of bit k of "count" plus two extra pieces of information:
+ * - The state of bit k-1 (when k == 0, consider bit -1 always set), and
+ * - Whether the higher-order bits are zero or non-zero (i.e.
+ *   is count >= 2^(k+1)).
+ * There are six states we distinguish.  "x" represents some arbitrary
+ * bits, and "y" represents some arbitrary non-zero bits:
+ * 0:  00x: 0 pending of size 2^k;           x pending of sizes < 2^k
+ * 1:  01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 2: x10x: 0 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 4: y00x: 1 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * (merge and loop back to state 2)
+ *
+ * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
+ * bit k-1 is set while the more significant bits are non-zero) and
+ * merge them away in the 5->2 transition.  Note in particular that just
+ * before the 5->2 transition, all lower-order bits are 11 (state 3),
+ * so there is one list of each smaller size.
+ *
+ * When we reach the end of the input, we merge all the pending
+ * lists, from smallest to largest.  If you work through cases 2 to
+ * 5 above, you can see that the number of elements we merge with a list
+ * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
+ * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
  */
 __attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
@@ -158,33 +207,53 @@ void list_sort(void *priv, struct list_head *head,
 	 *   pointers are not maintained.
 	 * - pending is a prev-linked "list of lists" of sorted
 	 *   sublists awaiting further merging.
-	 * - Each of the sorted sublists is power-of-two in size,
-	 *   corresponding to bits set in "count".
+	 * - Each of the sorted sublists is power-of-two in size.
 	 * - Sublists are sorted by size and age, smallest & newest at front.
+	 * - There are zero to two sublists of each size.
+	 * - A pair of pending sublists are merged as soon as the number
+	 *   of following pending elements equals their size (i.e.
+	 *   each time count reaches an odd multiple of that size).
+	 *   That ensures each later final merge will be at worst 2:1.
+	 * - Each round consists of:
+	 *   - Merging the two sublists selected by the highest bit
+	 *     which flips when count is incremented, and
+	 *   - Adding an element from the input as a size-1 sublist.
 	 */
 	do {
 		size_t bits;
-		struct list_head *cur = list;
+		struct list_head **tail = &pending;
 
-		/* Extract the head of "list" as a single-element list "cur" */
-		list = list->next;
-		cur->next = NULL;
+		/* Find the least-significant clear bit in count */
+		for (bits = count; bits & 1; bits >>= 1)
+			tail = &(*tail)->prev;
+		/* Do the indicated merge */
+		if (likely(bits)) {
+			struct list_head *a = *tail, *b = a->prev;
 
-		/* Do merges corresponding to set lsbits in count */
-		for (bits = count; bits & 1; bits >>= 1) {
-			cur = merge(priv, (cmp_func)cmp, pending, cur);
-			pending = pending->prev;  /* Untouched by merge() */
+			a = merge(priv, (cmp_func)cmp, b, a);
+			/* Install the merged result in place of the inputs */
+			a->prev = b->prev;
+			*tail = a;
 		}
-		/* And place the result at the head of "pending" */
-		cur->prev = pending;
-		pending = cur;
-		count++;
-	} while (list->next);
 
-	/* Now merge together last element with all pending lists */
-	while (pending->prev) {
+		/* Move one element from input list to pending */
+		list->prev = pending;
+		pending = list;
+		list = list->next;
+		pending->next = NULL;
+		count++;
+	} while (list);
+
+	/* End of input; merge together all the pending lists. */
+	list = pending;
+	pending = pending->prev;
+	for (;;) {
+		struct list_head *next = pending->prev;
+
+		if (!next)
+			break;
 		list = merge(priv, (cmp_func)cmp, pending, list);
-		pending = pending->prev;
+		pending = next;
 	}
 	/* The final merge, rebuilding prev links */
 	merge_final(priv, (cmp_func)cmp, head, pending, list);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller
@ 2019-03-16  2:43 George Spelvin
  2019-02-21  6:30 ` [PATCH v2 1/5] lib/sort: Make swap functions more generic George Spelvin
                   ` (5 more replies)
  0 siblings, 6 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-16  2:43 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

v1->v2:	Various spelling, naming and code style cleanups.
	Generally positive and no negative responses to the
	goals and algorithms used.

	I'm running these patches, with CONFIG_TEST_SORT and
	CONFIG_TEST_LIST_SORT, on the machine I'm sending this from.
	I have tweaked the comments further, but I have verified
	the compiled object code is identical to a snapshot I took
	when I rebooted.

	As far as I'm concerned, this is ready to be merged.
	As there is no owner in MAINTAINERS, I was thinking of
	sending it via AKPM, like the recent lib/lzo changes.
	Andrew, is that okay with you?

Because CONFIG_RETPOLINE has made indirect calls much more expensive,
I thought I'd try to reduce the number made by the library sort
functions.

The first three patches apply to lib/sort.c.

Patch #1 is a simple optimization.  The built-in swap has special cases
for aligned 4- and 8-byte objects.  But those are almost never used;
most calls to sort() work on larger structures, which fall back to the
byte-at-a-time loop.  This generalizes them to aligned *multiples* of 4
and 8 bytes.  (If nothing else, it saves an awful lot of energy by not
thrashing the store buffers as much.)

Patch #2 grabs a juicy piece of low-hanging fruit.  I agree that
nice simple solid heapsort is preferable to more complex algorithms
(sorry, Andrey), but it's possible to implement heapsort with far fewer
comparisons (50% asymptotically, 25-40% reduction for realistic sizes)
than the way it's been done up to now.  And with some care, the code
ends up smaller, as well.  This is the "big win" patch.

Patch #3 adds the same sort of indirect call bypass that has been added
to the net code of late.  The great majority of the callers use the
builtin swap functions, so replace the indirect call to sort_func with a
(highly preditable) series of if() statements.  Rather surprisingly,
this decreased code size, as the swap functions were inlined and their
prologue & epilogue code eliminated.

lib/list_sort.c is a bit trickier, as merge sort is already close to
optimal, and we don't want to introduce triumphs of theory over
practicality like the Ford-Johnson merge-insertion sort.

Patch #4, without changing the algorithm, chops 32% off the code size and
removes the part[MAX_LIST_LENGTH+1] pointer array (and the corresponding
upper limit on efficiently sortable input size).

Patch #5 improves the algorithm.  The previous code is already optimal
for power-of-two (or slightly smaller) size inputs, but when the input
size is just over a power of 2, there's a very unbalanced final merge.

There are, in the literature, several algorithms which solve this, but
they all depend on the "breadth-first" merge order which was replaced
by commit 835cc0c8477f with a more cache-friendly "depth-first" order.
Some hard thinking came up with a depth-first algorithm which defers
merges as little as possible while avoiding bad merges.  This saves
0.2*n compares, averaged over all sizes.

The code size increase is minimal (64 bytes on x86-64, reducing the net
savings to 26%), but the comments expanded significantly to document
the clever algorithm.


TESTING NOTES: I have some ugly user-space benchmarking code
which I used for testing before moving this code into the kernel.
Shout if you want a copy.

I'm running this code right now, with CONFIG_TEST_SORT and
CONFIG_TEST_LIST_SORT, but I confess I haven't rebooted since
the last round of minor edits to quell checkpatch.  I figure there
will be at least one round of comments and final testing.

George Spelvin (5):
  lib/sort: Make swap functions more generic
  lib/sort: Use more efficient bottom-up heapsort variant
  lib/sort: Avoid indirect calls to built-in swap
  lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  lib/list_sort: Optimize number of calls to comparison function

 include/linux/list_sort.h |   1 +
 lib/list_sort.c           | 244 +++++++++++++++++++++++++---------
 lib/sort.c                | 266 +++++++++++++++++++++++++++++---------
 3 files changed, 387 insertions(+), 124 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller
  2019-03-16  2:43 [PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                   ` (4 preceding siblings ...)
  2019-03-05  5:58 ` [PATCH v2 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
@ 2019-03-19  8:15 ` George Spelvin
  2019-03-19  8:15   ` [RESEND PATCH v2 1/5] lib/sort: Make swap functions more generic George Spelvin
                     ` (6 more replies)
  5 siblings, 7 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-19  8:15 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

(Resend because earlier send had GIT_AUTHOR_DATE in the e-mail
headers and got filed with last month's archives.  And probably
tripped a few spam filters.)

v1->v2:	Various spelling, naming and code style cleanups.
	Generally positive and no negative responses to the
	goals and algorithms used.

	I'm running these patches, with CONFIG_TEST_SORT and
	CONFIG_TEST_LIST_SORT, on the machine I'm sending this from.
	I have tweaked the comments further, but I have verified
	the compiled object code is identical to a snapshot I took
	when I rebooted.

	As far as I'm concerned, this is ready to be merged.
	As there is no owner in MAINTAINERS, I was thinking of
	sending it via AKPM, like the recent lib/lzo changes.
	Andrew, is that okay with you?

Because CONFIG_RETPOLINE has made indirect calls much more expensive,
I thought I'd try to reduce the number made by the library sort
functions.

The first three patches apply to lib/sort.c.

Patch #1 is a simple optimization.  The built-in swap has special cases
for aligned 4- and 8-byte objects.  But those are almost never used;
most calls to sort() work on larger structures, which fall back to the
byte-at-a-time loop.  This generalizes them to aligned *multiples* of 4
and 8 bytes.  (If nothing else, it saves an awful lot of energy by not
thrashing the store buffers as much.)

Patch #2 grabs a juicy piece of low-hanging fruit.  I agree that
nice simple solid heapsort is preferable to more complex algorithms
(sorry, Andrey), but it's possible to implement heapsort with far fewer
comparisons (50% asymptotically, 25-40% reduction for realistic sizes)
than the way it's been done up to now.  And with some care, the code
ends up smaller, as well.  This is the "big win" patch.

Patch #3 adds the same sort of indirect call bypass that has been added
to the net code of late.  The great majority of the callers use the
builtin swap functions, so replace the indirect call to sort_func with a
(highly preditable) series of if() statements.  Rather surprisingly,
this decreased code size, as the swap functions were inlined and their
prologue & epilogue code eliminated.

lib/list_sort.c is a bit trickier, as merge sort is already close to
optimal, and we don't want to introduce triumphs of theory over
practicality like the Ford-Johnson merge-insertion sort.

Patch #4, without changing the algorithm, chops 32% off the code size and
removes the part[MAX_LIST_LENGTH+1] pointer array (and the corresponding
upper limit on efficiently sortable input size).

Patch #5 improves the algorithm.  The previous code is already optimal
for power-of-two (or slightly smaller) size inputs, but when the input
size is just over a power of 2, there's a very unbalanced final merge.

There are, in the literature, several algorithms which solve this, but
they all depend on the "breadth-first" merge order which was replaced
by commit 835cc0c8477f with a more cache-friendly "depth-first" order.
Some hard thinking came up with a depth-first algorithm which defers
merges as little as possible while avoiding bad merges.  This saves
0.2*n compares, averaged over all sizes.

The code size increase is minimal (64 bytes on x86-64, reducing the net
savings to 26%), but the comments expanded significantly to document
the clever algorithm.


TESTING NOTES: I have some ugly user-space benchmarking code
which I used for testing before moving this code into the kernel.
Shout if you want a copy.

I'm running this code right now, with CONFIG_TEST_SORT and
CONFIG_TEST_LIST_SORT, but I confess I haven't rebooted since
the last round of minor edits to quell checkpatch.  I figure there
will be at least one round of comments and final testing.

George Spelvin (5):
  lib/sort: Make swap functions more generic
  lib/sort: Use more efficient bottom-up heapsort variant
  lib/sort: Avoid indirect calls to built-in swap
  lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  lib/list_sort: Optimize number of calls to comparison function

 include/linux/list_sort.h |   1 +
 lib/list_sort.c           | 244 +++++++++++++++++++++++++---------
 lib/sort.c                | 266 +++++++++++++++++++++++++++++---------
 3 files changed, 387 insertions(+), 124 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [RESEND PATCH v2 1/5] lib/sort: Make swap functions more generic
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
@ 2019-03-19  8:15   ` George Spelvin
  2019-03-19  8:15   ` [RESEND PATCH v2 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-19  8:15 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

Rather than having special-case swap functions for 4- and 8-byte objects,
special-case aligned multiples of 4 or 8 bytes.  This speeds up most
users of sort() by avoiding fallback to the byte copy loop.

Despite what commit ca96ab859ab4 ("lib/sort: Add 64 bit swap function")
claims, very few users of sort() sort pointers (or pointer-sized
objects); most sort structures containing at least two words.
(E.g. drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte
struct acpi_fan_fps.)

The functions also got renamed to reflect the fact that they support
multiple words.  In the great tradition of bikeshedding, the names were
by far the most contentious issue during review of this patch series.

x86-64 code size 872 -> 886 bytes (+14)

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Feedback-from: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Feedback-from: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Feedback-from: Geert Uytterhoeven <geert@linux-m68k.org>
---
 lib/sort.c | 135 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 105 insertions(+), 30 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..ec79eac85e21 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -11,35 +11,108 @@
 #include <linux/export.h>
 #include <linux/sort.h>
 
-static int alignment_ok(const void *base, int align)
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required aignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
 {
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
+	unsigned char lsbits = (unsigned char)size;
+
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
 }
 
-static void u32_swap(void *a, void *b, int size)
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, int size)
 {
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, int size)
-{
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, int size)
-{
-	char t;
+	size_t n = (unsigned int)size;
 
 	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, int size)
+{
+	size_t n = (unsigned int)size;
+
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a, @b: pointers to the elements
+ * @size: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, int size)
+{
+	size_t n = (unsigned int)size;
+
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
 }
 
 /**
@@ -50,8 +123,10 @@ static void generic_swap(void *a, void *b, int size)
  * @cmp_func: pointer to comparison function
  * @swap_func: pointer to swap function or NULL
  *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
+ * This function does a heapsort on the given array.  You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * isn't usually a bottleneck.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * qsort is about 20% faster on average, it suffers from exploitable
@@ -67,12 +142,12 @@ void sort(void *base, size_t num, size_t size,
 	int i = (num/2 - 1) * size, n = num * size, c, r;
 
 	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
+		if (is_aligned(base, size, 8))
+			swap_func = swap_words_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = swap_words_32;
 		else
-			swap_func = generic_swap;
+			swap_func = swap_bytes;
 	}
 
 	/* heapify */
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [RESEND PATCH v2 2/5] lib/sort: Use more efficient bottom-up heapsort variant
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
  2019-03-19  8:15   ` [RESEND PATCH v2 1/5] lib/sort: Make swap functions more generic George Spelvin
@ 2019-03-19  8:15   ` George Spelvin
  2019-03-19  8:15   ` [RESEND PATCH v2 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-19  8:15 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

This uses fewer comparisons than the previous code (approaching half
as many for large random inputs), but produces identical results;
it actually performs the exact same series of swap operations.

Specifically, it reduces the average number of compares from
2*n*log2(n) - 3*n + o(n)  to  n*log2(n) + 0.37*n + o(n).

This is still 1.63*n worse than glibc qsort() which manages
n*log2(n) - 1.26*n, but at least the leading coefficient is correct.

Standard heapsort, when sifting down, performs two comparisons
per level: one to find the greater child, and a second to see
if the current node should be exchanged with that child.

Bottom-up heapsort observes that it's better to postpone the second
comparison and search for the leaf where -infinity would be sent
to, then search back *up* for the current node's destination.

Since sifting down usually proceeds to the leaf level (that's where
half the nodes are), this does O(1) second comparisons rather
than log2(n).  That saves a lot of (expensive since Spectre)
indirect function calls.

The one time it's worse than the previous code is if there are
large numbers of duplicate keys, when the top-down algorithm is
O(n) and bottom-up is O(n log n).  For distinct keys, it's provably
always better, doing 1.5*n*log2(n) + O(n) in the worst case.

(The code is not significantly more complex.  This patch also
merges the heap-building and -extracting sift-down loops,
resulting in a net code size savings.)

x86-64 code size 885 -> 767 bytes (-118)

(I see the checkpatch complaint about "else if (n -= size)".
The alternative is significantly uglier.)

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
---
 lib/sort.c | 110 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 80 insertions(+), 30 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index ec79eac85e21..0d24d0c5c0fc 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -1,8 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
  *
- * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
+ * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
+ *
+ * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
+ * better) at the expense of stack usage and much larger code to avoid
+ * quicksort's O(n^2) worst case.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -15,7 +20,7 @@
  * is_aligned - is this pointer & size okay for word-wide copying?
  * @base: pointer to data
  * @size: size of each element
- * @align: required aignment (typically 4 or 8)
+ * @align: required alignment (typically 4 or 8)
  *
  * Returns true if elements can be copied using word loads and stores.
  * The size must be a multiple of the alignment, and the base address must
@@ -115,6 +120,32 @@ static void swap_bytes(void *a, void *b, int size)
 	} while (n);
 }
 
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
+}
+
 /**
  * sort - sort an array of elements
  * @base: pointer to data to sort
@@ -129,17 +160,20 @@ static void swap_bytes(void *a, void *b, int size)
  * isn't usually a bottleneck.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
+ * quicksort is slightly faster on average, it suffers from exploitable
  * O(n*n) worst-case behavior and extra memory requirements that make
  * it less suitable for kernel use.
  */
-
 void sort(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *),
 	  void (*swap_func)(void *, void *, int size))
 {
 	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
+	size_t n = num * size, a = (num/2) * size;
+	const unsigned int lsbit = size & -size;  /* Used to find parent */
+
+	if (!a)		/* num < 2 || size == 0 */
+		return;
 
 	if (!swap_func) {
 		if (is_aligned(base, size, 8))
@@ -150,32 +184,48 @@ void sort(void *base, size_t num, size_t size,
 			swap_func = swap_bytes;
 	}
 
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
+	/*
+	 * Loop invariants:
+	 * 1. elements [a,n) satisfy the heap property (compare greater than
+	 *    all of their children),
+	 * 2. elements [n,num*size) are sorted, and
+	 * 3. a <= b <= c <= d <= n (whenever they are valid).
+	 */
+	for (;;) {
+		size_t b, c, d;
 
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
+		if (a)			/* Building heap: sift down --a */
+			a -= size;
+		else if (n -= size)	/* Sorting: Extract root to --n */
+			swap_func(base, base + n, size);
+		else			/* Sort complete */
+			break;
+
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + size, (d = c + size) < n;)
+			b = cmp_func(base + c, base + d) >= 0 ? c : d;
+		if (d == n)	/* Special case last leaf with no sibling */
+			b = c;
+
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && cmp_func(base + a, base + b) >= 0)
+			b = parent(b, lsbit, size);
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = parent(b, lsbit, size);
+			swap_func(base + b, base + c, size);
 		}
 	}
 }
-
 EXPORT_SYMBOL(sort);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [RESEND PATCH v2 3/5] lib/sort: Avoid indirect calls to built-in swap
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
  2019-03-19  8:15   ` [RESEND PATCH v2 1/5] lib/sort: Make swap functions more generic George Spelvin
  2019-03-19  8:15   ` [RESEND PATCH v2 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
@ 2019-03-19  8:15   ` George Spelvin
  2019-03-19  8:16   ` [RESEND PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-19  8:15 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

Similar to what's being done in the net code, this takes advantage of
the fact that most invocations use only a few common swap functions, and
replaces indirect calls to them with (highly predictable) conditional
branches.  (The downside, of course, is that if you *do* use a custom
swap function, there are a few extra predicted branches on the code path.)

This actually *shrinks* the x86-64 code, because it inlines the various
swap functions inside do_swap, eliding function prologues & epilogues.

x86-64 code size 767 -> 703 bytes (-64)

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
---
 lib/sort.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index 0d24d0c5c0fc..50855ea8c262 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -54,10 +54,8 @@ static bool is_aligned(const void *base, size_t size, unsigned char align)
  * subtract (since the intervening mov instructions don't alter the flags).
  * Gcc 8.1.0 doesn't have that problem.
  */
-static void swap_words_32(void *a, void *b, int size)
+static void swap_words_32(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 		u32 t = *(u32 *)(a + (n -= 4));
 		*(u32 *)(a + n) = *(u32 *)(b + n);
@@ -80,10 +78,8 @@ static void swap_words_32(void *a, void *b, int size)
  * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
  * x32 ABI).  Are there any cases the kernel needs to worry about?
  */
-static void swap_words_64(void *a, void *b, int size)
+static void swap_words_64(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 #ifdef CONFIG_64BIT
 		u64 t = *(u64 *)(a + (n -= 8));
@@ -109,10 +105,8 @@ static void swap_words_64(void *a, void *b, int size)
  *
  * This is the fallback if alignment doesn't allow using larger chunks.
  */
-static void swap_bytes(void *a, void *b, int size)
+static void swap_bytes(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 		char t = ((char *)a)[--n];
 		((char *)a)[n] = ((char *)b)[n];
@@ -120,6 +114,33 @@ static void swap_bytes(void *a, void *b, int size)
 	} while (n);
 }
 
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES    (swap_func_t)2
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func)
+{
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, (int)size);
+}
+
 /**
  * parent - given the offset of the child, find the offset of the parent.
  * @i: the offset of the heap element whose parent is sought.  Non-zero.
@@ -157,7 +178,7 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size)
  * This function does a heapsort on the given array.  You may provide
  * a swap_func function if you need to do something more than a memory
  * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
- * isn't usually a bottleneck.
+ * avoids a slow retpoline and so is significantly faster.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * quicksort is slightly faster on average, it suffers from exploitable
@@ -177,11 +198,11 @@ void sort(void *base, size_t num, size_t size,
 
 	if (!swap_func) {
 		if (is_aligned(base, size, 8))
-			swap_func = swap_words_64;
+			swap_func = SWAP_WORDS_64;
 		else if (is_aligned(base, size, 4))
-			swap_func = swap_words_32;
+			swap_func = SWAP_WORDS_32;
 		else
-			swap_func = swap_bytes;
+			swap_func = SWAP_BYTES;
 	}
 
 	/*
@@ -197,7 +218,7 @@ void sort(void *base, size_t num, size_t size,
 		if (a)			/* Building heap: sift down --a */
 			a -= size;
 		else if (n -= size)	/* Sorting: Extract root to --n */
-			swap_func(base, base + n, size);
+			do_swap(base, base + n, size, swap_func);
 		else			/* Sort complete */
 			break;
 
@@ -224,7 +245,7 @@ void sort(void *base, size_t num, size_t size,
 		c = b;			/* Where "a" belongs */
 		while (b != a) {	/* Shift it into place */
 			b = parent(b, lsbit, size);
-			swap_func(base + b, base + c, size);
+			do_swap(base + b, base + c, size, swap_func);
 		}
 	}
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [RESEND PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                     ` (2 preceding siblings ...)
  2019-03-19  8:15   ` [RESEND PATCH v2 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
@ 2019-03-19  8:16   ` George Spelvin
  2019-03-28 22:08     ` Andrew Morton
  2019-03-29  4:31     ` [PATCH 6/5] lib/list_sort: Fix GCC warning George Spelvin
  2019-03-19  8:16   ` [RESEND PATCH v2 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
                     ` (2 subsequent siblings)
  6 siblings, 2 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-19  8:16 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

Rather than a fixed-size array of pending sorted runs, use the ->prev
links to keep track of things.  This reduces stack usage, eliminates
some ugly overflow handling, and reduces the code size.

Also:
* merge() no longer needs to handle NULL inputs, so simplify.
* The same applies to merge_and_restore_back_links(), which is renamed
  to the less ponderous merge_final().  (It's a static helper function,
  so we don't need a super-descriptive name; comments will do.)
* Document the actual return value requirements on the (*cmp)()
  function; some callers are already using this feature.

x86-64 code size 1086 -> 739 bytes (-347)

(Yes, I see checkpatch complaining about no space after comma in
"__attribute__((nonnull(2,3,4,5)))".  Checkpatch is wrong.)

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Feedback-from: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Feedback-from: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Feedback-from: Geert Uytterhoeven <geert@linux-m68k.org>
---
 include/linux/list_sort.h |   1 +
 lib/list_sort.c           | 169 ++++++++++++++++++++++++--------------
 2 files changed, 109 insertions(+), 61 deletions(-)

diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h
index ba79956e848d..20f178c24e9d 100644
--- a/include/linux/list_sort.h
+++ b/include/linux/list_sort.h
@@ -6,6 +6,7 @@
 
 struct list_head;
 
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 	       int (*cmp)(void *priv, struct list_head *a,
 			  struct list_head *b));
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 85759928215b..fc807dd60a51 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -7,33 +7,47 @@
 #include <linux/list_sort.h>
 #include <linux/list.h>
 
-#define MAX_LIST_LENGTH_BITS 20
+/*
+ * By declaring the compare function with the __pure attribute, we give
+ * the compiler more opportunity to optimize.  Ideally, we'd use this in
+ * the prototype of list_sort(), but that would involve a lot of churn
+ * at all call sites, so just cast the function pointer passed in.
+ */
+typedef int __pure __attribute__((nonnull(2,3))) (*cmp_func)(void *,
+		struct list_head const *, struct list_head const *);
 
 /*
  * Returns a list organized in an intermediate format suited
  * to chaining of merge() calls: null-terminated, no reserved or
  * sentinel head node, "prev" links not maintained.
  */
-static struct list_head *merge(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
+__attribute__((nonnull(2,3,4)))
+static struct list_head *merge(void *priv, cmp_func cmp,
 				struct list_head *a, struct list_head *b)
 {
-	struct list_head head, *tail = &head;
+	struct list_head *head, **tail = &head;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
-			tail->next = a;
+		if (cmp(priv, a, b) <= 0) {
+			*tail = a;
+			tail = &a->next;
 			a = a->next;
+			if (!a) {
+				*tail = b;
+				break;
+			}
 		} else {
-			tail->next = b;
+			*tail = b;
+			tail = &b->next;
 			b = b->next;
+			if (!b) {
+				*tail = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a?:b;
-	return head.next;
+	return head;
 }
 
 /*
@@ -43,44 +57,52 @@ static struct list_head *merge(void *priv,
  * prev-link restoration pass, or maintaining the prev links
  * throughout.
  */
-static void merge_and_restore_back_links(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
-				struct list_head *head,
-				struct list_head *a, struct list_head *b)
+__attribute__((nonnull(2,3,4,5)))
+static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
+			struct list_head *a, struct list_head *b)
 {
 	struct list_head *tail = head;
 	u8 count = 0;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
+		if (cmp(priv, a, b) <= 0) {
 			tail->next = a;
 			a->prev = tail;
+			tail = a;
 			a = a->next;
+			if (!a)
+				break;
 		} else {
 			tail->next = b;
 			b->prev = tail;
+			tail = b;
 			b = b->next;
+			if (!b) {
+				b = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a ? : b;
 
+	/* Finish linking remainder of list b on to tail */
+	tail->next = b;
 	do {
 		/*
-		 * In worst cases this loop may run many iterations.
+		 * If the merge is highly unbalanced (e.g. the input is
+		 * already sorted), this loop may run many iterations.
 		 * Continue callbacks to the client even though no
 		 * element comparison is needed, so the client's cmp()
 		 * routine can invoke cond_resched() periodically.
 		 */
-		if (unlikely(!(++count)))
-			(*cmp)(priv, tail->next, tail->next);
-
-		tail->next->prev = tail;
-		tail = tail->next;
-	} while (tail->next);
+		if (unlikely(!++count))
+			cmp(priv, b, b);
+		b->prev = tail;
+		tail = b;
+		b = b->next;
+	} while (b);
 
+	/* And the final links to make a circular doubly-linked list */
 	tail->next = head;
 	head->prev = tail;
 }
@@ -91,55 +113,80 @@ static void merge_and_restore_back_links(void *priv,
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements "merge sort", which has O(nlog(n))
- * complexity.
+ * This function implements a bottom-up merge sort, which has O(nlog(n))
+ * complexity.  We use depth-first order to take advantage of cacheing.
+ * (E.g. when we get to the fourth element, we immediately merge the
+ * first two 2-element lists.)
  *
- * The comparison function @cmp must return a negative value if @a
- * should sort before @b, and a positive value if @a should sort after
- * @b. If @a and @b are equivalent, and their original relative
- * ordering is to be preserved, @cmp must return 0.
+ * The comparison funtion @cmp must return > 0 if @a should sort after
+ * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
+ * sort before @b *or* their original order should be preserved.  It is
+ * always called with the element that came first in the input in @a,
+ * and list_sort is a stable sort, so it is not necessary to distinguish
+ * the @a < @b and @a == @b cases.
+ *
+ * This is compatible with two styles of @cmp function:
+ * - The traditional style which returns <0 / =0 / >0, or
+ * - Returning a boolean 0/1.
+ * The latter offers a chance to save a few cycles in the comparison
+ * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c).
+ *
+ * A good way to write a multi-word comparison is
+ *	if (a->high != b->high)
+ *		return a->high > b->high;
+ *	if (a->middle != b->middle)
+ *		return a->middle > b->middle;
+ *	return a->low > b->low;
  */
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 		int (*cmp)(void *priv, struct list_head *a,
 			struct list_head *b))
 {
-	struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists
-						-- last slot is a sentinel */
-	int lev;  /* index into part[] */
-	int max_lev = 0;
-	struct list_head *list;
+	struct list_head *list = head->next, *pending = NULL;
+	size_t count = 0;	/* Count of pending */
 
-	if (list_empty(head))
+	if (list == head->prev)	/* Zero or one elements */
 		return;
 
-	memset(part, 0, sizeof(part));
-
+	/* Convert to a null-terminated singly-linked list. */
 	head->prev->next = NULL;
-	list = head->next;
 
-	while (list) {
+	/*
+	 * Data structure invariants:
+	 * - All lists are singly linked and null-terminated; prev
+	 *   pointers are not maintained.
+	 * - pending is a prev-linked "list of lists" of sorted
+	 *   sublists awaiting further merging.
+	 * - Each of the sorted sublists is power-of-two in size,
+	 *   corresponding to bits set in "count".
+	 * - Sublists are sorted by size and age, smallest & newest at front.
+	 */
+	do {
+		size_t bits;
 		struct list_head *cur = list;
+
+		/* Extract the head of "list" as a single-element list "cur" */
 		list = list->next;
 		cur->next = NULL;
 
-		for (lev = 0; part[lev]; lev++) {
-			cur = merge(priv, cmp, part[lev], cur);
-			part[lev] = NULL;
+		/* Do merges corresponding to set lsbits in count */
+		for (bits = count; bits & 1; bits >>= 1) {
+			cur = merge(priv, (cmp_func)cmp, pending, cur);
+			pending = pending->prev;  /* Untouched by merge() */
 		}
-		if (lev > max_lev) {
-			if (unlikely(lev >= ARRAY_SIZE(part)-1)) {
-				printk_once(KERN_DEBUG "list too long for efficiency\n");
-				lev--;
-			}
-			max_lev = lev;
-		}
-		part[lev] = cur;
+		/* And place the result at the head of "pending" */
+		cur->prev = pending;
+		pending = cur;
+		count++;
+	} while (list->next);
+
+	/* Now merge together last element with all pending lists */
+	while (pending->prev) {
+		list = merge(priv, (cmp_func)cmp, pending, list);
+		pending = pending->prev;
 	}
-
-	for (lev = 0; lev < max_lev; lev++)
-		if (part[lev])
-			list = merge(priv, cmp, part[lev], list);
-
-	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
+	/* The final merge, rebuilding prev links */
+	merge_final(priv, (cmp_func)cmp, head, pending, list);
 }
 EXPORT_SYMBOL(list_sort);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [RESEND PATCH v2 5/5] lib/list_sort: Optimize number of calls to comparison function
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                     ` (3 preceding siblings ...)
  2019-03-19  8:16   ` [RESEND PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
@ 2019-03-19  8:16   ` George Spelvin
  2019-03-19 10:56   ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller Rasmus Villemoes
  2019-03-19 14:01   ` Andy Shevchenko
  6 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-19  8:16 UTC (permalink / raw)
  To: linux-kernel, kernel-janitors, Andrew Morton
  Cc: George Spelvin, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner,
	Andy Shevchenko

CONFIG_RETPOLINE has severely degraded indirect function call
performance, so it's worth putting some effort into reducing
the number of times cmp() is called.

This patch avoids badly unbalanced merges on unlucky input sizes.
It slightly increases the code size, but saves an average of 0.2*n
calls to cmp().

x86-64 code size 739 -> 803 bytes (+64)

Unfortunately, there's not a lot of low-hanging fruit in a merge
sort; it already performs only n*log2(n) - K*n + O(1) compares.
The leading coefficient is already at the theoretical limit (log2(n!)
corresponds to K=1.4427), so we're fighting over the linear term, and
the best mergesort can do is K=1.2645, achieved when n is a power of 2.

The differences between mergesort variants appear when n is *not*
a power of 2; K is a function of the fractional part of log2(n).
Top-down mergesort does best of all, achieving a minimum K=1.2408, and
an average (over all sizes) K=1.248.  However, that requires knowing
the number of entries to be sorted ahead of time, and making a full
pass over the input to count it conflicts with a second performance
goal, which is cache blocking.

Obviously, we have to read the entire list into L1 cache at some point,
and performance is best if it fits.  But if it doesn't fit, each full
pass over the input causes a cache miss per element, which is undesirable.

While textbooks explain bottom-up mergesort as a succession of merging
passes, practical implementations do merging in depth-first order:
as soon as two lists of the same size are available, they are merged.
This allows as many merge passes as possible to fit into L1; only the
final few merges force cache misses.

This cache-friendly depth-first merge order depends on us merging the
beginning of the input as much as possible before we've even seen the
end of the input (and thus know its size).

The simple eager merge pattern causes bad performance when n is just
over a power of 2.  If n=1028, the final merge is between 1024- and
4-element lists, which is wasteful of comparisons.  (This is actually
worse on average than n=1025, because a 1204:1 merge will, on average,
end after 512 compares, while 1024:4 will walk 4/5 of the list.)

Because of this, bottom-up mergesort achieves K < 0.5 for such sizes,
and has an average (over all sizes) K of around 1.  (My experiments
show K=1.01, while theory predicts K=0.965.)

There are "worst-case optimal" variants of bottom-up mergesort which
avoid this bad performance, but the algorithms given in the literature,
such as queue-mergesort and boustrodephonic mergesort, depend on the
breadth-first multi-pass structure that we are trying to avoid.

This implementation is as eager as possible while ensuring that all merge
passes are at worst 1:2 unbalanced.  This achieves the same average
K=1.207 as queue-mergesort, which is 0.2*n better then bottom-up, and
only 0.04*n behind top-down mergesort.

Specifically, defers merging two lists of size 2^k until it is known
that there are 2^k additional inputs following.  This ensures that the
final uneven merges triggered by reaching the end of the input will be
at worst 2:1.  This will avoid cache misses as long as 3*2^k elements
fit into the cache.

(I confess to being more than a little bit proud of how clean this
code turned out.  It took a lot of thinking, but the resultant inner
loop is very simple and efficient.)

Refs:
  Bottom-up Mergesort: A Detailed Analysis
  Wolfgang Panny, Helmut Prodinger
  Algorithmica 14(4):340--354, October 1995
  https://doi.org/10.1007/BF01294131
  https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.5260

  The cost distribution of queue-mergesort, optimal mergesorts, and
  power-of-two rules
  Wei-Mei Chen, Hsien-Kuei Hwang, Gen-Huey Chen
  Journal of Algorithms 30(2); Pages 423--448, February 1999
  https://doi.org/10.1006/jagm.1998.0986
  https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5380

  Queue-Mergesort
  Mordecai J. Golin, Robert Sedgewick
  Information Processing Letters, 48(5):253--259, 10 December 1993
  https://doi.org/10.1016/0020-0190(93)90088-q
  https://sci-hub.tw/10.1016/0020-0190(93)90088-Q

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Feedback-from: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
 lib/list_sort.c | 115 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 23 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index fc807dd60a51..623a9158ac8a 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -113,11 +113,6 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements a bottom-up merge sort, which has O(nlog(n))
- * complexity.  We use depth-first order to take advantage of cacheing.
- * (E.g. when we get to the fourth element, we immediately merge the
- * first two 2-element lists.)
- *
  * The comparison funtion @cmp must return > 0 if @a should sort after
  * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
  * sort before @b *or* their original order should be preserved.  It is
@@ -137,6 +132,60 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  *	if (a->middle != b->middle)
  *		return a->middle > b->middle;
  *	return a->low > b->low;
+ *
+ *
+ * This mergesort is as eager as possible while always performing at least
+ * 2:1 balanced merges.  Given two pending sublists of size 2^k, they are
+ * merged to a size-2^(k+1) list as soon as we have 2^k following elements.
+ *
+ * Thus, it will avoid cache thrashing as long as 3*2^k elements can
+ * fit into the cache.  Not quite as good as a fully-eager bottom-up
+ * mergesort, but it does use 0.2*n fewer comparisons, so is faster in
+ * the common case that everything fits into L1.
+ *
+ *
+ * The merging is controlled by "count", the number of elements in the
+ * pending lists.  This is beautiully simple code, but rather subtle.
+ *
+ * Each time we increment "count", we set one bit (bit k) and clear
+ * bits k-1 .. 0.  Each time this happens (except the very first time
+ * for each bit, when count increments to 2^k), we merge two lists of
+ * size 2^k into one list of size 2^(k+1).
+ *
+ * This merge happens exactly when the count reaches an odd multiple of
+ * 2^k, which is when we have 2^k elements pending in smaller lists,
+ * so it's safe to merge away two lists of size 2^k.
+ *
+ * After this happens twice, we have created two lists of size 2^(k+1),
+ * which will be merged into a list of size 2^(k+2) before we create
+ * a third list of size 2^(k+1), so there are never more than two pending.
+ *
+ * The number of pending lists of size 2^k is determined by the
+ * state of bit k of "count" plus two extra pieces of information:
+ * - The state of bit k-1 (when k == 0, consider bit -1 always set), and
+ * - Whether the higher-order bits are zero or non-zero (i.e.
+ *   is count >= 2^(k+1)).
+ * There are six states we distinguish.  "x" represents some arbitrary
+ * bits, and "y" represents some arbitrary non-zero bits:
+ * 0:  00x: 0 pending of size 2^k;           x pending of sizes < 2^k
+ * 1:  01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 2: x10x: 0 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 4: y00x: 1 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * (merge and loop back to state 2)
+ *
+ * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
+ * bit k-1 is set while the more significant bits are non-zero) and
+ * merge them away in the 5->2 transition.  Note in particular that just
+ * before the 5->2 transition, all lower-order bits are 11 (state 3),
+ * so there is one list of each smaller size.
+ *
+ * When we reach the end of the input, we merge all the pending
+ * lists, from smallest to largest.  If you work through cases 2 to
+ * 5 above, you can see that the number of elements we merge with a list
+ * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
+ * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
  */
 __attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
@@ -158,33 +207,53 @@ void list_sort(void *priv, struct list_head *head,
 	 *   pointers are not maintained.
 	 * - pending is a prev-linked "list of lists" of sorted
 	 *   sublists awaiting further merging.
-	 * - Each of the sorted sublists is power-of-two in size,
-	 *   corresponding to bits set in "count".
+	 * - Each of the sorted sublists is power-of-two in size.
 	 * - Sublists are sorted by size and age, smallest & newest at front.
+	 * - There are zero to two sublists of each size.
+	 * - A pair of pending sublists are merged as soon as the number
+	 *   of following pending elements equals their size (i.e.
+	 *   each time count reaches an odd multiple of that size).
+	 *   That ensures each later final merge will be at worst 2:1.
+	 * - Each round consists of:
+	 *   - Merging the two sublists selected by the highest bit
+	 *     which flips when count is incremented, and
+	 *   - Adding an element from the input as a size-1 sublist.
 	 */
 	do {
 		size_t bits;
-		struct list_head *cur = list;
+		struct list_head **tail = &pending;
 
-		/* Extract the head of "list" as a single-element list "cur" */
-		list = list->next;
-		cur->next = NULL;
+		/* Find the least-significant clear bit in count */
+		for (bits = count; bits & 1; bits >>= 1)
+			tail = &(*tail)->prev;
+		/* Do the indicated merge */
+		if (likely(bits)) {
+			struct list_head *a = *tail, *b = a->prev;
 
-		/* Do merges corresponding to set lsbits in count */
-		for (bits = count; bits & 1; bits >>= 1) {
-			cur = merge(priv, (cmp_func)cmp, pending, cur);
-			pending = pending->prev;  /* Untouched by merge() */
+			a = merge(priv, (cmp_func)cmp, b, a);
+			/* Install the merged result in place of the inputs */
+			a->prev = b->prev;
+			*tail = a;
 		}
-		/* And place the result at the head of "pending" */
-		cur->prev = pending;
-		pending = cur;
-		count++;
-	} while (list->next);
 
-	/* Now merge together last element with all pending lists */
-	while (pending->prev) {
+		/* Move one element from input list to pending */
+		list->prev = pending;
+		pending = list;
+		list = list->next;
+		pending->next = NULL;
+		count++;
+	} while (list);
+
+	/* End of input; merge together all the pending lists. */
+	list = pending;
+	pending = pending->prev;
+	for (;;) {
+		struct list_head *next = pending->prev;
+
+		if (!next)
+			break;
 		list = merge(priv, (cmp_func)cmp, pending, list);
-		pending = pending->prev;
+		pending = next;
 	}
 	/* The final merge, rebuilding prev links */
 	merge_final(priv, (cmp_func)cmp, head, pending, list);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                     ` (4 preceding siblings ...)
  2019-03-19  8:16   ` [RESEND PATCH v2 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
@ 2019-03-19 10:56   ` Rasmus Villemoes
  2019-03-19 14:01   ` Andy Shevchenko
  6 siblings, 0 replies; 17+ messages in thread
From: Rasmus Villemoes @ 2019-03-19 10:56 UTC (permalink / raw)
  To: George Spelvin, linux-kernel, kernel-janitors, Andrew Morton
  Cc: Andrey Abramov, Geert Uytterhoeven, Daniel Wagner, Don Mullis,
	Dave Chinner, Andy Shevchenko

On 19/03/2019 09.15, George Spelvin wrote:
> 
> Because CONFIG_RETPOLINE has made indirect calls much more expensive,
> I thought I'd try to reduce the number made by the library sort
> functions.

For the series,

Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>

and let's have more commits with such detailed changelogs and references
to academic work combined with "but in the real world we have to do this
and that tradeoff..." :)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller
  2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                     ` (5 preceding siblings ...)
  2019-03-19 10:56   ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller Rasmus Villemoes
@ 2019-03-19 14:01   ` Andy Shevchenko
  6 siblings, 0 replies; 17+ messages in thread
From: Andy Shevchenko @ 2019-03-19 14:01 UTC (permalink / raw)
  To: George Spelvin
  Cc: linux-kernel, kernel-janitors, Andrew Morton, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner

On Tue, Mar 19, 2019 at 08:15:56AM +0000, George Spelvin wrote:
> (Resend because earlier send had GIT_AUTHOR_DATE in the e-mail
> headers and got filed with last month's archives.  And probably
> tripped a few spam filters.)

I got both.

> 
> v1->v2:	Various spelling, naming and code style cleanups.
> 	Generally positive and no negative responses to the
> 	goals and algorithms used.
> 
> 	I'm running these patches, with CONFIG_TEST_SORT and
> 	CONFIG_TEST_LIST_SORT, on the machine I'm sending this from.
> 	I have tweaked the comments further, but I have verified
> 	the compiled object code is identical to a snapshot I took
> 	when I rebooted.
> 
> 	As far as I'm concerned, this is ready to be merged.
> 	As there is no owner in MAINTAINERS, I was thinking of
> 	sending it via AKPM, like the recent lib/lzo changes.
> 	Andrew, is that okay with you?
> 

Thanks for this improvement. I like when academic work is used in real life!

FWIW,
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>

> Because CONFIG_RETPOLINE has made indirect calls much more expensive,
> I thought I'd try to reduce the number made by the library sort
> functions.
> 
> The first three patches apply to lib/sort.c.
> 
> Patch #1 is a simple optimization.  The built-in swap has special cases
> for aligned 4- and 8-byte objects.  But those are almost never used;
> most calls to sort() work on larger structures, which fall back to the
> byte-at-a-time loop.  This generalizes them to aligned *multiples* of 4
> and 8 bytes.  (If nothing else, it saves an awful lot of energy by not
> thrashing the store buffers as much.)
> 
> Patch #2 grabs a juicy piece of low-hanging fruit.  I agree that
> nice simple solid heapsort is preferable to more complex algorithms
> (sorry, Andrey), but it's possible to implement heapsort with far fewer
> comparisons (50% asymptotically, 25-40% reduction for realistic sizes)
> than the way it's been done up to now.  And with some care, the code
> ends up smaller, as well.  This is the "big win" patch.
> 
> Patch #3 adds the same sort of indirect call bypass that has been added
> to the net code of late.  The great majority of the callers use the
> builtin swap functions, so replace the indirect call to sort_func with a
> (highly preditable) series of if() statements.  Rather surprisingly,
> this decreased code size, as the swap functions were inlined and their
> prologue & epilogue code eliminated.
> 
> lib/list_sort.c is a bit trickier, as merge sort is already close to
> optimal, and we don't want to introduce triumphs of theory over
> practicality like the Ford-Johnson merge-insertion sort.
> 
> Patch #4, without changing the algorithm, chops 32% off the code size and
> removes the part[MAX_LIST_LENGTH+1] pointer array (and the corresponding
> upper limit on efficiently sortable input size).
> 
> Patch #5 improves the algorithm.  The previous code is already optimal
> for power-of-two (or slightly smaller) size inputs, but when the input
> size is just over a power of 2, there's a very unbalanced final merge.
> 
> There are, in the literature, several algorithms which solve this, but
> they all depend on the "breadth-first" merge order which was replaced
> by commit 835cc0c8477f with a more cache-friendly "depth-first" order.
> Some hard thinking came up with a depth-first algorithm which defers
> merges as little as possible while avoiding bad merges.  This saves
> 0.2*n compares, averaged over all sizes.
> 
> The code size increase is minimal (64 bytes on x86-64, reducing the net
> savings to 26%), but the comments expanded significantly to document
> the clever algorithm.
> 
> 
> TESTING NOTES: I have some ugly user-space benchmarking code
> which I used for testing before moving this code into the kernel.
> Shout if you want a copy.
> 
> I'm running this code right now, with CONFIG_TEST_SORT and
> CONFIG_TEST_LIST_SORT, but I confess I haven't rebooted since
> the last round of minor edits to quell checkpatch.  I figure there
> will be at least one round of comments and final testing.
> 
> George Spelvin (5):
>   lib/sort: Make swap functions more generic
>   lib/sort: Use more efficient bottom-up heapsort variant
>   lib/sort: Avoid indirect calls to built-in swap
>   lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
>   lib/list_sort: Optimize number of calls to comparison function
> 
>  include/linux/list_sort.h |   1 +
>  lib/list_sort.c           | 244 +++++++++++++++++++++++++---------
>  lib/sort.c                | 266 +++++++++++++++++++++++++++++---------
>  3 files changed, 387 insertions(+), 124 deletions(-)
> 
> -- 
> 2.20.1
> 

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RESEND PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-19  8:16   ` [RESEND PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
@ 2019-03-28 22:08     ` Andrew Morton
  2019-03-29  4:10       ` George Spelvin
  2019-03-29  4:31     ` [PATCH 6/5] lib/list_sort: Fix GCC warning George Spelvin
  1 sibling, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2019-03-28 22:08 UTC (permalink / raw)
  To: George Spelvin
  Cc: linux-kernel, kernel-janitors, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner, Andy Shevchenko

On Tue, 19 Mar 2019 08:16:00 +0000 George Spelvin <lkml@sdf.org> wrote:

> Rather than a fixed-size array of pending sorted runs, use the ->prev
> links to keep track of things.  This reduces stack usage, eliminates
> some ugly overflow handling, and reduces the code size.
> 
> Also:
> * merge() no longer needs to handle NULL inputs, so simplify.
> * The same applies to merge_and_restore_back_links(), which is renamed
>   to the less ponderous merge_final().  (It's a static helper function,
>   so we don't need a super-descriptive name; comments will do.)
> * Document the actual return value requirements on the (*cmp)()
>   function; some callers are already using this feature.
> 
> x86-64 code size 1086 -> 739 bytes (-347)
> 
> (Yes, I see checkpatch complaining about no space after comma in
> "__attribute__((nonnull(2,3,4,5)))".  Checkpatch is wrong.)

x86_64 allnoconfig with gcc-7.3.0:

lib/list_sort.c:17:36: warning: __pure__ attribute ignored [-Wattributes]
   struct list_head const *, struct list_head const *);


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RESEND PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-28 22:08     ` Andrew Morton
@ 2019-03-29  4:10       ` George Spelvin
  0 siblings, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-29  4:10 UTC (permalink / raw)
  To: akpm, lkml, lkp, sfr
  Cc: andriy.shevchenko, daniel.wagner, dchinner, don.mullis, geert,
	kernel-janitors, linux-kernel, linux, st5pub

Than you all for the build warning report.

The warning produced by gcc versions 4.9, 7.3, 8.1, whatever version
Stephen Rothwell is running, is:
lib/list_sort.c:17:36: warning: __pure__ attribute ignored [-Wattributes]

The relevant code is:
10: /*
11:  * By declaring the compare function with the __pure attribute, we give
12:  * the compiler more opportunity to optimize.  Ideally, we'd use this in
13:  * the prototype of list_sort(), but that would involve a lot of churn
14:  * at all call sites, so just cast the function pointer passed in.
15:  */
16: typedef int __pure __attribute__((nonnull(2,3))) (*cmp_func)(void *,
17: 		struct list_head const *, struct list_head const *);

As the comment says, the purpose of the __pure attribute is to tell
the compiler that, after a call via a function pointer of this
type, memory is not clobbered and it is not necessary to reload
any cached list pointers.

This is, of course, purely optional and may be deleted harmlessly.
I just checked, and that makes no difference at all to gcc-8 code
generation, so there's no point messing with #ifdef.

There are only two questions: how to update the comment, and how
to submit the fix. I'm thinking of
/*
 * A more accurate type for comparison functions.  Ideally, we'd use
 * this in the prototype of list_sort(), but that would involve a lot of
 * churn at all call sites, so just cast the function pointer passed in.
 *
 * This could also include __pure to give the compiler more opportunity
 * to optimize, but that elicits an "attribute ignored" warning on
 * GCC <= 8.1, and doesn't change GCC 8.3's code generation at all,
 * so it's omitted.
 */

How to submit the fix: Andrew, do you prefer a replacement patch
or a small fix patch?  I'll assume the latter and send it in a few
minutes.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 6/5] lib/list_sort: Fix GCC warning
  2019-03-19  8:16   ` [RESEND PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
  2019-03-28 22:08     ` Andrew Morton
@ 2019-03-29  4:31     ` George Spelvin
  1 sibling, 0 replies; 17+ messages in thread
From: George Spelvin @ 2019-03-29  4:31 UTC (permalink / raw)
  To: Andrew Morton, Stephen Rothwell, kbuild test robot
  Cc: linux-kernel, kernel-janitors, George Spelvin, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner, Andy Shevchenko

It turns out that GCC 4.9, 7.3, and 8.1 ignore the __pure
attribute on function pointers and (with the standard kernel
compile flags) emit a warning about it.

Even though it accurately describes a comparison function
(the compiler need not reload cached pointers across the call),
it doesn't actually help GCC 8.3's code generation, so just
omit it.

Signed-off-by: George Spelvin <lkml@sdf.org>
Fixes: 820c81be5237 ("lib/list_sort: simplify and remove MAX_LIST_LENGTH_BITS")
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
---
 lib/list_sort.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index 623a9158ac8a..b1b492e20f1d 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -8,12 +8,16 @@
 #include <linux/list.h>
 
 /*
- * By declaring the compare function with the __pure attribute, we give
- * the compiler more opportunity to optimize.  Ideally, we'd use this in
- * the prototype of list_sort(), but that would involve a lot of churn
- * at all call sites, so just cast the function pointer passed in.
+ * A more accurate type for comparison functions.  Ideally, we'd use
+ * this in the prototype of list_sort(), but that would involve a lot of
+ * churn at all call sites, so just cast the function pointer passed in.
+ *
+ * This could also include __pure to give the compiler more opportunity
+ * to optimize, but that elicits an "attribute ignored" warning on
+ * GCC <= 8.1, and doesn't change GCC 8.3's code generation at all,
+ * so it's omitted.
  */
-typedef int __pure __attribute__((nonnull(2,3))) (*cmp_func)(void *,
+typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *,
 		struct list_head const *, struct list_head const *);
 
 /*
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2019-03-29  4:34 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-16  2:43 [PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
2019-02-21  6:30 ` [PATCH v2 1/5] lib/sort: Make swap functions more generic George Spelvin
2019-02-21  8:21 ` [PATCH v2 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
2019-02-21  8:21 ` [PATCH v2 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
2019-03-05  3:06 ` [PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
2019-03-05  5:58 ` [PATCH v2 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
2019-03-19  8:15 ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
2019-03-19  8:15   ` [RESEND PATCH v2 1/5] lib/sort: Make swap functions more generic George Spelvin
2019-03-19  8:15   ` [RESEND PATCH v2 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
2019-03-19  8:15   ` [RESEND PATCH v2 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
2019-03-19  8:16   ` [RESEND PATCH v2 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
2019-03-28 22:08     ` Andrew Morton
2019-03-29  4:10       ` George Spelvin
2019-03-29  4:31     ` [PATCH 6/5] lib/list_sort: Fix GCC warning George Spelvin
2019-03-19  8:16   ` [RESEND PATCH v2 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
2019-03-19 10:56   ` [RESEND PATCH v2 0/5] lib/sort & lib/list_sort: faster and smaller Rasmus Villemoes
2019-03-19 14:01   ` Andy Shevchenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).