All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-09  2:17 [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
@ 2019-02-21  6:30 ` George Spelvin
       [not found]   ` <20190309140653.GO9224@smile.fi.intel.com>
  2019-03-13 21:23   ` Rasmus Villemoes
  2019-02-21  8:21 ` [PATCH 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 44+ messages in thread
From: George Spelvin @ 2019-02-21  6:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: George Spelvin, Andrew Morton, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner, Andy Shevchenko

Rather than u32_swap and u64_swap working on 4- and 8-byte objects
directly, let them handle any multiple of 4 or 8 bytes.  This speeds
up most users of sort() by avoiding fallback to the byte copy loop.

Despite what commit ca96ab859ab4 ("lib/sort: Add 64 bit swap function")
claims, very few users of sort() sort pointers (or pointer-sized
objects); most sort structures containing at least two words.
(E.g. drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte
struct acpi_fan_fps.)

x86-64 code size 872 -> 885 bytes (+8)

Signed-off-by: George Spelvin <lkml@sdf.org>
---
 lib/sort.c | 117 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 96 insertions(+), 21 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..dff2ab2e196e 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -11,35 +11,110 @@
 #include <linux/export.h>
 #include <linux/sort.h>
 
-static int alignment_ok(const void *base, int align)
+/**
+ * alignment_ok - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required aignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+static bool __attribute_const__
+alignment_ok(const void *base, size_t size, unsigned int align)
 {
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
+	unsigned int lsbits = (unsigned int)size;
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	(void)base;
+#else
+	lsbits |= (unsigned int)(size_t)base;
+#endif
+	lsbits &= align - 1;
+	return lsbits == 0;
 }
 
+/**
+ * u32_swap - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
 static void u32_swap(void *a, void *b, int size)
 {
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
+	size_t n = size;
+
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
 }
 
+/**
+ * u64_swap - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+
 static void u64_swap(void *a, void *b, int size)
 {
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, int size)
-{
-	char t;
+	size_t n = size;
 
 	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * generic_swap - swap two elements a byte at a time
+ * @a, @b: pointers to the elements
+ * @size: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void generic_swap(void *a, void *b, int size)
+{
+	size_t n = size;
+
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
 }
 
 /**
@@ -67,10 +142,10 @@ void sort(void *base, size_t num, size_t size,
 	int i = (num/2 - 1) * size, n = num * size, c, r;
 
 	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
+		if (alignment_ok(base, size, 8))
 			swap_func = u64_swap;
+		else if (alignment_ok(base, size, 4))
+			swap_func = u32_swap;
 		else
 			swap_func = generic_swap;
 	}
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 3/5] lib/sort: Avoid indirect calls to built-in swap
  2019-03-09  2:17 [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
  2019-02-21  6:30 ` [PATCH 1/5] lib/sort: Make swap functions more generic George Spelvin
  2019-02-21  8:21 ` [PATCH 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
@ 2019-02-21  8:21 ` George Spelvin
  2019-03-05  3:06 ` [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 44+ messages in thread
From: George Spelvin @ 2019-02-21  8:21 UTC (permalink / raw)
  To: linux-kernel
  Cc: George Spelvin, Andrew Morton, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner, Andy Shevchenko

Similar to what's being done in the net code, this takes advantage of
the fact that most invocations use only a few common swap functions, and
replaces indirect calls to them with (highly predictable) conditional
branches.  (The downside, of course, is that if you *do* use a custom
swap function, there are a few additional (highly predictable) conditional
branches on the code path.)

This actually *shrinks* the x86-64 code, because it inlines the various
swap functions inside do_swap, eliding function prologues & epilogues.

x86-64 code size 770 -> 709 bytes (-61)

Signed-off-by: George Spelvin <lkml@sdf.org>
---
 lib/sort.c | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index 2aef4631e7d3..226a8c7e4b9a 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -117,6 +117,33 @@ static void generic_swap(void *a, void *b, int size)
 	} while (n);
 }
 
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define U64_SWAP (swap_func_t)0
+#define U32_SWAP (swap_func_t)1
+#define GENERIC_SWAP (swap_func_t)2
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, int size, swap_func_t swap_func)
+{
+	if (swap_func == U64_SWAP)
+		u64_swap(a, b, size);
+	else if (swap_func == U32_SWAP)
+		u32_swap(a, b, size);
+	else if (swap_func == GENERIC_SWAP)
+		generic_swap(a, b, size);
+	else
+		swap_func(a, b, size);
+}
+
 /**
  * parent - given the offset of the child, find the offset of the parent.
  * @i: the offset of the heap element whose parent is sought.  Non-zero.
@@ -151,10 +178,10 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size)
  * @cmp_func: pointer to comparison function
  * @swap_func: pointer to swap function or NULL
  *
- * This function does a heapsort on the given array.  You may provide a
- * swap_func function if you need to do something more than a memory copy
- * (e.g. fix up pointers or auxiliary data), but the built-in swap isn't
- * usually a bottleneck.
+ * This function does a heapsort on the given array.  You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * avoids a slow retpoline and so is significantly faster.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * qsort is about 20% faster on average, it suffers from exploitable
@@ -174,11 +201,11 @@ void sort(void *base, size_t num, size_t size,
 
 	if (!swap_func) {
 		if (alignment_ok(base, size, 8))
-			swap_func = u64_swap;
+			swap_func = U64_SWAP;
 		else if (alignment_ok(base, size, 4))
-			swap_func = u32_swap;
+			swap_func = U32_SWAP;
 		else
-			swap_func = generic_swap;
+			swap_func = GENERIC_SWAP;
 	}
 
 	/*
@@ -194,7 +221,7 @@ void sort(void *base, size_t num, size_t size,
 		if (a)			/* Building heap: sift down --a */
 			a -= size;
 		else if (n -= size)	/* Sorting: Extract root to --n */
-			swap_func(base, base + n, size);
+			do_swap(base, base + n, size, swap_func);
 		else			/* Sort complete */
 			break;
 
@@ -221,7 +248,7 @@ void sort(void *base, size_t num, size_t size,
 		c = b;			/* Where "a" belongs */
 		while (b != a) {	/* Shift it into place */
 			b = parent(b, lsbit, size);
-			swap_func(base + b, base + c, size);
+			do_swap(base + b, base + c, size, swap_func);
 		}
 	}
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 2/5] lib/sort: Use more efficient bottom-up heapsort variant
  2019-03-09  2:17 [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
  2019-02-21  6:30 ` [PATCH 1/5] lib/sort: Make swap functions more generic George Spelvin
@ 2019-02-21  8:21 ` George Spelvin
  2019-03-13 22:29   ` Rasmus Villemoes
  2019-02-21  8:21 ` [PATCH 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-02-21  8:21 UTC (permalink / raw)
  To: linux-kernel
  Cc: George Spelvin, Andrew Morton, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner, Andy Shevchenko

This uses fewer comparisons than the previous code (61% as
many for large random inputs), but produces identical results;
it actually performs the exact same series of swap operations.

Standard heapsort, when sifting down, performs two comparisons
per level: One to find the greater child, and a second to see
if the current node should be exchanged with that child.

Bottom-up heapsort observes that it's better to postpone the second
comparison and search for the leaf where -infinity would be sent to,
then search back *up* for the current node's destination.

Since sifting down usually proceeds to the leaf level (that's where
half the nodes are), this does many fewer second comparisons.  That
saves a lot of (expensive since Spectre) indirect function calls.

The one time it's worse than the previous code is if there are large
numbers of duplicate keys, when the top-down algorithm is O(n) and
bottom-up is O(n log n).  For distinct keys, it's provably always better.

(The code is not significantly more complex.  This patch also merges
the heap-building and -extracting sift-down loops, resulting in a
net code size savings.)

x86-64 code size 885 -> 770 bytes (-115)

(I see the checkpatch complaint about "else if (n -= size)".
The alternative is significantly uglier.)

Signed-off-by: George Spelvin <lkml@sdf.org>
---
 lib/sort.c | 102 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 75 insertions(+), 27 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index dff2ab2e196e..2aef4631e7d3 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -117,6 +117,32 @@ static void generic_swap(void *a, void *b, int size)
 	} while (n);
 }
 
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = i/size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * size has a least significant bit.  That bit will be clear if i is
+ * an even multiple of size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
+}
+
 /**
  * sort - sort an array of elements
  * @base: pointer to data to sort
@@ -125,21 +151,26 @@ static void generic_swap(void *a, void *b, int size)
  * @cmp_func: pointer to comparison function
  * @swap_func: pointer to swap function or NULL
  *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
+ * This function does a heapsort on the given array.  You may provide a
+ * swap_func function if you need to do something more than a memory copy
+ * (e.g. fix up pointers or auxiliary data), but the built-in swap isn't
+ * usually a bottleneck.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * qsort is about 20% faster on average, it suffers from exploitable
  * O(n*n) worst-case behavior and extra memory requirements that make
  * it less suitable for kernel use.
  */
-
 void sort(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *),
 	  void (*swap_func)(void *, void *, int size))
 {
 	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
+	size_t n = num * size, a = (num/2) * size;
+	unsigned const lsbit = size & -size;	/* Used to find parent */
+
+	if (!n)
+		return;
 
 	if (!swap_func) {
 		if (alignment_ok(base, size, 8))
@@ -150,30 +181,47 @@ void sort(void *base, size_t num, size_t size,
 			swap_func = generic_swap;
 	}
 
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
+	/*
+	 * Loop invariants:
+	 * 1. elements [a,n) satisfy the heap property (compare greater than
+	 *    all of their children),
+	 * 2. elements [n,num*size) are sorted, and
+	 * 3. a <= b <= c <= d <= n (whenever they are valid).
+	 */
+	for (;;) {
+		size_t b, c, d;
 
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
+		if (a)			/* Building heap: sift down --a */
+			a -= size;
+		else if (n -= size)	/* Sorting: Extract root to --n */
+			swap_func(base, base + n, size);
+		else			/* Sort complete */
+			break;
+
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + size, (d = c + size) < n;)
+			b = cmp_func(base + c, base + d) >= 0 ? c : d;
+		if (d == n)	/* Special case last leaf with no sibling */
+			b = c;
+
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && cmp_func(base + a, base + b) >= 0)
+			b = parent(b, lsbit, size);
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = parent(b, lsbit, size);
+			swap_func(base + b, base + c, size);
 		}
 	}
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-09  2:17 [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                   ` (2 preceding siblings ...)
  2019-02-21  8:21 ` [PATCH 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
@ 2019-03-05  3:06 ` George Spelvin
  2019-03-10 21:54   ` Rasmus Villemoes
  2019-03-14  9:10   ` Andy Shevchenko
  2019-03-05  5:58 ` [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
  2019-03-15 19:54 ` [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller Andrey Abramov
  5 siblings, 2 replies; 44+ messages in thread
From: George Spelvin @ 2019-03-05  3:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: George Spelvin, Andrew Morton, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner, Andy Shevchenko

Rather than a fixed-size array of pending sorted runs, use the ->prev
links to keep track of things.  This reduces stack usage, eliminates
some ugly overflow handling, and reduces the code size.

Also:
* merge() no longer needs to handle NULL inputs, so simplify.
* The same applies to merge_and_restore_back_links(), which is renamed
  to the less ponderous merge_final().  (It's a static helper function,
  so we don't need a super-descriptive name; comments will do.)

x86-64 code size 1086 -> 740 bytes (-346)

(Yes, I see checkpatch complaining about no space after comma in
"__attribute__((nonnull(2,3,4,5)))".  Checkpatch is wrong.)

Signed-off-by: George Spelvin <lkml@sdf.org>
---
 include/linux/list_sort.h |   1 +
 lib/list_sort.c           | 152 ++++++++++++++++++++++++--------------
 2 files changed, 96 insertions(+), 57 deletions(-)

diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h
index ba79956e848d..20f178c24e9d 100644
--- a/include/linux/list_sort.h
+++ b/include/linux/list_sort.h
@@ -6,6 +6,7 @@
 
 struct list_head;
 
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 	       int (*cmp)(void *priv, struct list_head *a,
 			  struct list_head *b));
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 85759928215b..e4819ef0426b 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -7,33 +7,47 @@
 #include <linux/list_sort.h>
 #include <linux/list.h>
 
-#define MAX_LIST_LENGTH_BITS 20
+/*
+ * By declaring the compare function with the __pure attribute, we give
+ * the compiler more opportunity to optimize.  Ideally, we'd use this in
+ * the prototype of list_sort(), but that would involve a lot of churn
+ * at all call sites, so just cast the function pointer passed in.
+ */
+typedef int __pure __attribute__((nonnull(2,3))) (*cmp_func)(void *,
+		struct list_head const *, struct list_head const *);
 
 /*
  * Returns a list organized in an intermediate format suited
  * to chaining of merge() calls: null-terminated, no reserved or
  * sentinel head node, "prev" links not maintained.
  */
-static struct list_head *merge(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
+__attribute__((nonnull(2,3,4)))
+static struct list_head *merge(void *priv, cmp_func cmp,
 				struct list_head *a, struct list_head *b)
 {
-	struct list_head head, *tail = &head;
+	struct list_head *head, **tail = &head;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
-			tail->next = a;
+		if (cmp(priv, a, b) <= 0) {
+			*tail = a;
+			tail = &a->next;
 			a = a->next;
+			if (!a) {
+				*tail = b;
+				break;
+			}
 		} else {
-			tail->next = b;
+			*tail = b;
+			tail = &b->next;
 			b = b->next;
+			if (!b) {
+				*tail = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a?:b;
-	return head.next;
+	return head;
 }
 
 /*
@@ -43,44 +57,52 @@ static struct list_head *merge(void *priv,
  * prev-link restoration pass, or maintaining the prev links
  * throughout.
  */
-static void merge_and_restore_back_links(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
-				struct list_head *head,
-				struct list_head *a, struct list_head *b)
+__attribute__((nonnull(2,3,4,5)))
+static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
+			struct list_head *a, struct list_head *b)
 {
 	struct list_head *tail = head;
 	u8 count = 0;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
+		if (cmp(priv, a, b) <= 0) {
 			tail->next = a;
 			a->prev = tail;
+			tail = a;
 			a = a->next;
+			if (!a)
+				break;
 		} else {
 			tail->next = b;
 			b->prev = tail;
+			tail = b;
 			b = b->next;
+			if (!b) {
+				b = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a ? : b;
 
+	/* Finish linking remainder of list b on to tail */
+	tail->next = b;
 	do {
 		/*
-		 * In worst cases this loop may run many iterations.
+		 * If the merge is highly unbalanced (e.g. the input is
+		 * already sorted), this loop may run many iterations.
 		 * Continue callbacks to the client even though no
 		 * element comparison is needed, so the client's cmp()
 		 * routine can invoke cond_resched() periodically.
 		 */
-		if (unlikely(!(++count)))
-			(*cmp)(priv, tail->next, tail->next);
-
-		tail->next->prev = tail;
-		tail = tail->next;
-	} while (tail->next);
+		if (unlikely(!++count))
+			cmp(priv, b, b);
+		b->prev = tail;
+		tail = b;
+		b = b->next;
+	} while (b);
 
+	/* And the final links to make a circular doubly-linked list */
 	tail->next = head;
 	head->prev = tail;
 }
@@ -91,55 +113,71 @@ static void merge_and_restore_back_links(void *priv,
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements "merge sort", which has O(nlog(n))
- * complexity.
+ * This function implements a bottom-up merge sort, which has O(nlog(n))
+ * complexity.  We use depth-first order to take advantage of cacheing.
+ * (I.e. when we get to the fourth element, we immediately merge the
+ * first two 2-element lists.)
  *
  * The comparison function @cmp must return a negative value if @a
  * should sort before @b, and a positive value if @a should sort after
  * @b. If @a and @b are equivalent, and their original relative
  * ordering is to be preserved, @cmp must return 0.
+ *
+ * (Actually, it is always called with @a being the element which was
+ * originally first, so it is not necessary to to distinguish the @a < @b
+ * and @a == @b cases; the return value may be a simple boolean.  But if
+ * you ever *use* this freedom, be sure to update this comment to document
+ * that code now depends on preserving this property!)
  */
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 		int (*cmp)(void *priv, struct list_head *a,
 			struct list_head *b))
 {
-	struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists
-						-- last slot is a sentinel */
-	int lev;  /* index into part[] */
-	int max_lev = 0;
-	struct list_head *list;
+	struct list_head *list = head->next, *pending = NULL;
+	size_t count = 0;	/* Count of pending */
 
-	if (list_empty(head))
+	if (list == head->prev)	/* Zero or one elements */
 		return;
 
-	memset(part, 0, sizeof(part));
-
+	/* Convert to a null-terminated singly-linked list. */
 	head->prev->next = NULL;
-	list = head->next;
 
-	while (list) {
+	/*
+	 * Data structure invariants:
+	 * - All lists are singly linked and null-terminated; prev
+	 *   pointers are not maintained.
+	 * - pending is a prev-linked "list of lists" of sorted
+	 *   sublists awaiting further merging.
+	 * - Sublists are sorted by size and age, smallest & newest at front.
+	 * - All of the sorted sublists are power-of-two in size,
+	 *   corresponding to bits set in "count".
+	 */
+	do {
+		size_t bit;
 		struct list_head *cur = list;
+
+		/* Extract the head of "list" as a single-element list "cur" */
 		list = list->next;
 		cur->next = NULL;
 
-		for (lev = 0; part[lev]; lev++) {
-			cur = merge(priv, cmp, part[lev], cur);
-			part[lev] = NULL;
+		/* Do merges corresponding to set lsbits in count */
+		for (bit = 1; count & bit; bit <<= 1) {
+			cur = merge(priv, (cmp_func)cmp, pending, cur);
+			pending = pending->prev;  /* Untouched by merge() */
 		}
-		if (lev > max_lev) {
-			if (unlikely(lev >= ARRAY_SIZE(part)-1)) {
-				printk_once(KERN_DEBUG "list too long for efficiency\n");
-				lev--;
-			}
-			max_lev = lev;
-		}
-		part[lev] = cur;
+		/* And place the result at the head of "pending" */
+		cur->prev = pending;
+		pending = cur;
+		count++;
+	} while (list->next);
+
+	/* Now merge together last element with all pending lists */
+	while (pending->prev) {
+		list = merge(priv, (cmp_func)cmp, pending, list);
+		pending = pending->prev;
 	}
-
-	for (lev = 0; lev < max_lev; lev++)
-		if (part[lev])
-			list = merge(priv, cmp, part[lev], list);
-
-	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
+	/* The final merge, rebuilding prev links */
+	merge_final(priv, (cmp_func)cmp, head, pending, list);
 }
 EXPORT_SYMBOL(list_sort);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function
  2019-03-09  2:17 [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                   ` (3 preceding siblings ...)
  2019-03-05  3:06 ` [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
@ 2019-03-05  5:58 ` George Spelvin
  2019-03-13 23:28   ` Rasmus Villemoes
  2019-03-15 19:54 ` [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller Andrey Abramov
  5 siblings, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-05  5:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: George Spelvin, Andrew Morton, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner, Andy Shevchenko

CONFIG_RETPOLINE has severely degraded indirect function call
performance, so it's worth putting some effort into reducing
the number of times cmp() is called.

This patch avoids badly unbalanced merges on unlucky input sizes.
It slightly increases the code size, but saves an average of 0.2*n
calls to cmp().

x86-64 code size 740 -> 820 bytes (+80)

Unfortunately, there's not a lot of low-hanging fruit in a merge sort;
it already performs only n*log2(n) - K*n + O(1) compares.  The leading
coefficient is already the lowest theoretically possible (log2(n!)
corresponds to K=1.4427), so we're fighting over the linear term, and
the best mergesort can do is K=1.2645, achieved when n is a power of 2.

The differences between mergesort variants appear when n is *not*
a power of 2; K is a function of the fractional part of log2(n).
Top-down mergesort does best of all, achieving a minimum K=1.2408, and
an average (over all sizes) K=1.248.  However, that requires knowing the
number of entries to be sorted ahead of time, and making a full pass
over the input to count it conflicts with a second performance goal,
which is cache blocking.

Obviously, we have to read the entire list into L1 cache at some point,
and performance is best if it fits.  But if it doesn't fit, each full
pass over the input causes a cache miss per element, which is undesirable.

While textbooks explain bottom-up mergesort as a succession of merging
passes, practical implementations do merging in depth-first order:
as soon as two lists of the same size are available, they are merged.
This allows as many merge passes as possible to fit into L1; only the
final few merges force cache misses.

This cache-friendly depth-first merge order depends on us merging the
beginning of the input as much as possible before we've even seen the
end of the input (and thus know its size).

The simple eager merge pattern causes bad performance when n is just
over a power of 2.  If n=1028, the final merge is between 1024- and
4-element lists, which is wasteful of comparisons.  (This is actually
worse on average than n=1025, because a 1204:1 merge will, on average,
end after 512 compares, while 1024:4 will walk 4/5 of the list.)

Because of this, bottom-up mergesort achieves K < 0.5 for such sizes,
and has an average (over all sizes) K of around 1.  (My experiments
show K=1.01, while theory predicts K=0.965.)

There are "worst-case optimal" variants of bottom-up mergesort which
avoid this bad performance, but the algorithms given in the literature,
such as queue-mergesort and boustrodephonic mergesort, depend on the
breadth-first multi-pass structure that we are trying to avoid.

This implementation is as eager as possible while ensuring that all merge
passes are at worst 1:2 unbalanced.  This achieves the same average
K=1.207 as queue-mergesort, which is 0.2*n better then bottom-up, and
only 0.04*n behind top-down mergesort.

Specifically, it merges two lists of size 2^k as soon as it is known
that there are 2^k additional inputs following.  This ensures that the
final uneven merges triggered by reaching the end of the input will be
at worst 2:1.  This will avoid cache misses as long as 3*2^k elements
fit into the cache.

(I confess to being more than a little bit proud of how clean this
code turned out.  It took a lot of thinking, but the resultant inner
loop is very simple and efficient.)

Refs:
  Bottom-up Mergesort: A Detailed Analysis
  Wolfgang Panny, Helmut Prodinger
  Algorithmica 14(4):340--354, October 1995
  https://doi.org/10.1007/BF01294131
  https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.5260

  The cost distribution of queue-mergesort, optimal mergesorts, and
  power-of-two rules
  Wei-Mei Chen, Hsien-Kuei Hwang, Gen-Huey Chen
  Journal of Algorithms 30(2); Pages 423--448, February 1999
  https://doi.org/10.1006/jagm.1998.0986
  https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5380

  Queue-Mergesort
  Mordecai J. Golin, Robert Sedgewick
  Information Processing Letters, 48(5):253--259, 10 December 1993
  https://doi.org/10.1016/0020-0190(93)90088-q
  https://sci-hub.tw/10.1016/0020-0190(93)90088-Q

Signed-off-by: George Spelvin <lkml@sdf.org>
---
 lib/list_sort.c | 111 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 23 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index e4819ef0426b..06df9b283c40 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -113,11 +113,6 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements a bottom-up merge sort, which has O(nlog(n))
- * complexity.  We use depth-first order to take advantage of cacheing.
- * (I.e. when we get to the fourth element, we immediately merge the
- * first two 2-element lists.)
- *
  * The comparison function @cmp must return a negative value if @a
  * should sort before @b, and a positive value if @a should sort after
  * @b. If @a and @b are equivalent, and their original relative
@@ -128,6 +123,57 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  * and @a == @b cases; the return value may be a simple boolean.  But if
  * you ever *use* this freedom, be sure to update this comment to document
  * that code now depends on preserving this property!)
+ *
+ * This mergesort is as eager as possible while always performing at least
+ * 2:1 balanced merges.  Given two pending sublists of size 2^k, they are
+ * merged to a size-2^(k+1) list as soon as we have 2^k following elements.
+ *
+ * Thus, it will avoid cache thrashing as long as 3*2^k elements can
+ * fit into the cache.  Not quite as good as a fully-eager bottom-up
+ * mergesort, but it does use 0.2*n fewer comparisons, so is faster in
+ * the common case that everything fits into L1.
+ *
+ *
+ * The merging is controlled by "count", the number of elements in the
+ * pending lists.  This is beautiully simple code, but rather subtle.
+ *
+ * Each time we increment "count", we set one bit (bit k) and clear
+ * bits k-1 .. 0.  Each time this happens (except the very first time
+ * for each bit, when count increments to 2^k), we merge two lists of
+ * size 2^k into one list of size 2^(k+1).
+ *
+ * This merge happens exactly when the count reaches an odd multiple of
+ * 2^k, which is when we have 2^k elements pending in smaller lists,
+ * so it's safe to merge away two lists of size 2^k.
+ *
+ * After this happens twice, we have created two lists of size 2^(k+1),
+ * which will be merged into a list of size 2^(k+2) before we create
+ * a third list of size 2^(k+1), so there are never more than two pending.
+ *
+ * The number of pending lists of size 2^k is determined by the
+ * state of bit k of "count" plus two extra pieces of information:
+ * - The state of bit k-1 (when k == 0, consider bit -1 always set), and
+ * - Whether the higher-order bits are zero or non-zero (i.e.
+ *   is count >= 2^(k+1)).
+ * There are six states we distinguish.  "x" represents some arbitrary
+ * bits, and "y" represents some arbitrary non-zero bits:
+ * 0:  00x: 0 pending of size 2^k;           x pending of sizes < 2^k
+ * 1:  01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 2: x10x: 0 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 4: y00x: 1 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * (merge and loop back to state 2)
+ *
+ * Note in particular that just before incrementing from state 5 to
+ * state 2, all lower-order bits are 11 (state 3) so there is one list
+ * of each smaller size.
+ *
+ * When we reach the end of the input, we merge all the pending
+ * lists, from smallest to largest.  If you work through cases 2 to
+ * 5 above, you can see that the number of elements we merge with a list
+ * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
+ * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
  */
 __attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
@@ -150,32 +196,51 @@ void list_sort(void *priv, struct list_head *head,
 	 * - pending is a prev-linked "list of lists" of sorted
 	 *   sublists awaiting further merging.
 	 * - Sublists are sorted by size and age, smallest & newest at front.
-	 * - All of the sorted sublists are power-of-two in size,
-	 *   corresponding to bits set in "count".
+	 * - All of the sorted sublists are power-of-two in size.
+	 * - There are zero to two sublists of each size.
+	 * - A pair of pending sublists are merged as soon as the number
+	 *   of following pending elements equals their size.  That ensures
+	 *   each later final merge will be at worst 2:1.
+	 * - Each round consists of:
+	 *   - Merging the two sublists selected by the highest bit
+	 *     which flips when count is inremented, and
+	 *   - Adding an element from the input as a size-1 sublist.
 	 */
 	do {
 		size_t bit;
-		struct list_head *cur = list;
+		struct list_head **tail = &pending;
 
-		/* Extract the head of "list" as a single-element list "cur" */
-		list = list->next;
-		cur->next = NULL;
+		/* Find the least-significant clear bit in count */
+		for (bit = 1; count & bit; bit <<= 1)
+			tail = &(*tail)->prev;
+		/* Do the indicated merge */
+		if (likely(count > bit)) {
+			struct list_head *a = *tail, *b = a->prev;
 
-		/* Do merges corresponding to set lsbits in count */
-		for (bit = 1; count & bit; bit <<= 1) {
-			cur = merge(priv, (cmp_func)cmp, pending, cur);
-			pending = pending->prev;  /* Untouched by merge() */
+			a = merge(priv, (cmp_func)cmp, a, b);
+			/* Install the result in place of the lists */
+			a->prev = b->prev;
+			*tail = a;
 		}
-		/* And place the result at the head of "pending" */
-		cur->prev = pending;
-		pending = cur;
-		count++;
-	} while (list->next);
 
-	/* Now merge together last element with all pending lists */
-	while (pending->prev) {
+		/* Move one element from input list to pending */
+		list->prev = pending;
+		pending = list;
+		list = list->next;
+		pending->next = NULL;
+		count++;
+	} while (list);
+
+	/* End of input; merge together all the pending lists. */
+	list = pending;
+	pending = pending->prev;
+	for (;;) {
+		struct list_head *next = pending->prev;
+
+		if (!next)
+			break;
 		list = merge(priv, (cmp_func)cmp, pending, list);
-		pending = pending->prev;
+		pending = next;
 	}
 	/* The final merge, rebuilding prev links */
 	merge_final(priv, (cmp_func)cmp, head, pending, list);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller
@ 2019-03-09  2:17 George Spelvin
  2019-02-21  6:30 ` [PATCH 1/5] lib/sort: Make swap functions more generic George Spelvin
                   ` (5 more replies)
  0 siblings, 6 replies; 44+ messages in thread
From: George Spelvin @ 2019-03-09  2:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: George Spelvin, Andrew Morton, Andrey Abramov,
	Geert Uytterhoeven, Daniel Wagner, Rasmus Villemoes, Don Mullis,
	Dave Chinner, Andy Shevchenko

Because CONFIG_RETPOLINE has made indirect calls much more expensive,
I thought I'd try to reduce the number made by the library sort
functions.

The first three patches apply to lib/sort.c.

Patch #1 is a simple optimization.  The built-in swap has rarely-used
special cases for aligned 4- and 8-byte objects.  But that case almost
never happens; most calls to sort() work on larger structures, which
fall back to the byte-at-a-time loop.  This generalizes them to aligned
*multiples* of 4 and 8 bytes.  (If nothing else, it saves an awful lot
of energy by not thrashing the store buffers as much.)

(Issue for disussion: should the special-case swap loops be reduced to
two, an aligned-word and generic byte verison?)

Patch #2 grabs a juicy piece of low-hanging fruit.  I agree that
nice simple solid heapsort is preferable to more complex algorithms
(sorry, Andrey), but it's possible to implement heapsort with 40% fewer
comparisons than the way it's been done up to now.  And with some care,
the code ends up smaller, as well.  This is the "big win" patch.

Patch #3 adds the same sort of indirect call bypass that has been added
to the net code of late.  The great majority of the callers use the
builtin swap functions, so replace the indirect call to sort_func with a
(highly preditable) series of if() statements.  Rather surprisingly,
this decreased code size, as the swap functions were inlined and their
prologue & epilogue code eliminated.

lib/list_sort.c is a bit trickier, as merge sort is already close to
optimal, and we don't want to introduce triumphs of theory over
practicality like the Ford-Johnson merge-insertion sort.

Patch #4, without changing the algorithm, chops 32% off the code size and
removes the part[MAX_LIST_LENGTH+1] pointer array (and the corresponding
upper limit on efficiently sortable input size).

Patch #5 improves the algorithm.  The previous code is already optimal
for power-of-two (or slightly smaller) size inputs, but when the input
size is just over a power of 2, there's a very unbalanced final merge.

There are, in the literature, several algorithms which solve this, but
they all depend on the "breadth-first" merge order which was replaced
by commit 835cc0c8477f with a more cache-friendly "depth-first" order.
Some hard thinking came up with a depth-first algorithm which defers
merges as little as possible while avoiding bad merges.  This saves
0.2*n compares, averaged over all sizes.

The code size increase is minimal (80 bytes on x86-64, reducing the net
savings to 24%), but the comments expanded significantly to document
the clever algorithm.


TESTING NOTES: I have some ugly user-space benchmarking code
which I used for testing before moving this code into the kernel.
Shout if you want a copy.

I'm running this code right now, with CONFIG_TEST_SORT and
CONFIG_TEST_LIST_SORT, but I confess I haven't rebooted since
the last round of minor edits to quell checkpatch.  I figure there
will be at least one round of comments and final testing.

George Spelvin (5):
  lib/sort: Make swap functions more generic
  lib/sort: Use more efficient bottom-up heapsort variant
  lib/sort: Avoid indirect calls to built-in swap
  lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  lib/list_sort: Optimize number of calls to comparison function

 include/linux/list_sort.h |   1 +
 lib/list_sort.c           | 225 ++++++++++++++++++++++++----------
 lib/sort.c                | 250 ++++++++++++++++++++++++++++++--------
 3 files changed, 365 insertions(+), 111 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
       [not found]   ` <20190309140653.GO9224@smile.fi.intel.com>
@ 2019-03-09 15:53     ` lkml
  2019-03-09 20:19       ` Andrey Abramov
  2019-03-14  9:29       ` Andy Shevchenko
  2019-03-09 21:02     ` George Spelvin
  1 sibling, 2 replies; 44+ messages in thread
From: lkml @ 2019-03-09 15:53 UTC (permalink / raw)
  To: andriy.shevchenko, lkml
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel,
	linux, st5pub

Thnk you for the feedback!

Andy Shevchenko wrote:
> On Thu, Feb 21, 2019 at 06:30:28AM +0000, George Spelvin wrote:
>> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> 
> Why #ifdef is better than if (IS_ENABLED()) ?

Because CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is bool and not
tristate.  IS_ENABLED tests for 'y' or 'm' but we don't need it
for something that's only on or off.

Looking through the kernel, I see both, but #ifdef or #if defined()
are definitely in the majority:

lib/lzo/lzo1x_compress.c:#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(LZO_USE_CTZ64)
lib/siphash.c:#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
lib/string.c:#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
lib/strncpy_from_user.c:#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
lib/zlib_inflate/inffast.c:#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

I see a few IS_ENABLED uses in include/crypto/ and kernel/bpf/.

It makes no real difference; #ifdef is simpler to me.

>> +	(void)base;
> 
> I understand the possible weirdness of the case, but 0 pointer is also good, no?

I don't quite understand the question.  A NULL pointer is aligned
as far as alignment_ok is concerned.  In other words, word accesses to
*NULL will (not) work just as well as byte accesses.

None of this matters because the callers never pass in NULL pointers.

>> +#else
>> +	lsbits |= (unsigned int)(size_t)base;
> 
> Perhaps you meant simple (uintptr_t) here and maybe above as well.

No, I meant to truncate it to the same type as "align".  We only
care about the low few bits; I could have used (unsigned char) just
as well.

My reason or choosing unsigned int rather than unsigned char was
that both x86 and ARM are a tiny bit more efficient at 32-bit
operations (x86 avoids a REX prefix; ARM gates off the high half
of the ALU to save power), but only x86 does byte operations
natively.

Still, compilers know how to promote to word operations, so
maybe using unsigned char would make it clearer to the
reader that "yes, I meant to do that!".

I've changed it to:

static bool __attribute_const__
is_aligned(const void *base, size_t size, unsigned char align)
{
	unsigned char lsbits = (unsigned char)size;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	(void)base;
#else
	lsbits |= (unsigned char)(uintptr_t)base;
#endif
	return (lsbits & (align - 1)) == 0;
}

Although I'm thinking of:

static bool __attribute_const__
is_aligned(const void *base, size_t size, unsigned char align)
{
	unsigned char lsbits = (unsigned char)size;

	(void)base;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	lsbits |= (unsigned char)(uintptr_t)base;
#endif
	return (lsbits & (align - 1)) == 0;
}

Any preference?

>> +#endif
>> +	lsbits &= align - 1;
>> +	return lsbits == 0;
> 
> and this can be one return operation.

It was just to avoid some annoying parentheses because C's precendence
rules and GCC's warnings are fighting me here.  If others prefer
"return (lsbits & (align - 1)) == 0;" I'm happy to change.

>>  static void u32_swap(void *a, void *b, int size)
> 
> For such primitives that operates on top of an arrays we usually append 's' to
> the name. Currently the name is misleading.
> 
> Perhaps u32s_swap().

I don't worry much about the naming of static helper functions.
If they were exported, it would be a whole lot more important!

I find "u32s" confusing; I keep reading the "s" as "signed" rather
than a plural.

How about one of:
swap_bytes / swap_ints / swap_longs
swap_1 / swap_4 / swap_8

> Shouldn't simple memcpy cover these case for both 32- and 64-bit architectures?

This isn't a memcpy, it's a memory *swap*.  To do it with memcpy
requires:
	memcpy(temp_buffer, a, size);
	memcpy(a, b, size);
	memcpy(b, temp_buffer, size);

This is 1.5x as much memory access, and you have to find a large
enough temp_buffer.  (I didn't think a variable-length array on
the stack would make people happy.)

Also, although it is a predictable branch, memcpy() has to check the
alignment of its inputs each call.  The reason for these helpers is
to factor that out.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-09 15:53     ` lkml
@ 2019-03-09 20:19       ` Andrey Abramov
  2019-03-14  9:29       ` Andy Shevchenko
  1 sibling, 0 replies; 44+ messages in thread
From: Andrey Abramov @ 2019-03-09 20:19 UTC (permalink / raw)
  To: lkml, andriy.shevchenko
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel, linux

> Although I'm thinking of:
>
> static bool __attribute_const__
> is_aligned(const void *base, size_t size, unsigned char align)
> {
> unsigned char lsbits = (unsigned char)size;
>
> (void)base;
> #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> lsbits |= (unsigned char)(uintptr_t)base;
> #endif
> return (lsbits & (align - 1)) == 0;
> }
>
> Any preference?
I think it would be better.

> I find "u32s" confusing; I keep reading the "s" as "signed" rather than a plural. How about one of: swap_bytes / swap_ints / swap_longs swap_1 / swap_4 / swap_8
In my opinion "swap_bytes / swap_ints / swap_longs" are the most readable.

(Good job)

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
       [not found]   ` <20190309140653.GO9224@smile.fi.intel.com>
  2019-03-09 15:53     ` lkml
@ 2019-03-09 21:02     ` George Spelvin
  1 sibling, 0 replies; 44+ messages in thread
From: George Spelvin @ 2019-03-09 21:02 UTC (permalink / raw)
  To: andriy.shevchenko, lkml
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel,
	linux, st5pub

Andy Shevchenko wrote:
> Shouldn't simple memcpy cover these case for both 32- and 64-bit architectures?

Speaking of replacing swap with copying via temporary buffers, one
idea that did come to mind was avoiding swap for sufficiently small
objects.

Every sift-down is actually a circular shift.  Once the target
position hs been found, we rotate the path from the root to the
target up one, with the target filled in by the previous root.

(When breaking down the heap, there's one additional link in the
cycle, where the previous root goes to the end of the heap and the
end of the heap goes to the target.)

If we had a temporary buffer (128 bytes would handle most things),
we could copy the previous root there and *copy*, rather than swap,
the elements on the path to the target up, then finally copy the
previous root to the target.

However, to rotate up, the this must be done in top-down order.
The way the code currently works with premultiplied offsets, it's
easy to traverse bottom-up, but very awkward to retrace the
top-down path.

The only solution I can think of is to build a bitmap of the
left/right turnings from the root to the leaf, and then back it up
to the target.  There are a few ways to do this:

1) The obvious big-endian order.  The bitmap is simply the 1-based
   position of the leaf. To add a level, shift left and add the new
   bit at the bottom.  To back up a step, shift right.

   To retrace, create a 1-bit mask equal to the msbit of the index
   ((smear_right(x) >> 1) + 1) and repeatedly shift the mask right.

2) The reverse order.  We use a 1-bit mask while building the
   bitmap, and while retracing, just examine the lsbit while shifting
   the bitmap right.

3) As option 1, but don't build the bitmap as we're walking down;
   rather reconstruct it from the premultiplied offset using
   reciprocal_divide().

Nothing really jumps out to me as The Right Way to do it.

I don't want to bloat the code to the point that it would be
easier to implement a different algorithm entirely.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-05  3:06 ` [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
@ 2019-03-10 21:54   ` Rasmus Villemoes
  2019-03-10 22:29     ` George Spelvin
  2019-03-14  9:10   ` Andy Shevchenko
  1 sibling, 1 reply; 44+ messages in thread
From: Rasmus Villemoes @ 2019-03-10 21:54 UTC (permalink / raw)
  To: George Spelvin, linux-kernel
  Cc: Andrew Morton, Andrey Abramov, Geert Uytterhoeven, Daniel Wagner,
	Rasmus Villemoes, Don Mullis, Dave Chinner, Andy Shevchenko

On 05/03/2019 04.06, George Spelvin wrote:

>   * The comparison function @cmp must return a negative value if @a
>   * should sort before @b, and a positive value if @a should sort after
>   * @b. If @a and @b are equivalent, and their original relative
>   * ordering is to be preserved, @cmp must return 0.
> + *
> + * (Actually, it is always called with @a being the element which was
> + * originally first, so it is not necessary to to distinguish the @a < @b
> + * and @a == @b cases; the return value may be a simple boolean.  But if
> + * you ever *use* this freedom, be sure to update this comment to document
> + * that code now depends on preserving this property!)
>   */

This was and still is used at least by the block layer, and likely
others as well. While 3110fc79606fb introduced a bunch of if() return -1
else if () ... stuff, it still ends with a 0/1 result. Before
3110fc79606fb, it was even more obvious that this property was used. So
I agree that it is worth documenting this feature, both for users of
list_sort, but even more so for future refactorers of it - but you
probably want to change the wording somewhat.

Grepping around shows that this could probably be used in more places,
gaining a cycle or two per cmp callback, e.g. xfs_buf_cmp. But that's of
course outside the scope of this series.

Rasmus

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-10 21:54   ` Rasmus Villemoes
@ 2019-03-10 22:29     ` George Spelvin
  0 siblings, 0 replies; 44+ messages in thread
From: George Spelvin @ 2019-03-10 22:29 UTC (permalink / raw)
  To: linux-kernel, linux, lkml
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	geert, st5pub

Rasmus Villemoes wrote:
> On 05/03/2019 04.06, George Spelvin wrote:
>> + * (Actually, it is always called with @a being the element which was
>> + * originally first, so it is not necessary to to distinguish the @a < @b
>> + * and @a == @b cases; the return value may be a simple boolean.  But if
>> + * you ever *use* this freedom, be sure to update this comment to document
>> + * that code now depends on preserving this property!)
>
> This was and still is used at least by the block layer, and likely
> others as well. While 3110fc79606fb introduced a bunch of if() return -1
> else if () ... stuff, it still ends with a 0/1 result. Before
> 3110fc79606fb, it was even more obvious that this property was used.

Ah, thank you!  I actually read through every list_sort caller in
the kernel to see if I could find anywhere that used it and couldn't,
but I didn't study this code carefully enough to see that it does
in the last step.

Since someone *does* use this, I'll change the comment signiicantly.

> Grepping around shows that this could probably be used in more places,
> gaining a cycle or two per cmp callback, e.g. xfs_buf_cmp. But that's of
> course outside the scope of this series.

The one that misled me at first was _xfs_buf_obj_cmp, which returns 0/1,
but that's not used by list_sort().  xfs_buf_cmp returns -1/0/+1.

As you might see from the comment around the cmp_func typedef,
there are other things that could be cleaned up if we did a pass
over all the call sites.

(I'm almost tempted to tell the compiler than cmp_func is const,
since it's supposed to be independent of the pointer frobbing that
list_sort does, but then I remember Henry Spencer's maxim about
lying to the compiler.)

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-02-21  6:30 ` [PATCH 1/5] lib/sort: Make swap functions more generic George Spelvin
       [not found]   ` <20190309140653.GO9224@smile.fi.intel.com>
@ 2019-03-13 21:23   ` Rasmus Villemoes
  2019-03-13 22:02     ` Geert Uytterhoeven
  2019-03-13 23:15     ` George Spelvin
  1 sibling, 2 replies; 44+ messages in thread
From: Rasmus Villemoes @ 2019-03-13 21:23 UTC (permalink / raw)
  To: George Spelvin, linux-kernel
  Cc: Andrew Morton, Andrey Abramov, Geert Uytterhoeven, Daniel Wagner,
	Don Mullis, Dave Chinner, Andy Shevchenko

On 21/02/2019 07.30, George Spelvin wrote:
> Rather than u32_swap and u64_swap working on 4- and 8-byte objects
> directly, let them handle any multiple of 4 or 8 bytes.  This speeds
> up most users of sort() by avoiding fallback to the byte copy loop.
> 
> Despite what commit ca96ab859ab4 ("lib/sort: Add 64 bit swap function")
> claims, very few users of sort() sort pointers (or pointer-sized
> objects); most sort structures containing at least two words.
> (E.g. drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte
> struct acpi_fan_fps.)
> 
> x86-64 code size 872 -> 885 bytes (+8)
> 
> Signed-off-by: George Spelvin <lkml@sdf.org>
> ---
>  lib/sort.c | 117 +++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 96 insertions(+), 21 deletions(-)
> 
> diff --git a/lib/sort.c b/lib/sort.c
> index d6b7a202b0b6..dff2ab2e196e 100644
> --- a/lib/sort.c
> +++ b/lib/sort.c
> @@ -11,35 +11,110 @@
>  #include <linux/export.h>
>  #include <linux/sort.h>
>  
> -static int alignment_ok(const void *base, int align)
> +/**
> + * alignment_ok - is this pointer & size okay for word-wide copying?
> + * @base: pointer to data
> + * @size: size of each element
> + * @align: required aignment (typically 4 or 8)
> + *

typo aLignment

> + * Returns true if elements can be copied using word loads and stores.
> + * The size must be a multiple of the alignment, and the base address must
> + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
> + *

I wonder if we shouldn't unconditionally require the base to be aligned.
It of course depends on what exactly 'efficient' means, but if the base
is not aligned, we know that every single load and store will be
unaligned, and we're doing lots of them. IIRC even on x86 it's usually
two cycles instead of one, and even more when we cross a cache line
boundary (which we do rather often - e.g. in your 40 byte example,
almost every swap will hit at least one). One could also have some data
that is naturally 4-byte aligned with an element size that happens to be
a multiple of 8 bytes; it would be a bit sad to use 8-byte accesses when
4-byte would all have been aligned.

> + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
> + * to "if ((a | b) & mask)", so we do that by hand.
> + */
> +static bool __attribute_const__
> +alignment_ok(const void *base, size_t size, unsigned int align)
>  {
> -	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
> -		((unsigned long)base & (align - 1)) == 0;
> +	unsigned int lsbits = (unsigned int)size;

Drop that cast.

> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +	(void)base;
> +#else
> +	lsbits |= (unsigned int)(size_t)base;

The kernel usually casts pointers to long or unsigned long. If some
oddball arch has size_t something other than unsigned long, this would
give a "warning: cast from pointer to integer of different size". So
just cast to unsigned long, and drop the cast to unsigned int.

If you do drop CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, this all becomes
much simpler, of course

lsbits = size | (unsigned long)base;

> +#endif
> +	lsbits &= align - 1;
> +	return lsbits == 0;
>  }
>  
> +/**
> + * u32_swap - swap two elements in 32-bit chunks
> + * @a, @b: pointers to the elements
> + * @size: element size (must be a multiple of 4)
> + *
> + * Exchange the two objects in memory.  This exploits base+index addressing,
> + * which basically all CPUs have, to minimize loop overhead computations.
> + *
> + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
> + * bottom of the loop, even though the zero flag is stil valid from the
> + * subtract (since the intervening mov instructions don't alter the flags).
> + * Gcc 8.1.0 doesn't have that problem.
> + */
>  static void u32_swap(void *a, void *b, int size)
>  {
> -	u32 t = *(u32 *)a;
> -	*(u32 *)a = *(u32 *)b;
> -	*(u32 *)b = t;
> +	size_t n = size;
> +

Since the callback has int in its prototype (we should fix that globally
at some point...), might as well use a four-byte local variable, to
avoid rex prefix on x86-64. So just make that "unsigned" or "u32".

> +	do {
> +		u32 t = *(u32 *)(a + (n -= 4));
> +		*(u32 *)(a + n) = *(u32 *)(b + n);
> +		*(u32 *)(b + n) = t;
> +	} while (n);
>  }

Hm, are we absolutely sure there's no weird .config where some struct
ends up being empty, yet we still sort an array of them? It's far
fetched, of course, but perhaps an "if (unlikely(!size)) return;" at the
start of sort() would be in order. Dunno, I'll leave it to you.

Rasmus

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-13 21:23   ` Rasmus Villemoes
@ 2019-03-13 22:02     ` Geert Uytterhoeven
  2019-03-13 23:15     ` George Spelvin
  1 sibling, 0 replies; 44+ messages in thread
From: Geert Uytterhoeven @ 2019-03-13 22:02 UTC (permalink / raw)
  To: Rasmus Villemoes
  Cc: George Spelvin, Linux Kernel Mailing List, Andrew Morton,
	Andrey Abramov, Daniel Wagner, Don Mullis, Dave Chinner,
	Andy Shevchenko

On Wed, Mar 13, 2019 at 10:23 PM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
> On 21/02/2019 07.30, George Spelvin wrote:
> > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > +     (void)base;
> > +#else
> > +     lsbits |= (unsigned int)(size_t)base;
>
> The kernel usually casts pointers to long or unsigned long. If some
> oddball arch has size_t something other than unsigned long, this would
> give a "warning: cast from pointer to integer of different size". So
> just cast to unsigned long, and drop the cast to unsigned int.

The proper integer equivalent of a pointer is uintptr_t, so please use
that instead of size_t.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/5] lib/sort: Use more efficient bottom-up heapsort variant
  2019-02-21  8:21 ` [PATCH 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
@ 2019-03-13 22:29   ` Rasmus Villemoes
  2019-03-14  0:03     ` George Spelvin
  0 siblings, 1 reply; 44+ messages in thread
From: Rasmus Villemoes @ 2019-03-13 22:29 UTC (permalink / raw)
  To: George Spelvin, linux-kernel
  Cc: Andrew Morton, Andrey Abramov, Geert Uytterhoeven, Daniel Wagner,
	Don Mullis, Dave Chinner, Andy Shevchenko

On 21/02/2019 09.21, George Spelvin wrote:
>  
> +/**
> + * parent - given the offset of the child, find the offset of the parent.
> + * @i: the offset of the heap element whose parent is sought.  Non-zero.
> + * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
> + * @size: size of each element
> + *
> + * In terms of array indexes, the parent of element j = i/size is simply
> + * (j-1)/2.  But when working in byte offsets, we can't use implicit
> + * truncation of integer divides.
> + *
> + * Fortunately, we only need one bit of the quotient, not the full divide.
> + * size has a least significant bit.  That bit will be clear if i is
> + * an even multiple of size, and set if it's an odd multiple.
> + *
> + * Logically, we're doing "if (i & lsbit) i -= size;", but since the
> + * branch is unpredictable, it's done with a bit of clever branch-free
> + * code instead.
> + */
> +__attribute_const__ __always_inline
> +static size_t parent(size_t i, unsigned int lsbit, size_t size)
> +{
> +	i -= size;
> +	i -= size & -(i & lsbit);
> +	return i / 2;
> +}
> +

Really nice :) I had to work through this by hand, but it's solid.

>  /**
>   * sort - sort an array of elements
>   * @base: pointer to data to sort
> @@ -125,21 +151,26 @@ static void generic_swap(void *a, void *b, int size)
>   * @cmp_func: pointer to comparison function
>   * @swap_func: pointer to swap function or NULL
>   *
> - * This function does a heapsort on the given array. You may provide a
> - * swap_func function optimized to your element type.
> + * This function does a heapsort on the given array.  You may provide a
> + * swap_func function if you need to do something more than a memory copy
> + * (e.g. fix up pointers or auxiliary data), but the built-in swap isn't
> + * usually a bottleneck.
>   *
>   * Sorting time is O(n log n) both on average and worst-case. While
>   * qsort is about 20% faster on average, it suffers from exploitable
>   * O(n*n) worst-case behavior and extra memory requirements that make
>   * it less suitable for kernel use.
>   */
> -
>  void sort(void *base, size_t num, size_t size,
>  	  int (*cmp_func)(const void *, const void *),
>  	  void (*swap_func)(void *, void *, int size))
>  {
>  	/* pre-scale counters for performance */
> -	int i = (num/2 - 1) * size, n = num * size, c, r;
> +	size_t n = num * size, a = (num/2) * size;
> +	unsigned const lsbit = size & -size;	/* Used to find parent */
> +

Nit: qualifier before type, "const unsigned". And this sets ZF, so a
paranoid check for zero size (cf. the other mail) by doing "if (!lsbit)
return;" is practically free. Though it's probably a bit obscure doing
it that way...

> +	if (!n)
> +		return;

I'd make that n <= 1. Shouldn't be much more costly.

> -		}
> -	}
> +	/*
> +	 * Loop invariants:
> +	 * 1. elements [a,n) satisfy the heap property (compare greater than
> +	 *    all of their children),
> +	 * 2. elements [n,num*size) are sorted, and
> +	 * 3. a <= b <= c <= d <= n (whenever they are valid).
> +	 */
> +	for (;;) {
> +		size_t b, c, d;
>  

> +		if (a)			/* Building heap: sift down --a */
> +			a -= size;
> +		else if (n -= size)	/* Sorting: Extract root to --n */
> +			swap_func(base, base + n, size);
> +		else			/* Sort complete */
> +			break;
> +
> +		/*
> +		 * Sift element at "a" down into heap.  This is the
> +		 * "bottom-up" variant, which significantly reduces
> +		 * calls to cmp_func(): we find the sift-down path all
> +		 * the way to the leaves (one compare per level), then
> +		 * backtrack to find where to insert the target element.
> +		 *
> +		 * Because elements tend to sift down close to the leaves,
> +		 * this uses fewer compares than doing two per level
> +		 * on the way down.  (A bit more than half as many on
> +		 * average, 3/4 worst-case.)
> +		 */
> +		for (b = a; c = 2*b + size, (d = c + size) < n;)
> +			b = cmp_func(base + c, base + d) >= 0 ? c : d;
> +		if (d == n)	/* Special case last leaf with no sibling */
> +			b = c;
> +
> +		/* Now backtrack from "b" to the correct location for "a" */
> +		while (b != a && cmp_func(base + a, base + b) >= 0)
> +			b = parent(b, lsbit, size);
> +		c = b;			/* Where "a" belongs */
> +		while (b != a) {	/* Shift it into place */
> +			b = parent(b, lsbit, size);
> +			swap_func(base + b, base + c, size);
>  		}
>  	}
>  }
> 

Nice!

Rasmus

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-13 21:23   ` Rasmus Villemoes
  2019-03-13 22:02     ` Geert Uytterhoeven
@ 2019-03-13 23:15     ` George Spelvin
  1 sibling, 0 replies; 44+ messages in thread
From: George Spelvin @ 2019-03-13 23:15 UTC (permalink / raw)
  To: linux-kernel, linux, lkml
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	geert, st5pub

Thank you for your thoughtful comments!

On Wed, 13 Mar 2019 at 23:23:44 +0100, Rasmus Villemoes wrote:
> On 21/02/2019 07.30, George Spelvin wrote:
> + * @align: required aignment (typically 4 or 8)
>
> typo aLignment

Thanks; fixed!

>> + * Returns true if elements can be copied using word loads and stores.
>> + * The size must be a multiple of the alignment, and the base address must
>> + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
>
> I wonder if we shouldn't unconditionally require the base to be aligned.
> It of course depends on what exactly 'efficient' means, but if the base
> is not aligned, we know that every single load and store will be
> unaligned, and we're doing lots of them. IIRC even on x86 it's usually
> two cycles instead of one, and even more when we cross a cache line
> boundary (which we do rather often - e.g. in your 40 byte example,
> almost every swap will hit at least one). One could also have some data
> that is naturally 4-byte aligned with an element size that happens to be
> a multiple of 8 bytes; it would be a bit sad to use 8-byte accesses when
> 4-byte would all have been aligned.

Well, a 2-cycle unaligned access is still lots faster than 4 byte accesses.
I think that's a decent interpretation of "EFFICIENT_UNALIGNED_ACCESS":
faster than doing it a byte at a time.  So the change is a net win;
we're just wondering if it could be optimized more.

The 4/8 issue is similar, but not as extreme.  Is one unaligned 64-bit
access faster than two aligned 32-bit accesses?  As you note, it's usually
only twice the cost, so it's no slower, and there's less loop overhead.
So I don't think it's a bad thing here, either.

Re your comments on cache lines, ISTR that x86 can do an unaligned
load with minimal penalty as long as it doesn't cross a cache line:
https://software.intel.com/en-us/articles/reducing-the-impact-of-misaligned-memory-accesses/
So you win on most of the accesses, hopefully enough to pay for the
one unligned access.  Apparently in processors more recent than
the P4 example Intel used above, it's even less:
https://lemire.me/blog/2012/05/31/data-alignment-for-speed-myth-or-reality/

On 32-bit machines, it's actually a 4-byte swap, unrolled twice;
there are no 64-bit memory accesses.  So the concern is only about
8-byte alignment on 64-bit machines.

The great majority of call sites sort structures with pointer or
long members, so are aligned and the question is moot.  I don't
think it's worth overthinking the issue on behalf of the performance
of some rare corner cases.

I have considered doing away with the two word-copy loops and just
having one machine-word-sized loop plus a byte fallback.

>> +	unsigned int lsbits = (unsigned int)size;
>
> Drop that cast.

Actually, in response to an earlier comment, I changed it (and the
type of lsbits) to (unsigned char), to emphasize that I *really* only
care about a handful of low bits.

I know gcc doesn't warn about implicit narrowing casts, but
I prefer to make them explicit for documentation reasons.
"Yes, I really mean to throw away the high bits."

Compilers on machines without native byte operations are very good
at using larger registers and optimizing away mask operations.
(Remember that this is inlined, and "align" is a constant 4 or 8.)
>
>> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>> +	(void)base;
>> +#else
>> +	lsbits |= (unsigned int)(size_t)base;
>
> The kernel usually casts pointers to long or unsigned long. If some
> oddball arch has size_t something other than unsigned long, this would
> give a "warning: cast from pointer to integer of different size". So
> just cast to unsigned long, and drop the cast to unsigned int.

I changed this to uintptr_t and called it good.

>>  static void u32_swap(void *a, void *b, int size)
>>  {
>> -	u32 t = *(u32 *)a;
>> -	*(u32 *)a = *(u32 *)b;
>> -	*(u32 *)b = t;
>> +	size_t n = size;
>> +
>
> Since the callback has int in its prototype (we should fix that globally
> at some point...), might as well use a four-byte local variable, to
> avoid rex prefix on x86-64. So just make that "unsigned" or "u32".

Agreed about the eventual global fix.  If you care, the only places
a non-NULL swap function is passed in are:
- arch/arc/kernel/unwind.c
- arch/powerpc/kernel/module_32.c
- arch/powerpc/kernel/module_64.c
! arch/x86/kernel/unwind_orc.c (2x)
- fs/ocfs2/dir.c
- fs/ocfs2/refcounttree.c (3x)
- fs/ocfs2/xattr.c (3x)
- fs/ubifs/find.c
! kernel/jump_label.c
! lib/extable.c

The ones marked with "-" are simple memory swaps that could (should!)
be replaced with NULL.  The ones marked with "!" actually do
something non-trivial.


Actually, I deliberately used a pointer-sized index, since the loop
uses (a + n) addressing.  Hardware indexed addressing on 64-bit
machines generally (I vaguely recall Aarch64 is an exception)
requires a 64-bit index; using a smaller index requires some masking
or sign-extending.

I thought it was safer to bet that the compiler would figure out
that a non-REX arithmetic operation could be used (cost if wrong,
a 1-byte REX prefix) than to bet that it would feel safe promoting
to 64 bits (cost if wrong: additional instructions).

The problem in both cases is that I changed to a test-for-zero
in the loop and the compiler doesn't know that size is a multiple
of 4, so it may have trouble knowing that this doesn't wrap to
2^64-1.

I could tell it via
	if (n % 4)
		unreachable();

One thing I should do is an intermediate cast through (unsigned int)
so the compiler can avoid a sign extension.  That makes the REX-elision
optimization easier.

> Hm, are we absolutely sure there's no weird .config where some struct
> ends up being empty, yet we still sort an array of them? It's far
> fetched, of course, but perhaps an "if (unlikely(!size)) return;" at the
> start of sort() would be in order. Dunno, I'll leave it to you.

Well, the old generic_swap code also had a test at the bottom and
so would misbehave somewhat, but yeah, this would be a good sanity
check.  Sorting is expensive enough that the cost is trivial.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function
  2019-03-05  5:58 ` [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
@ 2019-03-13 23:28   ` Rasmus Villemoes
  2019-03-14  1:58     ` George Spelvin
  0 siblings, 1 reply; 44+ messages in thread
From: Rasmus Villemoes @ 2019-03-13 23:28 UTC (permalink / raw)
  To: George Spelvin, linux-kernel
  Cc: Andrew Morton, Andrey Abramov, Geert Uytterhoeven, Daniel Wagner,
	Don Mullis, Dave Chinner, Andy Shevchenko

On 05/03/2019 06.58, George Spelvin wrote:
> CONFIG_RETPOLINE has severely degraded indirect function call
> performance, so it's worth putting some effort into reducing
> the number of times cmp() is called.
> 
> This patch avoids badly unbalanced merges on unlucky input sizes.
> It slightly increases the code size, but saves an average of 0.2*n
> calls to cmp().
> 
[snip]
> 
> (I confess to being more than a little bit proud of how clean this
> code turned out.  It took a lot of thinking, but the resultant inner
> loop is very simple and efficient.)
> 
> Refs:
>   Bottom-up Mergesort: A Detailed Analysis
>   Wolfgang Panny, Helmut Prodinger
>   Algorithmica 14(4):340--354, October 1995
>   https://doi.org/10.1007/BF01294131
>   https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.5260
> 
>   The cost distribution of queue-mergesort, optimal mergesorts, and
>   power-of-two rules
>   Wei-Mei Chen, Hsien-Kuei Hwang, Gen-Huey Chen
>   Journal of Algorithms 30(2); Pages 423--448, February 1999
>   https://doi.org/10.1006/jagm.1998.0986
>   https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5380
> 
>   Queue-Mergesort
>   Mordecai J. Golin, Robert Sedgewick
>   Information Processing Letters, 48(5):253--259, 10 December 1993
>   https://doi.org/10.1016/0020-0190(93)90088-q
>   https://sci-hub.tw/10.1016/0020-0190(93)90088-Q

This is beautiful. So no comments on the patch itself. One thing that
might be nice would be to see the reduction in number of cmp callbacks
explicitly; it should be trivial to use the priv element for that in the
list_sort_test module. But to really see it one would of course have to
extend that test to do a lot more different sizes of lists.

And looking at the actual cmp functions used made me think we might do
something similar to what you did for the swap functions for sort(),
though it will require updating call sites. Suppose we have some

struct foo {
   ...
   struct list_head list;
   ...
   some_integer_type key;
   ...
};

and a trivial cmp function that just compares the ->key members. What if
we do something like

#define LIST_SORT_SIMPLE_CMP(type, list, key) ({ \
  /* encode the size and signedness of type.key along with
   offsetof(type, key) - offsetof(type, list) into the low, say,
   24 bits - a signed 20 bit number should be sufficient for the
   offsetof diff, and we can bail at compile time if too big */
})

then do

int do_cmp(void *priv, list_head *a, list_head *b, cmp_func cmp)
{
   if ((long)cmp & high_bits) /* it's a kernel pointer */
      return cmp(priv, a, b);
   int diff = extract_diff(cmp);
   int type = extract_type(cmp);
   void *keya = (void*)a + diff;
   void *keyb = (void*)b + diff;
   if (type == s32)
      return *(s32*)keya > *(s32*)keyb;
   if (type == u32)
       return *(u32*)keya > *(u32*)keyb;
   ...

in practice, we'd probably only have to support 4- and 8-byte signed and
unsigned versions (and we can check at compile time whether key has a
supported type).

Similarly one could do a SORT_SIMPLE_CMP() for when sorting an array of
structs according to a single numeric member. That sort is not stable,
so the comparison functions would have to do a full -1,0,1 return, of
course, but we'd still avoid indirect calls.

Rasmus

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/5] lib/sort: Use more efficient bottom-up heapsort variant
  2019-03-13 22:29   ` Rasmus Villemoes
@ 2019-03-14  0:03     ` George Spelvin
  2019-03-14  0:15       ` Rasmus Villemoes
  0 siblings, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-14  0:03 UTC (permalink / raw)
  To: linux-kernel, linux, lkml
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	geert, st5pub

On Wed, 13 Mar 2019 at 23:29:40 +0100, Rasmus Villemoes wrote:
> On 21/02/2019 09.21, George Spelvin wrote:
>> +/**
>> + * parent - given the offset of the child, find the offset of the parent.
>> + * @i: the offset of the heap element whose parent is sought.  Non-zero.
>> + * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
>> + * @size: size of each element
>> + *
>> + * In terms of array indexes, the parent of element j = i/size is simply
>> + * (j-1)/2.  But when working in byte offsets, we can't use implicit
>> + * truncation of integer divides.
>> + *
>> + * Fortunately, we only need one bit of the quotient, not the full divide.
>> + * size has a least significant bit.  That bit will be clear if i is
>> + * an even multiple of size, and set if it's an odd multiple.
>> + *
>> + * Logically, we're doing "if (i & lsbit) i -= size;", but since the
>> + * branch is unpredictable, it's done with a bit of clever branch-free
>> + * code instead.
>> + */
>> +__attribute_const__ __always_inline
>> +static size_t parent(size_t i, unsigned int lsbit, size_t size)
>> +{
>> +	i -= size;
>> +	i -= size & -(i & lsbit);
>> +	return i / 2;
>> +}
>> +
>
> Really nice :) I had to work through this by hand, but it's solid.

Thank you!  Yes, the way the mask doesn't include the low-order bits
that don't matter anyway is a bit subtle.

When the code is subtle, use lots of comments.  The entire reason
for making this a separate helper function is to leave room for
the large comment.

>> +	unsigned const lsbit = size & -size;	/* Used to find parent */
>
> Nit: qualifier before type, "const unsigned". And this sets ZF, so a
> paranoid check for zero size (cf. the other mail) by doing "if (!lsbit)
> return;" is practically free. Though it's probably a bit obscure doing
> it that way...

Actually, this is a personal style thing which I can ignore for the sake
of the kernel, but I believe that it's better to put the qualifier
*after* the type.  This is due to C's pointer declaration syntax.

The standard example of the issue is:

	typedef char *pointer;
	const char *a;
	char const *b;
	char * const c;
	const pointer d;
	pointer const e;

Now, which variables are the same types?

The answer is that a & b are the same (mutable pointer to const
char), and c, d & e are the same (const pointer to mutable char).

I you make a habit of putting the qualifier *after* the type, then
a simple "textual substitution" mental model for the typedef works,
and it's clear that c and e are the same.

It's also clear that b cannot be represented by the typedef because
the const is between "char" and "*", and you obviously can't do that
with the typedef.

But if you put the qualifier first, it's annoying to rememeber why
a and d are not the same type.

So I've deliberately cultivated the style of putting the qualifier
after the type.

But if the kernel prefers it before...

>> +	if (!n)
>> +		return;
>
> I'd make that n <= 1. Shouldn't be much more costly.

(Actually, it's "num <= 1"; n is the pre-multiplied form so
n <= 1 can only happen when sorting one one-byte value.)

I actually thought about this and decided not to bother.  I did it
this way during development to stress the general-case code.  But
should I change it?

=== NEVER MIND ===

I had written a long reply justifying leaving it alone to save one
instruction when the light dawned: I can do *both* tests in one
step with
	size_t n = num * size, a = (num/2) * size;
	unsigned const lsbit = size & -size;	/* Used to find parent */

	if (!a)		/* num < 2 || size == 0 */
		return;

So now everyone's happy.

> Nice!

Thank you.  May I translate that into Acked-by?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/5] lib/sort: Use more efficient bottom-up heapsort variant
  2019-03-14  0:03     ` George Spelvin
@ 2019-03-14  0:15       ` Rasmus Villemoes
  0 siblings, 0 replies; 44+ messages in thread
From: Rasmus Villemoes @ 2019-03-14  0:15 UTC (permalink / raw)
  To: George Spelvin, linux-kernel
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	geert, st5pub

On 14/03/2019 01.03, George Spelvin wrote:
> On Wed, 13 Mar 2019 at 23:29:40 +0100, Rasmus Villemoes wrote:
> 
>> Nice!
> 
> Thank you.  May I translate that into Acked-by?
> 

Sort-of. I prefer first seeing the full rerolled series for context
etc., even if (the important parts of) this patch would be unchanged.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function
  2019-03-13 23:28   ` Rasmus Villemoes
@ 2019-03-14  1:58     ` George Spelvin
  2019-06-21 23:12       ` Rasmus Villemoes
  0 siblings, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-14  1:58 UTC (permalink / raw)
  To: linux-kernel, linux, lkml
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	geert, st5pub

On Thu, 14 Mar 2019 at 00:28:16 +0100, Rasmus Villemoes wrote:
> On 05/03/2019 06.58, George Spelvin wrote:
>> This patch avoids badly unbalanced merges on unlucky input sizes.
>> It slightly increases the code size, but saves an average of 0.2*n
>> calls to cmp().
>> 
> [snip]
>> 
>> (I confess to being more than a little bit proud of how clean this
>> code turned out.  It took a lot of thinking, but the resultant inner
>> loop is very simple and efficient.)
>
> This is beautiful. So no comments on the patch itself. One thing that
> might be nice would be to see the reduction in number of cmp callbacks
> explicitly; it should be trivial to use the priv element for that in the
> list_sort_test module. But to really see it one would of course have to
> extend that test to do a lot more different sizes of lists.

And you'd have to compile and run two different kernels, because you
can't run the algorithms side-by-side.  Not a good way to do it.

I used a user-space test harness for testing.  The output is ugly,
but here are lots of numbers for various list sizes.

The first group is single sort invocations.  The list is sorted by
(random) key, then again by address (the inverse), and the number
of comparisons printed.  The numbers in parens are the linear
coefficient K in
	comparisons = n*log2(n) - K*n,
i.e. it's
	log2(n) - (comparisons / n).
Higher is better.

The three lines for each size are the original list_sort, a top-down
version I wrote as a "perfect" value for reference, and the posted
code.  (1034 and 1040 are particularly bad cases for the old code.)

1: 0 (0.000000) 0 (0.000000)
1: 0 (0.000000) 0 (0.000000)
1: 0 (0.000000) 0 (0.000000)
2: 1 (0.500000) 1 (0.500000)
2: 1 (0.500000) 1 (0.500000)
2: 1 (0.500000) 1 (0.500000)
3: 3 (0.584963) 3 (0.584963)
3: 2 (0.918296) 2 (0.918296)
3: 3 (0.584963) 3 (0.584963)
4: 5 (0.750000) 5 (0.750000)
4: 5 (0.750000) 5 (0.750000)
4: 5 (0.750000) 5 (0.750000)
5: 8 (0.721928) 8 (0.721928)
5: 8 (0.721928) 8 (0.721928)
5: 8 (0.721928) 8 (0.721928)
6: 10 (0.918296) 10 (0.918296)
6: 10 (0.918296) 10 (0.918296)
6: 10 (0.918296) 10 (0.918296)
7: 11 (1.235926) 12 (1.093069)
7: 13 (0.950212) 12 (1.093069)
7: 11 (1.235926) 12 (1.093069)
8: 17 (0.875000) 17 (0.875000)
8: 17 (0.875000) 17 (0.875000)
8: 17 (0.875000) 17 (0.875000)
9: 18 (1.169925) 22 (0.725481)
9: 21 (0.836592) 20 (0.947703)
9: 20 (0.947703) 20 (0.947703)
10: 20 (1.321928) 25 (0.821928)
10: 25 (0.821928) 24 (0.921928)
10: 22 (1.121928) 24 (0.921928)
13: 35 (1.008132) 35 (1.008132)
13: 33 (1.161978) 33 (1.161978)
13: 36 (0.931209) 35 (1.008132)
21: 71 (1.011365) 74 (0.868508)
21: 68 (1.154222) 71 (1.011365)
21: 69 (1.106603) 72 (0.963746)
31: 117 (1.180003) 111 (1.373551)
31: 114 (1.276777) 114 (1.276777)
31: 117 (1.180003) 111 (1.373551)
32: 123 (1.156250) 121 (1.218750)
32: 123 (1.156250) 121 (1.218750)
32: 123 (1.156250) 121 (1.218750)
33: 158 (0.256515) 149 (0.529243)
33: 130 (1.105000) 131 (1.074697)
33: 131 (1.074697) 131 (1.074697)
34: 142 (0.910992) 135 (1.116875)
34: 133 (1.175698) 135 (1.116875)
34: 131 (1.234522) 134 (1.146286)
55: 244 (1.344996) 249 (1.254087)
55: 246 (1.308632) 241 (1.399542)
55: 249 (1.254087) 250 (1.235905)
89: 484 (1.037531) 490 (0.970115)
89: 464 (1.262250) 477 (1.116183)
89: 479 (1.093711) 482 (1.060003)
127: 729 (1.248527) 727 (1.264275)
127: 734 (1.209157) 724 (1.287897)
127: 729 (1.248527) 727 (1.264275)
128: 744 (1.187500) 733 (1.273438)
128: 744 (1.187500) 733 (1.273438)
128: 744 (1.187500) 733 (1.273438)
129: 752 (1.181770) 853 (0.398824)
129: 746 (1.228282) 740 (1.274793)
129: 747 (1.220530) 741 (1.267041)
144: 926 (0.739369) 928 (0.725481)
144: 851 (1.260203) 866 (1.156036)
144: 865 (1.162981) 872 (1.114369)
233: 1556 (1.186075) 1541 (1.250452)
233: 1545 (1.233285) 1527 (1.310538)
233: 1550 (1.211826) 1534 (1.280495)
377: 2787 (1.165848) 2790 (1.157890)
377: 2752 (1.258686) 2771 (1.208288)
377: 2778 (1.189720) 2782 (1.179110)
610: 5115 (0.867420) 5115 (0.867420)
610: 4891 (1.234633) 4883 (1.247747)
610: 4909 (1.205124) 4930 (1.170698)
642: 5385 (0.938579) 5428 (0.871601)
642: 5166 (1.279701) 5185 (1.250105)
642: 5205 (1.218953) 5201 (1.225183)
987: 8574 (1.259976) 8620 (1.213370)
987: 8564 (1.270108) 8599 (1.234647)
987: 8565 (1.269095) 8614 (1.219449)
1022: 8937 (1.252561) 8913 (1.276044)
1022: 8916 (1.273109) 8928 (1.261367)
1022: 8937 (1.252561) 8913 (1.276044)
1023: 8959 (1.241015) 8909 (1.289891)
1023: 8927 (1.272295) 8918 (1.281093)
1023: 8959 (1.241015) 8909 (1.289891)
1024: 8970 (1.240234) 8916 (1.292969)
1024: 8966 (1.244141) 8912 (1.296875)
1024: 8966 (1.244141) 8912 (1.296875)
1025: 9548 (0.686286) 9724 (0.514579)
1025: 8971 (1.249213) 8943 (1.276530)
1025: 8970 (1.250189) 8944 (1.275555)
1026: 9771 (0.479423) 9745 (0.504764)
1026: 8936 (1.293263) 8978 (1.252328)
1026: 8942 (1.287415) 8993 (1.237708)
1028: 9643 (0.625274) 9985 (0.292590)
1028: 8980 (1.270216) 9006 (1.244924)
1028: 8980 (1.270216) 9005 (1.245897)
1032: 9894 (0.424018) 10004 (0.317429)
1032: 9024 (1.267041) 9028 (1.263165)
1032: 9011 (1.279638) 9056 (1.236033)
1033: 9885 (0.443409) 9875 (0.453089)
1033: 9016 (1.284648) 9022 (1.278839)
1033: 9049 (1.252702) 9034 (1.267223)
1034: 9974 (0.367986) 9936 (0.404736)
1034: 9052 (1.259668) 9071 (1.241293)
1034: 9054 (1.257734) 9065 (1.247096)
1040: 10019 (0.388714) 9845 (0.556022)
1040: 9108 (1.264676) 9069 (1.302176)
1040: 9124 (1.249291) 9046 (1.324291)
1056: 10130 (0.451591) 10142 (0.440227)
1056: 9276 (1.260303) 9269 (1.266932)
1056: 9301 (1.236629) 9317 (1.221477)
1088: 10275 (0.643529) 10321 (0.601250)
1088: 9585 (1.277720) 9608 (1.256580)
1088: 9616 (1.249228) 9634 (1.232683)
1152: 10855 (0.747182) 10844 (0.756731)
1152: 10295 (1.233293) 10282 (1.244578)
1152: 10347 (1.188154) 10331 (1.202043)
1280: 11940 (0.993803) 11931 (1.000834)
1280: 11571 (1.282084) 11581 (1.274272)
1280: 11683 (1.194584) 11673 (1.202397)
1365: 12807 (1.032268) 12844 (1.005161)
1365: 12523 (1.240326) 12523 (1.240326)
1365: 12589 (1.191975) 12624 (1.166334)
1597: 15314 (1.051919) 15350 (1.029377)
1597: 15044 (1.220986) 15011 (1.241650)
1597: 15056 (1.213472) 15083 (1.196565)
2584: 27102 (0.847000) 27008 (0.883378)
2584: 26106 (1.232449) 26022 (1.264957)
2584: 26246 (1.178270) 26155 (1.213486)
4181: 48583 (0.409685) 48614 (0.402270)
4181: 45056 (1.253263) 45060 (1.252306)
4181: 45034 (1.258525) 45071 (1.249675)
6765: 78516 (1.117666) 78482 (1.122692)
6765: 77657 (1.244643) 77675 (1.241982)
6765: 77954 (1.200740) 77910 (1.207245)
16383: 208649 (1.264210) 208758 (1.257557)
16383: 208715 (1.260182) 208783 (1.256031)
16383: 208649 (1.264210) 208758 (1.257557)
16384: 208712 (1.261230) 208665 (1.264099)
16384: 208648 (1.265137) 208601 (1.268005)
16384: 208648 (1.265137) 208601 (1.268005)
16385: 217320 (0.736737) 215597 (0.841895)
16385: 208608 (1.268443) 208694 (1.263195)
16385: 208608 (1.268443) 208693 (1.263256)

This next group are averages over a large number of sorts in
power-of-2 size ranges.  (Note that the number of sorts is
2*repeat*range, because again I'm sorting 2x per example.)

(Sort #2 is the version from patch #4, which performs identically
to #1, and #3 was an in-development version.)

Sum for range 3..4:  (repeat = 4096)
Total1: 59965 (1.349885)
Total4: 60022 (1.348725)
Total5: 59965 (1.349885)
Sum for range 5..8:  (repeat = 2048)
Total1: 188025 (1.260878)
Total4: 186220 (1.281908)
Total5: 186778 (1.276100)
Sum for range 9..16:  (repeat = 1024)
Total1: 535651 (1.202479)
Total4: 526451 (1.257404)
Total5: 528574 (1.246141)
Sum for range 17..32:  (repeat = 512)
Total1: 1425426 (1.154977)
Total4: 1393173 (1.250729)
Total5: 1401257 (1.229850)
Sum for range 33..64:  (repeat = 256)
Total1: 3600379 (1.114953)
Total4: 3511299 (1.248298)
Total5: 3530978 (1.222512)
Sum for range 65..128:  (repeat = 128)
Total1: 8747092 (1.080758)
Total4: 8522794 (1.248781)
Total5: 8572107 (1.216567)
Sum for range 129..256:  (repeat = 64)
Total1: 20628339 (1.055123)
Total4: 20112497 (1.248378)
Total5: 20220855 (1.212965)
Sum for range 257..512:  (repeat = 32)
Total1: 47548848 (1.038531)
Total4: 46428338 (1.248514)
Total5: 46656950 (1.211016)
Sum for range 513..1024:  (repeat = 16)
Total1: 107716456 (1.026297)
Total4: 105346353 (1.248269)
Total5: 105818923 (1.209578)
Sum for range 1025..2048:  (repeat = 8)
Total1: 240657666 (1.018584)
Total4: 235753284 (1.248303)
Total5: 236715946 (1.208852)
Sum for range 2049..4096:  (repeat = 4)
Total1: 531747462 (1.013639)
Total4: 521718711 (1.248403)
Total5: 523663717 (1.208580)
Sum for range 4097..8192:  (repeat = 2)
Total1: 1164325820 (1.010319)
Total4: 1143994299 (1.248290)
Total5: 1147896676 (1.208329)
Sum for range 8193..16384:  (repeat = 1)
Total1: 2530151423 (1.008624)
Total4: 2489173424 (1.248348)
Total5: 2497021163 (1.208170)


> And looking at the actual cmp functions used made me think we might do
> something similar to what you did for the swap functions for sort(),
> though it will require updating call sites.

Nifty idea, but A Separate Patch.

Here's a survey of which call sites would work:

! block/blk-mq.c:plug_rq_cmp(): ->mq_ctx, then ->mq_hctx, then blk_rq_pos()
- drivers/acpi/nfit/core.c:nfit_mem_cmp(): ->device_handle (u32)
! drivers/gpu/drm/drm_modes.c:drm_mode_compare(): ->type & DRM_MODE_TYPE_PREFERRED, then ->hdisplay * ->vdisplay, then ->vrefresh, then ->clock
- drivers/gpu/drm/i915/gvt/debugfs.c:mmio_offset_compare(): ->offset
- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c:sort_holes(): ->start
- drivers/gpu/drm/radeon/radeon_cs.c:cmp_size_smaller_first(): ->tbo.num_pages
- drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c: interval_cmp(): ->start
- drivers/irqchip/irq-gic-v3-its.c:lpi_range_cmp(): ->base_id
- drivers/misc/sram.c:sram_reserve_cmp(): ->start
- drivers/nvme/host/core.c:ns_cmp(): ->ns_id
- drivers/spi/spi-loopback-test.c:rx_ranges_cmp(): ->start
- fs/btrfs/raid56.c:plug_cmp(): ->sector
- fs/btrfs/tree-log.c:extent_cmp(): ->start
- fs/btrfs/volumes.c:devid_cmp(): ->devid
- fs/ext4/fsmap.c:ext4_getfsmap_compare(): ->fmr_physical
- fs/gfs2/glock.c:glock_cmp(): ->gl_name.ln_number
- fs/gfs2/log.c:ip_cmp(): ->i_no_addr
- fs/gfs2/lops.c:blocknr_cmp(): ->b_blocknr
! fs/ubifs/gc.c:data_nodes_cmp(): key_inum(c, a), then key_block(c, a)
! fs/ubifs/gc.c:nondata_nodes_cmp(): ->type == UBIFS_INO_NODE, then lots of hair
- fs/ubifs/replay.c:replay_entries_cmp(): ->sqnum
! fs/xfs/xfs_trans_bmap.c:xfs_bmap_update_diff_items(): ->bi_owner->i_ino
! fs/xfs/xfs_trans_extfree.c:xfs_extent_free_diff_items(): XFS_FSB_TO_AGNO(mp, ra->xefi_startblock)
! fs/xfs/xfs_trans_refcount.c:xfs_refcount_update_diff_items(): XFS_FSB_TO_AGNO(mp, ra->ri_startblock)
! fs/xfs/xfs_trans_rmap.c:xfs_rmap_update_diff_items(): XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock)
	(XFS_FSB_TO_AGNO is a right shift by mp->m_sb.sb_agblklog.
	Er... but why bother?  Is there any harm including the low-order
	bits in the comparison?  Is the shift just to ensure we don't
	overflow int?)
- fs/xfs/scrub/bitmap.c:xfs_bitmap_range_cmp(): ->start
- fs/xfs/xfs_buf.c:xfs_buf_cmp(): ->b_maps[0].bm_bn
- fs/xfs/xfs_extent_busy.c:xfs_extent_busy_ag_cmp(): ->agno
- lib/test_list_sort.c:cmp(): ->value
! virt/kvm/arm/vgic/vgic.c:vgic_irq_cmp(): raw_spin_lock(&irqa->irq_lock);

- means your idea would work
! means the compare is too complex

(It might be possible to adapt the xfs ones, though.)

> int do_cmp(void *priv, list_head *a, list_head *b, cmp_func cmp)
> {
>    if ((long)cmp & high_bits) /* it's a kernel pointer */
>       return cmp(priv, a, b);
>    int diff = extract_diff(cmp);
>    int type = extract_type(cmp);
[snip]
> in practice, we'd probably only have to support 4- and 8-byte signed and
> unsigned versions (and we can check at compile time whether key has a
> supported type).

There's no need to get so fancy with the encoding inside "cmp"; we
could just use cmp == NULL and encode it in "priv".  That avoids
depending on the kernel memory layout on various architectures.

> Similarly one could do a SORT_SIMPLE_CMP() for when sorting an array of
> structs according to a single numeric member. That sort is not stable,
> so the comparison functions would have to do a full -1,0,1 return, of
> course, but we'd still avoid indirect calls.

Actually, while some sorts (notably fat-pivot quicksort) require
a 3-way return to avoid O(n^2) performance, heapsort is just fine
with a boolean return value.  Since this would be an internal
implementation detail of sort(), we could use the optimization.

This doesn't have a priv pointer, however, so we'd have to go back
to the messy encoding.  But nobody sorts large structures by
copying, so the offsets are guaranteed small.  I'd say 12 bits
of offset would be fine, and the entire thing would be aliased to the
zero page.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-05  3:06 ` [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
  2019-03-10 21:54   ` Rasmus Villemoes
@ 2019-03-14  9:10   ` Andy Shevchenko
  2019-03-14  9:41     ` George Spelvin
  2019-03-15  4:33     ` George Spelvin
  1 sibling, 2 replies; 44+ messages in thread
From: Andy Shevchenko @ 2019-03-14  9:10 UTC (permalink / raw)
  To: George Spelvin
  Cc: linux-kernel, Andrew Morton, Andrey Abramov, Geert Uytterhoeven,
	Daniel Wagner, Rasmus Villemoes, Don Mullis, Dave Chinner

On Tue, Mar 05, 2019 at 03:06:44AM +0000, George Spelvin wrote:
> Rather than a fixed-size array of pending sorted runs, use the ->prev
> links to keep track of things.  This reduces stack usage, eliminates
> some ugly overflow handling, and reduces the code size.
> 
> Also:
> * merge() no longer needs to handle NULL inputs, so simplify.
> * The same applies to merge_and_restore_back_links(), which is renamed
>   to the less ponderous merge_final().  (It's a static helper function,
>   so we don't need a super-descriptive name; comments will do.)
> 
> x86-64 code size 1086 -> 740 bytes (-346)

> +	do {
> +		size_t bit;
>  		struct list_head *cur = list;
> +
> +		/* Extract the head of "list" as a single-element list "cur" */
>  		list = list->next;
>  		cur->next = NULL;
>  
> +		/* Do merges corresponding to set lsbits in count */

> +		for (bit = 1; count & bit; bit <<= 1) {
> +			cur = merge(priv, (cmp_func)cmp, pending, cur);
> +			pending = pending->prev;  /* Untouched by merge() */
>  		}

Wouldn't be it the same to

	bit = ffz(count);
	while (bit--) {
		...
	}
?

Though I dunno which one is generating better code.

> +		/* And place the result at the head of "pending" */
> +		cur->prev = pending;
> +		pending = cur;
> +		count++;
> +	} while (list->next);
> +
> +	/* Now merge together last element with all pending lists */
> +	while (pending->prev) {
> +		list = merge(priv, (cmp_func)cmp, pending, list);
> +		pending = pending->prev;
>  	}

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-09 15:53     ` lkml
  2019-03-09 20:19       ` Andrey Abramov
@ 2019-03-14  9:29       ` Andy Shevchenko
  2019-03-14 10:09         ` George Spelvin
  2019-03-14 10:11         ` George Spelvin
  1 sibling, 2 replies; 44+ messages in thread
From: Andy Shevchenko @ 2019-03-14  9:29 UTC (permalink / raw)
  To: lkml
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel,
	linux, st5pub

On Sat, Mar 09, 2019 at 03:53:41PM +0000, lkml@sdf.org wrote:
> Andy Shevchenko wrote:
> > On Thu, Feb 21, 2019 at 06:30:28AM +0000, George Spelvin wrote:
> >> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > 
> > Why #ifdef is better than if (IS_ENABLED()) ?
> 
> Because CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is bool and not
> tristate.  IS_ENABLED tests for 'y' or 'm' but we don't need it
> for something that's only on or off.

There is IS_BUILTIN(), though it's a common practice to use IS_ENABLED() even
for boolean options (I think because of naming of the macro).

> Looking through the kernel, I see both, but #ifdef or #if defined()
> are definitely in the majority:
> 
> lib/lzo/lzo1x_compress.c:#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(LZO_USE_CTZ64)
> lib/siphash.c:#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> lib/string.c:#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> lib/strncpy_from_user.c:#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> lib/zlib_inflate/inffast.c:#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> 
> I see a few IS_ENABLED uses in include/crypto/ and kernel/bpf/.
> 
> It makes no real difference; #ifdef is simpler to me.


> static bool __attribute_const__
> is_aligned(const void *base, size_t size, unsigned char align)
> {
> 	unsigned char lsbits = (unsigned char)size;
> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> 	(void)base;
> #else
> 	lsbits |= (unsigned char)(uintptr_t)base;
> #endif
> 	return (lsbits & (align - 1)) == 0;
> }

> Any preference?

This one looks better in a sense we don't suppress the warnings when it's not
needed.

> > For such primitives that operates on top of an arrays we usually append 's' to
> > the name. Currently the name is misleading.
> > 
> > Perhaps u32s_swap().
> 
> I don't worry much about the naming of static helper functions.
> If they were exported, it would be a whole lot more important!
> 
> I find "u32s" confusing; I keep reading the "s" as "signed" rather
> than a plural.

For signedness we use prefixes, for plural — suffixes. I don't see the point of
confusion. And this is in use in kernel a lot.

> How about one of:
> swap_bytes / swap_ints / swap_longs
> swap_1 / swap_4 / swap_8

longs are ambiguous, so I would prefer bit-sized types.

> > Shouldn't simple memcpy cover these case for both 32- and 64-bit architectures?
> 
> This isn't a memcpy, it's a memory *swap*.  To do it with memcpy
> requires:
> 	memcpy(temp_buffer, a, size);
> 	memcpy(a, b, size);
> 	memcpy(b, temp_buffer, size);
> 
> This is 1.5x as much memory access, and you have to find a large
> enough temp_buffer.  (I didn't think a variable-length array on
> the stack would make people happy.)
> 
> Also, although it is a predictable branch, memcpy() has to check the
> alignment of its inputs each call.  The reason for these helpers is
> to factor that out.

Makes sense.

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-14  9:10   ` Andy Shevchenko
@ 2019-03-14  9:41     ` George Spelvin
  2019-03-15  4:33     ` George Spelvin
  1 sibling, 0 replies; 44+ messages in thread
From: George Spelvin @ 2019-03-14  9:41 UTC (permalink / raw)
  To: andriy.shevchenko, lkml
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel,
	linux, st5pub

On Thu, 14 Mar 2019 at 11:10:41 +0200, Andy Shevchenko wrote:
> On Tue, Mar 05, 2019 at 03:06:44AM +0000, George Spelvin wrote:
>> +		/* Do merges corresponding to set lsbits in count */
>
>> +		for (bit = 1; count & bit; bit <<= 1) {
>> +			cur = merge(priv, (cmp_func)cmp, pending, cur);
>> +			pending = pending->prev;  /* Untouched by merge() */
>>  		}
>
> Wouldn't be it the same to
>
> 	bit = ffz(count);
> 	while (bit--) {
> 		...
> 	}
> ?
>
> Though I dunno which one is generating better code.

Yes, it's the same.  But count is an incrementing counter, so the
pattern of return values from ffz() is 01020103010201040102010301020105...,
which has mean value 1/2 + 1/4 + 1/8 + 1/16 +... = 1.

So spending one instruction on ffz() to save one instruction per loop
iteration is an even trade, and if the processor doesn't have an ffz()
instruction, it's a loss.

There's a third possible implementation:

>> +		for (bit = count; bit & 1; bit >>= 1) {

...which works fine, too.  (It even saves a few bytes of code, so I
might switch to it.)  I used the form I did because my test code
verified that the length of the lists being merged equalled "bit".

The other forms don't have that property.


Thank you for looking at the code!

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-14  9:29       ` Andy Shevchenko
@ 2019-03-14 10:09         ` George Spelvin
  2019-03-14 10:41           ` Geert Uytterhoeven
  2019-03-14 10:11         ` George Spelvin
  1 sibling, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-14 10:09 UTC (permalink / raw)
  To: 13, andriy.shevchenko, lkml, st5pub
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel, linux

On Sat, 09 Mar 2019 at 23:19:49 +0300, Andrey Abramov wrote:
>> Although I'm thinking of:
>>
>> static bool __attribute_const__
>> is_aligned(const void *base, size_t size, unsigned char align)
>> {
>> 	unsigned char lsbits = (unsigned char)size;
>>
>> 	(void)base;
>> #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>> 	lsbits |= (unsigned char)(uintptr_t)base;
>> #endif
>> 	return (lsbits & (align - 1)) == 0;
>> }
>>
>> Any preference?
> I think it would be better.

>> I find "u32s" confusing; I keep reading the "s" as "signed" rather
>> than a plural.
>>
>> How about one of:
>> swap_bytes / swap_ints / swap_longs
>> swap_1 / swap_4 / swap_8
>
> In my opinion "swap_bytes / swap_ints / swap_longs" are the most readable.


On Thu, 14 Mar 2019 at 11:29:58 +0200, Andy Shevchenko wrote:
> On Sat, Mar 09, 2019 at 03:53:41PM +0000, lkml@sdf.org wrote:
>> static bool __attribute_const__
>> is_aligned(const void *base, size_t size, unsigned char align)
>> {
>> 	unsigned char lsbits = (unsigned char)size;
>> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>> 	(void)base;
>> #else
>> 	lsbits |= (unsigned char)(uintptr_t)base;
>> #endif
>> 	return (lsbits & (align - 1)) == 0;
>> }
>
>> Any preference?
>
> This one looks better in a sense we don't suppress the warnings when it's
> not needed.

>>> For such primitives that operates on top of an arrays we usually
>>> append 's' to the name. Currently the name is misleading.
>>> 
>>> Perhaps u32s_swap().
>> 
>> I don't worry much about the naming of static helper functions.
>> If they were exported, it would be a whole lot more important!
>> 
>> I find "u32s" confusing; I keep reading the "s" as "signed" rather
>> than a plural.
>
> For signedness we use prefixes; for plural, suffixes. I don't see the point of
> confusion. And this is in use in kernel a lot.
>
>> How about one of:
>> swap_bytes / swap_ints / swap_longs
>> swap_1 / swap_4 / swap_8
>
> longs are ambiguous, so I would prefer bit-sized types.

I already implemented Andrey's suggestions, which were the exact
opposite of yours.

Pistols at dawn?


>>>> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>>> 
>>> Why #ifdef is better than if (IS_ENABLED()) ?
>> 
>> Because CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is bool and not
>> tristate.  IS_ENABLED tests for 'y' or 'm' but we don't need it
>> for something that's only on or off.
>
> There is IS_BUILTIN(), though it's a common practice to use IS_ENABLED()
> even for boolean options (I think because of naming of the macro).

Well, as I said earlier, #ifdef is the most common form in the kernel.
It's also the shortest to write, and I like the fact that it slightly
simpler.  (Admittedly, "IS_ENABLED" does not take a lot of brain power
to interpret, but it *is* one more macro that might be hiding magic.)

So I'm not inclined to change it without a substantial reason.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-14  9:29       ` Andy Shevchenko
  2019-03-14 10:09         ` George Spelvin
@ 2019-03-14 10:11         ` George Spelvin
  1 sibling, 0 replies; 44+ messages in thread
From: George Spelvin @ 2019-03-14 10:11 UTC (permalink / raw)
  To: andriy.shevchenko, lkml, st5pub
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel, linux

On Sat, 09 Mar 2019 at 23:19:49 +0300, Andrey Abramov wrote:
>> Although I'm thinking of:
>>
>> static bool __attribute_const__
>> is_aligned(const void *base, size_t size, unsigned char align)
>> {
>> 	unsigned char lsbits = (unsigned char)size;
>>
>> 	(void)base;
>> #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>> 	lsbits |= (unsigned char)(uintptr_t)base;
>> #endif
>> 	return (lsbits & (align - 1)) == 0;
>> }
>>
>> Any preference?
> I think it would be better.

>> I find "u32s" confusing; I keep reading the "s" as "signed" rather
>> than a plural.
>>
>> How about one of:
>> swap_bytes / swap_ints / swap_longs
>> swap_1 / swap_4 / swap_8
>
> In my opinion "swap_bytes / swap_ints / swap_longs" are the most readable.


On Thu, 14 Mar 2019 at 11:29:58 +0200, Andy Shevchenko wrote:
> On Sat, Mar 09, 2019 at 03:53:41PM +0000, lkml@sdf.org wrote:
>> static bool __attribute_const__
>> is_aligned(const void *base, size_t size, unsigned char align)
>> {
>> 	unsigned char lsbits = (unsigned char)size;
>> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>> 	(void)base;
>> #else
>> 	lsbits |= (unsigned char)(uintptr_t)base;
>> #endif
>> 	return (lsbits & (align - 1)) == 0;
>> }
>
>> Any preference?
>
> This one looks better in a sense we don't suppress the warnings when it's
> not needed.

>>> For such primitives that operates on top of an arrays we usually
>>> append 's' to the name. Currently the name is misleading.
>>> 
>>> Perhaps u32s_swap().
>> 
>> I don't worry much about the naming of static helper functions.
>> If they were exported, it would be a whole lot more important!
>> 
>> I find "u32s" confusing; I keep reading the "s" as "signed" rather
>> than a plural.
>
> For signedness we use prefixes; for plural, suffixes. I don't see the point of
> confusion. And this is in use in kernel a lot.
>
>> How about one of:
>> swap_bytes / swap_ints / swap_longs
>> swap_1 / swap_4 / swap_8
>
> longs are ambiguous, so I would prefer bit-sized types.

I already implemented Andrey's suggestions, which were the exact
opposite of yours.

Pistols at dawn?


>>>> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>>> 
>>> Why #ifdef is better than if (IS_ENABLED()) ?
>> 
>> Because CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is bool and not
>> tristate.  IS_ENABLED tests for 'y' or 'm' but we don't need it
>> for something that's only on or off.
>
> There is IS_BUILTIN(), though it's a common practice to use IS_ENABLED()
> even for boolean options (I think because of naming of the macro).

Well, as I said earlier, #ifdef is the most common form in the kernel.
It's also the shortest to write, and I like the fact that it slightly
simpler.  (Admittedly, "IS_ENABLED" does not take a lot of brain power
to interpret, but it *is* one more macro that might be hiding magic.)

So I'm not inclined to change it without a substantial reason.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-14 10:09         ` George Spelvin
@ 2019-03-14 10:41           ` Geert Uytterhoeven
  2019-03-14 11:53             ` George Spelvin
  0 siblings, 1 reply; 44+ messages in thread
From: Geert Uytterhoeven @ 2019-03-14 10:41 UTC (permalink / raw)
  To: George Spelvin
  Cc: 13, Andy Shevchenko, Andrey Abramov, Andrew Morton,
	Daniel Wagner, Dave Chinner, Don Mullis,
	Linux Kernel Mailing List, Rasmus Villemoes

Hi George,

On Thu, Mar 14, 2019 at 11:10 AM George Spelvin <lkml@sdf.org> wrote:
> On Sat, 09 Mar 2019 at 23:19:49 +0300, Andrey Abramov wrote:
> >> How about one of:
> >> swap_bytes / swap_ints / swap_longs
> >> swap_1 / swap_4 / swap_8
> >
> > longs are ambiguous, so I would prefer bit-sized types.
>
> I already implemented Andrey's suggestions, which were the exact
> opposite of yours.
>
> Pistols at dawn?

Prepared to fix all future long vs. int bugs?

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-14 10:41           ` Geert Uytterhoeven
@ 2019-03-14 11:53             ` George Spelvin
  2019-03-14 12:18               ` Andy Shevchenko
  0 siblings, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-14 11:53 UTC (permalink / raw)
  To: geert, lkml
  Cc: 13, akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	linux-kernel, linux, st5pub

On Thu, 14 Mar 2019 at 11:41:26 +0100, Geert Uytterhoeven wrote:
> On Thu, Mar 14, 2019 at 11:10 AM George Spelvin <lkml@sdf.org> wrote:
>> On Sat, 09 Mar 2019 at 23:19:49 +0300, Andrey Abramov wrote:
>>>> How about one of:
>>>> swap_bytes / swap_ints / swap_longs
>>>> swap_1 / swap_4 / swap_8
>>>
>>> longs are ambiguous, so I would prefer bit-sized types.
>>
>> I already implemented Andrey's suggestions, which were the exact
>> opposite of yours.
>>
>> Pistols at dawn?

I didn't explain the joke because jokes aren't funny if explained,
but just in case, by suggesting a clearly ridiculous method, I was
saying "I have no idea how to resolve this conflict."

> Prepared to fix all future long vs. int bugs?

In the entire kernel?  Or just the one small source file
where these statically scoped helper functions are visible?

In case of uncertainty, the comments and the code are right there
just a few lines away from the one and only call site, so I
don't expect much confusion.

I care a lot about function names when they are exported, but in
this case we're talking about what colour to paint the *inside* of
the bike shed.

I just want to pick some names and move on.  Since nothing so far seems
satsify everyone, I'll go with the ponderous but utterly unambiguous:

swap_bytes
swap_4byte_words
swap_8byte_words

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-14 11:53             ` George Spelvin
@ 2019-03-14 12:18               ` Andy Shevchenko
  2019-03-14 19:59                 ` Andrey Abramov
  0 siblings, 1 reply; 44+ messages in thread
From: Andy Shevchenko @ 2019-03-14 12:18 UTC (permalink / raw)
  To: George Spelvin
  Cc: geert, 13, akpm, daniel.wagner, dchinner, don.mullis,
	linux-kernel, linux, st5pub

On Thu, Mar 14, 2019 at 11:53:55AM +0000, George Spelvin wrote:
> On Thu, 14 Mar 2019 at 11:41:26 +0100, Geert Uytterhoeven wrote:
> > On Thu, Mar 14, 2019 at 11:10 AM George Spelvin <lkml@sdf.org> wrote:
> >> On Sat, 09 Mar 2019 at 23:19:49 +0300, Andrey Abramov wrote:
> >>>> How about one of:
> >>>> swap_bytes / swap_ints / swap_longs
> >>>> swap_1 / swap_4 / swap_8
> >>>
> >>> longs are ambiguous, so I would prefer bit-sized types.
> >>
> >> I already implemented Andrey's suggestions, which were the exact
> >> opposite of yours.
> >>
> >> Pistols at dawn?

> I just want to pick some names and move on.  Since nothing so far seems
> satsify everyone, I'll go with the ponderous but utterly unambiguous:

> swap_bytes
> swap_4byte_words
> swap_8byte_words

Works for me, or if someone insists it might be also bits there.

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-14 12:18               ` Andy Shevchenko
@ 2019-03-14 19:59                 ` Andrey Abramov
  2019-03-15  3:35                   ` George Spelvin
  0 siblings, 1 reply; 44+ messages in thread
From: Andrey Abramov @ 2019-03-14 19:59 UTC (permalink / raw)
  To: Andy Shevchenko, George Spelvin
  Cc: geert, 13, akpm, daniel.wagner, dchinner, don.mullis,
	linux-kernel, linux

> Pistols at dawn?

> swap_bytes
> swap_4byte_words
> swap_8byte_words
or
>  swap_bytes / swap_ints / swap_longs
>  swap_1 / swap_4 / swap_8

Yes, in my opinion, swap_bytes / swap_ints / swap_longs are the most readable because we have both swap_ints and swap_longs functions (in one file near each other), so I don't think that there will be any confusion about size. 
But actually, it doesn't matter which name will you take, because the meaning of each, in my opinion, is obvious enough, so I don't mind about any of these options.

-- 
With Best Regards,
Andrey Abramov

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-14 19:59                 ` Andrey Abramov
@ 2019-03-15  3:35                   ` George Spelvin
  2019-03-15  8:27                     ` Geert Uytterhoeven
  0 siblings, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-15  3:35 UTC (permalink / raw)
  To: andriy.shevchenko, lkml, st5pub
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel, linux

>> swap_bytes / swap_4byte_words / swap_8byte_words
>> swap_bytes / swap_ints / swap_longs
>> swap_1 / swap_4 / swap_8
>> Pistols at dawn?

On Thu, 14 Mar 2019 at 22:59:55 +0300, Andrey Abramov wrote:
> Yes, in my opinion, swap_bytes / swap_ints / swap_longs are the
> most readable because we have both swap_ints and swap_longs functions
> (in one file near each other), so I don't think that there will be
> any confusion about size.

Yes, that's what I thought.  They're three related but different
functions, suffixed _bytes, _ints, and _longs.  What could the
difference possibly be?  And if anyone has any lingering doubts,
the functions are right there, with exquisitely clear comments.

No to mention where they're used.  Is "is_aligned(base, size, 8)"
remotely obscure?  Especially in context:

		if (is_aligned(base, size, 8))
			swap_func = swap_longs;
		else if (is_aligned(base, size, 4))
			swap_func = swap_ints;
		else
			swap_func = swap_bytes;

What subtle and mysterious code.

> But actually, it doesn't matter which name will you take, because
> the meaning of each, in my opinion, is obvious enough, so I don't
> mind about any of these options.

I'm just amazed that this piece of bikeshedding is the most
contentious thing about the patch series.

I mean, if I'd named them:
	llanfairpwllgwyngyll()
	shravanabelagola()
	zheleznodorozhny()
or
	peckish()
	esuriant()
	hungry()
then yes, those would be bad names.

I prefer the shorter _ints and _longs names, but this is just
not a hill I want to die on.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-14  9:10   ` Andy Shevchenko
  2019-03-14  9:41     ` George Spelvin
@ 2019-03-15  4:33     ` George Spelvin
  2019-03-15  8:20       ` Geert Uytterhoeven
  1 sibling, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-15  4:33 UTC (permalink / raw)
  To: andriy.shevchenko, lkml
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel,
	linux, st5pub

On Thu, 14 Mar 2019 at 11:10:41 +0200, Andy Shevchenko wrote:
> On Tue, Mar 05, 2019 at 03:06:44AM +0000, George Spelvin wrote:
>> +		for (bit = 1; count & bit; bit <<= 1) {
>> +			cur = merge(priv, (cmp_func)cmp, pending, cur);
>> +			pending = pending->prev;  /* Untouched by merge() */
>>  		}
>
> Wouldn't be it the same to
> 
> 	bit = ffz(count);
> 	while (bit--) {
> 		...
> 	}
> ?
>
> Though I dunno which one is generating better code.

One question I should ask everyone: should "count" be 32 or 64 bits
on 64-bit machines?  That would let x86 save a few REX bytes.  (815
vs. 813 byte code, if anyone cares.)

Allegedy ARM can save a few pJ by gating the high 32
bits of the ALU.

Most other 64-bit processors would prefer 64-bit operations as
it saves masking operations.

If we never sort a list with more than 2^32 entries, it
makes no difference.

If we use a 32-bit count and we *do* sort a list with more than
2^32 entries, then it still sorts, but the performance degrades to
O((n/2^32)^2).

Just how often do we expect the kernel to face lists that long?
(Note that the old code was O((n/2^20)^2).)

In the code, I could do something like

#ifdef CONFIG_X86_64
/* Comment explaining why */
typedef uint32_t count_t;
#else
typedef size_t count_t;
#endif

...
	count_t count = 0;

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15  4:33     ` George Spelvin
@ 2019-03-15  8:20       ` Geert Uytterhoeven
  2019-03-15 10:23         ` George Spelvin
  0 siblings, 1 reply; 44+ messages in thread
From: Geert Uytterhoeven @ 2019-03-15  8:20 UTC (permalink / raw)
  To: George Spelvin
  Cc: Andy Shevchenko, Andrew Morton, Daniel Wagner, Dave Chinner,
	Don Mullis, Linux Kernel Mailing List, Rasmus Villemoes,
	Andrey Abramov

Hi George,

On Fri, Mar 15, 2019 at 5:33 AM George Spelvin <lkml@sdf.org> wrote:
> On Thu, 14 Mar 2019 at 11:10:41 +0200, Andy Shevchenko wrote:
> > On Tue, Mar 05, 2019 at 03:06:44AM +0000, George Spelvin wrote:
> >> +            for (bit = 1; count & bit; bit <<= 1) {
> >> +                    cur = merge(priv, (cmp_func)cmp, pending, cur);
> >> +                    pending = pending->prev;  /* Untouched by merge() */
> >>              }
> >
> > Wouldn't be it the same to
> >
> >       bit = ffz(count);
> >       while (bit--) {
> >               ...
> >       }
> > ?
> >
> > Though I dunno which one is generating better code.
>
> One question I should ask everyone: should "count" be 32 or 64 bits
> on 64-bit machines?  That would let x86 save a few REX bytes.  (815
> vs. 813 byte code, if anyone cares.)
>
> Allegedy ARM can save a few pJ by gating the high 32
> bits of the ALU.
>
> Most other 64-bit processors would prefer 64-bit operations as
> it saves masking operations.
>
> If we never sort a list with more than 2^32 entries, it
> makes no difference.
>
> If we use a 32-bit count and we *do* sort a list with more than
> 2^32 entries, then it still sorts, but the performance degrades to
> O((n/2^32)^2).
>
> Just how often do we expect the kernel to face lists that long?
> (Note that the old code was O((n/2^20)^2).)

Using size_t sounds most logical to me (argument of least surprise).

> In the code, I could do something like
>
> #ifdef CONFIG_X86_64
> /* Comment explaining why */
> typedef uint32_t count_t;
> #else
> typedef size_t count_t;
> #endif
>
> ...
>         count_t count = 0;

Using different types makes it more complex, e.g. to print the value
in debug code.
And adding more typedefs is frowned upon.

Just my 0.02€.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 1/5] lib/sort: Make swap functions more generic
  2019-03-15  3:35                   ` George Spelvin
@ 2019-03-15  8:27                     ` Geert Uytterhoeven
  0 siblings, 0 replies; 44+ messages in thread
From: Geert Uytterhoeven @ 2019-03-15  8:27 UTC (permalink / raw)
  To: George Spelvin
  Cc: Andy Shevchenko, Andrey Abramov, Andrew Morton, Daniel Wagner,
	Dave Chinner, Don Mullis, Linux Kernel Mailing List,
	Rasmus Villemoes

Hi George,

On Fri, Mar 15, 2019 at 4:36 AM George Spelvin <lkml@sdf.org> wrote:
> >> swap_bytes / swap_4byte_words / swap_8byte_words
> >> swap_bytes / swap_ints / swap_longs
> >> swap_1 / swap_4 / swap_8
> >> Pistols at dawn?
>
> On Thu, 14 Mar 2019 at 22:59:55 +0300, Andrey Abramov wrote:
> > Yes, in my opinion, swap_bytes / swap_ints / swap_longs are the
> > most readable because we have both swap_ints and swap_longs functions
> > (in one file near each other), so I don't think that there will be
> > any confusion about size.
>
> Yes, that's what I thought.  They're three related but different
> functions, suffixed _bytes, _ints, and _longs.  What could the
> difference possibly be?  And if anyone has any lingering doubts,
> the functions are right there, with exquisitely clear comments.
>
> No to mention where they're used.  Is "is_aligned(base, size, 8)"
> remotely obscure?  Especially in context:
>
>                 if (is_aligned(base, size, 8))
>                         swap_func = swap_longs;
>                 else if (is_aligned(base, size, 4))
>                         swap_func = swap_ints;
>                 else
>                         swap_func = swap_bytes;
>
> What subtle and mysterious code.
>
> > But actually, it doesn't matter which name will you take, because
> > the meaning of each, in my opinion, is obvious enough, so I don't
> > mind about any of these options.
>
> I'm just amazed that this piece of bikeshedding is the most
> contentious thing about the patch series.
>
> I mean, if I'd named them:
>         llanfairpwllgwyngyll()
>         shravanabelagola()
>         zheleznodorozhny()
> or
>         peckish()
>         esuriant()
>         hungry()
> then yes, those would be bad names.
>
> I prefer the shorter _ints and _longs names, but this is just
> not a hill I want to die on.

Argument of least surprise: don't call something a duck if it's not
guaranteed to behave like a duck.

If I read "long", this triggers a warning flag in my head: be careful, this is
32-bit on 32-bit platforms, and 64-bit on 64-bit platforms.

There's a reason the newer I/O ioread{8,16,32} accessors use explicit
sizes, unlike the ancient x86-centric read[bwl]().

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15  8:20       ` Geert Uytterhoeven
@ 2019-03-15 10:23         ` George Spelvin
  2019-03-15 12:57           ` Geert Uytterhoeven
  0 siblings, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-15 10:23 UTC (permalink / raw)
  To: geert, lkml
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	linux-kernel, linux, st5pub

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain, Size: 3798 bytes --]

On Fri, 15 Mar 2019 at 09:20:58 +0100, Geert Uytterhoeven wrote:
> On Fri, Mar 15, 2019 at 5:33 AM George Spelvin <lkml@sdf.org> wrote:
>> On Thu, 14 Mar 2019 at 11:10:41 +0200, Andy Shevchenko wrote:
>>> On Tue, Mar 05, 2019 at 03:06:44AM +0000, George Spelvin wrote:
>>>> +            for (bit = 1; count & bit; bit <<= 1) {
>>>> +                    cur = merge(priv, (cmp_func)cmp, pending, cur);
>>>> +                    pending = pending->prev;  /* Untouched by merge() */
>>>>              }
>>>
>>> Wouldn't be it the same to
>>>
>>>       bit = ffz(count);
>>>       while (bit--) {
>>>               ...
>>>       }
>>> ?
>>>
>>> Though I dunno which one is generating better code.
>>
>> One question I should ask everyone: should "count" be 32 or 64 bits
>> on 64-bit machines?  That would let x86 save a few REX bytes.  (815
>> vs. 813 byte code, if anyone cares.)
>>
>> Allegedy ARM can save a few pJ by gating the high 32
>> bits of the ALU.
>>
>> Most other 64-bit processors would prefer 64-bit operations as
>> it saves masking operations.
>>
>> If we never sort a list with more than 2^32 entries, it
>> makes no difference.
>>
>> If we use a 32-bit count and we *do* sort a list with more than
>> 2^32 entries, then it still sorts, but the performance degrades to
>> O((n/2^32)^2).
>>
>> Just how often do we expect the kernel to face lists that long?
>> (Note that the old code was O((n/2^20)^2).)
>
> Using size_t sounds most logical to me (argument of least surprise).

Yes, it is the obvious solution, which is why that's my default choice.

But a bit of thought shows that a list long enough to break a
32-bit implementation is beyond ridiculous.

The list must be at least 3 * 2^32 elements long to make the sort
merge non-optimally.  That's 1.5 * 2^37 bytes (192 GiB) of list_head
structures alone; at least double that for any practical application.
And 32 * 3 * 2^32 + (2 + 3) * 2^32 = 101 * 2^32 = 1.57 * 2^38
compares.

That seems like a lot but that's not the botteneck.  Each compare
reads from a new list element, and pretty soon, they'll miss all
caches and go to main memory.

Since the memory locations are random, for any small subset of the
list, you'll get only one element per cache line.  A 32 MiB L3
cache is 2^19 cache lines (assuming 64B lines).  So merge levels
20 through 33 will go to main memory.

That's (12 * 3 + 5) * 2^32 = 1.28 * 2^37 cache misses.  At 60 ns each (typical
local DRAM access time on i7 & Xeon according to Intel), that's a
hard minimum of 10565 seconds = 2h 56m 05s in one list_sort call.

This is definitely the scale of problem where a mutithreaded sort is
called for.

It's *so* impossible that maybe it's worth trading that capability
for a couple of bytes in the inner loop.

>> In the code, I could do something like
>>
>> #ifdef CONFIG_X86_64
>> /* Comment explaining why */
>> typedef uint32_t count_t;
>> #else
>> typedef size_t count_t;
>> #endif
>>
>> ...
>>         count_t count = 0;
>
> Using different types makes it more complex, e.g. to print the value
> in debug code.
> And adding more typedefs is frowned upon.

It's a *local* typedef, purely for the purpose of moving #ifdef clutter
out of the function declaration.  I agree that *global* typedefs are
discouraged.

As for debugging, that's a red herring; it's easy to cast to (size_t).

> Just my 0.02€.

Thank you for considering the issue!

>> I prefer the shorter _ints and _longs names, but this is just
>> not a hill I want to die on.
>
> Argument of least surprise: don't call something a duck if it's not
> guaranteed to behave like a duck.
>
> If I read "long", this triggers a warning flag in my head: be careful,
> this is 32-bit on 32-bit platforms, and 64-bit on 64-bit platforms.

Good point.  And "_longlong" or "_llong" is a bit ugly, too.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15 10:23         ` George Spelvin
@ 2019-03-15 12:57           ` Geert Uytterhoeven
  2019-03-15 16:59             ` George Spelvin
  0 siblings, 1 reply; 44+ messages in thread
From: Geert Uytterhoeven @ 2019-03-15 12:57 UTC (permalink / raw)
  To: George Spelvin
  Cc: Andrew Morton, Andy Shevchenko, Daniel Wagner, Dave Chinner,
	Don Mullis, Linux Kernel Mailing List, Rasmus Villemoes,
	Andrey Abramov

Hi George,

On Fri, Mar 15, 2019 at 11:23 AM George Spelvin <lkml@sdf.org> wrote:
> On Fri, 15 Mar 2019 at 09:20:58 +0100, Geert Uytterhoeven wrote:
> > On Fri, Mar 15, 2019 at 5:33 AM George Spelvin <lkml@sdf.org> wrote:
> >> On Thu, 14 Mar 2019 at 11:10:41 +0200, Andy Shevchenko wrote:
> >>> On Tue, Mar 05, 2019 at 03:06:44AM +0000, George Spelvin wrote:
> >>>> +            for (bit = 1; count & bit; bit <<= 1) {
> >>>> +                    cur = merge(priv, (cmp_func)cmp, pending, cur);
> >>>> +                    pending = pending->prev;  /* Untouched by merge() */
> >>>>              }
> >>>
> >>> Wouldn't be it the same to
> >>>
> >>>       bit = ffz(count);
> >>>       while (bit--) {
> >>>               ...
> >>>       }
> >>> ?
> >>>
> >>> Though I dunno which one is generating better code.
> >>
> >> One question I should ask everyone: should "count" be 32 or 64 bits
> >> on 64-bit machines?  That would let x86 save a few REX bytes.  (815
> >> vs. 813 byte code, if anyone cares.)
> >>
> >> Allegedy ARM can save a few pJ by gating the high 32
> >> bits of the ALU.
> >>
> >> Most other 64-bit processors would prefer 64-bit operations as
> >> it saves masking operations.
> >>
> >> If we never sort a list with more than 2^32 entries, it
> >> makes no difference.
> >>
> >> If we use a 32-bit count and we *do* sort a list with more than
> >> 2^32 entries, then it still sorts, but the performance degrades to
> >> O((n/2^32)^2).
> >>
> >> Just how often do we expect the kernel to face lists that long?
> >> (Note that the old code was O((n/2^20)^2).)
> >
> > Using size_t sounds most logical to me (argument of least surprise).
>
> Yes, it is the obvious solution, which is why that's my default choice.
>
> But a bit of thought shows that a list long enough to break a
> 32-bit implementation is beyond ridiculous.
>
> The list must be at least 3 * 2^32 elements long to make the sort
> merge non-optimally.  That's 1.5 * 2^37 bytes (192 GiB) of list_head
> structures alone; at least double that for any practical application.
> And 32 * 3 * 2^32 + (2 + 3) * 2^32 = 101 * 2^32 = 1.57 * 2^38
> compares.
>
> That seems like a lot but that's not the botteneck.  Each compare
> reads from a new list element, and pretty soon, they'll miss all
> caches and go to main memory.
>
> Since the memory locations are random, for any small subset of the
> list, you'll get only one element per cache line.  A 32 MiB L3
> cache is 2^19 cache lines (assuming 64B lines).  So merge levels
> 20 through 33 will go to main memory.
>
> That's (12 * 3 + 5) * 2^32 = 1.28 * 2^37 cache misses.  At 60 ns each (typical
> local DRAM access time on i7 & Xeon according to Intel), that's a
> hard minimum of 10565 seconds = 2h 56m 05s in one list_sort call.
>
> This is definitely the scale of problem where a mutithreaded sort is
> called for.
>
> It's *so* impossible that maybe it's worth trading that capability
> for a couple of bytes in the inner loop.
>
> >> In the code, I could do something like
> >>
> >> #ifdef CONFIG_X86_64
> >> /* Comment explaining why */
> >> typedef uint32_t count_t;
> >> #else
> >> typedef size_t count_t;
> >> #endif

So just make it unsigned int, unconditionally.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15 12:57           ` Geert Uytterhoeven
@ 2019-03-15 16:59             ` George Spelvin
  2019-03-15 17:47               ` Geert Uytterhoeven
  0 siblings, 1 reply; 44+ messages in thread
From: George Spelvin @ 2019-03-15 16:59 UTC (permalink / raw)
  To: geert, lkml
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	linux-kernel, linux, st5pub

On Fri, 15 Mar 2019 at 13:57:05 +0100, Geert Uytterhoeven wrote:
> On Fri, Mar 15, 2019 at 11:23 AM George Spelvin <lkml@sdf.org> wrote:
>> On Fri, 15 Mar 2019 at 09:20:58 +0100, Geert Uytterhoeven wrote:
>>> On Fri, Mar 15, 2019 at 5:33 AM George Spelvin <lkml@sdf.org> wrote:
>>>> One question I should ask everyone: should "count" be 32 or 64 bits
>>>> on 64-bit machines?  That would let x86 save a few REX bytes.  (815
>>>> vs. 813 byte code, if anyone cares.)
>>>>
>>>> Allegedy ARM can save a few pJ by gating the high 32
>>>> bits of the ALU.
>>>>
>>>> Most other 64-bit processors would prefer 64-bit operations as
>>>> it saves masking operations.
> 
> So just make it unsigned int, unconditionally.

As I wrote originally (and quoted above), other 64-bit machines don't
have 32-bit operations and prefer 64-bit operations because they don't
require masking.  x86 (for historical compatibiity) and ARM (for power
saving) are the ones that come to mind.

I'm trying to present the case to spur discussion, but it realy is
a *question* I'm asking about whether to do that, not a suggestion
phrased as a question.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15 16:59             ` George Spelvin
@ 2019-03-15 17:47               ` Geert Uytterhoeven
  2019-03-15 18:53                 ` Andrey Abramov
  0 siblings, 1 reply; 44+ messages in thread
From: Geert Uytterhoeven @ 2019-03-15 17:47 UTC (permalink / raw)
  To: George Spelvin
  Cc: Andrew Morton, Andy Shevchenko, Daniel Wagner, Dave Chinner,
	Don Mullis, Linux Kernel Mailing List, Rasmus Villemoes,
	Andrey Abramov

Hi George,

On Fri, Mar 15, 2019 at 5:59 PM George Spelvin <lkml@sdf.org> wrote:
> On Fri, 15 Mar 2019 at 13:57:05 +0100, Geert Uytterhoeven wrote:
> > On Fri, Mar 15, 2019 at 11:23 AM George Spelvin <lkml@sdf.org> wrote:
> >> On Fri, 15 Mar 2019 at 09:20:58 +0100, Geert Uytterhoeven wrote:
> >>> On Fri, Mar 15, 2019 at 5:33 AM George Spelvin <lkml@sdf.org> wrote:
> >>>> One question I should ask everyone: should "count" be 32 or 64 bits
> >>>> on 64-bit machines?  That would let x86 save a few REX bytes.  (815
> >>>> vs. 813 byte code, if anyone cares.)
> >>>>
> >>>> Allegedy ARM can save a few pJ by gating the high 32
> >>>> bits of the ALU.
> >>>>
> >>>> Most other 64-bit processors would prefer 64-bit operations as
> >>>> it saves masking operations.
> >
> > So just make it unsigned int, unconditionally.
>
> As I wrote originally (and quoted above), other 64-bit machines don't
> have 32-bit operations and prefer 64-bit operations because they don't
> require masking.  x86 (for historical compatibiity) and ARM (for power
> saving) are the ones that come to mind.
>
> I'm trying to present the case to spur discussion, but it realy is
> a *question* I'm asking about whether to do that, not a suggestion
> phrased as a question.

If it's just x86_64, use size_t everywhere, and let them suffer, for not
being real 64-bit ;-)

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15 17:47               ` Geert Uytterhoeven
@ 2019-03-15 18:53                 ` Andrey Abramov
  2019-03-15 19:06                   ` Andy Shevchenko
  0 siblings, 1 reply; 44+ messages in thread
From: Andrey Abramov @ 2019-03-15 18:53 UTC (permalink / raw)
  To: Geert Uytterhoeven, George Spelvin
  Cc: Andrew Morton, Andy Shevchenko, Daniel Wagner, Dave Chinner,
	Don Mullis, Linux Kernel Mailing List, Rasmus Villemoes

> I'm trying to present the case to spur discussion, but it realy is
> a *question* I'm asking about whether to do that, not a suggestion
> phrased as a question.

> If it's just x86_64, use size_t everywhere, and let them suffer, for not
> being real 64-bit ;-)

But what is the problem of local typedef with a good and clear comment?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15 18:53                 ` Andrey Abramov
@ 2019-03-15 19:06                   ` Andy Shevchenko
  2019-03-15 19:23                     ` Andrey Abramov
  0 siblings, 1 reply; 44+ messages in thread
From: Andy Shevchenko @ 2019-03-15 19:06 UTC (permalink / raw)
  To: Andrey Abramov
  Cc: Geert Uytterhoeven, George Spelvin, Andrew Morton, Daniel Wagner,
	Dave Chinner, Don Mullis, Linux Kernel Mailing List,
	Rasmus Villemoes

On Fri, Mar 15, 2019 at 09:53:07PM +0300, Andrey Abramov wrote:
> > I'm trying to present the case to spur discussion, but it realy is
> > a *question* I'm asking about whether to do that, not a suggestion
> > phrased as a question.
> 
> > If it's just x86_64, use size_t everywhere, and let them suffer, for not
> > being real 64-bit ;-)
> 
> But what is the problem of local typedef with a good and clear comment?

It makes harder to read and understand the code. One needs spend deciseconds /
seconds to catch instead of santiseconds of time.

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15 19:06                   ` Andy Shevchenko
@ 2019-03-15 19:23                     ` Andrey Abramov
  2019-03-15 19:56                       ` Andy Shevchenko
  0 siblings, 1 reply; 44+ messages in thread
From: Andrey Abramov @ 2019-03-15 19:23 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Geert Uytterhoeven, George Spelvin, Andrew Morton, Daniel Wagner,
	Dave Chinner, Don Mullis, Linux Kernel Mailing List,
	Rasmus Villemoes



15.03.2019, 22:06, "Andy Shevchenko" <andriy.shevchenko@linux.intel.com>:
> It makes harder to read and understand the code. One needs spend deciseconds /
> seconds to catch instead of santiseconds of time.

If it is true for someone (for example, for me it is almost as readable
as a simple size_t), then I agree with Geert who said: "use size_t everywhere".

With Best Regards, Andrey Abramov.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller
  2019-03-09  2:17 [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
                   ` (4 preceding siblings ...)
  2019-03-05  5:58 ` [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
@ 2019-03-15 19:54 ` Andrey Abramov
  5 siblings, 0 replies; 44+ messages in thread
From: Andrey Abramov @ 2019-03-15 19:54 UTC (permalink / raw)
  To: George Spelvin, linux-kernel
  Cc: Andrew Morton, Geert Uytterhoeven, Daniel Wagner,
	Rasmus Villemoes, Don Mullis, Dave Chinner, Andy Shevchenko

I am pleased with this patch. Yes there is still a couple of controversial
issues but in my opinion, it is mostly "matter of taste", and whatever solutions
could be made, the patch is great.

So you may add me to the "Acked-by" section.

(And I mean the whole patch from 1 to 5)
With Best Regards, Andrey Abramov.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15 19:23                     ` Andrey Abramov
@ 2019-03-15 19:56                       ` Andy Shevchenko
  2019-03-16  3:49                         ` George Spelvin
  0 siblings, 1 reply; 44+ messages in thread
From: Andy Shevchenko @ 2019-03-15 19:56 UTC (permalink / raw)
  To: Andrey Abramov
  Cc: Geert Uytterhoeven, George Spelvin, Andrew Morton, Daniel Wagner,
	Dave Chinner, Don Mullis, Linux Kernel Mailing List,
	Rasmus Villemoes

On Fri, Mar 15, 2019 at 10:23:53PM +0300, Andrey Abramov wrote:
> 15.03.2019, 22:06, "Andy Shevchenko" <andriy.shevchenko@linux.intel.com>:
> > It makes harder to read and understand the code. One needs spend deciseconds /
> > seconds to catch instead of santiseconds of time.
> 
> If it is true for someone (for example, for me it is almost as readable
> as a simple size_t), then I agree with Geert who said: "use size_t everywhere".

We already spent too much on bikeshedding here.

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS
  2019-03-15 19:56                       ` Andy Shevchenko
@ 2019-03-16  3:49                         ` George Spelvin
  0 siblings, 0 replies; 44+ messages in thread
From: George Spelvin @ 2019-03-16  3:49 UTC (permalink / raw)
  To: andriy.shevchenko, st5pub
  Cc: akpm, daniel.wagner, dchinner, don.mullis, geert, linux-kernel,
	linux, lkml

Indeed, thanks to everyone who commented.  The extra conceptual
complexity and reduced readbility is Just Not Worth It.

v2 (and final, as far as I'm concerned) follows.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function
  2019-03-14  1:58     ` George Spelvin
@ 2019-06-21 23:12       ` Rasmus Villemoes
  2019-12-08  8:01         ` George Spelvin
  0 siblings, 1 reply; 44+ messages in thread
From: Rasmus Villemoes @ 2019-06-21 23:12 UTC (permalink / raw)
  To: George Spelvin, linux-kernel
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	geert, st5pub

On 14/03/2019 02.58, George Spelvin wrote:
> On Thu, 14 Mar 2019 at 00:28:16 +0100, Rasmus Villemoes wrote:

>> Similarly one could do a SORT_SIMPLE_CMP() for when sorting an array of
>> structs according to a single numeric member. That sort is not stable,
>> so the comparison functions would have to do a full -1,0,1 return, of
>> course, but we'd still avoid indirect calls.
> 
> Actually, while some sorts (notably fat-pivot quicksort) require
> a 3-way return to avoid O(n^2) performance, heapsort is just fine
> with a boolean return value. 

Hi George

So I tried starting to implement this, and the timing results look
promising. However, currently I'm doing

  (*(u32*)ka > *(u32*)kb) - (*(u32*)ka < *(u32*)kb);

in my do_cmp. Both existing invocations of the comparison function in
sort.c compare its return value >= 0, which is always true if I just
return the boolean (*(u32*)ka > *(u32*)kb). So it seems the algorithm
would have to be changed to allow the cmp function to return a bool.
Perhaps it's as simple as changing the two >= to >, but can I get you to
check that that would be ok? (Quick testing seems to suggest so, but
it's possible there are some corner cases where it would break.)

Thanks,
Rasmus

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function
  2019-06-21 23:12       ` Rasmus Villemoes
@ 2019-12-08  8:01         ` George Spelvin
  0 siblings, 0 replies; 44+ messages in thread
From: George Spelvin @ 2019-12-08  8:01 UTC (permalink / raw)
  To: linux-kernel, linux, lkml
  Cc: akpm, andriy.shevchenko, daniel.wagner, dchinner, don.mullis,
	geert, st5pub

On Sat, 22 Jun 2019 at 01:12:01, Rasmus Villemoes wrote:
> On 14/03/2019 02.58, George Spelvin wrote:
>> On Thu, 14 Mar 2019 at 00:28:16 +0100, Rasmus Villemoes wrote:
> 
>>> Similarly one could do a SORT_SIMPLE_CMP() for when sorting an array of
>>> structs according to a single numeric member. That sort is not stable,
>>> so the comparison functions would have to do a full -1,0,1 return, of
>>> course, but we'd still avoid indirect calls.
>> 
>> Actually, while some sorts (notably fat-pivot quicksort) require
>> a 3-way return to avoid O(n^2) performance, heapsort is just fine
>> with a boolean return value. 
> 
> Hi George
> 
> So I tried starting to implement this, and the timing results look
> promising. However, currently I'm doing
> 
>   (*(u32*)ka > *(u32*)kb) - (*(u32*)ka < *(u32*)kb);
> 
> in my do_cmp. Both existing invocations of the comparison function in
> sort.c compare its return value >= 0, which is always true if I just
> return the boolean (*(u32*)ka > *(u32*)kb). So it seems the algorithm
> would have to be changed to allow the cmp function to return a bool.
> Perhaps it's as simple as changing the two >= to >, but can I get you to
> check that that would be ok? (Quick testing seems to suggest so, but
> it's possible there are some corner cases where it would break.)

The only reason for >= vs. > is to do less work in the == case.

(I prefer to stay in the first backtracking loop, which does no
data motion, and therby reduce the number of swaps performed in
the second part of the backtracking.)

If you want to preserve the logic exactly, you can replace
"do_cmp(a, b, ...) >= 0" with "do_cmp(b, a, ...) <= 0" and then
the conversion is straightforward.

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2019-12-08  8:07 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-09  2:17 [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller George Spelvin
2019-02-21  6:30 ` [PATCH 1/5] lib/sort: Make swap functions more generic George Spelvin
     [not found]   ` <20190309140653.GO9224@smile.fi.intel.com>
2019-03-09 15:53     ` lkml
2019-03-09 20:19       ` Andrey Abramov
2019-03-14  9:29       ` Andy Shevchenko
2019-03-14 10:09         ` George Spelvin
2019-03-14 10:41           ` Geert Uytterhoeven
2019-03-14 11:53             ` George Spelvin
2019-03-14 12:18               ` Andy Shevchenko
2019-03-14 19:59                 ` Andrey Abramov
2019-03-15  3:35                   ` George Spelvin
2019-03-15  8:27                     ` Geert Uytterhoeven
2019-03-14 10:11         ` George Spelvin
2019-03-09 21:02     ` George Spelvin
2019-03-13 21:23   ` Rasmus Villemoes
2019-03-13 22:02     ` Geert Uytterhoeven
2019-03-13 23:15     ` George Spelvin
2019-02-21  8:21 ` [PATCH 2/5] lib/sort: Use more efficient bottom-up heapsort variant George Spelvin
2019-03-13 22:29   ` Rasmus Villemoes
2019-03-14  0:03     ` George Spelvin
2019-03-14  0:15       ` Rasmus Villemoes
2019-02-21  8:21 ` [PATCH 3/5] lib/sort: Avoid indirect calls to built-in swap George Spelvin
2019-03-05  3:06 ` [PATCH 4/5] lib/list_sort: Simplify and remove MAX_LIST_LENGTH_BITS George Spelvin
2019-03-10 21:54   ` Rasmus Villemoes
2019-03-10 22:29     ` George Spelvin
2019-03-14  9:10   ` Andy Shevchenko
2019-03-14  9:41     ` George Spelvin
2019-03-15  4:33     ` George Spelvin
2019-03-15  8:20       ` Geert Uytterhoeven
2019-03-15 10:23         ` George Spelvin
2019-03-15 12:57           ` Geert Uytterhoeven
2019-03-15 16:59             ` George Spelvin
2019-03-15 17:47               ` Geert Uytterhoeven
2019-03-15 18:53                 ` Andrey Abramov
2019-03-15 19:06                   ` Andy Shevchenko
2019-03-15 19:23                     ` Andrey Abramov
2019-03-15 19:56                       ` Andy Shevchenko
2019-03-16  3:49                         ` George Spelvin
2019-03-05  5:58 ` [PATCH 5/5] lib/list_sort: Optimize number of calls to comparison function George Spelvin
2019-03-13 23:28   ` Rasmus Villemoes
2019-03-14  1:58     ` George Spelvin
2019-06-21 23:12       ` Rasmus Villemoes
2019-12-08  8:01         ` George Spelvin
2019-03-15 19:54 ` [PATCH 0/5] lib/sort & lib/list_sort: faster and smaller Andrey Abramov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.