All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] Lib: sort.c: replaced heap sort algorithm with introspective sort
@ 2019-02-05 18:24 Andrey Abramov
  2019-02-05 20:15 ` Andrew Morton
  0 siblings, 1 reply; 2+ messages in thread
From: Andrey Abramov @ 2019-02-05 18:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, davem, Andrey Abramov

Replaced heap sort algorithm with faster introspective sort algorithm.

Signed-off-by: Andrey Abramov <st5pub@yandex.ru>
---
v1: The introspective sort algorithm is faster the heap sort (for example on my machine on a 100MB of random data it was consistently almost twice faster) and it doesn't have the worst case, unlike qsort.

 lib/sort.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 112 insertions(+), 15 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..edc09287e572 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ * A fast, recursive O(nlog n) sort for the Linux kernel
  *
  * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ * Feb 5  2019  Andrey Abramov <st5pub@yandex.ru> (introspective sort)
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -48,33 +49,22 @@ static void generic_swap(void *a, void *b, int size)
  * @num: number of elements
  * @size: size of each element
  * @cmp_func: pointer to comparison function
- * @swap_func: pointer to swap function or NULL
+ * @swap_func: pointer to swap function
  *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
+ * This function does a heapsort on the given array.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * qsort is about 20% faster on average, it suffers from exploitable
  * O(n*n) worst-case behavior and extra memory requirements that make
  * it less suitable for kernel use.
  */
-
-void sort(void *base, size_t num, size_t size,
+void heapsort(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *),
 	  void (*swap_func)(void *, void *, int size))
 {
 	/* pre-scale counters for performance */
 	int i = (num/2 - 1) * size, n = num * size, c, r;
 
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
 	/* heapify */
 	for ( ; i >= 0; i -= size) {
 		for (r = i; r * 2 + size < n; r  = c) {
@@ -103,4 +93,111 @@ void sort(void *base, size_t num, size_t size,
 	}
 }
 
+void introsort(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *),
+	  void (*swap_func)(void *, void *, int size),
+	  unsigned int max_depth, unsigned int depth)
+{
+
+	void *last = base + (num - 1) * size;
+	void *pivot = base + ((num - 1) / 2) * size;
+	void *i = base;
+	void *j = last;
+
+	if (num <= 1)
+		return;
+
+	/* switching to heapsort */
+	if (depth >= max_depth) {
+		heapsort(base, num, size, cmp_func, swap_func);
+		return;
+	}
+
+	/* making pivot be the median of middle, first and last elements */
+	if ((cmp_func(pivot, base) >= 0 && cmp_func(base, last) >= 0)
+		|| (cmp_func(last, base) >= 0 && cmp_func(base, pivot) >= 0)) {
+		pivot = base;
+	} else if ((cmp_func(pivot, last) >= 0 && cmp_func(last, base) >= 0)
+		|| (cmp_func(base, last) >= 0 && cmp_func(last, pivot) >= 0)) {
+		pivot = last;
+	}
+
+	/* split array */
+	while (true) {
+		while (cmp_func(i, pivot) < 0 && i < last)
+			i += size;
+		while (cmp_func(j, pivot) > 0 && j > base)
+			j -= size;
+
+		if (i >= j)
+			break;
+
+		swap_func(i, j, size);
+
+		if (i == pivot)
+			pivot = j;
+		else if (j == pivot)
+			pivot = i;
+
+		j -= size;
+		i += size;
+	}
+
+	/* continue for smaller parts */
+	if (i < last)
+		introsort(i, ((size_t)last - (size_t)i) / size + 1,
+			size, cmp_func, swap_func, max_depth, depth + 1);
+	if (base < j)
+		introsort(base, ((size_t)j - (size_t)base) / size + 1,
+			size, cmp_func, swap_func, max_depth, depth + 1);
+}
+
+unsigned int log2_up(size_t val)
+{
+	unsigned int log = 0;
+	size_t current = 1;
+
+	unsigned int max_reachable_log = sizeof(val) * 8 - 1;
+
+	while (current < val) {
+		current <<= 1;
+		log++;
+		if (log == max_reachable_log && current < val)
+			return max_reachable_log + 1;
+	}
+
+	return log;
+}
+
+
+/**
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ *
+ * This function does a introspective sort on the given array. You may provide a
+ * swap_func function optimized to your element type.
+ *
+ * The introspective sort use both qsort and heapsort,
+ * so it is faster than heapsort on average,
+ * but it doesn't have the worst case unlike qsort
+ */
+void sort(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *),
+	  void (*swap_func)(void *, void *, int size))
+{
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	introsort(base, num, size, cmp_func, swap_func, log2_up(num) * 2, 1);
+}
 EXPORT_SYMBOL(sort);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH 1/1] Lib: sort.c: replaced heap sort algorithm with introspective sort
  2019-02-05 18:24 [PATCH 1/1] Lib: sort.c: replaced heap sort algorithm with introspective sort Andrey Abramov
@ 2019-02-05 20:15 ` Andrew Morton
  0 siblings, 0 replies; 2+ messages in thread
From: Andrew Morton @ 2019-02-05 20:15 UTC (permalink / raw)
  To: Andrey Abramov; +Cc: linux-kernel, davem

On Tue,  5 Feb 2019 21:24:13 +0300 Andrey Abramov <st5pub@yandex.ru> wrote:

> Replaced heap sort algorithm with faster introspective sort algorithm.
> 
> Signed-off-by: Andrey Abramov <st5pub@yandex.ru>
> ---
> v1: The introspective sort algorithm is faster the heap sort (for example on my machine on a 100MB of random data it was consistently almost twice faster) and it doesn't have the worst case, unlike qsort.

I guess faster is always better, but are there any known parts of the
kernel which actually care about the performance of sort()?

>
> ...
>
> --- a/lib/sort.c
> +++ b/lib/sort.c
> @@ -1,8 +1,9 @@
>  // SPDX-License-Identifier: GPL-2.0
>  /*
> - * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
> + * A fast, recursive O(nlog n) sort for the Linux kernel

oops.  A good analysis of the maximum stack utilization is needed,
please.

>   * Jan 23 2005  Matt Mackall <mpm@selenic.com>
> + * Feb 5  2019  Andrey Abramov <st5pub@yandex.ru> (introspective sort)
>   */
>  
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> @@ -48,33 +49,22 @@ static void generic_swap(void *a, void *b, int size)
>   * @num: number of elements
>   * @size: size of each element
>   * @cmp_func: pointer to comparison function
> - * @swap_func: pointer to swap function or NULL
> + * @swap_func: pointer to swap function
>   *
> - * This function does a heapsort on the given array. You may provide a
> - * swap_func function optimized to your element type.
> + * This function does a heapsort on the given array.
>   *
>   * Sorting time is O(n log n) both on average and worst-case. While
>   * qsort is about 20% faster on average, it suffers from exploitable
>   * O(n*n) worst-case behavior and extra memory requirements that make
>   * it less suitable for kernel use.
>   */
> -
> -void sort(void *base, size_t num, size_t size,
> +void heapsort(void *base, size_t num, size_t size,

static?

>  	  int (*cmp_func)(const void *, const void *),
>  	  void (*swap_func)(void *, void *, int size))
>  {
>  	/* pre-scale counters for performance */
>  	int i = (num/2 - 1) * size, n = num * size, c, r;
>  
> -	if (!swap_func) {
> -		if (size == 4 && alignment_ok(base, 4))
> -			swap_func = u32_swap;
> -		else if (size == 8 && alignment_ok(base, 8))
> -			swap_func = u64_swap;
> -		else
> -			swap_func = generic_swap;
> -	}
> -
>  	/* heapify */
>  	for ( ; i >= 0; i -= size) {
>  		for (r = i; r * 2 + size < n; r  = c) {
> @@ -103,4 +93,111 @@ void sort(void *base, size_t num, size_t size,
>  	}
>  }
>  
> +void introsort(void *base, size_t num, size_t size,

static?

> +	  int (*cmp_func)(const void *, const void *),
> +	  void (*swap_func)(void *, void *, int size),
> +	  unsigned int max_depth, unsigned int depth)
> +{
> +
> +	void *last = base + (num - 1) * size;
> +	void *pivot = base + ((num - 1) / 2) * size;
> +	void *i = base;
> +	void *j = last;

That's about 100 bytes per invokation.

> +	if (num <= 1)
> +		return;
> +
> +	/* switching to heapsort */

Please explain the reason for this in the comment.

> +	if (depth >= max_depth) {
> +		heapsort(base, num, size, cmp_func, swap_func);

Ditto.

> +		return;
> +	}
> +
> +	/* making pivot be the median of middle, first and last elements */

"Make"

(and why?)

> +	if ((cmp_func(pivot, base) >= 0 && cmp_func(base, last) >= 0)
> +		|| (cmp_func(last, base) >= 0 && cmp_func(base, pivot) >= 0)) {
> +		pivot = base;
> +	} else if ((cmp_func(pivot, last) >= 0 && cmp_func(last, base) >= 0)
> +		|| (cmp_func(base, last) >= 0 && cmp_func(last, pivot) >= 0)) {
> +		pivot = last;
> +	}
> +
> +	/* split array */
> +	while (true) {
> +		while (cmp_func(i, pivot) < 0 && i < last)
> +			i += size;
> +		while (cmp_func(j, pivot) > 0 && j > base)
> +			j -= size;
> +
> +		if (i >= j)
> +			break;
> +
> +		swap_func(i, j, size);
> +
> +		if (i == pivot)
> +			pivot = j;
> +		else if (j == pivot)
> +			pivot = i;
> +
> +		j -= size;
> +		i += size;
> +	}
> +
> +	/* continue for smaller parts */
> +	if (i < last)
> +		introsort(i, ((size_t)last - (size_t)i) / size + 1,
> +			size, cmp_func, swap_func, max_depth, depth + 1);
> +	if (base < j)
> +		introsort(base, ((size_t)j - (size_t)base) / size + 1,
> +			size, cmp_func, swap_func, max_depth, depth + 1);
> +}
> +
> +unsigned int log2_up(size_t val)

static?

> +{
> +	unsigned int log = 0;
> +	size_t current = 1;
> +
> +	unsigned int max_reachable_log = sizeof(val) * 8 - 1;
> +
> +	while (current < val) {
> +		current <<= 1;
> +		log++;
> +		if (log == max_reachable_log && current < val)
> +			return max_reachable_log + 1;
> +	}
> +
> +	return log;
> +}

Please add a comment explaining what this does.  Partly so that readers
(such as I) can understand why it can't utilize the existing log2().

> +
> +/**
> + * sort - sort an array of elements
> + * @base: pointer to data to sort
> + * @num: number of elements
> + * @size: size of each element
> + * @cmp_func: pointer to comparison function
> + * @swap_func: pointer to swap function or NULL
> + *
> + * This function does a introspective sort on the given array. You may provide a
> + * swap_func function optimized to your element type.
> + *
> + * The introspective sort use both qsort and heapsort,
> + * so it is faster than heapsort on average,

There's a non-ascii character after "heapsort".

> + * but it doesn't have the worst case unlike qsort

Unclear.  "like qsort"?

> + */
> +void sort(void *base, size_t num, size_t size,
> +	  int (*cmp_func)(const void *, const void *),
> +	  void (*swap_func)(void *, void *, int size))
> +{
> +	if (!swap_func) {
> +		if (size == 4 && alignment_ok(base, 4))
> +			swap_func = u32_swap;
> +		else if (size == 8 && alignment_ok(base, 8))
> +			swap_func = u64_swap;
> +		else
> +			swap_func = generic_swap;
> +	}
> +
> +	introsort(base, num, size, cmp_func, swap_func, log2_up(num) * 2, 1);
> +}
>  EXPORT_SYMBOL(sort);


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-02-05 20:15 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-05 18:24 [PATCH 1/1] Lib: sort.c: replaced heap sort algorithm with introspective sort Andrey Abramov
2019-02-05 20:15 ` Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.