All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] lib: optimize cpumask_next_and()
@ 2017-10-26 12:18 Clement Courbet
  0 siblings, 0 replies; 3+ messages in thread
From: Clement Courbet @ 2017-10-26 12:18 UTC (permalink / raw)
  To: Arnd Bergmann, Rasmus Villemoes, Andrew Morton, Matthew Wilcox,
	Yury Norov
  Cc: Clement Courbet, Ingo Molnar, linux-arch, linux-kernel

We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
It's essentially a joined iteration in search for a non-zero bit, which
is currently implemented as a lookup join (find a nonzero bit on the
lhs, lookup the rhs to see if it's set there).

Implement a direct join (find a nonzero bit on the incrementally built
join). Direct benchmarking shows that it's 1.17x to 14x faster with a
geometric mean of 2.1 on 32 CPUs. No impact on memory usage.

Approximate benchmark code:

```
  unsigned long src1p[nr_cpumask_longs] = {pattern1};
  unsigned long src2p[nr_cpumask_longs] = {pattern2};
  for (/*a bunch of repetitions*/) {
    for (int n = -1; n <= nr_cpu_ids; ++n) {
      asm volatile("" : "+rm"(src1p)); // prevent any optimization
      asm volatile("" : "+rm"(src2p));
      unsigned long result = cpumask_next_and(n, src1p, src2p);
      asm volatile("" : "+rm"(result));
    }
  }
```

Results:
pattern1    pattern2     time_before/time_after
0x0000ffff  0x0000ffff   1.65
0x0000ffff  0x00005555   2.24
0x0000ffff  0x00001111   2.94
0x0000ffff  0x00000000   14.0
0x00005555  0x0000ffff   1.67
0x00005555  0x00005555   1.71
0x00005555  0x00001111   1.90
0x00005555  0x00000000   6.58
0x00001111  0x0000ffff   1.46
0x00001111  0x00005555   1.49
0x00001111  0x00001111   1.45
0x00001111  0x00000000   3.10
0x00000000  0x0000ffff   1.18
0x00000000  0x00005555   1.18
0x00000000  0x00001111   1.17
0x00000000  0x00000000   1.25
-----------------------------
               geo.mean  2.06

Signed-off-by: Clement Courbet <courbet@google.com>
---
Changes in v2:
 - Refactored _find_next_common_bit into _find_next_bit., as suggested
   by Yury Norov. This has no adverse effects on the performance side,
   as the compiler successfully inlines the code.

Changes in v3:
 - Fixes find_next_and_bit() declaration.
 - Synchronize _find_next_bit_le() with _find_next_bit()
 - Synchronize the code in tools/lib/find_bit.c
 - Add find_next_and_bit to guard code
 - Fix invert value (bad sync with our internal tree on which I'm doing
   the testing).

 include/asm-generic/bitops/find.h       | 16 ++++++++++
 include/linux/bitmap.h                  |  2 ++
 lib/cpumask.c                           |  9 +++---
 lib/find_bit.c                          | 55 ++++++++++++++++++++++++---------
 tools/include/asm-generic/bitops/find.h | 16 ++++++++++
 tools/lib/find_bit.c                    | 40 ++++++++++++++++++------
 6 files changed, 110 insertions(+), 28 deletions(-)

diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h
index 998d4d544f18..130962f3a264 100644
--- a/include/asm-generic/bitops/find.h
+++ b/include/asm-generic/bitops/find.h
@@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
 		size, unsigned long offset);
 #endif
 
+#ifndef find_next_and_bit
+/**
+ * find_next_and_bit - find the next set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+extern unsigned long find_next_and_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long size,
+		unsigned long offset);
+#endif
+
 #ifndef find_next_zero_bit
 /**
  * find_next_zero_bit - find the next cleared bit in a memory region
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 700cf5f67118..b4606bfda52f 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -77,6 +77,8 @@
  * find_first_bit(addr, nbits)		Position first set bit in *addr
  * find_next_zero_bit(addr, nbits, bit)	Position next zero bit in *addr >= bit
  * find_next_bit(addr, nbits, bit)	Position next set bit in *addr >= bit
+ * find_next_and_bit(addr1, addr2, nbits, bit)	Same as find_first_bit, but in
+ *						(*addr1 & *addr2)
  */
 
 /*
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 8b1a1bd77539..5602223837fa 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next);
 int cpumask_next_and(int n, const struct cpumask *src1p,
 		     const struct cpumask *src2p)
 {
-	while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
-		if (cpumask_test_cpu(n, src2p))
-			break;
-	return n;
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
+		nr_cpumask_bits, n + 1);
 }
 EXPORT_SYMBOL(cpumask_next_and);
 
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 6ed74f78380c..dfcea66b84f4 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -21,22 +21,29 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 
-#if !defined(find_next_bit) || !defined(find_next_zero_bit)
+#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
+		!defined(find_next_and_bit)
 
 /*
- * This is a common helper function for find_next_bit and
- * find_next_zero_bit.  The difference is the "invert" argument, which
- * is XORed with each fetched word before searching it for one bits.
+ * This is a common helper function for find_next_bit, find_next_zero_bit, and
+ * find_next_and_bit. The differences are:
+ *  - The "invert" argument, which is XORed with each fetched word before
+ *    searching it for one bits.
+ *  - The optional "addr2", which is anded with "addr1" if present.
  */
-static unsigned long _find_next_bit(const unsigned long *addr,
-		unsigned long nbits, unsigned long start, unsigned long invert)
+static unsigned long _find_next_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long nbits,
+		unsigned long start, unsigned long invert)
 {
 	unsigned long tmp;
 
 	if (unlikely(start >= nbits))
 		return nbits;
 
-	tmp = addr[start / BITS_PER_LONG] ^ invert;
+	tmp = addr1[start / BITS_PER_LONG];
+	if (addr2)
+		tmp &= addr2[start / BITS_PER_LONG];
+	tmp ^= invert;
 
 	/* Handle 1st word. */
 	tmp &= BITMAP_FIRST_WORD_MASK(start);
@@ -47,7 +54,10 @@ static unsigned long _find_next_bit(const unsigned long *addr,
 		if (start >= nbits)
 			return nbits;
 
-		tmp = addr[start / BITS_PER_LONG] ^ invert;
+		tmp = addr1[start / BITS_PER_LONG];
+		if (addr2)
+			tmp &= addr2[start / BITS_PER_LONG];
+		tmp ^= invert;
 	}
 
 	return min(start + __ffs(tmp), nbits);
@@ -61,7 +71,7 @@ static unsigned long _find_next_bit(const unsigned long *addr,
 unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
 			    unsigned long offset)
 {
-	return _find_next_bit(addr, size, offset, 0UL);
+	return _find_next_bit(addr, NULL, size, offset, 0UL);
 }
 EXPORT_SYMBOL(find_next_bit);
 #endif
@@ -70,11 +80,21 @@ EXPORT_SYMBOL(find_next_bit);
 unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
 				 unsigned long offset)
 {
-	return _find_next_bit(addr, size, offset, ~0UL);
+	return _find_next_bit(addr, NULL, size, offset, ~0UL);
 }
 EXPORT_SYMBOL(find_next_zero_bit);
 #endif
 
+#if !defined(find_next_and_bit)
+unsigned long find_next_and_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long size,
+		unsigned long offset)
+{
+	return _find_next_bit(addr1, addr2, size, offset, 0UL);
+}
+EXPORT_SYMBOL(find_next_and_bit);
+#endif
+
 #ifndef find_first_bit
 /*
  * Find the first set bit in a memory region.
@@ -146,15 +166,19 @@ static inline unsigned long ext2_swab(const unsigned long y)
 }
 
 #if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
-static unsigned long _find_next_bit_le(const unsigned long *addr,
-		unsigned long nbits, unsigned long start, unsigned long invert)
+static unsigned long _find_next_bit_le(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long nbits,
+		unsigned long start, unsigned long invert)
 {
 	unsigned long tmp;
 
 	if (unlikely(start >= nbits))
 		return nbits;
 
-	tmp = addr[start / BITS_PER_LONG] ^ invert;
+	tmp = addr1[start / BITS_PER_LONG];
+	if (addr2)
+		tmp &= addr2[start / BITS_PER_LONG];
+	tmp ^= invert;
 
 	/* Handle 1st word. */
 	tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start));
@@ -165,7 +189,10 @@ static unsigned long _find_next_bit_le(const unsigned long *addr,
 		if (start >= nbits)
 			return nbits;
 
-		tmp = addr[start / BITS_PER_LONG] ^ invert;
+		tmp = addr1[start / BITS_PER_LONG];
+		if (addr2)
+			tmp &= addr2[start / BITS_PER_LONG];
+		tmp ^= invert;
 	}
 
 	return min(start + __ffs(ext2_swab(tmp)), nbits);
diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/asm-generic/bitops/find.h
index 5538ecdc964a..435c7d002f33 100644
--- a/tools/include/asm-generic/bitops/find.h
+++ b/tools/include/asm-generic/bitops/find.h
@@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
 		size, unsigned long offset);
 #endif
 
+#ifndef find_next_and_bit
+/**
+ * find_next_and_bit - find the next set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+extern unsigned long find_next_and_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long size,
+		unsigned long offset);
+#endif
+
 #ifndef find_next_zero_bit
 
 /**
diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c
index 42c15f906aac..e4af79229ab4 100644
--- a/tools/lib/find_bit.c
+++ b/tools/lib/find_bit.c
@@ -22,22 +22,29 @@
 #include <linux/bitmap.h>
 #include <linux/kernel.h>
 
-#if !defined(find_next_bit)
+#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
+		!defined(find_next_and_bit)
 
 /*
- * This is a common helper function for find_next_bit and
- * find_next_zero_bit.  The difference is the "invert" argument, which
- * is XORed with each fetched word before searching it for one bits.
+ * This is a common helper function for find_next_bit, find_next_zero_bit, and
+ * find_next_and_bit. The differences are:
+ *  - The "invert" argument, which is XORed with each fetched word before
+ *    searching it for one bits.
+ *  - The optional "addr2", which is anded with "addr1" if present.
  */
-static unsigned long _find_next_bit(const unsigned long *addr,
-		unsigned long nbits, unsigned long start, unsigned long invert)
+static unsigned long _find_next_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long nbits,
+		unsigned long start, unsigned long invert)
 {
 	unsigned long tmp;
 
 	if (unlikely(start >= nbits))
 		return nbits;
 
-	tmp = addr[start / BITS_PER_LONG] ^ invert;
+	tmp = addr1[start / BITS_PER_LONG];
+	if (addr2)
+		tmp &= addr2[start / BITS_PER_LONG];
+	tmp ^= invert;
 
 	/* Handle 1st word. */
 	tmp &= BITMAP_FIRST_WORD_MASK(start);
@@ -48,7 +55,10 @@ static unsigned long _find_next_bit(const unsigned long *addr,
 		if (start >= nbits)
 			return nbits;
 
-		tmp = addr[start / BITS_PER_LONG] ^ invert;
+		tmp = addr1[start / BITS_PER_LONG];
+		if (addr2)
+			tmp &= addr2[start / BITS_PER_LONG];
+		tmp ^= invert;
 	}
 
 	return min(start + __ffs(tmp), nbits);
@@ -62,7 +72,7 @@ static unsigned long _find_next_bit(const unsigned long *addr,
 unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
 			    unsigned long offset)
 {
-	return _find_next_bit(addr, size, offset, 0UL);
+	return _find_next_bit(addr, NULL, size, offset, 0UL);
 }
 #endif
 
@@ -104,6 +114,16 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
 unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
 				 unsigned long offset)
 {
-	return _find_next_bit(addr, size, offset, ~0UL);
+	return _find_next_bit(addr, NULL, size, offset, ~0UL);
 }
 #endif
+
+#ifndef find_next_and_bit
+unsigned long find_next_and_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long size,
+		unsigned long offset)
+{
+	return _find_next_bit(addr1, addr2, size, offset, 0UL);
+}
+#endif
+
-- 
2.15.0.rc2.357.g7e34df9404-goog

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v3] lib: optimize cpumask_next_and()
  2017-10-26 12:58 Alexey Dobriyan
@ 2017-10-28 13:24 ` Yury Norov
  0 siblings, 0 replies; 3+ messages in thread
From: Yury Norov @ 2017-10-28 13:24 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: courbet, arnd, linux, akpm, mawilcox, mingo, linux-arch, linux-kernel

On Thu, Oct 26, 2017 at 02:58:00PM +0200, Alexey Dobriyan wrote:
> >  - Refactored _find_next_common_bit into _find_next_bit., as suggested
> >    by Yury Norov. This has no adverse effects on the performance side,
> >    as the compiler successfully inlines the code.
> 
> 1)
> Gentoo ships 5.4.0 which doesn't inline this code on x86_64 defconfig
> (which has OPTIMIZE_INLINING).
> 
> 
> ffffffff813556c0 <find_next_bit>:
> ffffffff813556c0:       55                      push   rbp
> ffffffff813556c1:       48 89 d1                mov    rcx,rdx
> ffffffff813556c4:       45 31 c0                xor    r8d,r8d
> ffffffff813556c7:       48 89 f2                mov    rdx,rsi
> ffffffff813556ca:       31 f6                   xor    esi,esi
> ffffffff813556cc:       48 89 e5                mov    rbp,rsp
> ffffffff813556cf:       e8 7c ff ff ff          call
> ffffffff81355650 <_find_next_bit>
> ffffffff813556d4:       5d                      pop    rbp
> ffffffff813556d5:       c3                      ret

GCC 7 for ARM64 doesn't inline as well. I wrote test for it to measure
the effect of inlining:
http://www.spinics.net/lists/kernel/msg2635338.html

The performance impact of this patch without inlining: 

Before:
[   96.856195] Start testing find_bit() with random-filled bitmap
[   96.868322] find_next_bit: 34529 cycles, 16304 iterations
[   96.879525] find_next_zero_bit: 35771 cycles, 16465 iterations
[   96.891409] find_last_bit: 17444 cycles, 16304 iterations
[   96.914445] find_first_bit: 1219671 cycles, 16305 iterations
[   96.925802] Start testing find_bit() with sparse bitmap
[   96.936308] find_next_bit: 301 cycles, 66 iterations
[   96.946981] find_next_zero_bit: 70897 cycles, 32703 iterations
[   96.958694] find_last_bit: 286 cycles, 66 iterations
[   96.968710] find_first_bit: 5260 cycles, 66 iterations

After:
[  116.205594] Start testing find_bit() with random-filled bitmap
[  116.217621] find_next_bit: 24979 cycles, 16449 iterations
[  116.228719] find_next_zero_bit: 25666 cycles, 16320 iterations
[  116.240620] find_last_bit: 19407 cycles, 16449 iterations
[  116.268368] find_first_bit: 1690945 cycles, 16449 iterations
[  116.279718] Start testing find_bit() with sparse bitmap
[  116.290219] find_next_bit: 352 cycles, 66 iterations
[  116.300692] find_next_zero_bit: 50916 cycles, 32703 iterations
[  116.312400] find_last_bit: 295 cycles, 66 iterations
[  116.322427] find_first_bit: 6742 cycles, 66 iterations

And with inlining:

Before:
[  169.464229] Start testing find_bit() with random-filled bitmap
[  169.476191] find_next_bit: 17520 cycles, 16336 iterations
[  169.487210] find_next_zero_bit: 17622 cycles, 16433 iterations
[  169.499111] find_last_bit: 19272 cycles, 16335 iterations
[  169.519735] find_first_bit: 978657 cycles, 16337 iterations
[  169.530912] Start testing find_bit() with sparse bitmap
[  169.541414] find_next_bit: 252 cycles, 66 iterations
[  169.551726] find_next_zero_bit: 34554 cycles, 32703 iterations
[  169.563436] find_last_bit: 294 cycles, 66 iterations
[  169.573439] find_first_bit: 3964 cycles, 66 iterations

After
[  191.191170] Start testing find_bit() with random-filled bitmap
[  191.203133] find_next_bit: 17530 cycles, 16346 iterations
[  191.214150] find_next_zero_bit: 17630 cycles, 16423 iterations
[  191.226037] find_last_bit: 17489 cycles, 16347 iterations
[  191.246672] find_first_bit: 979961 cycles, 16347 iterations
[  191.257849] Start testing find_bit() with sparse bitmap
[  191.268351] find_next_bit: 257 cycles, 66 iterations
[  191.278660] find_next_zero_bit: 34547 cycles, 32703 iterations
[  191.290370] find_last_bit: 292 cycles, 66 iterations
[  191.300376] find_first_bit: 4269 cycles, 66 iterations

I didn't investigate why non-inlined version of this patch works
faster than vanilla code, but inlined one is even faster and is
as fast as inlined version of existing code. I think, we should
come with it finally.

It would be great if someone test it on x86.
 
> 2)
> Making "and" operation to be centerpiece of this code is kind of meh
> find_next_or_bit() will be hard to implement.

Not so hard actually. :)
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1521775.html

Yury

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v3] lib: optimize cpumask_next_and()
@ 2017-10-26 12:58 Alexey Dobriyan
  2017-10-28 13:24 ` Yury Norov
  0 siblings, 1 reply; 3+ messages in thread
From: Alexey Dobriyan @ 2017-10-26 12:58 UTC (permalink / raw)
  To: courbet
  Cc: arnd, linux, akpm, mawilcox, ynorov, mingo, linux-arch, linux-kernel

>  - Refactored _find_next_common_bit into _find_next_bit., as suggested
>    by Yury Norov. This has no adverse effects on the performance side,
>    as the compiler successfully inlines the code.

1)
Gentoo ships 5.4.0 which doesn't inline this code on x86_64 defconfig
(which has OPTIMIZE_INLINING).


ffffffff813556c0 <find_next_bit>:
ffffffff813556c0:       55                      push   rbp
ffffffff813556c1:       48 89 d1                mov    rcx,rdx
ffffffff813556c4:       45 31 c0                xor    r8d,r8d
ffffffff813556c7:       48 89 f2                mov    rdx,rsi
ffffffff813556ca:       31 f6                   xor    esi,esi
ffffffff813556cc:       48 89 e5                mov    rbp,rsp
ffffffff813556cf:       e8 7c ff ff ff          call
ffffffff81355650 <_find_next_bit>
ffffffff813556d4:       5d                      pop    rbp
ffffffff813556d5:       c3                      ret

2)
Making "and" operation to be centerpiece of this code is kind of meh
find_next_or_bit() will be hard to implement.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-10-28 13:24 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-26 12:18 [PATCH v3] lib: optimize cpumask_next_and() Clement Courbet
2017-10-26 12:58 Alexey Dobriyan
2017-10-28 13:24 ` Yury Norov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.