All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] perf bench: Add benchmark of find_next_bit
@ 2020-07-24  7:19 Ian Rogers
  2020-07-24 14:45 ` Andi Kleen
  2020-07-28 11:51 ` Arnaldo Carvalho de Melo
  0 siblings, 2 replies; 7+ messages in thread
From: Ian Rogers @ 2020-07-24  7:19 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Namhyung Kim,
	Thomas Gleixner, Andi Kleen, linux-kernel
  Cc: Stephane Eranian, Ian Rogers

for_each_set_bit, or similar functions like for_each_cpu, may be hot
within the kernel. If many bits were set then one could imagine on
Intel a "bt" instruction with every bit may be faster than the function
call and word length find_next_bit logic. Add a benchmark to measure
this.

This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
option except for very small bitmaps.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/bench/Build            |   1 +
 tools/perf/bench/bench.h          |   1 +
 tools/perf/bench/find-bit-bench.c | 135 ++++++++++++++++++++++++++++++
 tools/perf/builtin-bench.c        |   1 +
 4 files changed, 138 insertions(+)
 create mode 100644 tools/perf/bench/find-bit-bench.c

diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 768e408757a0..fb114bca3a8d 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -10,6 +10,7 @@ perf-y += epoll-wait.o
 perf-y += epoll-ctl.o
 perf-y += synthesize.o
 perf-y += kallsyms-parse.o
+perf-y += find-bit-bench.o
 
 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 61cae4966cae..3291b0ddddfe 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -35,6 +35,7 @@ int bench_sched_messaging(int argc, const char **argv);
 int bench_sched_pipe(int argc, const char **argv);
 int bench_mem_memcpy(int argc, const char **argv);
 int bench_mem_memset(int argc, const char **argv);
+int bench_mem_find_bit(int argc, const char **argv);
 int bench_futex_hash(int argc, const char **argv);
 int bench_futex_wake(int argc, const char **argv);
 int bench_futex_wake_parallel(int argc, const char **argv);
diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c
new file mode 100644
index 000000000000..1aadbd9d7e86
--- /dev/null
+++ b/tools/perf/bench/find-bit-bench.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Benchmark find_next_bit and related bit operations.
+ *
+ * Copyright 2020 Google LLC.
+ */
+#include <stdlib.h>
+#include "bench.h"
+#include "../util/stat.h"
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/time64.h>
+#include <subcmd/parse-options.h>
+
+static unsigned int outer_iterations = 5;
+static unsigned int inner_iterations = 100000;
+
+static const struct option options[] = {
+	OPT_UINTEGER('i', "outer-iterations", &outer_iterations,
+		"Number of outerer iterations used"),
+	OPT_UINTEGER('j', "inner-iterations", &inner_iterations,
+		"Number of outerer iterations used"),
+	OPT_END()
+};
+
+static const char *const bench_usage[] = {
+	"perf bench mem find_bit <options>",
+	NULL
+};
+
+static unsigned int accumulator;
+static unsigned int use_of_val;
+
+static noinline void workload(int val)
+{
+	use_of_val += val;
+	accumulator++;
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static bool asm_test_bit(long nr, const unsigned long *addr)
+{
+	bool oldbit;
+
+	asm volatile("bt %2,%1"
+		     : "=@ccc" (oldbit)
+		     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
+
+	return oldbit;
+}
+#else
+#define asm_test_bit test_bit
+#endif
+
+static int do_for_each_set_bit(unsigned int num_bits)
+{
+	unsigned long *to_test = bitmap_alloc(num_bits);
+	struct timeval start, end, diff;
+	u64 runtime_us;
+	struct stats fb_time_stats, tb_time_stats;
+	double time_average, time_stddev;
+	unsigned int bit, i, j;
+	unsigned int set_bits, skip;
+	unsigned int old;
+
+	init_stats(&fb_time_stats);
+	init_stats(&tb_time_stats);
+
+	for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) {
+		bitmap_zero(to_test, num_bits);
+		skip = num_bits / set_bits;
+		for (i = 0; i < num_bits; i += skip)
+			set_bit(i, to_test);
+
+		for (i = 0; i < outer_iterations; i++) {
+			old = accumulator;
+			gettimeofday(&start, NULL);
+			for (j = 0; j < inner_iterations; j++) {
+				for_each_set_bit(bit, to_test, num_bits)
+					workload(bit);
+			}
+			gettimeofday(&end, NULL);
+			assert(old + (inner_iterations * set_bits) == accumulator);
+			timersub(&end, &start, &diff);
+			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+			update_stats(&fb_time_stats, runtime_us);
+
+			old = accumulator;
+			gettimeofday(&start, NULL);
+			for (j = 0; j < inner_iterations; j++) {
+				for (bit = 0; bit < num_bits; bit++) {
+					if (asm_test_bit(bit, to_test))
+						workload(bit);
+				}
+			}
+			gettimeofday(&end, NULL);
+			assert(old + (inner_iterations * set_bits) == accumulator);
+			timersub(&end, &start, &diff);
+			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+			update_stats(&tb_time_stats, runtime_us);
+		}
+
+		printf("%d operations %d bits set of %d bits\n",
+			inner_iterations, set_bits, num_bits);
+		time_average = avg_stats(&fb_time_stats);
+		time_stddev = stddev_stats(&fb_time_stats);
+		printf("  Average for_each_set_bit took: %.3f usec (+- %.3f usec)\n",
+			time_average, time_stddev);
+		time_average = avg_stats(&tb_time_stats);
+		time_stddev = stddev_stats(&tb_time_stats);
+		printf("  Average test_bit loop took:    %.3f usec (+- %.3f usec)\n",
+			time_average, time_stddev);
+
+		if (use_of_val == accumulator)  /* Try to avoid compiler tricks. */
+			printf("\n");
+	}
+	bitmap_free(to_test);
+	return 0;
+}
+
+int bench_mem_find_bit(int argc, const char **argv)
+{
+	int err = 0, i;
+
+	argc = parse_options(argc, argv, options, bench_usage, 0);
+	if (argc) {
+		usage_with_options(bench_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	for (i = 1; i <= 2048; i <<= 1)
+		do_for_each_set_bit(i);
+
+	return err;
+}
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index cad31b1d3438..690eee1120a7 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -52,6 +52,7 @@ static struct bench sched_benchmarks[] = {
 static struct bench mem_benchmarks[] = {
 	{ "memcpy",	"Benchmark for memcpy() functions",		bench_mem_memcpy	},
 	{ "memset",	"Benchmark for memset() functions",		bench_mem_memset	},
+	{ "find_bit",	"Benchmark for find_bit() functions",		bench_mem_find_bit	},
 	{ "all",	"Run all memory access benchmarks",		NULL			},
 	{ NULL,		NULL,						NULL			}
 };
-- 
2.28.0.rc0.142.g3c755180ce-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf bench: Add benchmark of find_next_bit
  2020-07-24  7:19 [PATCH] perf bench: Add benchmark of find_next_bit Ian Rogers
@ 2020-07-24 14:45 ` Andi Kleen
  2020-07-24 18:13   ` Ian Rogers
  2020-07-28 11:51 ` Arnaldo Carvalho de Melo
  1 sibling, 1 reply; 7+ messages in thread
From: Andi Kleen @ 2020-07-24 14:45 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Namhyung Kim,
	Thomas Gleixner, linux-kernel, Stephane Eranian

On Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers wrote:
> for_each_set_bit, or similar functions like for_each_cpu, may be hot
> within the kernel. If many bits were set then one could imagine on
> Intel a "bt" instruction with every bit may be faster than the function
> call and word length find_next_bit logic. Add a benchmark to measure
> this.
 
> This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> option except for very small bitmaps.

Small bitmaps is a common case in the kernel (e.g. cpu bitmaps) 

But the current code isn't that great for small bitmaps. It always looks horrific
when I look at PT traces or brstackinsn, especially since it was optimized
purely for code size at some point.

Probably would be better to have different implementations for
different sizes.

-Andi

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf bench: Add benchmark of find_next_bit
  2020-07-24 14:45 ` Andi Kleen
@ 2020-07-24 18:13   ` Ian Rogers
  0 siblings, 0 replies; 7+ messages in thread
From: Ian Rogers @ 2020-07-24 18:13 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Namhyung Kim,
	Thomas Gleixner, LKML, Stephane Eranian

On Fri, Jul 24, 2020 at 7:45 AM Andi Kleen <ak@linux.intel.com> wrote:
>
> On Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers wrote:
> > for_each_set_bit, or similar functions like for_each_cpu, may be hot
> > within the kernel. If many bits were set then one could imagine on
> > Intel a "bt" instruction with every bit may be faster than the function
> > call and word length find_next_bit logic. Add a benchmark to measure
> > this.
>
> > This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> > option except for very small bitmaps.
>
> Small bitmaps is a common case in the kernel (e.g. cpu bitmaps)
>
> But the current code isn't that great for small bitmaps. It always looks horrific
> when I look at PT traces or brstackinsn, especially since it was optimized
> purely for code size at some point.
>
> Probably would be better to have different implementations for
> different sizes.

Thanks Andi!

what I was kind of expecting was "bt [mem],reg; jae" to have a fixed
cost and the find_next_bit to be a big win when the majority of bits
in a bitmask were 0 but to lose if the majority of bits in the bitmask
were 1. So the ratio of 1s and 0s was going to matter, not so much the
bitmap size. For bitmaps smaller than 32 this was pretty much the
case:

1000000 operations 1 bits set of 1 bits
  Average for_each_set_bit took: 8368.640 usec (+- 103.009 usec)
  Average test_bit loop took:    4052.360 usec (+- 15.381 usec)
1000000 operations 1 bits set of 2 bits
  Average for_each_set_bit took: 8503.770 usec (+- 4.385 usec)
  Average test_bit loop took:    6103.570 usec (+- 0.514 usec)
1000000 operations 2 bits set of 2 bits
  Average for_each_set_bit took: 12754.485 usec (+- 301.342 usec)
  Average test_bit loop took:    6882.950 usec (+- 55.254 usec)
1000000 operations 1 bits set of 4 bits
  Average for_each_set_bit took: 8517.670 usec (+- 5.712 usec)
  Average test_bit loop took:    10485.580 usec (+- 1.386 usec)
1000000 operations 2 bits set of 4 bits
  Average for_each_set_bit took: 12931.060 usec (+- 312.882 usec)
  Average test_bit loop took:    11093.980 usec (+- 43.149 usec)
1000000 operations 4 bits set of 4 bits
  Average for_each_set_bit took: 19664.760 usec (+- 588.841 usec)
  Average test_bit loop took:    12217.583 usec (+- 96.287 usec)
1000000 operations 1 bits set of 8 bits
  Average for_each_set_bit took: 8501.810 usec (+- 5.250 usec)
  Average test_bit loop took:    18924.550 usec (+- 2.999 usec)
1000000 operations 2 bits set of 8 bits
  Average for_each_set_bit took: 12760.025 usec (+- 301.881 usec)
  Average test_bit loop took:    19726.540 usec (+- 56.878 usec)
1000000 operations 4 bits set of 8 bits
  Average for_each_set_bit took: 19151.163 usec (+- 560.053 usec)
  Average test_bit loop took:    20817.317 usec (+- 96.923 usec)
1000000 operations 8 bits set of 8 bits
  Average for_each_set_bit took: 29770.865 usec (+- 1012.050 usec)
  Average test_bit loop took:    22677.150 usec (+- 176.883 usec)
1000000 operations 1 bits set of 16 bits
  Average for_each_set_bit took: 8534.640 usec (+- 4.920 usec)
  Average test_bit loop took:    36451.930 usec (+- 9.763 usec)
1000000 operations 2 bits set of 16 bits
  Average for_each_set_bit took: 12797.065 usec (+- 302.176 usec)
  Average test_bit loop took:    37178.395 usec (+- 51.756 usec)
1000000 operations 4 bits set of 16 bits
  Average for_each_set_bit took: 18736.880 usec (+- 525.846 usec)
  Average test_bit loop took:    38306.897 usec (+- 98.527 usec)
1000000 operations 8 bits set of 16 bits
  Average for_each_set_bit took: 29257.765 usec (+- 993.813 usec)
  Average test_bit loop took:    40038.798 usec (+- 167.358 usec)
1000000 operations 16 bits set of 16 bits
  Average for_each_set_bit took: 46784.984 usec (+- 1759.075 usec)
  Average test_bit loop took:    43014.266 usec (+- 298.138 usec)

 but for larger bitmaps, for example, 2048 bits where all are set,
even though the bt was a single instruction and find_next_bit was
going to have to do a lot of work to just return true (function call,
bunch of shifts) I got:

1000000 operations 2048 bits set of 2048 bits
  Average for_each_set_bit took: 2218881.255 usec (+- 106485.020 usec)
  Average test_bit loop took:    4934851.515 usec (+- 18463.511 usec)

And so other than say for 4-bit and smaller bitmaps the current
find_next_bit approach works best. I don't see an obvious bug in the
benchmark so I am surprised at how bad bt performs.

Thanks,
Ian

> -Andi

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf bench: Add benchmark of find_next_bit
  2020-07-24  7:19 [PATCH] perf bench: Add benchmark of find_next_bit Ian Rogers
  2020-07-24 14:45 ` Andi Kleen
@ 2020-07-28 11:51 ` Arnaldo Carvalho de Melo
  2020-07-29 19:59   ` Arnaldo Carvalho de Melo
  1 sibling, 1 reply; 7+ messages in thread
From: Arnaldo Carvalho de Melo @ 2020-07-28 11:51 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Thomas Gleixner, Andi Kleen,
	linux-kernel, Stephane Eranian

Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> for_each_set_bit, or similar functions like for_each_cpu, may be hot
> within the kernel. If many bits were set then one could imagine on
> Intel a "bt" instruction with every bit may be faster than the function
> call and word length find_next_bit logic. Add a benchmark to measure
> this.

Thanks, applied.

- Arnaldo
 
> This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> option except for very small bitmaps.
> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  tools/perf/bench/Build            |   1 +
>  tools/perf/bench/bench.h          |   1 +
>  tools/perf/bench/find-bit-bench.c | 135 ++++++++++++++++++++++++++++++
>  tools/perf/builtin-bench.c        |   1 +
>  4 files changed, 138 insertions(+)
>  create mode 100644 tools/perf/bench/find-bit-bench.c
> 
> diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> index 768e408757a0..fb114bca3a8d 100644
> --- a/tools/perf/bench/Build
> +++ b/tools/perf/bench/Build
> @@ -10,6 +10,7 @@ perf-y += epoll-wait.o
>  perf-y += epoll-ctl.o
>  perf-y += synthesize.o
>  perf-y += kallsyms-parse.o
> +perf-y += find-bit-bench.o
>  
>  perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
>  perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
> diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
> index 61cae4966cae..3291b0ddddfe 100644
> --- a/tools/perf/bench/bench.h
> +++ b/tools/perf/bench/bench.h
> @@ -35,6 +35,7 @@ int bench_sched_messaging(int argc, const char **argv);
>  int bench_sched_pipe(int argc, const char **argv);
>  int bench_mem_memcpy(int argc, const char **argv);
>  int bench_mem_memset(int argc, const char **argv);
> +int bench_mem_find_bit(int argc, const char **argv);
>  int bench_futex_hash(int argc, const char **argv);
>  int bench_futex_wake(int argc, const char **argv);
>  int bench_futex_wake_parallel(int argc, const char **argv);
> diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c
> new file mode 100644
> index 000000000000..1aadbd9d7e86
> --- /dev/null
> +++ b/tools/perf/bench/find-bit-bench.c
> @@ -0,0 +1,135 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Benchmark find_next_bit and related bit operations.
> + *
> + * Copyright 2020 Google LLC.
> + */
> +#include <stdlib.h>
> +#include "bench.h"
> +#include "../util/stat.h"
> +#include <linux/bitmap.h>
> +#include <linux/bitops.h>
> +#include <linux/time64.h>
> +#include <subcmd/parse-options.h>
> +
> +static unsigned int outer_iterations = 5;
> +static unsigned int inner_iterations = 100000;
> +
> +static const struct option options[] = {
> +	OPT_UINTEGER('i', "outer-iterations", &outer_iterations,
> +		"Number of outerer iterations used"),
> +	OPT_UINTEGER('j', "inner-iterations", &inner_iterations,
> +		"Number of outerer iterations used"),
> +	OPT_END()
> +};
> +
> +static const char *const bench_usage[] = {
> +	"perf bench mem find_bit <options>",
> +	NULL
> +};
> +
> +static unsigned int accumulator;
> +static unsigned int use_of_val;
> +
> +static noinline void workload(int val)
> +{
> +	use_of_val += val;
> +	accumulator++;
> +}
> +
> +#if defined(__i386__) || defined(__x86_64__)
> +static bool asm_test_bit(long nr, const unsigned long *addr)
> +{
> +	bool oldbit;
> +
> +	asm volatile("bt %2,%1"
> +		     : "=@ccc" (oldbit)
> +		     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> +
> +	return oldbit;
> +}
> +#else
> +#define asm_test_bit test_bit
> +#endif
> +
> +static int do_for_each_set_bit(unsigned int num_bits)
> +{
> +	unsigned long *to_test = bitmap_alloc(num_bits);
> +	struct timeval start, end, diff;
> +	u64 runtime_us;
> +	struct stats fb_time_stats, tb_time_stats;
> +	double time_average, time_stddev;
> +	unsigned int bit, i, j;
> +	unsigned int set_bits, skip;
> +	unsigned int old;
> +
> +	init_stats(&fb_time_stats);
> +	init_stats(&tb_time_stats);
> +
> +	for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) {
> +		bitmap_zero(to_test, num_bits);
> +		skip = num_bits / set_bits;
> +		for (i = 0; i < num_bits; i += skip)
> +			set_bit(i, to_test);
> +
> +		for (i = 0; i < outer_iterations; i++) {
> +			old = accumulator;
> +			gettimeofday(&start, NULL);
> +			for (j = 0; j < inner_iterations; j++) {
> +				for_each_set_bit(bit, to_test, num_bits)
> +					workload(bit);
> +			}
> +			gettimeofday(&end, NULL);
> +			assert(old + (inner_iterations * set_bits) == accumulator);
> +			timersub(&end, &start, &diff);
> +			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
> +			update_stats(&fb_time_stats, runtime_us);
> +
> +			old = accumulator;
> +			gettimeofday(&start, NULL);
> +			for (j = 0; j < inner_iterations; j++) {
> +				for (bit = 0; bit < num_bits; bit++) {
> +					if (asm_test_bit(bit, to_test))
> +						workload(bit);
> +				}
> +			}
> +			gettimeofday(&end, NULL);
> +			assert(old + (inner_iterations * set_bits) == accumulator);
> +			timersub(&end, &start, &diff);
> +			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
> +			update_stats(&tb_time_stats, runtime_us);
> +		}
> +
> +		printf("%d operations %d bits set of %d bits\n",
> +			inner_iterations, set_bits, num_bits);
> +		time_average = avg_stats(&fb_time_stats);
> +		time_stddev = stddev_stats(&fb_time_stats);
> +		printf("  Average for_each_set_bit took: %.3f usec (+- %.3f usec)\n",
> +			time_average, time_stddev);
> +		time_average = avg_stats(&tb_time_stats);
> +		time_stddev = stddev_stats(&tb_time_stats);
> +		printf("  Average test_bit loop took:    %.3f usec (+- %.3f usec)\n",
> +			time_average, time_stddev);
> +
> +		if (use_of_val == accumulator)  /* Try to avoid compiler tricks. */
> +			printf("\n");
> +	}
> +	bitmap_free(to_test);
> +	return 0;
> +}
> +
> +int bench_mem_find_bit(int argc, const char **argv)
> +{
> +	int err = 0, i;
> +
> +	argc = parse_options(argc, argv, options, bench_usage, 0);
> +	if (argc) {
> +		usage_with_options(bench_usage, options);
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	for (i = 1; i <= 2048; i <<= 1)
> +		do_for_each_set_bit(i);
> +
> +	return err;
> +}
> diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
> index cad31b1d3438..690eee1120a7 100644
> --- a/tools/perf/builtin-bench.c
> +++ b/tools/perf/builtin-bench.c
> @@ -52,6 +52,7 @@ static struct bench sched_benchmarks[] = {
>  static struct bench mem_benchmarks[] = {
>  	{ "memcpy",	"Benchmark for memcpy() functions",		bench_mem_memcpy	},
>  	{ "memset",	"Benchmark for memset() functions",		bench_mem_memset	},
> +	{ "find_bit",	"Benchmark for find_bit() functions",		bench_mem_find_bit	},
>  	{ "all",	"Run all memory access benchmarks",		NULL			},
>  	{ NULL,		NULL,						NULL			}
>  };
> -- 
> 2.28.0.rc0.142.g3c755180ce-goog
> 

-- 

- Arnaldo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf bench: Add benchmark of find_next_bit
  2020-07-28 11:51 ` Arnaldo Carvalho de Melo
@ 2020-07-29 19:59   ` Arnaldo Carvalho de Melo
  2020-07-29 20:44     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 7+ messages in thread
From: Arnaldo Carvalho de Melo @ 2020-07-29 19:59 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Thomas Gleixner, Andi Kleen,
	linux-kernel, Stephane Eranian

Em Tue, Jul 28, 2020 at 08:51:52AM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> > for_each_set_bit, or similar functions like for_each_cpu, may be hot
> > within the kernel. If many bits were set then one could imagine on
> > Intel a "bt" instruction with every bit may be faster than the function
> > call and word length find_next_bit logic. Add a benchmark to measure
> > this.
> 
> Thanks, applied.
> 
> - Arnaldo
>  
> > This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> > option except for very small bitmaps.

> > +++ b/tools/perf/bench/find-bit-bench.c

> > +#if defined(__i386__) || defined(__x86_64__)
> > +static bool asm_test_bit(long nr, const unsigned long *addr)
> > +{
> > +	bool oldbit;
> > +
> > +	asm volatile("bt %2,%1"
> > +		     : "=@ccc" (oldbit)
> > +		     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> > +
> > +	return oldbit;

Some old clang versions are not liking this:

clang version 3.8.0 (tags/RELEASE_380/final)
Target: x86_64-alpine-linux-musl
Thread model: posix
InstalledDir: /usr/bin
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-alpine-linux-musl/5.3.0
Found candidate GCC installation: /usr/lib/gcc/x86_64-alpine-linux-musl/5.3.0
Selected GCC installation: /usr/bin/../lib/gcc/x86_64-alpine-linux-musl/5.3.0
Candidate multilib: .;@m64
Selected multilib: .;@m64
+ make ARCH= CROSS_COMPILE= EXTRA_CFLAGS= -C /git/linux/tools/perf O=/tmp/build/perf CC=clang


  CC       /tmp/build/perf/trace/beauty/pkey_alloc.o
  CC       /tmp/build/perf/tests/openat-syscall-tp-fields.o
bench/find-bit-bench.c:46:10: error: invalid output constraint '=@ccc' in asm
                     : "=@ccc" (oldbit)
                       ^
1 error generated.
mv: can't rename '/tmp/build/perf/bench/.find-bit-bench.o.tmp': No such file or directory
/git/linux/tools/build/Makefile.build:96: recipe for target '/tmp/build/perf/bench/find-bit-bench.o' failed
make[4]: *** [/tmp/build/perf/bench/find-bit-bench.o] Error 1
/git/linux/tools/build/Makefile.build:139: recipe for target 'bench' failed
make[3]: *** [bench] Error 2
make[3]: *** Waiting for unfinished jobs....
  MKDIR    /tmp/build/perf/arch/x86/tests/


Also:

clang version 3.8.1 (tags/RELEASE_381/final)
clang version 4.0.0 (tags/RELEASE_400/final)


- Arnaldo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf bench: Add benchmark of find_next_bit
  2020-07-29 19:59   ` Arnaldo Carvalho de Melo
@ 2020-07-29 20:44     ` Arnaldo Carvalho de Melo
  2020-07-29 22:03       ` Ian Rogers
  0 siblings, 1 reply; 7+ messages in thread
From: Arnaldo Carvalho de Melo @ 2020-07-29 20:44 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Thomas Gleixner, Andi Kleen,
	linux-kernel, Stephane Eranian

Em Wed, Jul 29, 2020 at 04:59:18PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Tue, Jul 28, 2020 at 08:51:52AM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> > > for_each_set_bit, or similar functions like for_each_cpu, may be hot
> > > within the kernel. If many bits were set then one could imagine on
> > > Intel a "bt" instruction with every bit may be faster than the function
> > > call and word length find_next_bit logic. Add a benchmark to measure
> > > this.
> > 
> > Thanks, applied.

> > > This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> > > option except for very small bitmaps.
> 
> > > +++ b/tools/perf/bench/find-bit-bench.c
> 
> > > +#if defined(__i386__) || defined(__x86_64__)
> > > +static bool asm_test_bit(long nr, const unsigned long *addr)
> > > +{
> > > +	bool oldbit;
> > > +
> > > +	asm volatile("bt %2,%1"
> > > +		     : "=@ccc" (oldbit)
> > > +		     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> > > +
> > > +	return oldbit;
> 
> Some old clang versions are not liking this:

Failed with:

  clang version 3.8.0 (tags/RELEASE_380/final)
  clang version 3.8.1 (tags/RELEASE_381/final)
  clang version 4.0.0 (tags/RELEASE_400/final)
  Alpine clang version 8.0.0 (tags/RELEASE_800/final) (based on LLVM 8.0.0)
  Alpine clang version 5.0.0 (tags/RELEASE_500/final) (based on LLVM 5.0.0)
  Alpine clang version 5.0.1 (tags/RELEASE_501/final) (based on LLVM 5.0.1)
  Alpine clang version 5.0.1 (tags/RELEASE_502/final) (based on LLVM 5.0.1)

Worked with:

  Alpine clang version 9.0.0 (https://git.alpinelinux.org/aports f7f0d2c2b8bcd6a5843401a9a702029556492689) (based on LLVM 9.0.0)
  Alpine clang version 10.0.0 (https://gitlab.alpinelinux.org/alpine/aports.git 7445adce501f8473efdb93b17b5eaf2f1445ed4c)
  Alpine clang version 10.0.0 (git://git.alpinelinux.org/aports 7445adce501f8473efdb93b17b5eaf2f1445ed4c)


Also failed for;

# grep FAIL dm.log/summary  | grep -v alpine
 alt:p8: FAIL
   clang version 3.8.0 (tags/RELEASE_380/final)
 alt:p9: FAIL
   clang version 7.0.1
 amazonlinux:1: FAIL
   clang version 3.6.2 (tags/RELEASE_362/final)
 amazonlinux:2: FAIL
   clang version 7.0.1 (Amazon Linux 2 7.0.1-1.amzn2.0.2)
#

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf bench: Add benchmark of find_next_bit
  2020-07-29 20:44     ` Arnaldo Carvalho de Melo
@ 2020-07-29 22:03       ` Ian Rogers
  0 siblings, 0 replies; 7+ messages in thread
From: Ian Rogers @ 2020-07-29 22:03 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Thomas Gleixner, Andi Kleen, LKML,
	Stephane Eranian

On Wed, Jul 29, 2020 at 1:44 PM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> Em Wed, Jul 29, 2020 at 04:59:18PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Tue, Jul 28, 2020 at 08:51:52AM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> > > > for_each_set_bit, or similar functions like for_each_cpu, may be hot
> > > > within the kernel. If many bits were set then one could imagine on
> > > > Intel a "bt" instruction with every bit may be faster than the function
> > > > call and word length find_next_bit logic. Add a benchmark to measure
> > > > this.
> > >
> > > Thanks, applied.
>
> > > > This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> > > > option except for very small bitmaps.
> >
> > > > +++ b/tools/perf/bench/find-bit-bench.c
> >
> > > > +#if defined(__i386__) || defined(__x86_64__)
> > > > +static bool asm_test_bit(long nr, const unsigned long *addr)
> > > > +{
> > > > + bool oldbit;
> > > > +
> > > > + asm volatile("bt %2,%1"
> > > > +              : "=@ccc" (oldbit)
> > > > +              : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> > > > +
> > > > + return oldbit;
> >
> > Some old clang versions are not liking this:
>
> Failed with:
>
>   clang version 3.8.0 (tags/RELEASE_380/final)
>   clang version 3.8.1 (tags/RELEASE_381/final)
>   clang version 4.0.0 (tags/RELEASE_400/final)
>   Alpine clang version 8.0.0 (tags/RELEASE_800/final) (based on LLVM 8.0.0)
>   Alpine clang version 5.0.0 (tags/RELEASE_500/final) (based on LLVM 5.0.0)
>   Alpine clang version 5.0.1 (tags/RELEASE_501/final) (based on LLVM 5.0.1)
>   Alpine clang version 5.0.1 (tags/RELEASE_502/final) (based on LLVM 5.0.1)
>
> Worked with:
>
>   Alpine clang version 9.0.0 (https://git.alpinelinux.org/aports f7f0d2c2b8bcd6a5843401a9a702029556492689) (based on LLVM 9.0.0)
>   Alpine clang version 10.0.0 (https://gitlab.alpinelinux.org/alpine/aports.git 7445adce501f8473efdb93b17b5eaf2f1445ed4c)
>   Alpine clang version 10.0.0 (git://git.alpinelinux.org/aports 7445adce501f8473efdb93b17b5eaf2f1445ed4c)
>
>
> Also failed for;
>
> # grep FAIL dm.log/summary  | grep -v alpine
>  alt:p8: FAIL
>    clang version 3.8.0 (tags/RELEASE_380/final)
>  alt:p9: FAIL
>    clang version 7.0.1
>  amazonlinux:1: FAIL
>    clang version 3.6.2 (tags/RELEASE_362/final)
>  amazonlinux:2: FAIL
>    clang version 7.0.1 (Amazon Linux 2 7.0.1-1.amzn2.0.2)
> #

Thanks, I added a __GCC_ASM_FLAG_OUTPUTS__ guard:
https://lore.kernel.org/lkml/20200729220034.1337168-1-irogers@google.com/T/#u

Ian

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-07-29 22:03 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-24  7:19 [PATCH] perf bench: Add benchmark of find_next_bit Ian Rogers
2020-07-24 14:45 ` Andi Kleen
2020-07-24 18:13   ` Ian Rogers
2020-07-28 11:51 ` Arnaldo Carvalho de Melo
2020-07-29 19:59   ` Arnaldo Carvalho de Melo
2020-07-29 20:44     ` Arnaldo Carvalho de Melo
2020-07-29 22:03       ` Ian Rogers

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.