All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters
@ 2024-03-15  5:18 Andrii Nakryiko
  2024-03-15  5:18 ` [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks Andrii Nakryiko
  2024-03-15 16:08 ` [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters Alexei Starovoitov
  0 siblings, 2 replies; 12+ messages in thread
From: Andrii Nakryiko @ 2024-03-15  5:18 UTC (permalink / raw)
  To: bpf, ast, daniel, martin.lau; +Cc: andrii, kernel-team

When benchmarking with multiple threads (-pN, where N>1), we start
contending on single atomic counter that both BPF trigger benchmarks are
using, as well as "baseline" tests in user space (trig-base and
trig-uprobe-base benchmarks). As such, we start bottlenecking on
something completely irrelevant to benchmark at hand.

Scale counting up by using per-CPU counters on BPF side. On use space
side we do the next best thing: hash thread ID to approximate per-CPU
behavior. It seems to work quite well in practice.

To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8,
16, and 32 threads:
  - trig-uprobe-base (no syscalls, pure tight counting loop in user-space);
  - trig-base (get_pgid() syscall, atomic counter in user-space);
  - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU
    counter on BPF side).

Command used:

  for b in uprobe-base base fentry; do \
    for p in 1 2 4 8 16 32; do \
      printf "%-11s %2d: %s\n" $b $p \
        "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \
    done; \
  done

Before these changes, aggregate throughput across all threads doesn't
scale well with number of threads, it actually even falls sharply for
uprobe-base due to a very high contention:

  uprobe-base  1:  138.998 ± 0.650M/s
  uprobe-base  2:   70.526 ± 1.147M/s
  uprobe-base  4:   63.114 ± 0.302M/s
  uprobe-base  8:   54.177 ± 0.138M/s
  uprobe-base 16:   45.439 ± 0.057M/s
  uprobe-base 32:   37.163 ± 0.242M/s
  base         1:   16.940 ± 0.182M/s
  base         2:   19.231 ± 0.105M/s
  base         4:   21.479 ± 0.038M/s
  base         8:   23.030 ± 0.037M/s
  base        16:   22.034 ± 0.004M/s
  base        32:   18.152 ± 0.013M/s
  fentry       1:   14.794 ± 0.054M/s
  fentry       2:   17.341 ± 0.055M/s
  fentry       4:   23.792 ± 0.024M/s
  fentry       8:   21.557 ± 0.047M/s
  fentry      16:   21.121 ± 0.004M/s
  fentry      32:   17.067 ± 0.023M/s

After these changes, we see almost perfect linear scaling, as expected.
The sub-linear scaling when going from 8 to 16 threads is interesting
and consistent on my test machine, but I haven't investigated what is
causing it this peculiar slowdown (across all benchmarks, could be due
to hyperthreading effects, not sure).

  uprobe-base  1:  139.980 ± 0.648M/s
  uprobe-base  2:  270.244 ± 0.379M/s
  uprobe-base  4:  532.044 ± 1.519M/s
  uprobe-base  8: 1004.571 ± 3.174M/s
  uprobe-base 16: 1720.098 ± 0.744M/s
  uprobe-base 32: 3506.659 ± 8.549M/s
  base         1:   16.869 ± 0.071M/s
  base         2:   33.007 ± 0.092M/s
  base         4:   64.670 ± 0.203M/s
  base         8:  121.969 ± 0.210M/s
  base        16:  207.832 ± 0.112M/s
  base        32:  424.227 ± 1.477M/s
  fentry       1:   14.777 ± 0.087M/s
  fentry       2:   28.575 ± 0.146M/s
  fentry       4:   56.234 ± 0.176M/s
  fentry       8:  106.095 ± 0.385M/s
  fentry      16:  181.440 ± 0.032M/s
  fentry      32:  369.131 ± 0.693M/s

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../selftests/bpf/benchs/bench_trigger.c      | 40 ++++++++++++++++---
 .../selftests/bpf/progs/trigger_bench.c       | 39 ++++++++++++------
 2 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index ace0d1011a8e..8fbc78d5f8a4 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -1,15 +1,45 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <unistd.h>
 #include "bench.h"
 #include "trigger_bench.skel.h"
 #include "trace_helpers.h"
 
+/* adjust slot shift in inc_hits() if changing */
+#define MAX_BUCKETS 256
+
 /* BPF triggering benchmarks */
 static struct trigger_ctx {
 	struct trigger_bench *skel;
 } ctx;
 
-static struct counter base_hits;
+static struct counter base_hits[MAX_BUCKETS];
+
+static __always_inline void inc_counter(struct counter *counters)
+{
+	static __thread int tid = 0;
+	unsigned slot;
+
+	if (unlikely(tid == 0))
+		tid = gettid();
+
+	/* multiplicative hashing, it's fast */
+	slot = 2654435769U * tid;
+	slot >>= 24;
+
+	atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
+}
+
+static __always_inline long sum_and_reset_counters(struct counter *counters)
+{
+	int i;
+	long sum = 0;
+
+	for (i = 0; i < MAX_BUCKETS; i++)
+		sum += atomic_swap(&counters[i].value, 0);
+	return sum;
+}
 
 static void trigger_validate(void)
 {
@@ -23,14 +53,14 @@ static void *trigger_base_producer(void *input)
 {
 	while (true) {
 		(void)syscall(__NR_getpgid);
-		atomic_inc(&base_hits.value);
+		inc_counter(base_hits);
 	}
 	return NULL;
 }
 
 static void trigger_base_measure(struct bench_res *res)
 {
-	res->hits = atomic_swap(&base_hits.value, 0);
+	res->hits = sum_and_reset_counters(base_hits);
 }
 
 static void *trigger_producer(void *input)
@@ -42,7 +72,7 @@ static void *trigger_producer(void *input)
 
 static void trigger_measure(struct bench_res *res)
 {
-	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+	res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
 }
 
 static void setup_ctx(void)
@@ -164,7 +194,7 @@ static void *uprobe_base_producer(void *input)
 {
 	while (true) {
 		uprobe_target_nop();
-		atomic_inc(&base_hits.value);
+		inc_counter(base_hits);
 	}
 	return NULL;
 }
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 5fda43901033..42ec202015ed 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -9,12 +9,27 @@
 
 char _license[] SEC("license") = "GPL";
 
-long hits = 0;
+#define CPU_MASK 255
+#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */
+
+/* matches struct counter in bench.h */
+struct counter {
+	long value;
+} __attribute__((aligned(128)));
+
+struct counter hits[MAX_CPUS];
+
+static __always_inline void inc_counter(void)
+{
+	int cpu = bpf_get_smp_processor_id();
+
+	__sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1);
+}
 
 SEC("tp/syscalls/sys_enter_getpgid")
 int bench_trigger_tp(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
@@ -22,69 +37,69 @@ SEC("raw_tp/sys_enter")
 int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id)
 {
 	if (id == __NR_getpgid)
-		__sync_add_and_fetch(&hits, 1);
+		inc_counter();
 	return 0;
 }
 
 SEC("kprobe/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kretprobe/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kretprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kprobe_multi(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kretprobe_multi(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fexit/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fexit(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry_sleep(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fmod_ret/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fmodret(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return -22;
 }
 
 SEC("uprobe")
 int bench_trigger_uprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15  5:18 [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters Andrii Nakryiko
@ 2024-03-15  5:18 ` Andrii Nakryiko
  2024-03-15  9:10   ` Jiri Olsa
  2024-03-15 16:03   ` Alexei Starovoitov
  2024-03-15 16:08 ` [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters Alexei Starovoitov
  1 sibling, 2 replies; 12+ messages in thread
From: Andrii Nakryiko @ 2024-03-15  5:18 UTC (permalink / raw)
  To: bpf, ast, daniel, martin.lau; +Cc: andrii, kernel-team, Jiri Olsa

Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
one syscall execution and BPF program run. While we use a fast
get_pgid() syscall, syscall overhead can still be non-trivial.

This patch adds kprobe/fentry set of benchmarks significantly amortizing
the cost of syscall vs actual BPF triggering overhead. We do this by
employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
which does a tight parameterized loop calling cheap BPF helper
(bpf_get_smp_processor_id()), to which kprobe/fentry programs are
attached for benchmarking.

This way 1 bpf() syscall causes N executions of BPF program being
benchmarked. N defaults to 100, but can be adjusted with
--trig-batch-iters CLI argument.

Results speak for themselves:

$ ./run_bench_trigger.sh
uprobe-base         :  138.054 ± 0.556M/s
base                :   16.650 ± 0.123M/s
tp                  :   11.068 ± 0.100M/s
rawtp               :   14.087 ± 0.511M/s
kprobe              :    9.641 ± 0.027M/s
kprobe-multi        :   10.263 ± 0.061M/s
kretprobe           :    5.475 ± 0.028M/s
kretprobe-multi     :    5.703 ± 0.036M/s
fentry              :   14.544 ± 0.112M/s
fexit               :   10.637 ± 0.073M/s
fmodret             :   11.357 ± 0.061M/s
kprobe-fast         :   14.286 ± 0.377M/s
kprobe-multi-fast   :   14.999 ± 0.204M/s
kretprobe-fast      :    7.646 ± 0.084M/s
kretprobe-multi-fast:    4.354 ± 0.066M/s
fentry-fast         :   31.475 ± 0.254M/s
fexit-fast          :   17.379 ± 0.195M/s

Note how xxx-fast variants are measured with significantly higher
throughput, even though it's exactly the same in-kernel overhead:

fentry              :   14.544 ± 0.112M/s
fentry-fast         :   31.475 ± 0.254M/s

kprobe-multi        :   10.263 ± 0.061M/s
kprobe-multi-fast   :   14.999 ± 0.204M/s

One huge and not yet explained deviation is a slowdown of
kretprobe-multi, we should look into that separately.

kretprobe           :    5.475 ± 0.028M/s
kretprobe-multi     :    5.703 ± 0.036M/s
kretprobe-fast      :    7.646 ± 0.084M/s
kretprobe-multi-fast:    4.354 ± 0.066M/s

Kprobe cases don't seem to have this illogical slowdown:

kprobe              :    9.641 ± 0.027M/s
kprobe-multi        :   10.263 ± 0.061M/s
kprobe-fast         :   14.286 ± 0.377M/s
kprobe-multi-fast   :   14.999 ± 0.204M/s

Cc: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/bench.c           |  18 +++
 .../selftests/bpf/benchs/bench_trigger.c      | 123 +++++++++++++++++-
 .../selftests/bpf/benchs/run_bench_trigger.sh |   8 +-
 .../selftests/bpf/progs/trigger_bench.c       |  56 +++++++-
 4 files changed, 201 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index b2b4c391eb0a..67212b89f876 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -280,6 +280,7 @@ extern struct argp bench_strncmp_argp;
 extern struct argp bench_hashmap_lookup_argp;
 extern struct argp bench_local_storage_create_argp;
 extern struct argp bench_htab_mem_argp;
+extern struct argp bench_trigger_fast_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -292,6 +293,7 @@ static const struct argp_child bench_parsers[] = {
 	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
 	{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
 	{ &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
+	{ &bench_trigger_fast_argp, 0, "BPF triggering benchmark", 0 },
 	{},
 };
 
@@ -502,6 +504,12 @@ extern const struct bench bench_trig_fentry;
 extern const struct bench bench_trig_fexit;
 extern const struct bench bench_trig_fentry_sleep;
 extern const struct bench bench_trig_fmodret;
+extern const struct bench bench_trig_kprobe_fast;
+extern const struct bench bench_trig_kretprobe_fast;
+extern const struct bench bench_trig_kprobe_multi_fast;
+extern const struct bench bench_trig_kretprobe_multi_fast;
+extern const struct bench bench_trig_fentry_fast;
+extern const struct bench bench_trig_fexit_fast;
 extern const struct bench bench_trig_uprobe_base;
 extern const struct bench bench_trig_uprobe_nop;
 extern const struct bench bench_trig_uretprobe_nop;
@@ -539,6 +547,7 @@ static const struct bench *benchs[] = {
 	&bench_rename_rawtp,
 	&bench_rename_fentry,
 	&bench_rename_fexit,
+	/* syscall-driven triggering benchmarks */
 	&bench_trig_base,
 	&bench_trig_tp,
 	&bench_trig_rawtp,
@@ -550,6 +559,14 @@ static const struct bench *benchs[] = {
 	&bench_trig_fexit,
 	&bench_trig_fentry_sleep,
 	&bench_trig_fmodret,
+	/* fast, mostly in-kernel triggers */
+	&bench_trig_kprobe_fast,
+	&bench_trig_kretprobe_fast,
+	&bench_trig_kprobe_multi_fast,
+	&bench_trig_kretprobe_multi_fast,
+	&bench_trig_fentry_fast,
+	&bench_trig_fexit_fast,
+	/* uprobes */
 	&bench_trig_uprobe_base,
 	&bench_trig_uprobe_nop,
 	&bench_trig_uretprobe_nop,
@@ -557,6 +574,7 @@ static const struct bench *benchs[] = {
 	&bench_trig_uretprobe_push,
 	&bench_trig_uprobe_ret,
 	&bench_trig_uretprobe_ret,
+	/* ringbuf/perfbuf benchmarks */
 	&bench_rb_libbpf,
 	&bench_rb_custom,
 	&bench_pb_libbpf,
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 8fbc78d5f8a4..d6c87180c887 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -1,11 +1,54 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
 #define _GNU_SOURCE
+#include <argp.h>
 #include <unistd.h>
+#include <stdint.h>
 #include "bench.h"
 #include "trigger_bench.skel.h"
 #include "trace_helpers.h"
 
+static struct {
+	__u32 batch_iters;
+} args = {
+	.batch_iters = 100,
+};
+
+enum {
+	ARG_TRIG_BATCH_ITERS = 7000,
+};
+
+static const struct argp_option opts[] = {
+	{ "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0,
+		"Number of in-kernel iterations per one driver test run"},
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	long ret;
+
+	switch (key) {
+	case ARG_TRIG_BATCH_ITERS:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > UINT_MAX) {
+			fprintf(stderr, "invalid --trig-batch-iters value");
+			argp_usage(state);
+		}
+		args.batch_iters = ret;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_trigger_fast_argp = {
+	.options = opts,
+	.parser = parse_arg,
+};
+
 /* adjust slot shift in inc_hits() if changing */
 #define MAX_BUCKETS 256
 
@@ -70,6 +113,16 @@ static void *trigger_producer(void *input)
 	return NULL;
 }
 
+static void *trigger_producer_fast(void *input)
+{
+	int fd = bpf_program__fd(ctx.skel->progs.trigger_driver);
+
+	while (true)
+		bpf_prog_test_run_opts(fd, NULL);
+
+	return NULL;
+}
+
 static void trigger_measure(struct bench_res *res)
 {
 	res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
@@ -77,13 +130,23 @@ static void trigger_measure(struct bench_res *res)
 
 static void setup_ctx(void)
 {
+	int err;
+
 	setup_libbpf();
 
-	ctx.skel = trigger_bench__open_and_load();
+	ctx.skel = trigger_bench__open();
 	if (!ctx.skel) {
 		fprintf(stderr, "failed to open skeleton\n");
 		exit(1);
 	}
+
+	ctx.skel->rodata->batch_iters = args.batch_iters;
+
+	err = trigger_bench__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
 }
 
 static void attach_bpf(struct bpf_program *prog)
@@ -157,6 +220,44 @@ static void trigger_fmodret_setup(void)
 	attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
 }
 
+/* Fast, mostly in-kernel triggering setups */
+
+static void trigger_kprobe_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_fast);
+}
+
+static void trigger_kretprobe_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_fast);
+}
+
+static void trigger_kprobe_multi_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi_fast);
+}
+
+static void trigger_kretprobe_multi_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi_fast);
+}
+
+static void trigger_fentry_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_fentry_fast);
+}
+
+static void trigger_fexit_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_fexit_fast);
+}
+
 /* make sure call is not inlined and not avoided by compiler, so __weak and
  * inline asm volatile in the body of the function
  *
@@ -385,6 +486,26 @@ const struct bench bench_trig_fmodret = {
 	.report_final = hits_drops_report_final,
 };
 
+/* fast (staying mostly in kernel) kprobe/fentry benchmarks */
+#define BENCH_TRIG_FAST(KIND, NAME)					\
+const struct bench bench_trig_##KIND = {				\
+	.name = "trig-" NAME,						\
+	.setup = trigger_##KIND##_setup,				\
+	.producer_thread = trigger_producer_fast,			\
+	.measure = trigger_measure,					\
+	.report_progress = hits_drops_report_progress,			\
+	.report_final = hits_drops_report_final,			\
+	.argp = &bench_trigger_fast_argp,				\
+}
+
+BENCH_TRIG_FAST(kprobe_fast, "kprobe-fast");
+BENCH_TRIG_FAST(kretprobe_fast, "kretprobe-fast");
+BENCH_TRIG_FAST(kprobe_multi_fast, "kprobe-multi-fast");
+BENCH_TRIG_FAST(kretprobe_multi_fast, "kretprobe-multi-fast");
+BENCH_TRIG_FAST(fentry_fast, "fentry-fast");
+BENCH_TRIG_FAST(fexit_fast, "fexit-fast");
+
+/* uprobe benchmarks */
 const struct bench bench_trig_uprobe_base = {
 	.name = "trig-uprobe-base",
 	.setup = NULL, /* no uprobe/uretprobe is attached */
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
index 78e83f243294..fee069ac930b 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
@@ -2,8 +2,12 @@
 
 set -eufo pipefail
 
-for i in base tp rawtp kprobe fentry fmodret
+for i in uprobe-base base tp rawtp \
+	 kprobe kprobe-multi kretprobe kretprobe-multi \
+	 fentry fexit fmodret \
+	 kprobe-fast kprobe-multi-fast kretprobe-fast kretprobe-multi-fast \
+	 fentry-fast fexit-fast
 do
 	summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
-	printf "%-10s: %s\n" $i "$summary"
+	printf "%-20s: %s\n" $i "$summary"
 done
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 42ec202015ed..2886c2cb3570 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2020 Facebook
-
 #include <linux/bpf.h>
 #include <asm/unistd.h>
 #include <bpf/bpf_helpers.h>
@@ -103,3 +102,58 @@ int bench_trigger_uprobe(void *ctx)
 	inc_counter();
 	return 0;
 }
+
+const volatile int batch_iters = 0;
+
+SEC("raw_tp")
+int trigger_driver(void *ctx)
+{
+	int i;
+
+	for (i = 0; i < batch_iters; i++)
+		(void)bpf_get_smp_processor_id(); /* attach here to benchmark */
+
+	return 0;
+}
+
+SEC("kprobe/bpf_get_smp_processor_id")
+int bench_trigger_kprobe_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("kretprobe/bpf_get_smp_processor_id")
+int bench_trigger_kretprobe_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("kprobe.multi/bpf_get_smp_processor_id")
+int bench_trigger_kprobe_multi_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("kretprobe.multi/bpf_get_smp_processor_id")
+int bench_trigger_kretprobe_multi_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("fentry/bpf_get_smp_processor_id")
+int bench_trigger_fentry_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("fexit/bpf_get_smp_processor_id")
+int bench_trigger_fexit_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15  5:18 ` [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks Andrii Nakryiko
@ 2024-03-15  9:10   ` Jiri Olsa
  2024-03-15 22:22     ` Andrii Nakryiko
  2024-03-15 16:03   ` Alexei Starovoitov
  1 sibling, 1 reply; 12+ messages in thread
From: Jiri Olsa @ 2024-03-15  9:10 UTC (permalink / raw)
  To: Andrii Nakryiko; +Cc: bpf, ast, daniel, martin.lau, kernel-team

On Thu, Mar 14, 2024 at 10:18:13PM -0700, Andrii Nakryiko wrote:
> Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> one syscall execution and BPF program run. While we use a fast
> get_pgid() syscall, syscall overhead can still be non-trivial.
> 
> This patch adds kprobe/fentry set of benchmarks significantly amortizing
> the cost of syscall vs actual BPF triggering overhead. We do this by
> employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> which does a tight parameterized loop calling cheap BPF helper
> (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> attached for benchmarking.
> 
> This way 1 bpf() syscall causes N executions of BPF program being
> benchmarked. N defaults to 100, but can be adjusted with
> --trig-batch-iters CLI argument.
> 
> Results speak for themselves:
> 
> $ ./run_bench_trigger.sh
> uprobe-base         :  138.054 ± 0.556M/s
> base                :   16.650 ± 0.123M/s
> tp                  :   11.068 ± 0.100M/s
> rawtp               :   14.087 ± 0.511M/s
> kprobe              :    9.641 ± 0.027M/s
> kprobe-multi        :   10.263 ± 0.061M/s
> kretprobe           :    5.475 ± 0.028M/s
> kretprobe-multi     :    5.703 ± 0.036M/s
> fentry              :   14.544 ± 0.112M/s
> fexit               :   10.637 ± 0.073M/s
> fmodret             :   11.357 ± 0.061M/s
> kprobe-fast         :   14.286 ± 0.377M/s
> kprobe-multi-fast   :   14.999 ± 0.204M/s
> kretprobe-fast      :    7.646 ± 0.084M/s
> kretprobe-multi-fast:    4.354 ± 0.066M/s
> fentry-fast         :   31.475 ± 0.254M/s
> fexit-fast          :   17.379 ± 0.195M/s
> 
> Note how xxx-fast variants are measured with significantly higher
> throughput, even though it's exactly the same in-kernel overhead:
> 
> fentry              :   14.544 ± 0.112M/s
> fentry-fast         :   31.475 ± 0.254M/s
> 
> kprobe-multi        :   10.263 ± 0.061M/s
> kprobe-multi-fast   :   14.999 ± 0.204M/s
> 
> One huge and not yet explained deviation is a slowdown of
> kretprobe-multi, we should look into that separately.
> 
> kretprobe           :    5.475 ± 0.028M/s
> kretprobe-multi     :    5.703 ± 0.036M/s
> kretprobe-fast      :    7.646 ± 0.084M/s
> kretprobe-multi-fast:    4.354 ± 0.066M/s
> 
> Kprobe cases don't seem to have this illogical slowdown:
> 
> kprobe              :    9.641 ± 0.027M/s
> kprobe-multi        :   10.263 ± 0.061M/s
> kprobe-fast         :   14.286 ± 0.377M/s
> kprobe-multi-fast   :   14.999 ± 0.204M/s

hum, I see that as well:

uprobe-base         :  230.624 ± 0.527M/s 
base                :   16.320 ± 0.087M/s 
tp                  :   10.057 ± 0.122M/s 
rawtp               :   14.851 ± 0.300M/s 
kprobe              :   10.993 ± 0.104M/s 
kprobe-multi        :   11.053 ± 0.038M/s 
kretprobe           :    6.679 ± 0.015M/s 
kretprobe-multi     :    6.466 ± 0.015M/s 
fentry              :   14.949 ± 0.064M/s 
fexit               :   10.530 ± 1.275M/s 
fmodret             :   11.145 ± 0.245M/s 
kprobe-fast         :   20.080 ± 0.468M/s 
kprobe-multi-fast   :   17.603 ± 0.102M/s 
kretprobe-fast      :    9.943 ± 0.056M/s 
kretprobe-multi-fast:    5.185 ± 0.022M/s 
fentry-fast         :   46.697 ± 0.260M/s 
fexit-fast          :   19.250 ± 0.108M/s 


I even see decline in kprobe-multi-fast:

  kprobe-fast         :   20.080 ± 0.468M/s 
  kprobe-multi-fast   :   17.603 ± 0.102M/s 

  kretprobe-fast      :    9.943 ± 0.056M/s 
  kretprobe-multi-fast:    5.185 ± 0.022M/s 


I've got some IBT related code showing up in the perf profile
and when I disabled it I got better results for kprobe-multi-fast
but kretprobe-multi-fast is still bad


uprobe-base         :  234.024 ± 0.225M/s 
base                :   16.383 ± 0.029M/s 
tp                  :    9.973 ± 0.017M/s 
rawtp               :   14.889 ± 0.047M/s 
kprobe              :   10.970 ± 0.011M/s 
kprobe-multi        :   11.640 ± 0.009M/s 
kretprobe           :    6.667 ± 0.005M/s 
kretprobe-multi     :    6.704 ± 0.005M/s 
fentry              :   14.968 ± 0.024M/s 
fexit               :   11.860 ± 0.012M/s 
fmodret             :   12.656 ± 0.036M/s 
kprobe-fast         :   20.340 ± 0.043M/s 
kprobe-multi-fast   :   21.203 ± 0.019M/s 
kretprobe-fast      :    9.956 ± 0.021M/s 
kretprobe-multi-fast:    5.611 ± 0.006M/s 
fentry-fast         :   46.825 ± 0.041M/s 
fexit-fast          :   19.746 ± 0.024M/s 

slightly better:

  kprobe-fast         :   20.340 ± 0.043M/s 
  kprobe-multi-fast   :   21.203 ± 0.019M/s 

still almost half perf:
  kretprobe-fast      :    9.956 ± 0.021M/s 
  kretprobe-multi-fast:    5.611 ± 0.006M/s 

jirka

> 
> Cc: Jiri Olsa <jolsa@kernel.org>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  tools/testing/selftests/bpf/bench.c           |  18 +++
>  .../selftests/bpf/benchs/bench_trigger.c      | 123 +++++++++++++++++-
>  .../selftests/bpf/benchs/run_bench_trigger.sh |   8 +-
>  .../selftests/bpf/progs/trigger_bench.c       |  56 +++++++-
>  4 files changed, 201 insertions(+), 4 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
> index b2b4c391eb0a..67212b89f876 100644
> --- a/tools/testing/selftests/bpf/bench.c
> +++ b/tools/testing/selftests/bpf/bench.c
> @@ -280,6 +280,7 @@ extern struct argp bench_strncmp_argp;
>  extern struct argp bench_hashmap_lookup_argp;
>  extern struct argp bench_local_storage_create_argp;
>  extern struct argp bench_htab_mem_argp;
> +extern struct argp bench_trigger_fast_argp;
>  
>  static const struct argp_child bench_parsers[] = {
>  	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
> @@ -292,6 +293,7 @@ static const struct argp_child bench_parsers[] = {
>  	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
>  	{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
>  	{ &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
> +	{ &bench_trigger_fast_argp, 0, "BPF triggering benchmark", 0 },
>  	{},
>  };
>  
> @@ -502,6 +504,12 @@ extern const struct bench bench_trig_fentry;
>  extern const struct bench bench_trig_fexit;
>  extern const struct bench bench_trig_fentry_sleep;
>  extern const struct bench bench_trig_fmodret;
> +extern const struct bench bench_trig_kprobe_fast;
> +extern const struct bench bench_trig_kretprobe_fast;
> +extern const struct bench bench_trig_kprobe_multi_fast;
> +extern const struct bench bench_trig_kretprobe_multi_fast;
> +extern const struct bench bench_trig_fentry_fast;
> +extern const struct bench bench_trig_fexit_fast;
>  extern const struct bench bench_trig_uprobe_base;
>  extern const struct bench bench_trig_uprobe_nop;
>  extern const struct bench bench_trig_uretprobe_nop;
> @@ -539,6 +547,7 @@ static const struct bench *benchs[] = {
>  	&bench_rename_rawtp,
>  	&bench_rename_fentry,
>  	&bench_rename_fexit,
> +	/* syscall-driven triggering benchmarks */
>  	&bench_trig_base,
>  	&bench_trig_tp,
>  	&bench_trig_rawtp,
> @@ -550,6 +559,14 @@ static const struct bench *benchs[] = {
>  	&bench_trig_fexit,
>  	&bench_trig_fentry_sleep,
>  	&bench_trig_fmodret,
> +	/* fast, mostly in-kernel triggers */
> +	&bench_trig_kprobe_fast,
> +	&bench_trig_kretprobe_fast,
> +	&bench_trig_kprobe_multi_fast,
> +	&bench_trig_kretprobe_multi_fast,
> +	&bench_trig_fentry_fast,
> +	&bench_trig_fexit_fast,
> +	/* uprobes */
>  	&bench_trig_uprobe_base,
>  	&bench_trig_uprobe_nop,
>  	&bench_trig_uretprobe_nop,
> @@ -557,6 +574,7 @@ static const struct bench *benchs[] = {
>  	&bench_trig_uretprobe_push,
>  	&bench_trig_uprobe_ret,
>  	&bench_trig_uretprobe_ret,
> +	/* ringbuf/perfbuf benchmarks */
>  	&bench_rb_libbpf,
>  	&bench_rb_custom,
>  	&bench_pb_libbpf,
> diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> index 8fbc78d5f8a4..d6c87180c887 100644
> --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
> +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> @@ -1,11 +1,54 @@
>  // SPDX-License-Identifier: GPL-2.0
>  /* Copyright (c) 2020 Facebook */
>  #define _GNU_SOURCE
> +#include <argp.h>
>  #include <unistd.h>
> +#include <stdint.h>
>  #include "bench.h"
>  #include "trigger_bench.skel.h"
>  #include "trace_helpers.h"
>  
> +static struct {
> +	__u32 batch_iters;
> +} args = {
> +	.batch_iters = 100,
> +};
> +
> +enum {
> +	ARG_TRIG_BATCH_ITERS = 7000,
> +};
> +
> +static const struct argp_option opts[] = {
> +	{ "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0,
> +		"Number of in-kernel iterations per one driver test run"},
> +	{},
> +};
> +
> +static error_t parse_arg(int key, char *arg, struct argp_state *state)
> +{
> +	long ret;
> +
> +	switch (key) {
> +	case ARG_TRIG_BATCH_ITERS:
> +		ret = strtol(arg, NULL, 10);
> +		if (ret < 1 || ret > UINT_MAX) {
> +			fprintf(stderr, "invalid --trig-batch-iters value");
> +			argp_usage(state);
> +		}
> +		args.batch_iters = ret;
> +		break;
> +	default:
> +		return ARGP_ERR_UNKNOWN;
> +	}
> +
> +	return 0;
> +}
> +
> +const struct argp bench_trigger_fast_argp = {
> +	.options = opts,
> +	.parser = parse_arg,
> +};
> +
>  /* adjust slot shift in inc_hits() if changing */
>  #define MAX_BUCKETS 256
>  
> @@ -70,6 +113,16 @@ static void *trigger_producer(void *input)
>  	return NULL;
>  }
>  
> +static void *trigger_producer_fast(void *input)
> +{
> +	int fd = bpf_program__fd(ctx.skel->progs.trigger_driver);
> +
> +	while (true)
> +		bpf_prog_test_run_opts(fd, NULL);
> +
> +	return NULL;
> +}
> +
>  static void trigger_measure(struct bench_res *res)
>  {
>  	res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
> @@ -77,13 +130,23 @@ static void trigger_measure(struct bench_res *res)
>  
>  static void setup_ctx(void)
>  {
> +	int err;
> +
>  	setup_libbpf();
>  
> -	ctx.skel = trigger_bench__open_and_load();
> +	ctx.skel = trigger_bench__open();
>  	if (!ctx.skel) {
>  		fprintf(stderr, "failed to open skeleton\n");
>  		exit(1);
>  	}
> +
> +	ctx.skel->rodata->batch_iters = args.batch_iters;
> +
> +	err = trigger_bench__load(ctx.skel);
> +	if (err) {
> +		fprintf(stderr, "failed to open skeleton\n");
> +		exit(1);
> +	}
>  }
>  
>  static void attach_bpf(struct bpf_program *prog)
> @@ -157,6 +220,44 @@ static void trigger_fmodret_setup(void)
>  	attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
>  }
>  
> +/* Fast, mostly in-kernel triggering setups */
> +
> +static void trigger_kprobe_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_fast);
> +}
> +
> +static void trigger_kretprobe_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_fast);
> +}
> +
> +static void trigger_kprobe_multi_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi_fast);
> +}
> +
> +static void trigger_kretprobe_multi_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi_fast);
> +}
> +
> +static void trigger_fentry_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_fentry_fast);
> +}
> +
> +static void trigger_fexit_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_fexit_fast);
> +}
> +
>  /* make sure call is not inlined and not avoided by compiler, so __weak and
>   * inline asm volatile in the body of the function
>   *
> @@ -385,6 +486,26 @@ const struct bench bench_trig_fmodret = {
>  	.report_final = hits_drops_report_final,
>  };
>  
> +/* fast (staying mostly in kernel) kprobe/fentry benchmarks */
> +#define BENCH_TRIG_FAST(KIND, NAME)					\
> +const struct bench bench_trig_##KIND = {				\
> +	.name = "trig-" NAME,						\
> +	.setup = trigger_##KIND##_setup,				\
> +	.producer_thread = trigger_producer_fast,			\
> +	.measure = trigger_measure,					\
> +	.report_progress = hits_drops_report_progress,			\
> +	.report_final = hits_drops_report_final,			\
> +	.argp = &bench_trigger_fast_argp,				\
> +}
> +
> +BENCH_TRIG_FAST(kprobe_fast, "kprobe-fast");
> +BENCH_TRIG_FAST(kretprobe_fast, "kretprobe-fast");
> +BENCH_TRIG_FAST(kprobe_multi_fast, "kprobe-multi-fast");
> +BENCH_TRIG_FAST(kretprobe_multi_fast, "kretprobe-multi-fast");
> +BENCH_TRIG_FAST(fentry_fast, "fentry-fast");
> +BENCH_TRIG_FAST(fexit_fast, "fexit-fast");
> +
> +/* uprobe benchmarks */
>  const struct bench bench_trig_uprobe_base = {
>  	.name = "trig-uprobe-base",
>  	.setup = NULL, /* no uprobe/uretprobe is attached */
> diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> index 78e83f243294..fee069ac930b 100755
> --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> @@ -2,8 +2,12 @@
>  
>  set -eufo pipefail
>  
> -for i in base tp rawtp kprobe fentry fmodret
> +for i in uprobe-base base tp rawtp \
> +	 kprobe kprobe-multi kretprobe kretprobe-multi \
> +	 fentry fexit fmodret \
> +	 kprobe-fast kprobe-multi-fast kretprobe-fast kretprobe-multi-fast \
> +	 fentry-fast fexit-fast
>  do
>  	summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
> -	printf "%-10s: %s\n" $i "$summary"
> +	printf "%-20s: %s\n" $i "$summary"
>  done
> diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
> index 42ec202015ed..2886c2cb3570 100644
> --- a/tools/testing/selftests/bpf/progs/trigger_bench.c
> +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
> @@ -1,6 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0
>  // Copyright (c) 2020 Facebook
> -
>  #include <linux/bpf.h>
>  #include <asm/unistd.h>
>  #include <bpf/bpf_helpers.h>
> @@ -103,3 +102,58 @@ int bench_trigger_uprobe(void *ctx)
>  	inc_counter();
>  	return 0;
>  }
> +
> +const volatile int batch_iters = 0;
> +
> +SEC("raw_tp")
> +int trigger_driver(void *ctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < batch_iters; i++)
> +		(void)bpf_get_smp_processor_id(); /* attach here to benchmark */
> +
> +	return 0;
> +}
> +
> +SEC("kprobe/bpf_get_smp_processor_id")
> +int bench_trigger_kprobe_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("kretprobe/bpf_get_smp_processor_id")
> +int bench_trigger_kretprobe_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("kprobe.multi/bpf_get_smp_processor_id")
> +int bench_trigger_kprobe_multi_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("kretprobe.multi/bpf_get_smp_processor_id")
> +int bench_trigger_kretprobe_multi_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("fentry/bpf_get_smp_processor_id")
> +int bench_trigger_fentry_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("fexit/bpf_get_smp_processor_id")
> +int bench_trigger_fexit_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> -- 
> 2.43.0
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15  5:18 ` [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks Andrii Nakryiko
  2024-03-15  9:10   ` Jiri Olsa
@ 2024-03-15 16:03   ` Alexei Starovoitov
  2024-03-15 16:31     ` Andrii Nakryiko
  1 sibling, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2024-03-15 16:03 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Kernel Team, Jiri Olsa

On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
> Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> one syscall execution and BPF program run. While we use a fast
> get_pgid() syscall, syscall overhead can still be non-trivial.
>
> This patch adds kprobe/fentry set of benchmarks significantly amortizing
> the cost of syscall vs actual BPF triggering overhead. We do this by
> employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> which does a tight parameterized loop calling cheap BPF helper
> (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> attached for benchmarking.
>
> This way 1 bpf() syscall causes N executions of BPF program being
> benchmarked. N defaults to 100, but can be adjusted with
> --trig-batch-iters CLI argument.
>
> Results speak for themselves:
>
> $ ./run_bench_trigger.sh
> uprobe-base         :  138.054 ± 0.556M/s
> base                :   16.650 ± 0.123M/s

What's going on here? Why two bases are so different?
I thought the "base" is what all other benchmarks
should be compared against.
The "base" is the theoretical maximum for all benchs.
Or uprobe* benches should be compared with uprobe-base
while all other benchs compared with 'base' ?
Probably not anymore due to this new approach.
The 'base' is kinda lost its value then.

> tp                  :   11.068 ± 0.100M/s
> rawtp               :   14.087 ± 0.511M/s
> kprobe              :    9.641 ± 0.027M/s
> kprobe-multi        :   10.263 ± 0.061M/s
> kretprobe           :    5.475 ± 0.028M/s
> kretprobe-multi     :    5.703 ± 0.036M/s
> fentry              :   14.544 ± 0.112M/s
> fexit               :   10.637 ± 0.073M/s
> fmodret             :   11.357 ± 0.061M/s
> kprobe-fast         :   14.286 ± 0.377M/s
> kprobe-multi-fast   :   14.999 ± 0.204M/s
> kretprobe-fast      :    7.646 ± 0.084M/s
> kretprobe-multi-fast:    4.354 ± 0.066M/s
> fentry-fast         :   31.475 ± 0.254M/s
> fexit-fast          :   17.379 ± 0.195M/s

I think the "-fast" suffix doesn't really fit here.
It's a different way to benchmark fexit vs kprobe overhead.

I think the old approach should be replaced with the new one.
There is no value in keeping the old stuff around when
now it's clear that it's measuring much more than it should.

> One huge and not yet explained deviation is a slowdown of
> kretprobe-multi, we should look into that separately.
>
> kretprobe           :    5.475 ± 0.028M/s
> kretprobe-multi     :    5.703 ± 0.036M/s
> kretprobe-fast      :    7.646 ± 0.084M/s
> kretprobe-multi-fast:    4.354 ± 0.066M/s

That is weird indeed. Since it's a single cpu run
it cannot be due to contention.
My gut feel is that something odd is here:
#ifdef CONFIG_X86_KERNEL_IBT
static unsigned long get_entry_ip(unsigned long fentry_ip)

Did you have it in your config?

> +SEC("raw_tp")
> +int trigger_driver(void *ctx)
> +{
> +       int i;
> +
> +       for (i = 0; i < batch_iters; i++)
> +               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */

bpf_get_numa_node_id() is probably even faster and
not subject to DEBUG configs.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters
  2024-03-15  5:18 [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters Andrii Nakryiko
  2024-03-15  5:18 ` [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks Andrii Nakryiko
@ 2024-03-15 16:08 ` Alexei Starovoitov
  2024-03-15 16:22   ` Andrii Nakryiko
  1 sibling, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2024-03-15 16:08 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Kernel Team

On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
> When benchmarking with multiple threads (-pN, where N>1), we start
> contending on single atomic counter that both BPF trigger benchmarks are
> using, as well as "baseline" tests in user space (trig-base and
> trig-uprobe-base benchmarks). As such, we start bottlenecking on
> something completely irrelevant to benchmark at hand.
>
> Scale counting up by using per-CPU counters on BPF side. On use space
> side we do the next best thing: hash thread ID to approximate per-CPU
> behavior. It seems to work quite well in practice.
>
> To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8,
> 16, and 32 threads:
>   - trig-uprobe-base (no syscalls, pure tight counting loop in user-space);
>   - trig-base (get_pgid() syscall, atomic counter in user-space);
>   - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU
>     counter on BPF side).
>
> Command used:
>
>   for b in uprobe-base base fentry; do \
>     for p in 1 2 4 8 16 32; do \
>       printf "%-11s %2d: %s\n" $b $p \
>         "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \
>     done; \
>   done
>
> Before these changes, aggregate throughput across all threads doesn't
> scale well with number of threads, it actually even falls sharply for
> uprobe-base due to a very high contention:
>
>   uprobe-base  1:  138.998 ± 0.650M/s
>   uprobe-base  2:   70.526 ± 1.147M/s
>   uprobe-base  4:   63.114 ± 0.302M/s
>   uprobe-base  8:   54.177 ± 0.138M/s
>   uprobe-base 16:   45.439 ± 0.057M/s
>   uprobe-base 32:   37.163 ± 0.242M/s
>   base         1:   16.940 ± 0.182M/s
>   base         2:   19.231 ± 0.105M/s
>   base         4:   21.479 ± 0.038M/s
>   base         8:   23.030 ± 0.037M/s
>   base        16:   22.034 ± 0.004M/s
>   base        32:   18.152 ± 0.013M/s
>   fentry       1:   14.794 ± 0.054M/s
>   fentry       2:   17.341 ± 0.055M/s
>   fentry       4:   23.792 ± 0.024M/s
>   fentry       8:   21.557 ± 0.047M/s
>   fentry      16:   21.121 ± 0.004M/s
>   fentry      32:   17.067 ± 0.023M/s
>
> After these changes, we see almost perfect linear scaling, as expected.
> The sub-linear scaling when going from 8 to 16 threads is interesting
> and consistent on my test machine, but I haven't investigated what is
> causing it this peculiar slowdown (across all benchmarks, could be due
> to hyperthreading effects, not sure).
>
>   uprobe-base  1:  139.980 ± 0.648M/s
>   uprobe-base  2:  270.244 ± 0.379M/s
>   uprobe-base  4:  532.044 ± 1.519M/s
>   uprobe-base  8: 1004.571 ± 3.174M/s
>   uprobe-base 16: 1720.098 ± 0.744M/s
>   uprobe-base 32: 3506.659 ± 8.549M/s
>   base         1:   16.869 ± 0.071M/s
>   base         2:   33.007 ± 0.092M/s
>   base         4:   64.670 ± 0.203M/s
>   base         8:  121.969 ± 0.210M/s
>   base        16:  207.832 ± 0.112M/s
>   base        32:  424.227 ± 1.477M/s
>   fentry       1:   14.777 ± 0.087M/s
>   fentry       2:   28.575 ± 0.146M/s
>   fentry       4:   56.234 ± 0.176M/s
>   fentry       8:  106.095 ± 0.385M/s
>   fentry      16:  181.440 ± 0.032M/s
>   fentry      32:  369.131 ± 0.693M/s
>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  .../selftests/bpf/benchs/bench_trigger.c      | 40 ++++++++++++++++---
>  .../selftests/bpf/progs/trigger_bench.c       | 39 ++++++++++++------
>  2 files changed, 62 insertions(+), 17 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> index ace0d1011a8e..8fbc78d5f8a4 100644
> --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
> +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> @@ -1,15 +1,45 @@
>  // SPDX-License-Identifier: GPL-2.0
>  /* Copyright (c) 2020 Facebook */
> +#define _GNU_SOURCE
> +#include <unistd.h>
>  #include "bench.h"
>  #include "trigger_bench.skel.h"
>  #include "trace_helpers.h"
>
> +/* adjust slot shift in inc_hits() if changing */
> +#define MAX_BUCKETS 256
> +
>  /* BPF triggering benchmarks */
>  static struct trigger_ctx {
>         struct trigger_bench *skel;
>  } ctx;
>
> -static struct counter base_hits;
> +static struct counter base_hits[MAX_BUCKETS];
> +
> +static __always_inline void inc_counter(struct counter *counters)
> +{
> +       static __thread int tid = 0;
> +       unsigned slot;
> +
> +       if (unlikely(tid == 0))
> +               tid = gettid();
> +
> +       /* multiplicative hashing, it's fast */
> +       slot = 2654435769U * tid;
> +       slot >>= 24;
> +
> +       atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
> +}
> +
> +static __always_inline long sum_and_reset_counters(struct counter *counters)

Why __always_inline ?
It's a slow path. Inlining or not shouldn't make any difference.

> +{
> +       int i;
> +       long sum = 0;
> +
> +       for (i = 0; i < MAX_BUCKETS; i++)
> +               sum += atomic_swap(&counters[i].value, 0);
> +       return sum;
> +}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters
  2024-03-15 16:08 ` [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters Alexei Starovoitov
@ 2024-03-15 16:22   ` Andrii Nakryiko
  0 siblings, 0 replies; 12+ messages in thread
From: Andrii Nakryiko @ 2024-03-15 16:22 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andrii Nakryiko, bpf, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Kernel Team

On Fri, Mar 15, 2024 at 9:08 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> > When benchmarking with multiple threads (-pN, where N>1), we start
> > contending on single atomic counter that both BPF trigger benchmarks are
> > using, as well as "baseline" tests in user space (trig-base and
> > trig-uprobe-base benchmarks). As such, we start bottlenecking on
> > something completely irrelevant to benchmark at hand.
> >
> > Scale counting up by using per-CPU counters on BPF side. On use space
> > side we do the next best thing: hash thread ID to approximate per-CPU
> > behavior. It seems to work quite well in practice.
> >
> > To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8,
> > 16, and 32 threads:
> >   - trig-uprobe-base (no syscalls, pure tight counting loop in user-space);
> >   - trig-base (get_pgid() syscall, atomic counter in user-space);
> >   - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU
> >     counter on BPF side).
> >
> > Command used:
> >
> >   for b in uprobe-base base fentry; do \
> >     for p in 1 2 4 8 16 32; do \
> >       printf "%-11s %2d: %s\n" $b $p \
> >         "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \
> >     done; \
> >   done
> >
> > Before these changes, aggregate throughput across all threads doesn't
> > scale well with number of threads, it actually even falls sharply for
> > uprobe-base due to a very high contention:
> >
> >   uprobe-base  1:  138.998 ± 0.650M/s
> >   uprobe-base  2:   70.526 ± 1.147M/s
> >   uprobe-base  4:   63.114 ± 0.302M/s
> >   uprobe-base  8:   54.177 ± 0.138M/s
> >   uprobe-base 16:   45.439 ± 0.057M/s
> >   uprobe-base 32:   37.163 ± 0.242M/s
> >   base         1:   16.940 ± 0.182M/s
> >   base         2:   19.231 ± 0.105M/s
> >   base         4:   21.479 ± 0.038M/s
> >   base         8:   23.030 ± 0.037M/s
> >   base        16:   22.034 ± 0.004M/s
> >   base        32:   18.152 ± 0.013M/s
> >   fentry       1:   14.794 ± 0.054M/s
> >   fentry       2:   17.341 ± 0.055M/s
> >   fentry       4:   23.792 ± 0.024M/s
> >   fentry       8:   21.557 ± 0.047M/s
> >   fentry      16:   21.121 ± 0.004M/s
> >   fentry      32:   17.067 ± 0.023M/s
> >
> > After these changes, we see almost perfect linear scaling, as expected.
> > The sub-linear scaling when going from 8 to 16 threads is interesting
> > and consistent on my test machine, but I haven't investigated what is
> > causing it this peculiar slowdown (across all benchmarks, could be due
> > to hyperthreading effects, not sure).
> >
> >   uprobe-base  1:  139.980 ± 0.648M/s
> >   uprobe-base  2:  270.244 ± 0.379M/s
> >   uprobe-base  4:  532.044 ± 1.519M/s
> >   uprobe-base  8: 1004.571 ± 3.174M/s
> >   uprobe-base 16: 1720.098 ± 0.744M/s
> >   uprobe-base 32: 3506.659 ± 8.549M/s
> >   base         1:   16.869 ± 0.071M/s
> >   base         2:   33.007 ± 0.092M/s
> >   base         4:   64.670 ± 0.203M/s
> >   base         8:  121.969 ± 0.210M/s
> >   base        16:  207.832 ± 0.112M/s
> >   base        32:  424.227 ± 1.477M/s
> >   fentry       1:   14.777 ± 0.087M/s
> >   fentry       2:   28.575 ± 0.146M/s
> >   fentry       4:   56.234 ± 0.176M/s
> >   fentry       8:  106.095 ± 0.385M/s
> >   fentry      16:  181.440 ± 0.032M/s
> >   fentry      32:  369.131 ± 0.693M/s
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  .../selftests/bpf/benchs/bench_trigger.c      | 40 ++++++++++++++++---
> >  .../selftests/bpf/progs/trigger_bench.c       | 39 ++++++++++++------
> >  2 files changed, 62 insertions(+), 17 deletions(-)
> >
> > diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> > index ace0d1011a8e..8fbc78d5f8a4 100644
> > --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
> > +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> > @@ -1,15 +1,45 @@
> >  // SPDX-License-Identifier: GPL-2.0
> >  /* Copyright (c) 2020 Facebook */
> > +#define _GNU_SOURCE
> > +#include <unistd.h>
> >  #include "bench.h"
> >  #include "trigger_bench.skel.h"
> >  #include "trace_helpers.h"
> >
> > +/* adjust slot shift in inc_hits() if changing */
> > +#define MAX_BUCKETS 256
> > +
> >  /* BPF triggering benchmarks */
> >  static struct trigger_ctx {
> >         struct trigger_bench *skel;
> >  } ctx;
> >
> > -static struct counter base_hits;
> > +static struct counter base_hits[MAX_BUCKETS];
> > +
> > +static __always_inline void inc_counter(struct counter *counters)
> > +{
> > +       static __thread int tid = 0;
> > +       unsigned slot;
> > +
> > +       if (unlikely(tid == 0))
> > +               tid = gettid();
> > +
> > +       /* multiplicative hashing, it's fast */
> > +       slot = 2654435769U * tid;
> > +       slot >>= 24;
> > +
> > +       atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
> > +}
> > +
> > +static __always_inline long sum_and_reset_counters(struct counter *counters)
>
> Why __always_inline ?
> It's a slow path. Inlining or not shouldn't make any difference.

copy/paste of inc_counter signature originally, you are right, it's
not necessary

>
> > +{
> > +       int i;
> > +       long sum = 0;
> > +
> > +       for (i = 0; i < MAX_BUCKETS; i++)
> > +               sum += atomic_swap(&counters[i].value, 0);
> > +       return sum;
> > +}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15 16:03   ` Alexei Starovoitov
@ 2024-03-15 16:31     ` Andrii Nakryiko
  2024-03-15 16:58       ` Andrii Nakryiko
  0 siblings, 1 reply; 12+ messages in thread
From: Andrii Nakryiko @ 2024-03-15 16:31 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andrii Nakryiko, bpf, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Kernel Team, Jiri Olsa

On Fri, Mar 15, 2024 at 9:03 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > one syscall execution and BPF program run. While we use a fast
> > get_pgid() syscall, syscall overhead can still be non-trivial.
> >
> > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > the cost of syscall vs actual BPF triggering overhead. We do this by
> > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > which does a tight parameterized loop calling cheap BPF helper
> > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > attached for benchmarking.
> >
> > This way 1 bpf() syscall causes N executions of BPF program being
> > benchmarked. N defaults to 100, but can be adjusted with
> > --trig-batch-iters CLI argument.
> >
> > Results speak for themselves:
> >
> > $ ./run_bench_trigger.sh
> > uprobe-base         :  138.054 ± 0.556M/s
> > base                :   16.650 ± 0.123M/s
>
> What's going on here? Why two bases are so different?
> I thought the "base" is what all other benchmarks
> should be compared against.
> The "base" is the theoretical maximum for all benchs.
> Or uprobe* benches should be compared with uprobe-base
> while all other benchs compared with 'base' ?
> Probably not anymore due to this new approach.
> The 'base' is kinda lost its value then.

naming is hard. base is doing syscall(get_pgid) in a tight loop. It's
base compared to previous trigger benchmarks where we used syscall to
trigger kprobe/fentry programs. uprobe-base is just a user-space loop
that does atomic_inc() in a tight loop. So uprobe-base is basically
the measure of how fast CPU is, but it's unrealistic to expect either
fentry/kprobe to get close, and especially it's unrealistic to expect
uprobes to get close to it.

Naming suggestions are welcome, though.

I'm not sure what the "base" should be for xxx-fast benchmarks? Doing
a counter loop in driver BPF program, perhaps? Would you like me to
add base-fast benchmark doing just that?


>
> > tp                  :   11.068 ± 0.100M/s
> > rawtp               :   14.087 ± 0.511M/s
> > kprobe              :    9.641 ± 0.027M/s
> > kprobe-multi        :   10.263 ± 0.061M/s
> > kretprobe           :    5.475 ± 0.028M/s
> > kretprobe-multi     :    5.703 ± 0.036M/s
> > fentry              :   14.544 ± 0.112M/s
> > fexit               :   10.637 ± 0.073M/s
> > fmodret             :   11.357 ± 0.061M/s
> > kprobe-fast         :   14.286 ± 0.377M/s
> > kprobe-multi-fast   :   14.999 ± 0.204M/s
> > kretprobe-fast      :    7.646 ± 0.084M/s
> > kretprobe-multi-fast:    4.354 ± 0.066M/s
> > fentry-fast         :   31.475 ± 0.254M/s
> > fexit-fast          :   17.379 ± 0.195M/s
>
> I think the "-fast" suffix doesn't really fit here.
> It's a different way to benchmark fexit vs kprobe overhead.
>
> I think the old approach should be replaced with the new one.
> There is no value in keeping the old stuff around when
> now it's clear that it's measuring much more than it should.

I'd say we should keep them. Note that fmod_ret, tp, and raw_tp can't
be benchmarked the same way. So we don't have -fast variants for them.
If we get rid of pre-existing fentry/kprobe benchmarks, we won't be
able to compare raw_tp/fmod_ret/fexit.s to them.

And to be clear, in practice I'm not comparing xxx-fast with xxx at
all. I just put all the numbers in this patch summary to show that we
indeed have a significant performance lost on syscall.

In practice when comparing kprobe vs fentry, it should be kprobe-fast
vs fentry-fast.

>
> > One huge and not yet explained deviation is a slowdown of
> > kretprobe-multi, we should look into that separately.
> >
> > kretprobe           :    5.475 ± 0.028M/s
> > kretprobe-multi     :    5.703 ± 0.036M/s
> > kretprobe-fast      :    7.646 ± 0.084M/s
> > kretprobe-multi-fast:    4.354 ± 0.066M/s
>
> That is weird indeed. Since it's a single cpu run
> it cannot be due to contention.
> My gut feel is that something odd is here:
> #ifdef CONFIG_X86_KERNEL_IBT
> static unsigned long get_entry_ip(unsigned long fentry_ip)
>
> Did you have it in your config?

No, this was without IBT. I do have an upcoming small patch to
eliminate unnecessary copy_kernel_nofault() in IBT path, but I don't
think this is it. It should be pretty clear with profiling where we
are losing CPU, I just didn't have time/desire to look into it, as I
have tons of small things being worked on in parallel and didn't want
to add another one.

>
> > +SEC("raw_tp")
> > +int trigger_driver(void *ctx)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < batch_iters; i++)
> > +               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */
>
> bpf_get_numa_node_id() is probably even faster and
> not subject to DEBUG configs.

Sure, I can switch to that, np.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15 16:31     ` Andrii Nakryiko
@ 2024-03-15 16:58       ` Andrii Nakryiko
  2024-03-15 18:47         ` Alexei Starovoitov
  0 siblings, 1 reply; 12+ messages in thread
From: Andrii Nakryiko @ 2024-03-15 16:58 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andrii Nakryiko, bpf, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Kernel Team, Jiri Olsa

On Fri, Mar 15, 2024 at 9:31 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Fri, Mar 15, 2024 at 9:03 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > >
> > > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > > one syscall execution and BPF program run. While we use a fast
> > > get_pgid() syscall, syscall overhead can still be non-trivial.
> > >
> > > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > > the cost of syscall vs actual BPF triggering overhead. We do this by
> > > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > > which does a tight parameterized loop calling cheap BPF helper
> > > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > > attached for benchmarking.
> > >
> > > This way 1 bpf() syscall causes N executions of BPF program being
> > > benchmarked. N defaults to 100, but can be adjusted with
> > > --trig-batch-iters CLI argument.
> > >
> > > Results speak for themselves:
> > >
> > > $ ./run_bench_trigger.sh
> > > uprobe-base         :  138.054 ± 0.556M/s
> > > base                :   16.650 ± 0.123M/s
> >
> > What's going on here? Why two bases are so different?
> > I thought the "base" is what all other benchmarks
> > should be compared against.
> > The "base" is the theoretical maximum for all benchs.
> > Or uprobe* benches should be compared with uprobe-base
> > while all other benchs compared with 'base' ?
> > Probably not anymore due to this new approach.
> > The 'base' is kinda lost its value then.
>
> naming is hard. base is doing syscall(get_pgid) in a tight loop. It's
> base compared to previous trigger benchmarks where we used syscall to
> trigger kprobe/fentry programs. uprobe-base is just a user-space loop
> that does atomic_inc() in a tight loop. So uprobe-base is basically
> the measure of how fast CPU is, but it's unrealistic to expect either
> fentry/kprobe to get close, and especially it's unrealistic to expect
> uprobes to get close to it.
>
> Naming suggestions are welcome, though.
>
> I'm not sure what the "base" should be for xxx-fast benchmarks? Doing
> a counter loop in driver BPF program, perhaps? Would you like me to
> add base-fast benchmark doing just that?
>

How about this.

base -> base-syscall (i.e., "syscall-calling baseline")
uprobe-base -> base-user-loop (i.e., "user space-only baseline")
and then for "fast" baseline we add "base-kernel-loop" for
"kernel-side looping baseline"

Or we can use some naming based on "counting": base-syscall-count,
base-user-count, base-kernel-count?

Any preferences, anyone?

>
> >
> > > tp                  :   11.068 ± 0.100M/s
> > > rawtp               :   14.087 ± 0.511M/s
> > > kprobe              :    9.641 ± 0.027M/s
> > > kprobe-multi        :   10.263 ± 0.061M/s
> > > kretprobe           :    5.475 ± 0.028M/s
> > > kretprobe-multi     :    5.703 ± 0.036M/s
> > > fentry              :   14.544 ± 0.112M/s
> > > fexit               :   10.637 ± 0.073M/s
> > > fmodret             :   11.357 ± 0.061M/s
> > > kprobe-fast         :   14.286 ± 0.377M/s
> > > kprobe-multi-fast   :   14.999 ± 0.204M/s
> > > kretprobe-fast      :    7.646 ± 0.084M/s
> > > kretprobe-multi-fast:    4.354 ± 0.066M/s
> > > fentry-fast         :   31.475 ± 0.254M/s
> > > fexit-fast          :   17.379 ± 0.195M/s
> >
> > I think the "-fast" suffix doesn't really fit here.
> > It's a different way to benchmark fexit vs kprobe overhead.
> >
> > I think the old approach should be replaced with the new one.
> > There is no value in keeping the old stuff around when
> > now it's clear that it's measuring much more than it should.
>
> I'd say we should keep them. Note that fmod_ret, tp, and raw_tp can't
> be benchmarked the same way. So we don't have -fast variants for them.
> If we get rid of pre-existing fentry/kprobe benchmarks, we won't be
> able to compare raw_tp/fmod_ret/fexit.s to them.
>
> And to be clear, in practice I'm not comparing xxx-fast with xxx at
> all. I just put all the numbers in this patch summary to show that we
> indeed have a significant performance lost on syscall.
>
> In practice when comparing kprobe vs fentry, it should be kprobe-fast
> vs fentry-fast.
>
> >
> > > One huge and not yet explained deviation is a slowdown of
> > > kretprobe-multi, we should look into that separately.
> > >
> > > kretprobe           :    5.475 ± 0.028M/s
> > > kretprobe-multi     :    5.703 ± 0.036M/s
> > > kretprobe-fast      :    7.646 ± 0.084M/s
> > > kretprobe-multi-fast:    4.354 ± 0.066M/s
> >
> > That is weird indeed. Since it's a single cpu run
> > it cannot be due to contention.
> > My gut feel is that something odd is here:
> > #ifdef CONFIG_X86_KERNEL_IBT
> > static unsigned long get_entry_ip(unsigned long fentry_ip)
> >
> > Did you have it in your config?
>
> No, this was without IBT. I do have an upcoming small patch to
> eliminate unnecessary copy_kernel_nofault() in IBT path, but I don't
> think this is it. It should be pretty clear with profiling where we
> are losing CPU, I just didn't have time/desire to look into it, as I
> have tons of small things being worked on in parallel and didn't want
> to add another one.
>
> >
> > > +SEC("raw_tp")
> > > +int trigger_driver(void *ctx)
> > > +{
> > > +       int i;
> > > +
> > > +       for (i = 0; i < batch_iters; i++)
> > > +               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */
> >
> > bpf_get_numa_node_id() is probably even faster and
> > not subject to DEBUG configs.
>
> Sure, I can switch to that, np.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15 16:58       ` Andrii Nakryiko
@ 2024-03-15 18:47         ` Alexei Starovoitov
  2024-03-15 21:07           ` Andrii Nakryiko
  0 siblings, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2024-03-15 18:47 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Kernel Team, Jiri Olsa

On Fri, Mar 15, 2024 at 9:59 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Fri, Mar 15, 2024 at 9:31 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Fri, Mar 15, 2024 at 9:03 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > >
> > > > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > > > one syscall execution and BPF program run. While we use a fast
> > > > get_pgid() syscall, syscall overhead can still be non-trivial.
> > > >
> > > > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > > > the cost of syscall vs actual BPF triggering overhead. We do this by
> > > > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > > > which does a tight parameterized loop calling cheap BPF helper
> > > > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > > > attached for benchmarking.
> > > >
> > > > This way 1 bpf() syscall causes N executions of BPF program being
> > > > benchmarked. N defaults to 100, but can be adjusted with
> > > > --trig-batch-iters CLI argument.
> > > >
> > > > Results speak for themselves:
> > > >
> > > > $ ./run_bench_trigger.sh
> > > > uprobe-base         :  138.054 ± 0.556M/s
> > > > base                :   16.650 ± 0.123M/s
> > >
> > > What's going on here? Why two bases are so different?
> > > I thought the "base" is what all other benchmarks
> > > should be compared against.
> > > The "base" is the theoretical maximum for all benchs.
> > > Or uprobe* benches should be compared with uprobe-base
> > > while all other benchs compared with 'base' ?
> > > Probably not anymore due to this new approach.
> > > The 'base' is kinda lost its value then.
> >
> > naming is hard. base is doing syscall(get_pgid) in a tight loop. It's
> > base compared to previous trigger benchmarks where we used syscall to
> > trigger kprobe/fentry programs. uprobe-base is just a user-space loop
> > that does atomic_inc() in a tight loop. So uprobe-base is basically
> > the measure of how fast CPU is, but it's unrealistic to expect either
> > fentry/kprobe to get close, and especially it's unrealistic to expect
> > uprobes to get close to it.
> >
> > Naming suggestions are welcome, though.
> >
> > I'm not sure what the "base" should be for xxx-fast benchmarks? Doing
> > a counter loop in driver BPF program, perhaps? Would you like me to
> > add base-fast benchmark doing just that?
> >
>
> How about this.
>
> base -> base-syscall (i.e., "syscall-calling baseline")
> uprobe-base -> base-user-loop (i.e., "user space-only baseline")
> and then for "fast" baseline we add "base-kernel-loop" for
> "kernel-side looping baseline"

I think "base" part in both doesn't fit.
Maybe
base -> syscall_loop
uprobe-base - user_space_loop

since the first is only doing syscall in a loop
and the 2nd doesn't even go to the kernel.

>
> Or we can use some naming based on "counting": base-syscall-count,
> base-user-count, base-kernel-count?

Instead of fenty-fast -> fentry-batch
sounds more accurate.

But in general it doesn't feel right to keep the existing
benchs, since we've discovered that syscall overhead
affects the measurement so much.
I think we should bite the bullet and use bpf_testmod in bench tool.
Then we can add a tracepoint there and call it in proper kernel loop,
then add fmodret-able empty function and call it in another loop.

Then bench tool can trigger them via bpf->kfunc->that_looping_helper
and capture as accurate as possibly overhead of
tp, raw_tp, kprobe, fentry, fmodret.
And all of them will be comparable to each other.
Right now fmodret vs fentry are comparable, but both have
syscall time addon which makes the comparison questionable.
Especially consider the kernels with mitigations.
Overhead of syscall is even higher and the delta is smaller,
to the point that fentry vs fexit might measure equal.
A benchmarking tool should be a gold standard and accurate tool
for performance measurement.

The trick:
+       for (i = 0; i < batch_iters; i++)
+               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */

is neat, but since it works for fentry/kprobe only
we need testmod to generalize it for fmodret and tracepoints.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15 18:47         ` Alexei Starovoitov
@ 2024-03-15 21:07           ` Andrii Nakryiko
  0 siblings, 0 replies; 12+ messages in thread
From: Andrii Nakryiko @ 2024-03-15 21:07 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andrii Nakryiko, bpf, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Kernel Team, Jiri Olsa

On Fri, Mar 15, 2024 at 11:48 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Mar 15, 2024 at 9:59 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Fri, Mar 15, 2024 at 9:31 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Fri, Mar 15, 2024 at 9:03 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > > >
> > > > > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > > > > one syscall execution and BPF program run. While we use a fast
> > > > > get_pgid() syscall, syscall overhead can still be non-trivial.
> > > > >
> > > > > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > > > > the cost of syscall vs actual BPF triggering overhead. We do this by
> > > > > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > > > > which does a tight parameterized loop calling cheap BPF helper
> > > > > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > > > > attached for benchmarking.
> > > > >
> > > > > This way 1 bpf() syscall causes N executions of BPF program being
> > > > > benchmarked. N defaults to 100, but can be adjusted with
> > > > > --trig-batch-iters CLI argument.
> > > > >
> > > > > Results speak for themselves:
> > > > >
> > > > > $ ./run_bench_trigger.sh
> > > > > uprobe-base         :  138.054 ± 0.556M/s
> > > > > base                :   16.650 ± 0.123M/s
> > > >
> > > > What's going on here? Why two bases are so different?
> > > > I thought the "base" is what all other benchmarks
> > > > should be compared against.
> > > > The "base" is the theoretical maximum for all benchs.
> > > > Or uprobe* benches should be compared with uprobe-base
> > > > while all other benchs compared with 'base' ?
> > > > Probably not anymore due to this new approach.
> > > > The 'base' is kinda lost its value then.
> > >
> > > naming is hard. base is doing syscall(get_pgid) in a tight loop. It's
> > > base compared to previous trigger benchmarks where we used syscall to
> > > trigger kprobe/fentry programs. uprobe-base is just a user-space loop
> > > that does atomic_inc() in a tight loop. So uprobe-base is basically
> > > the measure of how fast CPU is, but it's unrealistic to expect either
> > > fentry/kprobe to get close, and especially it's unrealistic to expect
> > > uprobes to get close to it.
> > >
> > > Naming suggestions are welcome, though.
> > >
> > > I'm not sure what the "base" should be for xxx-fast benchmarks? Doing
> > > a counter loop in driver BPF program, perhaps? Would you like me to
> > > add base-fast benchmark doing just that?
> > >
> >
> > How about this.
> >
> > base -> base-syscall (i.e., "syscall-calling baseline")
> > uprobe-base -> base-user-loop (i.e., "user space-only baseline")
> > and then for "fast" baseline we add "base-kernel-loop" for
> > "kernel-side looping baseline"
>
> I think "base" part in both doesn't fit.
> Maybe
> base -> syscall_loop
> uprobe-base - user_space_loop

ok

>
> since the first is only doing syscall in a loop
> and the 2nd doesn't even go to the kernel.
>
> >
> > Or we can use some naming based on "counting": base-syscall-count,
> > base-user-count, base-kernel-count?
>
> Instead of fenty-fast -> fentry-batch
> sounds more accurate.
>

ok, -batch it is

> But in general it doesn't feel right to keep the existing
> benchs, since we've discovered that syscall overhead
> affects the measurement so much.
> I think we should bite the bullet and use bpf_testmod in bench tool.

This will worsen the logistics of using it for any benchmark so much
that it will be pretty much a useless tool.

I can try to find some other way to trigger tracepoint and fmod_ret
without relying on custom modules, but I'd rather just remove those
benchmarks altogether than add dependency on bpf_testmod.

> Then we can add a tracepoint there and call it in proper kernel loop,
> then add fmodret-able empty function and call it in another loop.
>
> Then bench tool can trigger them via bpf->kfunc->that_looping_helper
> and capture as accurate as possibly overhead of
> tp, raw_tp, kprobe, fentry, fmodret.
> And all of them will be comparable to each other.
> Right now fmodret vs fentry are comparable, but both have
> syscall time addon which makes the comparison questionable.
> Especially consider the kernels with mitigations.
> Overhead of syscall is even higher and the delta is smaller,
> to the point that fentry vs fexit might measure equal.

It's still useful to be able to do this, even if syscall overhead is
very measurable. Because when profiling it's easy to filter out
syscall overhead and compare purely fentry vs fexit overheads.

Even with mitigations on the differences between fentry and fexit are
measurable. And bench is a nice tool to generate non-stop work to be
able to capture profiling data conveniently. While a custom test
module will make it much harder.


> A benchmarking tool should be a gold standard and accurate tool
> for performance measurement.
>

This was neither a goal, nor do I sign up for that aspiration :) It's
first and foremost a small and useful tool for doing local performance
optimization work when it comes to BPF-related functionality in the
kernel. And its portability is a huge part of this.

> The trick:
> +       for (i = 0; i < batch_iters; i++)
> +               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */
>
> is neat, but since it works for fentry/kprobe only
> we need testmod to generalize it for fmodret and tracepoints.

see above, I agree it would be nice to have (and I'll look for some
other ways to achieve this), but is not really a required property to
make benchmark useful

For now, I'll drop this patch and re-submit the first one with
__always_inline removed just to get those prerequisites landed.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15  9:10   ` Jiri Olsa
@ 2024-03-15 22:22     ` Andrii Nakryiko
  2024-03-17 20:38       ` Jiri Olsa
  0 siblings, 1 reply; 12+ messages in thread
From: Andrii Nakryiko @ 2024-03-15 22:22 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: Andrii Nakryiko, bpf, ast, daniel, martin.lau, kernel-team

On Fri, Mar 15, 2024 at 2:10 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Mar 14, 2024 at 10:18:13PM -0700, Andrii Nakryiko wrote:
> > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > one syscall execution and BPF program run. While we use a fast
> > get_pgid() syscall, syscall overhead can still be non-trivial.
> >
> > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > the cost of syscall vs actual BPF triggering overhead. We do this by
> > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > which does a tight parameterized loop calling cheap BPF helper
> > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > attached for benchmarking.
> >
> > This way 1 bpf() syscall causes N executions of BPF program being
> > benchmarked. N defaults to 100, but can be adjusted with
> > --trig-batch-iters CLI argument.
> >
> > Results speak for themselves:
> >
> > $ ./run_bench_trigger.sh
> > uprobe-base         :  138.054 ą 0.556M/s
> > base                :   16.650 ą 0.123M/s
> > tp                  :   11.068 ą 0.100M/s
> > rawtp               :   14.087 ą 0.511M/s
> > kprobe              :    9.641 ą 0.027M/s
> > kprobe-multi        :   10.263 ą 0.061M/s
> > kretprobe           :    5.475 ą 0.028M/s
> > kretprobe-multi     :    5.703 ą 0.036M/s
> > fentry              :   14.544 ą 0.112M/s
> > fexit               :   10.637 ą 0.073M/s
> > fmodret             :   11.357 ą 0.061M/s
> > kprobe-fast         :   14.286 ą 0.377M/s
> > kprobe-multi-fast   :   14.999 ą 0.204M/s
> > kretprobe-fast      :    7.646 ą 0.084M/s
> > kretprobe-multi-fast:    4.354 ą 0.066M/s
> > fentry-fast         :   31.475 ą 0.254M/s
> > fexit-fast          :   17.379 ą 0.195M/s
> >
> > Note how xxx-fast variants are measured with significantly higher
> > throughput, even though it's exactly the same in-kernel overhead:
> >
> > fentry              :   14.544 ą 0.112M/s
> > fentry-fast         :   31.475 ą 0.254M/s
> >
> > kprobe-multi        :   10.263 ą 0.061M/s
> > kprobe-multi-fast   :   14.999 ą 0.204M/s
> >
> > One huge and not yet explained deviation is a slowdown of
> > kretprobe-multi, we should look into that separately.
> >
> > kretprobe           :    5.475 ą 0.028M/s
> > kretprobe-multi     :    5.703 ą 0.036M/s
> > kretprobe-fast      :    7.646 ą 0.084M/s
> > kretprobe-multi-fast:    4.354 ą 0.066M/s
> >
> > Kprobe cases don't seem to have this illogical slowdown:
> >
> > kprobe              :    9.641 ą 0.027M/s
> > kprobe-multi        :   10.263 ą 0.061M/s
> > kprobe-fast         :   14.286 ą 0.377M/s
> > kprobe-multi-fast   :   14.999 ą 0.204M/s
>
> hum, I see that as well:
>
> uprobe-base         :  230.624 ą 0.527M/s
> base                :   16.320 ą 0.087M/s
> tp                  :   10.057 ą 0.122M/s
> rawtp               :   14.851 ą 0.300M/s
> kprobe              :   10.993 ą 0.104M/s
> kprobe-multi        :   11.053 ą 0.038M/s
> kretprobe           :    6.679 ą 0.015M/s
> kretprobe-multi     :    6.466 ą 0.015M/s
> fentry              :   14.949 ą 0.064M/s
> fexit               :   10.530 ą 1.275M/s
> fmodret             :   11.145 ą 0.245M/s
> kprobe-fast         :   20.080 ą 0.468M/s
> kprobe-multi-fast   :   17.603 ą 0.102M/s
> kretprobe-fast      :    9.943 ą 0.056M/s
> kretprobe-multi-fast:    5.185 ą 0.022M/s
> fentry-fast         :   46.697 ą 0.260M/s
> fexit-fast          :   19.250 ą 0.108M/s
>
>
> I even see decline in kprobe-multi-fast:
>
>   kprobe-fast         :   20.080 ą 0.468M/s
>   kprobe-multi-fast   :   17.603 ą 0.102M/s
>
>   kretprobe-fast      :    9.943 ą 0.056M/s
>   kretprobe-multi-fast:    5.185 ą 0.022M/s
>
>
> I've got some IBT related code showing up in the perf profile
> and when I disabled it I got better results for kprobe-multi-fast
> but kretprobe-multi-fast is still bad

ok, I think it's a self-inflicted false alarm. Note how this benchmark
is using bpf_get_smp_processor_id() as an attach point? Well, guess
what, each triggered program is then calling
bpf_get_smp_processor_id() again to get the per-CPU counter to
increment. So this benchmark was effectively heavily testing recursion
protection, and I guess kretprobe hurts the most.

I realized that when I moved to bpf_get_numa_node_id() and seeing much
higher numbers:

BEFORE
======
$ benchs/run_bench_trigger.sh kprobe-batch fentry-batch
kprobe-batch        :   14.018 ± 0.333M/s
fentry-batch        :   31.470 ± 0.791M/s

AFTER
=====
$ benchs/run_bench_trigger.sh kprobe-batch fentry-batch
kprobe-batch         :   19.391 ± 0.868M/s
fentry-batch         :   48.785 ± 0.932M/s

That's when I realized something is off here. Now results make a bit
more sense (machine on which I test is different, so absolute values
differ from before):

$ benchs/run_bench_trigger.sh kprobe kprobe-multi kprobe-batch
kprobe-multi-batch kretprobe kretprobe-multi kretprobe-batch
kretprobe-multi-batch
kprobe               :    8.114 ± 0.199M/s
kprobe-multi         :    8.739 ± 0.052M/s
kprobe-batch         :   19.379 ± 0.709M/s
kprobe-multi-batch   :   24.969 ± 0.226M/s
kretprobe            :    4.859 ± 0.027M/s
kretprobe-multi      :    4.940 ± 0.083M/s
kretprobe-batch      :    8.460 ± 0.044M/s
kretprobe-multi-batch:    8.761 ± 0.282M/s


>
>
> uprobe-base         :  234.024 ą 0.225M/s
> base                :   16.383 ą 0.029M/s
> tp                  :    9.973 ą 0.017M/s
> rawtp               :   14.889 ą 0.047M/s
> kprobe              :   10.970 ą 0.011M/s
> kprobe-multi        :   11.640 ą 0.009M/s
> kretprobe           :    6.667 ą 0.005M/s
> kretprobe-multi     :    6.704 ą 0.005M/s
> fentry              :   14.968 ą 0.024M/s
> fexit               :   11.860 ą 0.012M/s
> fmodret             :   12.656 ą 0.036M/s
> kprobe-fast         :   20.340 ą 0.043M/s
> kprobe-multi-fast   :   21.203 ą 0.019M/s
> kretprobe-fast      :    9.956 ą 0.021M/s
> kretprobe-multi-fast:    5.611 ą 0.006M/s
> fentry-fast         :   46.825 ą 0.041M/s
> fexit-fast          :   19.746 ą 0.024M/s
>
> slightly better:
>
>   kprobe-fast         :   20.340 ą 0.043M/s
>   kprobe-multi-fast   :   21.203 ą 0.019M/s
>
> still almost half perf:
>   kretprobe-fast      :    9.956 ą 0.021M/s
>   kretprobe-multi-fast:    5.611 ą 0.006M/s
>
> jirka
>
> >
> > Cc: Jiri Olsa <jolsa@kernel.org>
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  tools/testing/selftests/bpf/bench.c           |  18 +++
> >  .../selftests/bpf/benchs/bench_trigger.c      | 123 +++++++++++++++++-
> >  .../selftests/bpf/benchs/run_bench_trigger.sh |   8 +-
> >  .../selftests/bpf/progs/trigger_bench.c       |  56 +++++++-
> >  4 files changed, 201 insertions(+), 4 deletions(-)
> >

[...]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks
  2024-03-15 22:22     ` Andrii Nakryiko
@ 2024-03-17 20:38       ` Jiri Olsa
  0 siblings, 0 replies; 12+ messages in thread
From: Jiri Olsa @ 2024-03-17 20:38 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Jiri Olsa, Andrii Nakryiko, bpf, ast, daniel, martin.lau, kernel-team

On Fri, Mar 15, 2024 at 03:22:29PM -0700, Andrii Nakryiko wrote:
> On Fri, Mar 15, 2024 at 2:10 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Thu, Mar 14, 2024 at 10:18:13PM -0700, Andrii Nakryiko wrote:
> > > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > > one syscall execution and BPF program run. While we use a fast
> > > get_pgid() syscall, syscall overhead can still be non-trivial.
> > >
> > > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > > the cost of syscall vs actual BPF triggering overhead. We do this by
> > > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > > which does a tight parameterized loop calling cheap BPF helper
> > > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > > attached for benchmarking.
> > >
> > > This way 1 bpf() syscall causes N executions of BPF program being
> > > benchmarked. N defaults to 100, but can be adjusted with
> > > --trig-batch-iters CLI argument.
> > >
> > > Results speak for themselves:
> > >
> > > $ ./run_bench_trigger.sh
> > > uprobe-base         :  138.054 ą 0.556M/s
> > > base                :   16.650 ą 0.123M/s
> > > tp                  :   11.068 ą 0.100M/s
> > > rawtp               :   14.087 ą 0.511M/s
> > > kprobe              :    9.641 ą 0.027M/s
> > > kprobe-multi        :   10.263 ą 0.061M/s
> > > kretprobe           :    5.475 ą 0.028M/s
> > > kretprobe-multi     :    5.703 ą 0.036M/s
> > > fentry              :   14.544 ą 0.112M/s
> > > fexit               :   10.637 ą 0.073M/s
> > > fmodret             :   11.357 ą 0.061M/s
> > > kprobe-fast         :   14.286 ą 0.377M/s
> > > kprobe-multi-fast   :   14.999 ą 0.204M/s
> > > kretprobe-fast      :    7.646 ą 0.084M/s
> > > kretprobe-multi-fast:    4.354 ą 0.066M/s
> > > fentry-fast         :   31.475 ą 0.254M/s
> > > fexit-fast          :   17.379 ą 0.195M/s
> > >
> > > Note how xxx-fast variants are measured with significantly higher
> > > throughput, even though it's exactly the same in-kernel overhead:
> > >
> > > fentry              :   14.544 ą 0.112M/s
> > > fentry-fast         :   31.475 ą 0.254M/s
> > >
> > > kprobe-multi        :   10.263 ą 0.061M/s
> > > kprobe-multi-fast   :   14.999 ą 0.204M/s
> > >
> > > One huge and not yet explained deviation is a slowdown of
> > > kretprobe-multi, we should look into that separately.
> > >
> > > kretprobe           :    5.475 ą 0.028M/s
> > > kretprobe-multi     :    5.703 ą 0.036M/s
> > > kretprobe-fast      :    7.646 ą 0.084M/s
> > > kretprobe-multi-fast:    4.354 ą 0.066M/s
> > >
> > > Kprobe cases don't seem to have this illogical slowdown:
> > >
> > > kprobe              :    9.641 ą 0.027M/s
> > > kprobe-multi        :   10.263 ą 0.061M/s
> > > kprobe-fast         :   14.286 ą 0.377M/s
> > > kprobe-multi-fast   :   14.999 ą 0.204M/s
> >
> > hum, I see that as well:
> >
> > uprobe-base         :  230.624 ą 0.527M/s
> > base                :   16.320 ą 0.087M/s
> > tp                  :   10.057 ą 0.122M/s
> > rawtp               :   14.851 ą 0.300M/s
> > kprobe              :   10.993 ą 0.104M/s
> > kprobe-multi        :   11.053 ą 0.038M/s
> > kretprobe           :    6.679 ą 0.015M/s
> > kretprobe-multi     :    6.466 ą 0.015M/s
> > fentry              :   14.949 ą 0.064M/s
> > fexit               :   10.530 ą 1.275M/s
> > fmodret             :   11.145 ą 0.245M/s
> > kprobe-fast         :   20.080 ą 0.468M/s
> > kprobe-multi-fast   :   17.603 ą 0.102M/s
> > kretprobe-fast      :    9.943 ą 0.056M/s
> > kretprobe-multi-fast:    5.185 ą 0.022M/s
> > fentry-fast         :   46.697 ą 0.260M/s
> > fexit-fast          :   19.250 ą 0.108M/s
> >
> >
> > I even see decline in kprobe-multi-fast:
> >
> >   kprobe-fast         :   20.080 ą 0.468M/s
> >   kprobe-multi-fast   :   17.603 ą 0.102M/s
> >
> >   kretprobe-fast      :    9.943 ą 0.056M/s
> >   kretprobe-multi-fast:    5.185 ą 0.022M/s
> >
> >
> > I've got some IBT related code showing up in the perf profile
> > and when I disabled it I got better results for kprobe-multi-fast
> > but kretprobe-multi-fast is still bad
> 
> ok, I think it's a self-inflicted false alarm. Note how this benchmark
> is using bpf_get_smp_processor_id() as an attach point? Well, guess
> what, each triggered program is then calling
> bpf_get_smp_processor_id() again to get the per-CPU counter to
> increment. So this benchmark was effectively heavily testing recursion
> protection, and I guess kretprobe hurts the most.

heh, nice :-) that explains the bpf_prog_inc_misses_counter hits
I see in perf profile

     1.46%  bench    [kernel.vmlinux]                                              [k] bpf_prog_inc_misses_counter
            |
            ---bpf_prog_inc_misses_counter
               kprobe_multi_link_prog_run.isra.0
               fprobe_exit_handler
               rethook_trampoline_handler
               arch_rethook_trampoline_callback
               arch_rethook_trampoline
               bpf_prog_7790468e40a289ea_bench_trigger_kretprobe_multi_fast
               kprobe_multi_link_prog_run.isra.0
               fprobe_exit_handler
               rethook_trampoline_handler
               arch_rethook_trampoline_callback
               arch_rethook_trampoline
               bpf_prog_db14b3b08bc9e952_trigger_driver
               __bpf_prog_test_run_raw_tp
               bpf_prog_test_run_raw_tp
               __sys_bpf
               __x64_sys_bpf
               do_syscall_64
               entry_SYSCALL_64
               syscall
               start_thread
               clone3

jirka

> 
> I realized that when I moved to bpf_get_numa_node_id() and seeing much
> higher numbers:
> 
> BEFORE
> ======
> $ benchs/run_bench_trigger.sh kprobe-batch fentry-batch
> kprobe-batch        :   14.018 ± 0.333M/s
> fentry-batch        :   31.470 ± 0.791M/s
> 
> AFTER
> =====
> $ benchs/run_bench_trigger.sh kprobe-batch fentry-batch
> kprobe-batch         :   19.391 ± 0.868M/s
> fentry-batch         :   48.785 ± 0.932M/s
> 
> That's when I realized something is off here. Now results make a bit
> more sense (machine on which I test is different, so absolute values
> differ from before):
> 
> $ benchs/run_bench_trigger.sh kprobe kprobe-multi kprobe-batch
> kprobe-multi-batch kretprobe kretprobe-multi kretprobe-batch
> kretprobe-multi-batch
> kprobe               :    8.114 ± 0.199M/s
> kprobe-multi         :    8.739 ± 0.052M/s
> kprobe-batch         :   19.379 ± 0.709M/s
> kprobe-multi-batch   :   24.969 ± 0.226M/s
> kretprobe            :    4.859 ± 0.027M/s
> kretprobe-multi      :    4.940 ± 0.083M/s
> kretprobe-batch      :    8.460 ± 0.044M/s
> kretprobe-multi-batch:    8.761 ± 0.282M/s
> 
> 
> >
> >
> > uprobe-base         :  234.024 ą 0.225M/s
> > base                :   16.383 ą 0.029M/s
> > tp                  :    9.973 ą 0.017M/s
> > rawtp               :   14.889 ą 0.047M/s
> > kprobe              :   10.970 ą 0.011M/s
> > kprobe-multi        :   11.640 ą 0.009M/s
> > kretprobe           :    6.667 ą 0.005M/s
> > kretprobe-multi     :    6.704 ą 0.005M/s
> > fentry              :   14.968 ą 0.024M/s
> > fexit               :   11.860 ą 0.012M/s
> > fmodret             :   12.656 ą 0.036M/s
> > kprobe-fast         :   20.340 ą 0.043M/s
> > kprobe-multi-fast   :   21.203 ą 0.019M/s
> > kretprobe-fast      :    9.956 ą 0.021M/s
> > kretprobe-multi-fast:    5.611 ą 0.006M/s
> > fentry-fast         :   46.825 ą 0.041M/s
> > fexit-fast          :   19.746 ą 0.024M/s
> >
> > slightly better:
> >
> >   kprobe-fast         :   20.340 ą 0.043M/s
> >   kprobe-multi-fast   :   21.203 ą 0.019M/s
> >
> > still almost half perf:
> >   kretprobe-fast      :    9.956 ą 0.021M/s
> >   kretprobe-multi-fast:    5.611 ą 0.006M/s
> >
> > jirka
> >
> > >
> > > Cc: Jiri Olsa <jolsa@kernel.org>
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > ---
> > >  tools/testing/selftests/bpf/bench.c           |  18 +++
> > >  .../selftests/bpf/benchs/bench_trigger.c      | 123 +++++++++++++++++-
> > >  .../selftests/bpf/benchs/run_bench_trigger.sh |   8 +-
> > >  .../selftests/bpf/progs/trigger_bench.c       |  56 +++++++-
> > >  4 files changed, 201 insertions(+), 4 deletions(-)
> > >
> 
> [...]

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-03-17 20:39 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-15  5:18 [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters Andrii Nakryiko
2024-03-15  5:18 ` [PATCH v2 bpf-next 2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks Andrii Nakryiko
2024-03-15  9:10   ` Jiri Olsa
2024-03-15 22:22     ` Andrii Nakryiko
2024-03-17 20:38       ` Jiri Olsa
2024-03-15 16:03   ` Alexei Starovoitov
2024-03-15 16:31     ` Andrii Nakryiko
2024-03-15 16:58       ` Andrii Nakryiko
2024-03-15 18:47         ` Alexei Starovoitov
2024-03-15 21:07           ` Andrii Nakryiko
2024-03-15 16:08 ` [PATCH v2 bpf-next 1/2] selftests/bpf: scale benchmark counting by using per-CPU counters Alexei Starovoitov
2024-03-15 16:22   ` Andrii Nakryiko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.