[PATCH RFC bpf-next 2/4] selftests/bpf: fix and optimize bloom filter bench

From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
Cc: <kafai@fb.com>, <joannekoong@fb.com>,
	Andrii Nakryiko <andrii@kernel.org>
Subject: [PATCH RFC bpf-next 2/4] selftests/bpf: fix and optimize bloom filter bench
Date: Wed, 22 Sep 2021 13:32:22 -0700	[thread overview]
Message-ID: <20210922203224.912809-3-andrii@kernel.org> (raw)
In-Reply-To: <20210922203224.912809-1-andrii@kernel.org>

Fix inconsistent and highly-variable measurement logic by using always
increasing counters. Also reduce the amount of accounting by doing
increment once after the entire loop. Also use rodata for bloom filter
flag. And build bench files in optimized mode.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/Makefile          |  2 +-
 .../bpf/benchs/bench_bloom_filter_map.c       | 32 +++++++++++++-
 .../selftests/bpf/progs/bloom_filter_map.c    | 44 ++++++++++++++-----
 3 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 5dbaf7f512fd..f36f49e4d0f2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -515,7 +515,7 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
 # Benchmark runner
 $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ)
 	$(call msg,CC,,$@)
-	$(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
+	$(Q)$(CC) $(CFLAGS) -O2 -c $(filter %.c,$^) $(LDLIBS) -o $@
 $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h
 $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
 $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
diff --git a/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c b/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
index 7adf80be7292..4f53cd9fb099 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
@@ -248,7 +248,7 @@ static void hashmap_no_bloom_filter_setup(void)
 
 	ctx.skel = setup_skeleton();
 
-	ctx.skel->data->hashmap_use_bloom_filter = false;
+	ctx.skel->rodata->hashmap_use_bloom_filter = false;
 
 	populate_maps();
 
@@ -259,16 +259,26 @@ static void hashmap_no_bloom_filter_setup(void)
 	}
 }
 
+struct stat { __u32 stats[3]; };
+
 static void measure(struct bench_res *res)
 {
+	static long last_hits, last_drops, last_false_hits;
 	long total_hits = 0, total_drops = 0, total_false_hits = 0;
 	unsigned int nr_cpus = bpf_num_possible_cpus();
+	/*
 	BPF_DECLARE_PERCPU(__u64, zeroed_values);
 	BPF_DECLARE_PERCPU(__u64, false_hits);
 	BPF_DECLARE_PERCPU(__u64, drops);
 	BPF_DECLARE_PERCPU(__u64, hits);
 	int err, i, percpu_array_fd;
 	__u32 key;
+	*/
+	int i;
+	int hit_key, drop_key, false_hit_key;
+	hit_key = ctx.skel->rodata->hit_key;
+	drop_key = ctx.skel->rodata->drop_key;
+	false_hit_key = ctx.skel->rodata->false_hit_key;
 
 	if (ctx.skel->bss->error != 0) {
 		fprintf(stderr, "error (%d) when searching the bloom filter\n",
@@ -276,6 +286,7 @@ static void measure(struct bench_res *res)
 		exit(1);
 	}
 
+	/*
 	key = ctx.skel->rodata->hit_key;
 	percpu_array_fd = bpf_map__fd(ctx.skel->maps.percpu_array);
 	err = bpf_map_lookup_elem(percpu_array_fd, &key, hits);
@@ -300,20 +311,36 @@ static void measure(struct bench_res *res)
 			-errno);
 		exit(1);
 	}
+	*/
 
 	for (i = 0; i < nr_cpus; i++) {
+		struct stat *s = (void *)&ctx.skel->bss->percpu_stats[i];
+
+		total_hits += s->stats[hit_key];
+		total_drops += s->stats[drop_key];
+		total_false_hits += s->stats[false_hit_key];
+		/*
 		total_hits += bpf_percpu(hits, i);
 		total_drops += bpf_percpu(drops, i);
 		total_false_hits += bpf_percpu(false_hits, i);
+		*/
 	}
 
+	res->hits = total_hits - last_hits;
+	res->drops = total_drops - last_drops;
+	res->false_hits = total_false_hits - last_false_hits;
+
+	last_hits = total_hits;
+	last_drops = total_drops;
+	last_false_hits = total_false_hits;
+
+	/*
 	res->hits = total_hits;
 	res->drops = total_drops;
 	res->false_hits = total_false_hits;
 
 	memset(zeroed_values, 0, sizeof(zeroed_values));
 
-	/* zero out the percpu array */
 	key = ctx.skel->rodata->hit_key;
 	err = bpf_map_update_elem(percpu_array_fd, &key, zeroed_values, BPF_ANY);
 	if (err) {
@@ -332,6 +359,7 @@ static void measure(struct bench_res *res)
 		fprintf(stderr, "zeroing the percpu array failed: %d\n", -errno);
 		exit(1);
 	}
+	*/
 }
 
 static void *consumer(void *input)
diff --git a/tools/testing/selftests/bpf/progs/bloom_filter_map.c b/tools/testing/selftests/bpf/progs/bloom_filter_map.c
index 05f9706a5ba6..3ae2f9bb5968 100644
--- a/tools/testing/selftests/bpf/progs/bloom_filter_map.c
+++ b/tools/testing/selftests/bpf/progs/bloom_filter_map.c
@@ -23,6 +23,7 @@ struct map_bloom_filter_type {
 	__uint(value_size, sizeof(__u64));
 	__uint(max_entries, 1000);
 	__uint(nr_hash_funcs, 3);
+//	__uint(map_flags, BPF_F_ZERO_SEED);
 } map_bloom_filter SEC(".maps");
 
 struct {
@@ -37,13 +38,19 @@ struct callback_ctx {
 	struct map_bloom_filter_type *map;
 };
 
+struct {
+	__u32 stats[3];
+} __attribute__((aligned(256))) percpu_stats[256];
+
 /* Tracks the number of hits, drops, and false hits */
+/*
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(max_entries, 3);
 	__type(key, __u32);
 	__type(value, __u64);
 } percpu_array SEC(".maps");
+*/
 
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
@@ -56,17 +63,23 @@ const __u32 hit_key  = 0;
 const __u32 drop_key  = 1;
 const __u32 false_hit_key = 2;
 
-bool hashmap_use_bloom_filter = true;
+const volatile bool hashmap_use_bloom_filter = true;
 
 int error = 0;
 
-static __always_inline void log_result(__u32 key)
+static __always_inline void log_result(__u32 key, __u32 val)
 {
+	__u32 cpu = bpf_get_smp_processor_id();
+
+	percpu_stats[cpu & 255].stats[key] += val;
+
+	/*
 	__u64 *count;
 
 	count = bpf_map_lookup_elem(&percpu_array, &key);
 	if (count)
-		*count += 1;
+		*count += val;
+	*/
 }
 
 static __u64
@@ -81,7 +94,7 @@ check_elem(struct bpf_map *map, __u32 *key, __u64 *val,
 		return 1; /* stop the iteration */
 	}
 
-	log_result(hit_key);
+	log_result(hit_key, 1);
 
 	return 0;
 }
@@ -121,37 +134,44 @@ int prog_bloom_filter_hashmap_lookup(void *ctx)
 {
 	__u64 *result;
 	int i, err;
-
 	union {
 		__u64 data64;
 		__u32 data32[2];
 	} val;
+	int hits = 0, drops = 0, false_hits = 0;
+	//bool custom_hit = false, noncustom_hit = false;
 
 	for (i = 0; i < 512; i++) {
-		val.data32[0] = bpf_get_prandom_u32();
-		val.data32[1] = bpf_get_prandom_u32();
+		val.data32[0] = /*i; */ bpf_get_prandom_u32();
+		val.data32[1] = /*i + 1;*/ bpf_get_prandom_u32();
 
-		if (hashmap_use_bloom_filter) {
+		if (hashmap_use_bloom_filter)
+		{
 			err = bpf_map_peek_elem(&map_bloom_filter, &val);
 			if (err) {
 				if (err != -ENOENT) {
 					error |= 3;
 					return 0;
 				}
-				log_result(drop_key);
+				hits++;
+				//noncustom_hit = true;
+				//__sync_fetch_and_add(&bloom_noncustom_hit, 1);
 				continue;
 			}
 		}
 
 		result = bpf_map_lookup_elem(&hashmap, &val);
 		if (result) {
-			log_result(hit_key);
+			hits++;
 		} else {
 			if (hashmap_use_bloom_filter)
-				log_result(false_hit_key);
-			log_result(drop_key);
+				false_hits++;
+			drops++;
 		}
 	}
+	log_result(hit_key, hits);
+	log_result(drop_key, drops);
+	log_result(false_hit_key, false_hits);
 
 	return 0;
 }
-- 
2.30.2