bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <netdev@vger.kernel.org>, <ast@fb.com>,
	<daniel@iogearbox.net>
Cc: <andrii@kernel.org>, <kernel-team@fb.com>,
	Andrii Nakryiko <andriin@fb.com>,
	Song Liu <songliubraving@fb.com>
Subject: [PATCH v2 bpf-next 03/11] libbpf: unify and speed up BTF string deduplication
Date: Wed, 4 Nov 2020 20:33:53 -0800	[thread overview]
Message-ID: <20201105043402.2530976-4-andrii@kernel.org> (raw)
In-Reply-To: <20201105043402.2530976-1-andrii@kernel.org>

From: Andrii Nakryiko <andriin@fb.com>

Revamp BTF dedup's string deduplication to match the approach of writable BTF
string management. This allows to transfer deduplicated strings index back to
BTF object after deduplication without expensive extra memory copying and hash
map re-construction. It also simplifies the code and speeds it up, because
hashmap-based string deduplication is faster than sort + unique approach.

Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/lib/bpf/btf.c | 263 +++++++++++++++++---------------------------
 1 file changed, 98 insertions(+), 165 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 89fecfe5cb2b..fa1b147c63c6 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -90,6 +90,14 @@ struct btf {
 	struct hashmap *strs_hash;
 	/* whether strings are already deduplicated */
 	bool strs_deduped;
+	/* extra indirection layer to make strings hashmap work with stable
+	 * string offsets and ability to transparently choose between
+	 * btf->strs_data or btf_dedup->strs_data as a source of strings.
+	 * This is used for BTF strings dedup to transfer deduplicated strings
+	 * data back to struct btf without re-building strings index.
+	 */
+	void **strs_data_ptr;
+
 	/* BTF object FD, if loaded into kernel */
 	int fd;
 
@@ -1363,17 +1371,19 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 
 static size_t strs_hash_fn(const void *key, void *ctx)
 {
-	struct btf *btf = ctx;
-	const char *str = btf->strs_data + (long)key;
+	const struct btf *btf = ctx;
+	const char *strs = *btf->strs_data_ptr;
+	const char *str = strs + (long)key;
 
 	return str_hash(str);
 }
 
 static bool strs_hash_equal_fn(const void *key1, const void *key2, void *ctx)
 {
-	struct btf *btf = ctx;
-	const char *str1 = btf->strs_data + (long)key1;
-	const char *str2 = btf->strs_data + (long)key2;
+	const struct btf *btf = ctx;
+	const char *strs = *btf->strs_data_ptr;
+	const char *str1 = strs + (long)key1;
+	const char *str2 = strs + (long)key2;
 
 	return strcmp(str1, str2) == 0;
 }
@@ -1418,6 +1428,9 @@ static int btf_ensure_modifiable(struct btf *btf)
 	memcpy(types, btf->types_data, btf->hdr->type_len);
 	memcpy(strs, btf->strs_data, btf->hdr->str_len);
 
+	/* make hashmap below use btf->strs_data as a source of strings */
+	btf->strs_data_ptr = &btf->strs_data;
+
 	/* build lookup index for all strings */
 	hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, btf);
 	if (IS_ERR(hash)) {
@@ -2824,19 +2837,11 @@ struct btf_dedup {
 	size_t hypot_cap;
 	/* Various option modifying behavior of algorithm */
 	struct btf_dedup_opts opts;
-};
-
-struct btf_str_ptr {
-	const char *str;
-	__u32 new_off;
-	bool used;
-};
-
-struct btf_str_ptrs {
-	struct btf_str_ptr *ptrs;
-	const char *data;
-	__u32 cnt;
-	__u32 cap;
+	/* temporary strings deduplication state */
+	void *strs_data;
+	size_t strs_cap;
+	size_t strs_len;
+	struct hashmap* strs_hash;
 };
 
 static long hash_combine(long h, long value)
@@ -3063,64 +3068,41 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx)
 	return 0;
 }
 
-static int str_sort_by_content(const void *a1, const void *a2)
-{
-	const struct btf_str_ptr *p1 = a1;
-	const struct btf_str_ptr *p2 = a2;
-
-	return strcmp(p1->str, p2->str);
-}
-
-static int str_sort_by_offset(const void *a1, const void *a2)
-{
-	const struct btf_str_ptr *p1 = a1;
-	const struct btf_str_ptr *p2 = a2;
-
-	if (p1->str != p2->str)
-		return p1->str < p2->str ? -1 : 1;
-	return 0;
-}
-
-static int btf_dedup_str_ptr_cmp(const void *str_ptr, const void *pelem)
-{
-	const struct btf_str_ptr *p = pelem;
-
-	if (str_ptr != p->str)
-		return (const char *)str_ptr < p->str ? -1 : 1;
-	return 0;
-}
-
-static int btf_str_mark_as_used(__u32 *str_off_ptr, void *ctx)
+static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx)
 {
-	struct btf_str_ptrs *strs;
-	struct btf_str_ptr *s;
+	struct btf_dedup *d = ctx;
+	long old_off, new_off, len;
+	const char *s;
+	void *p;
+	int err;
 
 	if (*str_off_ptr == 0)
 		return 0;
 
-	strs = ctx;
-	s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt,
-		    sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp);
-	if (!s)
-		return -EINVAL;
-	s->used = true;
-	return 0;
-}
+	s = btf__str_by_offset(d->btf, *str_off_ptr);
+	len = strlen(s) + 1;
 
-static int btf_str_remap_offset(__u32 *str_off_ptr, void *ctx)
-{
-	struct btf_str_ptrs *strs;
-	struct btf_str_ptr *s;
+	new_off = d->strs_len;
+	p = btf_add_mem(&d->strs_data, &d->strs_cap, 1, new_off, BTF_MAX_STR_OFFSET, len);
+	if (!p)
+		return -ENOMEM;
 
-	if (*str_off_ptr == 0)
-		return 0;
+	memcpy(p, s, len);
 
-	strs = ctx;
-	s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt,
-		    sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp);
-	if (!s)
-		return -EINVAL;
-	*str_off_ptr = s->new_off;
+	/* Now attempt to add the string, but only if the string with the same
+	 * contents doesn't exist already (HASHMAP_ADD strategy). If such
+	 * string exists, we'll get its offset in old_off (that's old_key).
+	 */
+	err = hashmap__insert(d->strs_hash, (void *)new_off, (void *)new_off,
+			      HASHMAP_ADD, (const void **)&old_off, NULL);
+	if (err == -EEXIST) {
+		*str_off_ptr = old_off;
+	} else if (err) {
+		return err;
+	} else {
+		*str_off_ptr = new_off;
+		d->strs_len += len;
+	}
 	return 0;
 }
 
@@ -3137,118 +3119,69 @@ static int btf_str_remap_offset(__u32 *str_off_ptr, void *ctx)
  */
 static int btf_dedup_strings(struct btf_dedup *d)
 {
-	char *start = d->btf->strs_data;
-	char *end = start + d->btf->hdr->str_len;
-	char *p = start, *tmp_strs = NULL;
-	struct btf_str_ptrs strs = {
-		.cnt = 0,
-		.cap = 0,
-		.ptrs = NULL,
-		.data = start,
-	};
-	int i, j, err = 0, grp_idx;
-	bool grp_used;
+	char *s;
+	int err;
 
 	if (d->btf->strs_deduped)
 		return 0;
 
-	/* build index of all strings */
-	while (p < end) {
-		if (strs.cnt + 1 > strs.cap) {
-			struct btf_str_ptr *new_ptrs;
-
-			strs.cap += max(strs.cnt / 2, 16U);
-			new_ptrs = libbpf_reallocarray(strs.ptrs, strs.cap, sizeof(strs.ptrs[0]));
-			if (!new_ptrs) {
-				err = -ENOMEM;
-				goto done;
-			}
-			strs.ptrs = new_ptrs;
-		}
-
-		strs.ptrs[strs.cnt].str = p;
-		strs.ptrs[strs.cnt].used = false;
-
-		p += strlen(p) + 1;
-		strs.cnt++;
-	}
-
-	/* temporary storage for deduplicated strings */
-	tmp_strs = malloc(d->btf->hdr->str_len);
-	if (!tmp_strs) {
-		err = -ENOMEM;
-		goto done;
-	}
-
-	/* mark all used strings */
-	strs.ptrs[0].used = true;
-	err = btf_for_each_str_off(d, btf_str_mark_as_used, &strs);
-	if (err)
-		goto done;
-
-	/* sort strings by context, so that we can identify duplicates */
-	qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_content);
+	s = btf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1);
+	if (!s)
+		return -ENOMEM;
+	/* initial empty string */
+	s[0] = 0;
+	d->strs_len = 1;
 
-	/*
-	 * iterate groups of equal strings and if any instance in a group was
-	 * referenced, emit single instance and remember new offset
+	/* temporarily switch to use btf_dedup's strs_data for strings for hash
+	 * functions; later we'll just transfer hashmap to struct btf as is,
+	 * along the strs_data
 	 */
-	p = tmp_strs;
-	grp_idx = 0;
-	grp_used = strs.ptrs[0].used;
-	/* iterate past end to avoid code duplication after loop */
-	for (i = 1; i <= strs.cnt; i++) {
-		/*
-		 * when i == strs.cnt, we want to skip string comparison and go
-		 * straight to handling last group of strings (otherwise we'd
-		 * need to handle last group after the loop w/ duplicated code)
-		 */
-		if (i < strs.cnt &&
-		    !strcmp(strs.ptrs[i].str, strs.ptrs[grp_idx].str)) {
-			grp_used = grp_used || strs.ptrs[i].used;
-			continue;
-		}
+	d->btf->strs_data_ptr = &d->strs_data;
 
-		/*
-		 * this check would have been required after the loop to handle
-		 * last group of strings, but due to <= condition in a loop
-		 * we avoid that duplication
-		 */
-		if (grp_used) {
-			int new_off = p - tmp_strs;
-			__u32 len = strlen(strs.ptrs[grp_idx].str);
-
-			memmove(p, strs.ptrs[grp_idx].str, len + 1);
-			for (j = grp_idx; j < i; j++)
-				strs.ptrs[j].new_off = new_off;
-			p += len + 1;
-		}
-
-		if (i < strs.cnt) {
-			grp_idx = i;
-			grp_used = strs.ptrs[i].used;
-		}
+	d->strs_hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, d->btf);
+	if (IS_ERR(d->strs_hash)) {
+		err = PTR_ERR(d->strs_hash);
+		d->strs_hash = NULL;
+		goto err_out;
 	}
 
-	/* replace original strings with deduped ones */
-	d->btf->hdr->str_len = p - tmp_strs;
-	memmove(start, tmp_strs, d->btf->hdr->str_len);
-	end = start + d->btf->hdr->str_len;
-
-	/* restore original order for further binary search lookups */
-	qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_offset);
+	/* insert empty string; we won't be looking it up during strings
+	 * dedup, but it's good to have it for generic BTF string lookups
+	 */
+	err = hashmap__insert(d->strs_hash, (void *)0, (void *)0,
+			      HASHMAP_ADD, NULL, NULL);
+	if (err)
+		goto err_out;
 
 	/* remap string offsets */
-	err = btf_for_each_str_off(d, btf_str_remap_offset, &strs);
+	err = btf_for_each_str_off(d, strs_dedup_remap_str_off, d);
 	if (err)
-		goto done;
+		goto err_out;
 
-	d->btf->hdr->str_len = end - start;
+	/* replace BTF string data and hash with deduped ones */
+	free(d->btf->strs_data);
+	hashmap__free(d->btf->strs_hash);
+	d->btf->strs_data = d->strs_data;
+	d->btf->strs_data_cap = d->strs_cap;
+	d->btf->hdr->str_len = d->strs_len;
+	d->btf->strs_hash = d->strs_hash;
+	/* now point strs_data_ptr back to btf->strs_data */
+	d->btf->strs_data_ptr = &d->btf->strs_data;
+
+	d->strs_data = d->strs_hash = NULL;
+	d->strs_len = d->strs_cap = 0;
 	d->btf->strs_deduped = true;
+	return 0;
+
+err_out:
+	free(d->strs_data);
+	hashmap__free(d->strs_hash);
+	d->strs_data = d->strs_hash = NULL;
+	d->strs_len = d->strs_cap = 0;
+
+	/* restore strings pointer for existing d->btf->strs_hash back */
+	d->btf->strs_data_ptr = &d->strs_data;
 
-done:
-	free(tmp_strs);
-	free(strs.ptrs);
 	return err;
 }
 
-- 
2.24.1


  parent reply	other threads:[~2020-11-05  4:34 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-05  4:33 [PATCH v2 bpf-next 00/11] libbpf: split BTF support Andrii Nakryiko
2020-11-05  4:33 ` [PATCH v2 bpf-next 01/11] libbpf: factor out common operations in BTF writing APIs Andrii Nakryiko
2020-11-05  4:33 ` [PATCH v2 bpf-next 02/11] selftest/bpf: relax btf_dedup test checks Andrii Nakryiko
2020-11-05  4:33 ` Andrii Nakryiko [this message]
2020-11-05  4:33 ` [PATCH v2 bpf-next 04/11] libbpf: implement basic split BTF support Andrii Nakryiko
2020-11-05  4:33 ` [PATCH v2 bpf-next 05/11] selftests/bpf: add split BTF basic test Andrii Nakryiko
2020-11-05  4:33 ` [PATCH v2 bpf-next 06/11] selftests/bpf: add checking of raw type dump in BTF writer APIs selftests Andrii Nakryiko
2020-11-05  4:33 ` [PATCH v2 bpf-next 07/11] libbpf: fix BTF data layout checks and allow empty BTF Andrii Nakryiko
2020-11-05  4:33 ` [PATCH v2 bpf-next 08/11] libbpf: support BTF dedup of split BTFs Andrii Nakryiko
2020-11-05  4:33 ` [PATCH v2 bpf-next 09/11] libbpf: accomodate DWARF/compiler bug with duplicated identical arrays Andrii Nakryiko
2020-11-05  4:34 ` [PATCH v2 bpf-next 10/11] selftests/bpf: add split BTF dedup selftests Andrii Nakryiko
2020-11-05  4:34 ` [PATCH v2 bpf-next 11/11] tools/bpftool: add bpftool support for split BTF Andrii Nakryiko
2020-11-05  9:52 ` [PATCH v2 bpf-next 00/11] libbpf: split BTF support Jesper Dangaard Brouer
2020-11-05 19:16   ` Andrii Nakryiko
2020-11-05 19:38     ` Saeed Mahameed
2020-11-05 20:02       ` Andrii Nakryiko
2020-11-06  2:50 ` patchwork-bot+netdevbpf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201105043402.2530976-4-andrii@kernel.org \
    --to=andrii@kernel.org \
    --cc=andriin@fb.com \
    --cc=ast@fb.com \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@fb.com \
    --cc=netdev@vger.kernel.org \
    --cc=songliubraving@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).