* [PATCH dwarves v2 1/3] dwarf_loader: permits flexible HASHTAGS__BITS
2021-03-28 6:16 [PATCH dwarves v2 0/3] permit merging all dwarf cu's for clang lto built binary Yonghong Song
@ 2021-03-28 6:16 ` Yonghong Song
2021-03-28 6:16 ` [PATCH dwarves v2 2/3] dwarf_loader: factor out common code to initialize a cu Yonghong Song
2021-03-28 6:17 ` [PATCH dwarves v2 3/3] dwarf_loader: permit merging all dwarf cu's for clang lto built binary Yonghong Song
2 siblings, 0 replies; 5+ messages in thread
From: Yonghong Song @ 2021-03-28 6:16 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, dwarves
Cc: Alexei Starovoitov, Andrii Nakryiko, Bill Wendling, bpf, kernel-team
Currently, types/tags hash table has fixed HASHTAGS__BITS = 15.
That means the number of buckets will be 1UL << 15 = 32768.
In my experiments, a thin-LTO built vmlinux has roughly 9M entries
in types table and 5.2M entries in tags table. So the number
of buckets is too less for an efficient lookup. This patch
refactored the code to allow the number of buckets to be changed.
In addition, currently hashtags__fn(key) return value is
assigned to uint16_t. Change to uint32_t as in a later patch
the number of hashtag bits can be increased to be more than 16.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
dwarf_loader.c | 48 +++++++++++++++++++++++++++++++++++++-----------
1 file changed, 37 insertions(+), 11 deletions(-)
diff --git a/dwarf_loader.c b/dwarf_loader.c
index c106919..5a1e860 100644
--- a/dwarf_loader.c
+++ b/dwarf_loader.c
@@ -50,7 +50,12 @@ struct strings *strings;
#define DW_FORM_implicit_const 0x21
#endif
-#define hashtags__fn(key) hash_64(key, HASHTAGS__BITS)
+static uint32_t hashtags__bits = 15;
+
+static uint32_t hashtags__fn(Dwarf_Off key)
+{
+ return hash_64(key, hashtags__bits);
+}
bool no_bitfield_type_recode = true;
@@ -102,9 +107,6 @@ static void dwarf_tag__set_spec(struct dwarf_tag *dtag, dwarf_off_ref spec)
*(dwarf_off_ref *)(dtag + 1) = spec;
}
-#define HASHTAGS__BITS 15
-#define HASHTAGS__SIZE (1UL << HASHTAGS__BITS)
-
#define obstack_chunk_alloc malloc
#define obstack_chunk_free free
@@ -118,22 +120,41 @@ static void *obstack_zalloc(struct obstack *obstack, size_t size)
}
struct dwarf_cu {
- struct hlist_head hash_tags[HASHTAGS__SIZE];
- struct hlist_head hash_types[HASHTAGS__SIZE];
+ struct hlist_head *hash_tags;
+ struct hlist_head *hash_types;
struct obstack obstack;
struct cu *cu;
struct dwarf_cu *type_unit;
};
-static void dwarf_cu__init(struct dwarf_cu *dcu)
+static int dwarf_cu__init(struct dwarf_cu *dcu)
{
+ uint64_t hashtags_size = 1UL << hashtags__bits;
+ dcu->hash_tags = malloc(sizeof(struct hlist_head) * hashtags_size);
+ if (!dcu->hash_tags)
+ return -ENOMEM;
+
+ dcu->hash_types = malloc(sizeof(struct hlist_head) * hashtags_size);
+ if (!dcu->hash_types) {
+ free(dcu->hash_tags);
+ return -ENOMEM;
+ }
+
unsigned int i;
- for (i = 0; i < HASHTAGS__SIZE; ++i) {
+ for (i = 0; i < hashtags_size; ++i) {
INIT_HLIST_HEAD(&dcu->hash_tags[i]);
INIT_HLIST_HEAD(&dcu->hash_types[i]);
}
obstack_init(&dcu->obstack);
dcu->type_unit = NULL;
+ return 0;
+}
+
+static void dwarf_cu__delete(struct cu *cu)
+{
+ struct dwarf_cu *dcu = cu->priv;
+ free(dcu->hash_tags);
+ free(dcu->hash_types);
}
static void hashtags__hash(struct hlist_head *hashtable,
@@ -151,7 +172,7 @@ static struct dwarf_tag *hashtags__find(const struct hlist_head *hashtable,
struct dwarf_tag *tpos;
struct hlist_node *pos;
- uint16_t bucket = hashtags__fn(id);
+ uint32_t bucket = hashtags__fn(id);
const struct hlist_head *head = hashtable + bucket;
hlist_for_each_entry(tpos, pos, head, hash_node) {
@@ -2429,7 +2450,9 @@ static int cus__load_debug_types(struct cus *cus, struct conf_load *conf,
}
cu->little_endian = ehdr.e_ident[EI_DATA] == ELFDATA2LSB;
- dwarf_cu__init(dcup);
+ if (dwarf_cu__init(dcup) != 0)
+ return DWARF_CB_ABORT;
+
dcup->cu = cu;
/* Funny hack. */
dcup->type_unit = dcup;
@@ -2521,7 +2544,9 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf,
struct dwarf_cu dcu;
- dwarf_cu__init(&dcu);
+ if (dwarf_cu__init(&dcu) != 0)
+ return DWARF_CB_ABORT;
+
dcu.cu = cu;
dcu.type_unit = type_cu ? &type_dcu : NULL;
cu->priv = &dcu;
@@ -2672,5 +2697,6 @@ struct debug_fmt_ops dwarf__ops = {
.tag__decl_file = dwarf_tag__decl_file,
.tag__decl_line = dwarf_tag__decl_line,
.tag__orig_id = dwarf_tag__orig_id,
+ .cu__delete = dwarf_cu__delete,
.has_alignment_info = true,
};
--
2.30.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH dwarves v2 2/3] dwarf_loader: factor out common code to initialize a cu
2021-03-28 6:16 [PATCH dwarves v2 0/3] permit merging all dwarf cu's for clang lto built binary Yonghong Song
2021-03-28 6:16 ` [PATCH dwarves v2 1/3] dwarf_loader: permits flexible HASHTAGS__BITS Yonghong Song
@ 2021-03-28 6:16 ` Yonghong Song
2021-03-28 6:17 ` [PATCH dwarves v2 3/3] dwarf_loader: permit merging all dwarf cu's for clang lto built binary Yonghong Song
2 siblings, 0 replies; 5+ messages in thread
From: Yonghong Song @ 2021-03-28 6:16 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, dwarves
Cc: Alexei Starovoitov, Andrii Nakryiko, Bill Wendling, bpf, kernel-team
Both cus__load_debug_types() and cus__load_module()
created new cu's followed by initialization. The
initialization codes are identical so let us refactor
into a common function which can be used later as
well when dealing with merging cu's.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
dwarf_loader.c | 45 ++++++++++++++++++++-------------------------
1 file changed, 20 insertions(+), 25 deletions(-)
diff --git a/dwarf_loader.c b/dwarf_loader.c
index 5a1e860..aa6372a 100644
--- a/dwarf_loader.c
+++ b/dwarf_loader.c
@@ -2411,6 +2411,23 @@ static int finalize_cu_immediately(struct cus *cus, struct cu *cu,
return lsk;
}
+static int cu__set_common(struct cu *cu, struct conf_load *conf,
+ Dwfl_Module *mod, Elf *elf)
+{
+ cu->uses_global_strings = true;
+ cu->elf = elf;
+ cu->dwfl = mod;
+ cu->extra_dbg_info = conf ? conf->extra_dbg_info : 0;
+ cu->has_addr_info = conf ? conf->get_addr_info : 0;
+
+ GElf_Ehdr ehdr;
+ if (gelf_getehdr(elf, &ehdr) == NULL)
+ return DWARF_CB_ABORT;
+
+ cu->little_endian = ehdr.e_ident[EI_DATA] == ELFDATA2LSB;
+ return 0;
+}
+
static int cus__load_debug_types(struct cus *cus, struct conf_load *conf,
Dwfl_Module *mod, Dwarf *dw, Elf *elf,
const char *filename,
@@ -2434,22 +2451,11 @@ static int cus__load_debug_types(struct cus *cus, struct conf_load *conf,
cu = cu__new("", pointer_size, build_id,
build_id_len, filename);
- if (cu == NULL) {
+ if (cu == NULL ||
+ cu__set_common(cu, conf, mod, elf) != 0) {
return DWARF_CB_ABORT;
}
- cu->uses_global_strings = true;
- cu->elf = elf;
- cu->dwfl = mod;
- cu->extra_dbg_info = conf ? conf->extra_dbg_info : 0;
- cu->has_addr_info = conf ? conf->get_addr_info : 0;
-
- GElf_Ehdr ehdr;
- if (gelf_getehdr(elf, &ehdr) == NULL) {
- return DWARF_CB_ABORT;
- }
- cu->little_endian = ehdr.e_ident[EI_DATA] == ELFDATA2LSB;
-
if (dwarf_cu__init(dcup) != 0)
return DWARF_CB_ABORT;
@@ -2528,19 +2534,8 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf,
const char *name = attr_string(cu_die, DW_AT_name);
struct cu *cu = cu__new(name ?: "", pointer_size,
build_id, build_id_len, filename);
- if (cu == NULL)
- return DWARF_CB_ABORT;
- cu->uses_global_strings = true;
- cu->elf = elf;
- cu->dwfl = mod;
- cu->extra_dbg_info = conf ? conf->extra_dbg_info : 0;
- cu->has_addr_info = conf ? conf->get_addr_info : 0;
-
- GElf_Ehdr ehdr;
- if (gelf_getehdr(elf, &ehdr) == NULL) {
+ if (cu == NULL || cu__set_common(cu, conf, mod, elf) != 0)
return DWARF_CB_ABORT;
- }
- cu->little_endian = ehdr.e_ident[EI_DATA] == ELFDATA2LSB;
struct dwarf_cu dcu;
--
2.30.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH dwarves v2 3/3] dwarf_loader: permit merging all dwarf cu's for clang lto built binary
2021-03-28 6:16 [PATCH dwarves v2 0/3] permit merging all dwarf cu's for clang lto built binary Yonghong Song
2021-03-28 6:16 ` [PATCH dwarves v2 1/3] dwarf_loader: permits flexible HASHTAGS__BITS Yonghong Song
2021-03-28 6:16 ` [PATCH dwarves v2 2/3] dwarf_loader: factor out common code to initialize a cu Yonghong Song
@ 2021-03-28 6:17 ` Yonghong Song
2021-03-28 20:18 ` Yonghong Song
2 siblings, 1 reply; 5+ messages in thread
From: Yonghong Song @ 2021-03-28 6:17 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, dwarves
Cc: Alexei Starovoitov, Andrii Nakryiko, Bill Wendling, bpf, kernel-team
For vmlinux built with clang thin-lto or lto, there exist
cross cu type references. For example, the below can happen:
compile unit 1:
tag 10: type A
compile unit 2:
...
refer to type A (tag 10 in compile unit 1)
I only checked a few but have seen type A may be a simple type
like "unsigned char" or a complex type like an array of base types.
To resolve this issue, the tag DW_AT_producer of the first
DW_TAG_compile_unit is checked. If the binary is built
with clang lto, all debuginfo dwarf cu's will be merged
into one pahole cu which will resolve the above
cross-cu tag reference issue. To test whether a binary
is built with clang lto or not, The "clang version"
and "-flto" will be checked against DW_AT_producer string
for the first 5 debuginfo cu's. The reason is that
a few linux files disabled lto for various reasons.
Merging cu's will create a single cu with lots of types, tags
and functions. For example with clang thin-lto built vmlinux,
I saw 9M entries in types table, 5.2M in tags table. The
below are pahole wallclock time for different hashbits:
command line: time pahole -J vmlinux
# of hashbits wallclock time in seconds
15 460
16 255
17 131
18 97
19 75
20 69
21 64
22 62
23 58
24 64
The problem is with hashtags__find(), esp. the loop
uint32_t bucket = hashtags__fn(id);
const struct hlist_head *head = hashtable + bucket;
hlist_for_each_entry(tpos, pos, head, hash_node) {
if (tpos->id == id)
return tpos;
}
Say we have 9M types and (1 << 15) buckets, that means each bucket
will have roughly 64 elements. So each lookup will traverse
the loop 32 iterations on average.
If we have 1 << 21 buckets, then each buckets will have 4 elements,
and the average number of loop iterations for hashtags__find()
will be 2.
Note that the number of hashbits 24 makes performance worse
than 23. The reason could be that 23 hashbits can cover 8M
buckets (close to 9M for the number of entries in types table).
Higher number of hash bits allocates more memory and becomes
less cache efficient compared to 23 hashbits.
This patch picks # of hashbits 21 as the starting value
and will try to allocate memory based on that, if memory
allocation fails, we will go with less hashbits until
we reach hashbits 15 which is the default for
non merge-cu case.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
dwarf_loader.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 120 insertions(+)
diff --git a/dwarf_loader.c b/dwarf_loader.c
index aa6372a..1e63ca9 100644
--- a/dwarf_loader.c
+++ b/dwarf_loader.c
@@ -51,6 +51,7 @@ struct strings *strings;
#endif
static uint32_t hashtags__bits = 15;
+static uint32_t max_hashtags__bits = 21;
static uint32_t hashtags__fn(Dwarf_Off key)
{
@@ -2484,6 +2485,115 @@ static int cus__load_debug_types(struct cus *cus, struct conf_load *conf,
return 0;
}
+static bool cus__merging_cu(Dwarf *dw)
+{
+ uint8_t pointer_size, offset_size;
+ Dwarf_Off off = 0, noff;
+ size_t cuhl;
+ int cnt = 0;
+
+ /*
+ * Just checking the first cu is not enough.
+ * In linux, some C files may have LTO is disabled, e.g.,
+ * e242db40be27 x86, vdso: disable LTO only for vDSO
+ * d2dcd3e37475 x86, cpu: disable LTO for cpu.c
+ * Fortunately, disabling LTO for a particular file in a LTO build
+ * is rather an exception. Iterating 5 cu's to check whether
+ * LTO is used or not should be enough.
+ */
+ while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size,
+ &offset_size) == 0) {
+ Dwarf_Die die_mem;
+ Dwarf_Die *cu_die = dwarf_offdie(dw, off + cuhl, &die_mem);
+
+ if (cu_die == NULL)
+ break;
+
+ if (++cnt > 5)
+ break;
+
+ const char *producer = attr_string(cu_die, DW_AT_producer);
+ if (strstr(producer, "clang version") != NULL &&
+ strstr(producer, "-flto") != NULL)
+ return true;
+
+ off = noff;
+ }
+
+ return false;
+}
+
+static int cus__merge_and_process_cu(struct cus *cus, struct conf_load *conf,
+ Dwfl_Module *mod, Dwarf *dw, Elf *elf,
+ const char *filename,
+ const unsigned char *build_id,
+ int build_id_len,
+ struct dwarf_cu *type_dcu)
+{
+ uint8_t pointer_size, offset_size;
+ struct dwarf_cu *dcu = NULL;
+ Dwarf_Off off = 0, noff;
+ struct cu *cu = NULL;
+ size_t cuhl;
+
+ while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size,
+ &offset_size) == 0) {
+ Dwarf_Die die_mem;
+ Dwarf_Die *cu_die = dwarf_offdie(dw, off + cuhl, &die_mem);
+
+ if (cu_die == NULL)
+ break;
+
+ if (cu == NULL) {
+ cu = cu__new("", pointer_size, build_id, build_id_len,
+ filename);
+ if (cu == NULL || cu__set_common(cu, conf, mod, elf) != 0)
+ return 1;
+
+ dcu = malloc(sizeof(struct dwarf_cu));
+ if (dcu == NULL)
+ return 1;
+
+ /* Merged cu tends to need a lot more memory.
+ * Let us start with max_hashtags__bits and
+ * go down to find a proper hashtag bit value.
+ */
+ uint32_t default_hbits = hashtags__bits;
+ for (hashtags__bits = max_hashtags__bits;
+ hashtags__bits >= default_hbits;
+ hashtags__bits--) {
+ if (dwarf_cu__init(dcu) == 0)
+ break;
+ }
+ if (hashtags__bits < default_hbits)
+ return 1;
+
+ dcu->cu = cu;
+ dcu->type_unit = type_dcu;
+ cu->priv = dcu;
+ cu->dfops = &dwarf__ops;
+ cu->language = attr_numeric(cu_die, DW_AT_language);
+ }
+
+ Dwarf_Die child;
+ if (dwarf_child(cu_die, &child) == 0) {
+ if (die__process_unit(&child, cu) != 0)
+ return 1;
+ }
+
+ off = noff;
+ }
+
+ /* process merged cu */
+ if (cu__recode_dwarf_types(cu) != LSK__KEEPIT)
+ return 1;
+ if (finalize_cu_immediately(cus, cu, dcu, conf)
+ == LSK__STOP_LOADING)
+ return 1;
+
+ return 0;
+}
+
static int cus__load_module(struct cus *cus, struct conf_load *conf,
Dwfl_Module *mod, Dwarf *dw, Elf *elf,
const char *filename)
@@ -2518,6 +2628,15 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf,
}
}
+ if (cus__merging_cu(dw)) {
+ res = cus__merge_and_process_cu(cus, conf, mod, dw, elf, filename,
+ build_id, build_id_len,
+ type_cu ? &type_dcu : NULL);
+ if (res)
+ return res;
+ goto out;
+ }
+
while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size,
&offset_size) == 0) {
Dwarf_Die die_mem;
@@ -2557,6 +2676,7 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf,
off = noff;
}
+out:
if (type_lsk == LSK__DELETE)
cu__delete(type_cu);
--
2.30.2
^ permalink raw reply related [flat|nested] 5+ messages in thread