dwarves.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole
@ 2022-01-20  1:08 Kui-Feng Lee
  2022-01-20  1:08 ` [PATCH dwarves 1/2] dwarf_loader: Prepare and pass per-thread data to worker threads Kui-Feng Lee
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Kui-Feng Lee @ 2022-01-20  1:08 UTC (permalink / raw)
  To: dwarves, arnaldo.melo; +Cc: ast, daniel, andrii, Kui-Feng Lee

Creating an instance of btf for each worker thread allows
steal-function provided by pahole to add type info on multiple threads
without a lock.  The main thread merges the results of worker threads
to the primary instance.

Copying data from per-thread btf instances to the primary instance is
expensive now.  However, there is a patch landed at the bpf-next
repository. [1] With the patch for bpf-next and this patch, they drop
total runtime to 5.4s from 6.0s with "-j4" on my device to generate
BTF for Linux.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=d81283d27266

Kui-Feng Lee (2):
  dwarf_loader: Prepare and pass per-thread data to worker threads.
  pahole: Use per-thread btf instances to avoid mutex locking.

 btf_encoder.c  |   5 +++
 btf_encoder.h  |   2 +
 btf_loader.c   |   2 +-
 ctf_loader.c   |   2 +-
 dwarf_loader.c |  58 ++++++++++++++++++------
 dwarves.h      |   9 +++-
 pahole.c       | 120 ++++++++++++++++++++++++++++++++++++++++++++++---
 pdwtags.c      |   3 +-
 pfunct.c       |   4 +-
 9 files changed, 180 insertions(+), 25 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH dwarves 1/2] dwarf_loader: Prepare and pass per-thread data to worker threads.
  2022-01-20  1:08 [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole Kui-Feng Lee
@ 2022-01-20  1:08 ` Kui-Feng Lee
  2022-01-20  1:08 ` [PATCH dwarves 2/2] pahole: Use per-thread btf instances to avoid mutex locking Kui-Feng Lee
  2022-01-20 19:59 ` [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole Andrii Nakryiko
  2 siblings, 0 replies; 6+ messages in thread
From: Kui-Feng Lee @ 2022-01-20  1:08 UTC (permalink / raw)
  To: dwarves, arnaldo.melo; +Cc: ast, daniel, andrii, Kui-Feng Lee

Add interfaces to allow users of dwarf_loader to prepare and pass
per-thread data to steal-functions running on worker threads.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 btf_loader.c   |  2 +-
 ctf_loader.c   |  2 +-
 dwarf_loader.c | 58 +++++++++++++++++++++++++++++++++++++++-----------
 dwarves.h      |  9 ++++++--
 pahole.c       |  3 ++-
 pdwtags.c      |  3 ++-
 pfunct.c       |  4 +++-
 7 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/btf_loader.c b/btf_loader.c
index 7a5b16ff393e..b61cadd55127 100644
--- a/btf_loader.c
+++ b/btf_loader.c
@@ -624,7 +624,7 @@ static int cus__load_btf(struct cus *cus, struct conf_load *conf, const char *fi
 	 * The app stole this cu, possibly deleting it,
 	 * so forget about it
 	 */
-	if (conf && conf->steal && conf->steal(cu, conf))
+	if (conf && conf->steal && conf->steal(cu, conf, NULL))
 		return 0;
 
 	cus__add(cus, cu);
diff --git a/ctf_loader.c b/ctf_loader.c
index 7c34739afdce..de6d4dbfce97 100644
--- a/ctf_loader.c
+++ b/ctf_loader.c
@@ -722,7 +722,7 @@ int ctf__load_file(struct cus *cus, struct conf_load *conf,
 	 * The app stole this cu, possibly deleting it,
 	 * so forget about it
 	 */
-	if (conf && conf->steal && conf->steal(cu, conf))
+	if (conf && conf->steal && conf->steal(cu, conf, NULL))
 		return 0;
 
 	cus__add(cus, cu);
diff --git a/dwarf_loader.c b/dwarf_loader.c
index e30b03c1c541..ed415d2db11f 100644
--- a/dwarf_loader.c
+++ b/dwarf_loader.c
@@ -2682,18 +2682,18 @@ static int class_member__cache_byte_size(struct tag *tag, struct cu *cu,
 	return 0;
 }
 
-static int cu__finalize(struct cu *cu, struct conf_load *conf)
+static int cu__finalize(struct cu *cu, struct conf_load *conf, void *thr_data)
 {
 	cu__for_all_tags(cu, class_member__cache_byte_size, conf);
 	if (conf && conf->steal) {
-		return conf->steal(cu, conf);
+		return conf->steal(cu, conf, thr_data);
 	}
 	return LSK__KEEPIT;
 }
 
-static int cus__finalize(struct cus *cus, struct cu *cu, struct conf_load *conf)
+static int cus__finalize(struct cus *cus, struct cu *cu, struct conf_load *conf, void *thr_data)
 {
-	int lsk = cu__finalize(cu, conf);
+	int lsk = cu__finalize(cu, conf, thr_data);
 	switch (lsk) {
 	case LSK__DELETE:
 		cu__delete(cu);
@@ -2862,7 +2862,13 @@ struct dwarf_cus {
 	struct dwarf_cu	    *type_dcu;
 };
 
-static int dwarf_cus__create_and_process_cu(struct dwarf_cus *dcus, Dwarf_Die *cu_die, uint8_t pointer_size)
+struct dwarf_thread {
+	struct dwarf_cus	*dcus;
+	void			*data;
+};
+
+static int dwarf_cus__create_and_process_cu(struct dwarf_cus *dcus, Dwarf_Die *cu_die,
+					    uint8_t pointer_size, void *thr_data)
 {
 	/*
 	 * DW_AT_name in DW_TAG_compile_unit can be NULL, first seen in:
@@ -2884,7 +2890,7 @@ static int dwarf_cus__create_and_process_cu(struct dwarf_cus *dcus, Dwarf_Die *c
 	cu->dfops = &dwarf__ops;
 
 	if (die__process_and_recode(cu_die, cu, dcus->conf) != 0 ||
-	    cus__finalize(dcus->cus, cu, dcus->conf) == LSK__STOP_LOADING)
+	    cus__finalize(dcus->cus, cu, dcus->conf, thr_data) == LSK__STOP_LOADING)
 		return DWARF_CB_ABORT;
 
        return DWARF_CB_OK;
@@ -2918,7 +2924,8 @@ out_unlock:
 
 static void *dwarf_cus__process_cu_thread(void *arg)
 {
-	struct dwarf_cus *dcus = arg;
+	struct dwarf_thread *dthr = arg;
+	struct dwarf_cus *dcus = dthr->dcus;
 	uint8_t pointer_size, offset_size;
 	Dwarf_Die die_mem, *cu_die;
 
@@ -2926,11 +2933,13 @@ static void *dwarf_cus__process_cu_thread(void *arg)
 		if (cu_die == NULL)
 			break;
 
-		if (dwarf_cus__create_and_process_cu(dcus, cu_die, pointer_size) == DWARF_CB_ABORT)
+		if (dwarf_cus__create_and_process_cu(dcus, cu_die,
+						     pointer_size, dthr->data) == DWARF_CB_ABORT)
 			goto out_abort;
 	}
 
-	if (dcus->conf->thread_exit && dcus->conf->thread_exit() != 0)
+	if (dcus->conf->thread_exit &&
+	    dcus->conf->thread_exit(dcus->conf, dthr->data) != 0)
 		goto out_abort;
 
 	return (void *)DWARF_CB_OK;
@@ -2941,10 +2950,25 @@ out_abort:
 static int dwarf_cus__threaded_process_cus(struct dwarf_cus *dcus)
 {
 	pthread_t threads[dcus->conf->nr_jobs];
+	struct dwarf_thread dthr[dcus->conf->nr_jobs];
+	void *thread_data[dcus->conf->nr_jobs];
+	int res;
 	int i;
 
+	if (dcus->conf->threads_prepare) {
+		res = dcus->conf->threads_prepare(dcus->conf, dcus->conf->nr_jobs, thread_data);
+		if (res != 0)
+			return res;
+	} else
+		memset(thread_data, 0, sizeof(void *) * dcus->conf->nr_jobs);
+
 	for (i = 0; i < dcus->conf->nr_jobs; ++i) {
-		dcus->error = pthread_create(&threads[i], NULL, dwarf_cus__process_cu_thread, dcus);
+		dthr[i].dcus = dcus;
+		dthr[i].data = thread_data[i];
+
+		dcus->error = pthread_create(&threads[i], NULL,
+					     dwarf_cus__process_cu_thread,
+					     &dthr[i]);
 		if (dcus->error)
 			goto out_join;
 	}
@@ -2960,6 +2984,13 @@ out_join:
 			dcus->error = (long)res;
 	}
 
+	if (dcus->conf->threads_collect) {
+		res = dcus->conf->threads_collect(dcus->conf, dcus->conf->nr_jobs,
+						  thread_data, dcus->error);
+		if (dcus->error == 0)
+			dcus->error = res;
+	}
+
 	return dcus->error;
 }
 
@@ -2976,7 +3007,8 @@ static int __dwarf_cus__process_cus(struct dwarf_cus *dcus)
 		if (cu_die == NULL)
 			break;
 
-		if (dwarf_cus__create_and_process_cu(dcus, cu_die, pointer_size) == DWARF_CB_ABORT)
+		if (dwarf_cus__create_and_process_cu(dcus, cu_die,
+						     pointer_size, NULL) == DWARF_CB_ABORT)
 			return DWARF_CB_ABORT;
 
 		dcus->off = noff;
@@ -3070,7 +3102,7 @@ static int cus__merge_and_process_cu(struct cus *cus, struct conf_load *conf,
 	if (cu__resolve_func_ret_types(cu) != LSK__KEEPIT)
 		goto out_abort;
 
-	if (cus__finalize(cus, cu, conf) == LSK__STOP_LOADING)
+	if (cus__finalize(cus, cu, conf, NULL) == LSK__STOP_LOADING)
 		goto out_abort;
 
 	return 0;
@@ -3102,7 +3134,7 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf,
 	}
 
 	if (type_cu != NULL) {
-		type_lsk = cu__finalize(type_cu, conf);
+		type_lsk = cu__finalize(type_cu, conf, NULL);
 		if (type_lsk == LSK__KEEPIT) {
 			cus__add(cus, type_cu);
 		}
diff --git a/dwarves.h b/dwarves.h
index 52d162d67456..de152f9b64cf 100644
--- a/dwarves.h
+++ b/dwarves.h
@@ -48,8 +48,9 @@ struct conf_fprintf;
  */
 struct conf_load {
 	enum load_steal_kind	(*steal)(struct cu *cu,
-					 struct conf_load *conf);
-	int			(*thread_exit)(void);
+					 struct conf_load *conf,
+					 void *thr_data);
+	int			(*thread_exit)(struct conf_load *conf, void *thr_data);
 	void			*cookie;
 	char			*format_path;
 	int			nr_jobs;
@@ -70,6 +71,10 @@ struct conf_load {
 	const char		*kabi_prefix;
 	struct btf		*base_btf;
 	struct conf_fprintf	*conf_fprintf;
+	int			(*threads_prepare)(struct conf_load *conf, int nr_threads,
+						   void **thr_data);
+	int			(*threads_collect)(struct conf_load *conf, int nr_threads,
+						   void **thr_data, int error);
 };
 
 /** struct conf_fprintf - hints to the __fprintf routines
diff --git a/pahole.c b/pahole.c
index f3a51cb2fe74..f3eeaaca4cdf 100644
--- a/pahole.c
+++ b/pahole.c
@@ -2799,7 +2799,8 @@ out:
 static struct type_instance *header;
 
 static enum load_steal_kind pahole_stealer(struct cu *cu,
-					   struct conf_load *conf_load)
+					   struct conf_load *conf_load,
+					   void *thr_data)
 {
 	int ret = LSK__DELETE;
 
diff --git a/pdwtags.c b/pdwtags.c
index 2b5ba1bf6745..8b1d6f1c96cb 100644
--- a/pdwtags.c
+++ b/pdwtags.c
@@ -72,7 +72,8 @@ static int cu__emit_tags(struct cu *cu)
 }
 
 static enum load_steal_kind pdwtags_stealer(struct cu *cu,
-					    struct conf_load *conf_load __maybe_unused)
+					    struct conf_load *conf_load __maybe_unused,
+					    void *thr_data __maybe_unused)
 {
 	cu__emit_tags(cu);
 	return LSK__DELETE;
diff --git a/pfunct.c b/pfunct.c
index 5485622e639b..314915b774f4 100644
--- a/pfunct.c
+++ b/pfunct.c
@@ -489,7 +489,9 @@ int elf_symtabs__show(char *filenames[])
 	return EXIT_SUCCESS;
 }
 
-static enum load_steal_kind pfunct_stealer(struct cu *cu, struct conf_load *conf_load __maybe_unused)
+static enum load_steal_kind pfunct_stealer(struct cu *cu,
+					   struct conf_load *conf_load __maybe_unused,
+					   void *thr_data __maybe_unused)
 {
 
 	if (function_name) {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH dwarves 2/2] pahole: Use per-thread btf instances to avoid mutex locking.
  2022-01-20  1:08 [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole Kui-Feng Lee
  2022-01-20  1:08 ` [PATCH dwarves 1/2] dwarf_loader: Prepare and pass per-thread data to worker threads Kui-Feng Lee
@ 2022-01-20  1:08 ` Kui-Feng Lee
  2022-01-20 20:16   ` Andrii Nakryiko
  2022-01-20 19:59 ` [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole Andrii Nakryiko
  2 siblings, 1 reply; 6+ messages in thread
From: Kui-Feng Lee @ 2022-01-20  1:08 UTC (permalink / raw)
  To: dwarves, arnaldo.melo; +Cc: ast, daniel, andrii, Kui-Feng Lee

Create an instance of btf for each worker thread, and add type info to
the local btf instance in the steal-function of pahole without mutex
acquiring.  Once finished with all worker threads, merge all
per-thread btf instances to the primary btf instance.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 btf_encoder.c |   5 +++
 btf_encoder.h |   2 +
 pahole.c      | 117 +++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/btf_encoder.c b/btf_encoder.c
index 9d015f304e92..96a6c1d5fba0 100644
--- a/btf_encoder.c
+++ b/btf_encoder.c
@@ -1529,3 +1529,8 @@ int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu)
 out:
 	return err;
 }
+
+struct btf *btf_encoder__get_btf(struct btf_encoder *encoder)
+{
+	return encoder->btf;
+}
diff --git a/btf_encoder.h b/btf_encoder.h
index f133b0d7674d..e141ec2e677d 100644
--- a/btf_encoder.h
+++ b/btf_encoder.h
@@ -29,4 +29,6 @@ struct btf_encoder *btf_encoders__first(struct list_head *encoders);
 
 struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
 
+struct btf *btf_encoder__get_btf(struct btf_encoder *encoder);
+
 #endif /* _BTF_ENCODER_H_ */
diff --git a/pahole.c b/pahole.c
index f3eeaaca4cdf..56bd7dc4e62e 100644
--- a/pahole.c
+++ b/pahole.c
@@ -2798,6 +2798,73 @@ out:
 
 static struct type_instance *header;
 
+struct thread_data {
+	struct btf *btf;
+	struct btf_encoder *encoder;
+};
+
+static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data)
+{
+	int i;
+	struct thread_data *threads =
+		(struct thread_data *)calloc(sizeof(struct thread_data), nr_threads);
+
+	for (i = 0; i < nr_threads; i++)
+		thr_data[i] = threads + i;
+
+	return 0;
+}
+
+static int pahole_thread_exit(struct conf_load *conf, void *thr_data)
+{
+	struct thread_data *thread = (struct thread_data *)thr_data;
+
+	if (thread == NULL)
+		return 0;
+
+	/*
+	 * It will call dedup here in the future once we have fixed
+	 * the known performance regression of libbpf.
+	 */
+
+	return 0;
+}
+
+static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void **thr_data,
+				  int error)
+{
+	struct thread_data **threads = (struct thread_data **)thr_data;
+	int i;
+	int err = 0;
+
+	if (error == 0) {
+		for (i = 0; i < nr_threads; i++) {
+			/*
+			 * Merge cotnent of the btf instances of
+			 * worker threads to the btf instance of the
+			 * primary btf_encoder.
+			 */
+			if (!threads[i]->btf || threads[i]->encoder == btf_encoder)
+				continue; /* The primary btf_encoder */
+			err = btf__add_btf(btf_encoder__get_btf(btf_encoder), threads[i]->btf);
+			if (err < 0)
+				goto out;
+			err = 0;
+			btf_encoder__delete(threads[i]->encoder);
+			threads[i]->encoder = NULL;
+		}
+	}
+
+out:
+	for (i = 0; i < nr_threads; i++) {
+		if (threads[i]->encoder && threads[i]->encoder != btf_encoder)
+			btf_encoder__delete(threads[i]->encoder);
+	}
+	free(threads[0]);
+
+	return err;
+}
+
 static enum load_steal_kind pahole_stealer(struct cu *cu,
 					   struct conf_load *conf_load,
 					   void *thr_data)
@@ -2819,8 +2886,8 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
 
 	if (btf_encode) {
 		static pthread_mutex_t btf_lock = PTHREAD_MUTEX_INITIALIZER;
+		struct btf_encoder *encoder;
 
-		pthread_mutex_lock(&btf_lock);
 		/*
 		 * FIXME:
 		 *
@@ -2828,21 +2895,58 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
 		 * point we'll have cu->elf setup...
 		 */
 		if (!btf_encoder) {
-			btf_encoder = btf_encoder__new(cu, detached_btf_filename, conf_load->base_btf, skip_encoding_btf_vars,
-						       btf_encode_force, btf_gen_floats, global_verbose);
+			int new_btf_encoder = 0;
+
+			pthread_mutex_lock(&btf_lock);
+			if (!btf_encoder) {
+				/*
+				 * btf_encoder is the primary encoder.
+				 * And, it is used by the thread
+				 * create it.
+				 */
+				btf_encoder = btf_encoder__new(cu, detached_btf_filename,
+							       conf_load->base_btf,
+							       skip_encoding_btf_vars,
+							       btf_encode_force,
+							       btf_gen_floats, global_verbose);
+				new_btf_encoder = 1;
+			}
+			pthread_mutex_unlock(&btf_lock);
 			if (btf_encoder == NULL) {
 				ret = LSK__STOP_LOADING;
 				goto out_btf;
 			}
+			if (new_btf_encoder && thr_data) {
+				struct thread_data *thread = (struct thread_data *)thr_data;
+
+				thread->encoder = btf_encoder;
+				thread->btf = btf_encoder__get_btf(btf_encoder);
+			}
 		}
 
-		if (btf_encoder__encode_cu(btf_encoder, cu)) {
+		if (thr_data) {
+			struct thread_data *thread = (struct thread_data *)thr_data;
+
+			if (thread->encoder == NULL) {
+				thread->encoder =
+					btf_encoder__new(cu, detached_btf_filename,
+							 NULL,
+							 skip_encoding_btf_vars,
+							 btf_encode_force,
+							 btf_gen_floats,
+							 global_verbose);
+				thread->btf = btf_encoder__get_btf(thread->encoder);
+			}
+			encoder = thread->encoder;
+		} else
+			encoder = btf_encoder;
+
+		if (btf_encoder__encode_cu(encoder, cu)) {
 			fprintf(stderr, "Encountered error while encoding BTF.\n");
 			exit(1);
 		}
 		ret = LSK__DELETE;
 out_btf:
-		pthread_mutex_unlock(&btf_lock);
 		return ret;
 	}
 #if 0
@@ -3207,6 +3311,9 @@ int main(int argc, char *argv[])
 	memset(tab, ' ', sizeof(tab) - 1);
 
 	conf_load.steal = pahole_stealer;
+	conf_load.thread_exit = pahole_thread_exit;
+	conf_load.threads_prepare = pahole_threads_prepare;
+	conf_load.threads_collect = pahole_threads_collect;
 
 	// Make 'pahole --header type < file' a shorter form of 'pahole -C type --count 1 < file'
 	if (conf.header_type && !class_name && prettify_input) {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole
  2022-01-20  1:08 [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole Kui-Feng Lee
  2022-01-20  1:08 ` [PATCH dwarves 1/2] dwarf_loader: Prepare and pass per-thread data to worker threads Kui-Feng Lee
  2022-01-20  1:08 ` [PATCH dwarves 2/2] pahole: Use per-thread btf instances to avoid mutex locking Kui-Feng Lee
@ 2022-01-20 19:59 ` Andrii Nakryiko
  2022-01-20 20:46   ` Arnaldo Carvalho de Melo
  2 siblings, 1 reply; 6+ messages in thread
From: Andrii Nakryiko @ 2022-01-20 19:59 UTC (permalink / raw)
  To: Kui-Feng Lee
  Cc: dwarves, Arnaldo Carvalho de Melo, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, bpf

+cc bpf@vger.kernel.org

On Wed, Jan 19, 2022 at 5:08 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> Creating an instance of btf for each worker thread allows
> steal-function provided by pahole to add type info on multiple threads
> without a lock.  The main thread merges the results of worker threads
> to the primary instance.
>
> Copying data from per-thread btf instances to the primary instance is
> expensive now.  However, there is a patch landed at the bpf-next
> repository. [1] With the patch for bpf-next and this patch, they drop
> total runtime to 5.4s from 6.0s with "-j4" on my device to generate
> BTF for Linux.

Just a few more data points. I've tried this locally with 40 cores,
both with and without the libbpf's btf__add_btf() optimization.

BASELINE NON-PARALLEL
=====================
$ time ./pahole -J ~/linux-build/default/vmlinux
./pahole -J ~/linux-build/default/vmlinux  11.17s user 0.66s system
99% cpu 11.832 total

BASELINE PARALLEL
=================
$ time ./pahole -j40 -J ~/linux-build/default/vmlinux
./pahole -j40 -J ~/linux-build/default/vmlinux  13.85s user 0.75s
system 290% cpu 5.023 total

THESE PATCHES WITHOUT LIBBPF SPEED-UP
=====================================
$ time ./pahole -j40 -J ~/linux-build/default/vmlinux
./pahole -j40 -J ~/linux-build/default/vmlinux  25.94s user 1.15s
system 685% cpu 3.954 total

THESE PATCHES WITH LATEST LIBBPF SPEED-UP
=========================================
$ time ./pahole -j40 -J ~/linux-build/default/vmlinux
./pahole -j40 -J ~/linux-build/default/vmlinux  27.49s user 1.08s
system 858% cpu 3.328 total


So on 40 cores, it's a speed up from 11.8 seconds non-parallel, to 5s
parallel without Kui-Feng's changes, to 4s with Kui-Feng's changes, to
3.3s after libbpf update (I did it locally, will sync this to Github
today).

4x speed up, not bad!

But parallel mode is not currently enabled in kernel build, let's
enable parallel mode and save those seconds during the kernel build!

>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=d81283d27266
>
> Kui-Feng Lee (2):
>   dwarf_loader: Prepare and pass per-thread data to worker threads.
>   pahole: Use per-thread btf instances to avoid mutex locking.
>
>  btf_encoder.c  |   5 +++
>  btf_encoder.h  |   2 +
>  btf_loader.c   |   2 +-
>  ctf_loader.c   |   2 +-
>  dwarf_loader.c |  58 ++++++++++++++++++------
>  dwarves.h      |   9 +++-
>  pahole.c       | 120 ++++++++++++++++++++++++++++++++++++++++++++++---
>  pdwtags.c      |   3 +-
>  pfunct.c       |   4 +-
>  9 files changed, 180 insertions(+), 25 deletions(-)
>
> --
> 2.30.2
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH dwarves 2/2] pahole: Use per-thread btf instances to avoid mutex locking.
  2022-01-20  1:08 ` [PATCH dwarves 2/2] pahole: Use per-thread btf instances to avoid mutex locking Kui-Feng Lee
@ 2022-01-20 20:16   ` Andrii Nakryiko
  0 siblings, 0 replies; 6+ messages in thread
From: Andrii Nakryiko @ 2022-01-20 20:16 UTC (permalink / raw)
  To: Kui-Feng Lee
  Cc: dwarves, Arnaldo Carvalho de Melo, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko

On Wed, Jan 19, 2022 at 5:09 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> Create an instance of btf for each worker thread, and add type info to
> the local btf instance in the steal-function of pahole without mutex
> acquiring.  Once finished with all worker threads, merge all
> per-thread btf instances to the primary btf instance.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---

Just a bunch of nits, I'll relay to Arnaldo to verify the logic is sound

>  btf_encoder.c |   5 +++
>  btf_encoder.h |   2 +
>  pahole.c      | 117 +++++++++++++++++++++++++++++++++++++++++++++++---
>  3 files changed, 119 insertions(+), 5 deletions(-)
>
> diff --git a/btf_encoder.c b/btf_encoder.c
> index 9d015f304e92..96a6c1d5fba0 100644
> --- a/btf_encoder.c
> +++ b/btf_encoder.c
> @@ -1529,3 +1529,8 @@ int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu)
>  out:
>         return err;
>  }
> +
> +struct btf *btf_encoder__get_btf(struct btf_encoder *encoder)

nit: btf_encoder__btf() is how we'd call this in libbpf, not sure
which naming convention Arnaldo prefers

> +{
> +       return encoder->btf;
> +}
> diff --git a/btf_encoder.h b/btf_encoder.h
> index f133b0d7674d..e141ec2e677d 100644
> --- a/btf_encoder.h
> +++ b/btf_encoder.h
> @@ -29,4 +29,6 @@ struct btf_encoder *btf_encoders__first(struct list_head *encoders);
>
>  struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
>
> +struct btf *btf_encoder__get_btf(struct btf_encoder *encoder);
> +
>  #endif /* _BTF_ENCODER_H_ */
> diff --git a/pahole.c b/pahole.c
> index f3eeaaca4cdf..56bd7dc4e62e 100644
> --- a/pahole.c
> +++ b/pahole.c
> @@ -2798,6 +2798,73 @@ out:
>
>  static struct type_instance *header;
>
> +struct thread_data {
> +       struct btf *btf;
> +       struct btf_encoder *encoder;
> +};
> +
> +static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data)
> +{
> +       int i;
> +       struct thread_data *threads =
> +               (struct thread_data *)calloc(sizeof(struct thread_data), nr_threads);

nit: split variable declaration and initialization, if it's not
fitting on a single line? You also don't need casting of calloc()
result in C.

> +
> +       for (i = 0; i < nr_threads; i++)
> +               thr_data[i] = threads + i;
> +
> +       return 0;
> +}
> +
> +static int pahole_thread_exit(struct conf_load *conf, void *thr_data)
> +{
> +       struct thread_data *thread = (struct thread_data *)thr_data;

same, casting not needed

> +
> +       if (thread == NULL)

nit: !thread

> +               return 0;
> +
> +       /*
> +        * It will call dedup here in the future once we have fixed
> +        * the known performance regression of libbpf.
> +        */

uhm... it's not a regression, it's BTF dedup algorithm not really
supporting partial dedup and subsequent dedup. I'd drop this comment
for now, it just adds confusion, IMO.

> +
> +       return 0;
> +}
> +
> +static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void **thr_data,
> +                                 int error)
> +{
> +       struct thread_data **threads = (struct thread_data **)thr_data;
> +       int i;
> +       int err = 0;
> +
> +       if (error == 0) {

reduce nesting:

if (error)
    goto out;

> +               for (i = 0; i < nr_threads; i++) {
> +                       /*
> +                        * Merge cotnent of the btf instances of

typo: content

> +                        * worker threads to the btf instance of the
> +                        * primary btf_encoder.
> +                        */
> +                       if (!threads[i]->btf || threads[i]->encoder == btf_encoder)
> +                               continue; /* The primary btf_encoder */
> +                       err = btf__add_btf(btf_encoder__get_btf(btf_encoder), threads[i]->btf);

if you do btf__add_btf() in pahole_thread_exit() (under mutex, but
that's fine), you can offload some time spent in btf__Add_btf() to
worker threads. Have you tried that? I bet you'll get a bit of speed
up that way as well.

> +                       if (err < 0)
> +                               goto out;
> +                       err = 0;

move this to after the for loop

> +                       btf_encoder__delete(threads[i]->encoder);
> +                       threads[i]->encoder = NULL;
> +               }
> +       }
> +
> +out:
> +       for (i = 0; i < nr_threads; i++) {
> +               if (threads[i]->encoder && threads[i]->encoder != btf_encoder)
> +                       btf_encoder__delete(threads[i]->encoder);
> +       }
> +       free(threads[0]);
> +
> +       return err;
> +}
> +
>  static enum load_steal_kind pahole_stealer(struct cu *cu,
>                                            struct conf_load *conf_load,
>                                            void *thr_data)
> @@ -2819,8 +2886,8 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
>
>         if (btf_encode) {
>                 static pthread_mutex_t btf_lock = PTHREAD_MUTEX_INITIALIZER;
> +               struct btf_encoder *encoder;
>
> -               pthread_mutex_lock(&btf_lock);
>                 /*
>                  * FIXME:
>                  *
> @@ -2828,21 +2895,58 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
>                  * point we'll have cu->elf setup...
>                  */
>                 if (!btf_encoder) {
> -                       btf_encoder = btf_encoder__new(cu, detached_btf_filename, conf_load->base_btf, skip_encoding_btf_vars,
> -                                                      btf_encode_force, btf_gen_floats, global_verbose);
> +                       int new_btf_encoder = 0;

this is boolean variable, use bool

> +
> +                       pthread_mutex_lock(&btf_lock);
> +                       if (!btf_encoder) {
> +                               /*
> +                                * btf_encoder is the primary encoder.
> +                                * And, it is used by the thread
> +                                * create it.
> +                                */
> +                               btf_encoder = btf_encoder__new(cu, detached_btf_filename,
> +                                                              conf_load->base_btf,
> +                                                              skip_encoding_btf_vars,
> +                                                              btf_encode_force,
> +                                                              btf_gen_floats, global_verbose);
> +                               new_btf_encoder = 1;
> +                       }
> +                       pthread_mutex_unlock(&btf_lock);
>                         if (btf_encoder == NULL) {
>                                 ret = LSK__STOP_LOADING;
>                                 goto out_btf;
>                         }
> +                       if (new_btf_encoder && thr_data) {
> +                               struct thread_data *thread = (struct thread_data *)thr_data;
> +
> +                               thread->encoder = btf_encoder;
> +                               thread->btf = btf_encoder__get_btf(btf_encoder);

maybe do this initialization right after you created btf_encode above
(under lock is fine, there is nothing slow here)

> +                       }
>                 }
>
> -               if (btf_encoder__encode_cu(btf_encoder, cu)) {
> +               if (thr_data) {
> +                       struct thread_data *thread = (struct thread_data *)thr_data;
> +
> +                       if (thread->encoder == NULL) {
> +                               thread->encoder =
> +                                       btf_encoder__new(cu, detached_btf_filename,
> +                                                        NULL,
> +                                                        skip_encoding_btf_vars,
> +                                                        btf_encode_force,
> +                                                        btf_gen_floats,
> +                                                        global_verbose);
> +                               thread->btf = btf_encoder__get_btf(thread->encoder);
> +                       }
> +                       encoder = thread->encoder;
> +               } else
> +                       encoder = btf_encoder;
> +
> +               if (btf_encoder__encode_cu(encoder, cu)) {
>                         fprintf(stderr, "Encountered error while encoding BTF.\n");
>                         exit(1);
>                 }
>                 ret = LSK__DELETE;
>  out_btf:
> -               pthread_mutex_unlock(&btf_lock);
>                 return ret;
>         }
>  #if 0
> @@ -3207,6 +3311,9 @@ int main(int argc, char *argv[])
>         memset(tab, ' ', sizeof(tab) - 1);
>
>         conf_load.steal = pahole_stealer;
> +       conf_load.thread_exit = pahole_thread_exit;
> +       conf_load.threads_prepare = pahole_threads_prepare;
> +       conf_load.threads_collect = pahole_threads_collect;
>
>         // Make 'pahole --header type < file' a shorter form of 'pahole -C type --count 1 < file'
>         if (conf.header_type && !class_name && prettify_input) {
> --
> 2.30.2
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole
  2022-01-20 19:59 ` [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole Andrii Nakryiko
@ 2022-01-20 20:46   ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 6+ messages in thread
From: Arnaldo Carvalho de Melo @ 2022-01-20 20:46 UTC (permalink / raw)
  To: Andrii Nakryiko, Kui-Feng Lee
  Cc: dwarves, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, bpf



On January 20, 2022 4:59:08 PM GMT-03:00, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
>+cc bpf@vger.kernel.org
>
>On Wed, Jan 19, 2022 at 5:08 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
>>
>> Creating an instance of btf for each worker thread allows
>> steal-function provided by pahole to add type info on multiple threads
>> without a lock.  The main thread merges the results of worker threads
>> to the primary instance.
>>
>> Copying data from per-thread btf instances to the primary instance is
>> expensive now.  However, there is a patch landed at the bpf-next
>> repository. [1] With the patch for bpf-next and this patch, they drop
>> total runtime to 5.4s from 6.0s with "-j4" on my device to generate
>> BTF for Linux.
>
>Just a few more data points. I've tried this locally with 40 cores,
>both with and without the libbpf's btf__add_btf() optimization.
>
>BASELINE NON-PARALLEL
>=====================
>$ time ./pahole -J ~/linux-build/default/vmlinux
>./pahole -J ~/linux-build/default/vmlinux  11.17s user 0.66s system
>99% cpu 11.832 total
>
>BASELINE PARALLEL
>=================
>$ time ./pahole -j40 -J ~/linux-build/default/vmlinux
>./pahole -j40 -J ~/linux-build/default/vmlinux  13.85s user 0.75s
>system 290% cpu 5.023 total
>
>THESE PATCHES WITHOUT LIBBPF SPEED-UP
>=====================================
>$ time ./pahole -j40 -J ~/linux-build/default/vmlinux
>./pahole -j40 -J ~/linux-build/default/vmlinux  25.94s user 1.15s
>system 685% cpu 3.954 total
>
>THESE PATCHES WITH LATEST LIBBPF SPEED-UP
>=========================================
>$ time ./pahole -j40 -J ~/linux-build/default/vmlinux
>./pahole -j40 -J ~/linux-build/default/vmlinux  27.49s user 1.08s
>system 858% cpu 3.328 total
>
>
>So on 40 cores, it's a speed up from 11.8 seconds non-parallel, to 5s
>parallel without Kui-Feng's changes, to 4s with Kui-Feng's changes, to
>3.3s after libbpf update (I did it locally, will sync this to Github
>today).
>
>4x speed up, not bad!

That's indeed excellent! From the limited review I've made so far it looks good, takes advantage of the leg work I did, I'll just break down the patches a bit more, look at the review comments from you and repost, I'll not change the patches, just split them a bit more, tomorrow morning.

- Arnaldo
>
>But parallel mode is not currently enabled in kernel build, let's
>enable parallel mode and save those seconds during the kernel build!



>
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=d81283d27266
>>
>> Kui-Feng Lee (2):
>>   dwarf_loader: Prepare and pass per-thread data to worker threads.
>>   pahole: Use per-thread btf instances to avoid mutex locking.
>>
>>  btf_encoder.c  |   5 +++
>>  btf_encoder.h  |   2 +
>>  btf_loader.c   |   2 +-
>>  ctf_loader.c   |   2 +-
>>  dwarf_loader.c |  58 ++++++++++++++++++------
>>  dwarves.h      |   9 +++-
>>  pahole.c       | 120 ++++++++++++++++++++++++++++++++++++++++++++++---
>>  pdwtags.c      |   3 +-
>>  pfunct.c       |   4 +-
>>  9 files changed, 180 insertions(+), 25 deletions(-)
>>
>> --
>> 2.30.2
>>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-01-20 20:46 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-20  1:08 [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole Kui-Feng Lee
2022-01-20  1:08 ` [PATCH dwarves 1/2] dwarf_loader: Prepare and pass per-thread data to worker threads Kui-Feng Lee
2022-01-20  1:08 ` [PATCH dwarves 2/2] pahole: Use per-thread btf instances to avoid mutex locking Kui-Feng Lee
2022-01-20 20:16   ` Andrii Nakryiko
2022-01-20 19:59 ` [PATCH dwarves 0/2] Parallelize BTF type info generating of pahole Andrii Nakryiko
2022-01-20 20:46   ` Arnaldo Carvalho de Melo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).