All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram
@ 2024-02-28  0:52 Namhyung Kim
  2024-02-28  0:52 ` [PATCH 1/4] perf annotate: Add a hashmap " Namhyung Kim
                   ` (5 more replies)
  0 siblings, 6 replies; 12+ messages in thread
From: Namhyung Kim @ 2024-02-28  0:52 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Ian Rogers
  Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Andi Kleen

Hello,

This is another series of memory optimization in perf annotate.

When perf annotate (or perf report/top with TUI) processes samples, it
needs to save the sample period (overhead) at instruction level.  For
now, it allocates an array to do that for the whole symbol when it
hits any new symbol.  This comes with a lot of waste since samples can
be very few and instructions span to multiple bytes.

For example, when a sample hits symbol 'foo' that has size of 100 and
that's the only sample falls into the symbol.  Then it needs to
allocate a symbol histogram (sym_hist) and the its size would be

  16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616

But actually it just needs 32 (header + sym_hist_entry) bytes.  Things
get worse if the symbol size is bigger (and it doesn't have many
samples in different places).  Also note that it needs separate
histogram for each event.

Let's split the sym_hist_entry and have it in a hash table so that it
can allocate only necessary entries.

No functional change intended.

Thanks,
Namhyung


Namhyung Kim (4):
  perf annotate: Add a hashmap for symbol histogram
  perf annotate: Calculate instruction overhead using hashmap
  perf annotate: Remove sym_hist.addr[] array
  perf annotate: Add comments in the data structures

 tools/perf/ui/gtk/annotate.c |  14 ++++-
 tools/perf/util/annotate.c   | 114 ++++++++++++++++++++++-------------
 tools/perf/util/annotate.h   |  86 +++++++++++++++++++++++---
 3 files changed, 158 insertions(+), 56 deletions(-)

-- 
2.44.0.rc1.240.g4c46232300-goog


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] perf annotate: Add a hashmap for symbol histogram
  2024-02-28  0:52 [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Namhyung Kim
@ 2024-02-28  0:52 ` Namhyung Kim
  2024-02-28  1:19   ` Ian Rogers
  2024-02-28  0:52 ` [PATCH 2/4] perf annotate: Calculate instruction overhead using hashmap Namhyung Kim
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 12+ messages in thread
From: Namhyung Kim @ 2024-02-28  0:52 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Ian Rogers
  Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Andi Kleen

Now symbol histogram uses an array to save per-offset sample counts.
But it wastes a lot of memory if the symbol has a few samples only.
Add a hashmap to save values only for actual samples.

For now, it has duplicate histogram (one in the existing array and
another in the new hash map).  Once it can convert to use the hash
in all places, we can get rid of the array later.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate.c | 40 +++++++++++++++++++++++++++++++++++++-
 tools/perf/util/annotate.h |  2 ++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 107b264fa41e..7a70e4d35c9b 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -38,6 +38,7 @@
 #include "arch/common.h"
 #include "namespaces.h"
 #include "thread.h"
+#include "hashmap.h"
 #include <regex.h>
 #include <linux/bitops.h>
 #include <linux/kernel.h>
@@ -863,6 +864,17 @@ bool arch__is(struct arch *arch, const char *name)
 	return !strcmp(arch->name, name);
 }
 
+/* symbol histogram: key = offset << 16 | evsel->core.idx */
+static size_t sym_hist_hash(long key, void *ctx __maybe_unused)
+{
+	return (key >> 16) + (key & 0xffff);
+}
+
+static bool sym_hist_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+	return key1 == key2;
+}
+
 static struct annotated_source *annotated_source__new(void)
 {
 	struct annotated_source *src = zalloc(sizeof(*src));
@@ -877,6 +889,8 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src
 {
 	if (src == NULL)
 		return;
+
+	hashmap__free(src->samples);
 	zfree(&src->histograms);
 	free(src);
 }
@@ -909,6 +923,14 @@ static int annotated_source__alloc_histograms(struct annotated_source *src,
 	src->sizeof_sym_hist = sizeof_sym_hist;
 	src->nr_histograms   = nr_hists;
 	src->histograms	     = calloc(nr_hists, sizeof_sym_hist) ;
+
+	if (src->histograms == NULL)
+		return -1;
+
+	src->samples = hashmap__new(sym_hist_hash, sym_hist_equal, NULL);
+	if (src->samples == NULL)
+		zfree(&src->histograms);
+
 	return src->histograms ? 0 : -1;
 }
 
@@ -920,6 +942,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym)
 	if (notes->src != NULL) {
 		memset(notes->src->histograms, 0,
 		       notes->src->nr_histograms * notes->src->sizeof_sym_hist);
+		hashmap__clear(notes->src->samples);
 	}
 	if (notes->branch && notes->branch->cycles_hist) {
 		memset(notes->branch->cycles_hist, 0,
@@ -983,8 +1006,10 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
 				      struct perf_sample *sample)
 {
 	struct symbol *sym = ms->sym;
+	long hash_key;
 	unsigned offset;
 	struct sym_hist *h;
+	struct sym_hist_entry *entry;
 
 	pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map__unmap_ip(ms->map, addr));
 
@@ -1002,15 +1027,28 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
 			 __func__, __LINE__, sym->name, sym->start, addr, sym->end, sym->type == STT_FUNC);
 		return -ENOMEM;
 	}
+
+	hash_key = offset << 16 | evidx;
+	if (!hashmap__find(src->samples, hash_key, &entry)) {
+		entry = zalloc(sizeof(*entry));
+		if (entry == NULL)
+			return -ENOMEM;
+
+		if (hashmap__add(src->samples, hash_key, entry) < 0)
+			return -ENOMEM;
+	}
+
 	h->nr_samples++;
 	h->addr[offset].nr_samples++;
 	h->period += sample->period;
 	h->addr[offset].period += sample->period;
+	entry->nr_samples++;
+	entry->period += sample->period;
 
 	pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
 		  ", evidx=%d] => nr_samples: %" PRIu64 ", period: %" PRIu64 "\n",
 		  sym->start, sym->name, addr, addr - sym->start, evidx,
-		  h->addr[offset].nr_samples, h->addr[offset].period);
+		  entry->nr_samples, entry->period);
 	return 0;
 }
 
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 94435607c958..a2b0c8210740 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -12,6 +12,7 @@
 #include "symbol_conf.h"
 #include "mutex.h"
 #include "spark.h"
+#include "hashmap.h"
 
 struct hist_browser_timer;
 struct hist_entry;
@@ -280,6 +281,7 @@ struct annotated_source {
 	size_t			sizeof_sym_hist;
 	struct sym_hist		*histograms;
 	struct annotation_line	**offsets;
+	struct hashmap	   	*samples;
 	int    			nr_histograms;
 	int			nr_entries;
 	int			nr_asm_entries;
-- 
2.44.0.rc1.240.g4c46232300-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/4] perf annotate: Calculate instruction overhead using hashmap
  2024-02-28  0:52 [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Namhyung Kim
  2024-02-28  0:52 ` [PATCH 1/4] perf annotate: Add a hashmap " Namhyung Kim
@ 2024-02-28  0:52 ` Namhyung Kim
  2024-02-28  5:43   ` Namhyung Kim
  2024-02-28  0:52 ` [PATCH 3/4] perf annotate: Remove sym_hist.addr[] array Namhyung Kim
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 12+ messages in thread
From: Namhyung Kim @ 2024-02-28  0:52 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Ian Rogers
  Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Andi Kleen

Use annotated_source.samples hashmap instead of addr array in the
struct sym_hist.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/ui/gtk/annotate.c | 14 +++++++++---
 tools/perf/util/annotate.c   | 44 ++++++++++++++++++++++++------------
 tools/perf/util/annotate.h   | 11 +++++++++
 3 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index 394861245fd3..93ce3d47e47e 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -28,21 +28,29 @@ static const char *const col_names[] = {
 static int perf_gtk__get_percent(char *buf, size_t size, struct symbol *sym,
 				 struct disasm_line *dl, int evidx)
 {
+	struct annotation *notes;
 	struct sym_hist *symhist;
+	struct sym_hist_entry *entry;
 	double percent = 0.0;
 	const char *markup;
 	int ret = 0;
+	u64 nr_samples = 0;
 
 	strcpy(buf, "");
 
 	if (dl->al.offset == (s64) -1)
 		return 0;
 
-	symhist = annotation__histogram(symbol__annotation(sym), evidx);
-	if (!symbol_conf.event_group && !symhist->addr[dl->al.offset].nr_samples)
+	notes = symbol__annotation(sym);
+	symhist = annotation__histogram(notes, evidx);
+	entry = annotated_source__hist_entry(notes->src, evidx, dl->al.offset);
+	if (entry)
+		nr_samples = entry->nr_samples;
+
+	if (!symbol_conf.event_group && nr_samples == 0)
 		return 0;
 
-	percent = 100.0 * symhist->addr[dl->al.offset].nr_samples / symhist->nr_samples;
+	percent = 100.0 * nr_samples / symhist->nr_samples;
 
 	markup = perf_gtk__get_percent_color(percent);
 	if (markup)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 7a70e4d35c9b..e7859f756252 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2368,17 +2368,25 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 	return err;
 }
 
-static void calc_percent(struct sym_hist *sym_hist,
-			 struct hists *hists,
+static void calc_percent(struct annotation *notes,
+			 struct evsel *evsel,
 			 struct annotation_data *data,
 			 s64 offset, s64 end)
 {
+	struct hists *hists = evsel__hists(evsel);
+	int evidx = evsel->core.idx;
+	struct sym_hist *sym_hist = annotation__histogram(notes, evidx);
 	unsigned int hits = 0;
 	u64 period = 0;
 
 	while (offset < end) {
-		hits   += sym_hist->addr[offset].nr_samples;
-		period += sym_hist->addr[offset].period;
+		struct sym_hist_entry *entry;
+
+		entry = annotated_source__hist_entry(notes->src, evidx, offset);
+		if (entry) {
+			hits   += entry->nr_samples;
+			period += entry->period;
+		}
 		++offset;
 	}
 
@@ -2415,16 +2423,13 @@ static void annotation__calc_percent(struct annotation *notes,
 		end  = next ? next->offset : len;
 
 		for_each_group_evsel(evsel, leader) {
-			struct hists *hists = evsel__hists(evsel);
 			struct annotation_data *data;
-			struct sym_hist *sym_hist;
 
 			BUG_ON(i >= al->data_nr);
 
-			sym_hist = annotation__histogram(notes, evsel->core.idx);
 			data = &al->data[i++];
 
-			calc_percent(sym_hist, hists, data, al->offset, end);
+			calc_percent(notes, evsel, data, al->offset, end);
 		}
 	}
 }
@@ -2619,14 +2624,19 @@ static void print_summary(struct rb_root *root, const char *filename)
 
 static void symbol__annotate_hits(struct symbol *sym, struct evsel *evsel)
 {
+	int evidx = evsel->core.idx;
 	struct annotation *notes = symbol__annotation(sym);
-	struct sym_hist *h = annotation__histogram(notes, evsel->core.idx);
+	struct sym_hist *h = annotation__histogram(notes, evidx);
 	u64 len = symbol__size(sym), offset;
 
-	for (offset = 0; offset < len; ++offset)
-		if (h->addr[offset].nr_samples != 0)
+	for (offset = 0; offset < len; ++offset) {
+		struct sym_hist_entry *entry;
+
+		entry = annotated_source__hist_entry(notes->src, evidx, offset);
+		if (entry && entry->nr_samples != 0)
 			printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
-			       sym->start + offset, h->addr[offset].nr_samples);
+			       sym->start + offset, entry->nr_samples);
+	}
 	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->nr_samples", h->nr_samples);
 }
 
@@ -2855,8 +2865,14 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
 
 	h->nr_samples = 0;
 	for (offset = 0; offset < len; ++offset) {
-		h->addr[offset].nr_samples = h->addr[offset].nr_samples * 7 / 8;
-		h->nr_samples += h->addr[offset].nr_samples;
+		struct sym_hist_entry *entry;
+
+		entry = annotated_source__hist_entry(notes->src, evidx, offset);
+		if (entry == NULL)
+			continue;
+
+		entry->nr_samples = entry->nr_samples * 7 / 8;
+		h->nr_samples += entry->nr_samples;
 	}
 }
 
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index a2b0c8210740..3362980a5d3d 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -356,6 +356,17 @@ static inline struct sym_hist *annotation__histogram(struct annotation *notes, i
 	return annotated_source__histogram(notes->src, idx);
 }
 
+static inline struct sym_hist_entry *
+annotated_source__hist_entry(struct annotated_source *src, int idx, u64 offset)
+{
+	struct sym_hist_entry *entry;
+	long key = offset << 16 | idx;
+
+	if (!hashmap__find(src->samples, key, &entry))
+		return NULL;
+	return entry;
+}
+
 static inline struct annotation *symbol__annotation(struct symbol *sym)
 {
 	return (void *)sym - symbol_conf.priv_size;
-- 
2.44.0.rc1.240.g4c46232300-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/4] perf annotate: Remove sym_hist.addr[] array
  2024-02-28  0:52 [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Namhyung Kim
  2024-02-28  0:52 ` [PATCH 1/4] perf annotate: Add a hashmap " Namhyung Kim
  2024-02-28  0:52 ` [PATCH 2/4] perf annotate: Calculate instruction overhead using hashmap Namhyung Kim
@ 2024-02-28  0:52 ` Namhyung Kim
  2024-02-28  0:52 ` [PATCH 4/4] perf annotate: Add comments in the data structures Namhyung Kim
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Namhyung Kim @ 2024-02-28  0:52 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Ian Rogers
  Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Andi Kleen

It's not used anymore and the code is coverted to use a hash map.  Now
sym_hist has a static size, so no need to have sizeof_sym_hist in the
struct annotated_source.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate.c | 36 +++++-------------------------------
 tools/perf/util/annotate.h |  4 +---
 2 files changed, 6 insertions(+), 34 deletions(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index e7859f756252..43b204815020 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -896,33 +896,10 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src
 }
 
 static int annotated_source__alloc_histograms(struct annotated_source *src,
-					      size_t size, int nr_hists)
+					      int nr_hists)
 {
-	size_t sizeof_sym_hist;
-
-	/*
-	 * Add buffer of one element for zero length symbol.
-	 * When sample is taken from first instruction of
-	 * zero length symbol, perf still resolves it and
-	 * shows symbol name in perf report and allows to
-	 * annotate it.
-	 */
-	if (size == 0)
-		size = 1;
-
-	/* Check for overflow when calculating sizeof_sym_hist */
-	if (size > (SIZE_MAX - sizeof(struct sym_hist)) / sizeof(struct sym_hist_entry))
-		return -1;
-
-	sizeof_sym_hist = (sizeof(struct sym_hist) + size * sizeof(struct sym_hist_entry));
-
-	/* Check for overflow in zalloc argument */
-	if (sizeof_sym_hist > SIZE_MAX / nr_hists)
-		return -1;
-
-	src->sizeof_sym_hist = sizeof_sym_hist;
 	src->nr_histograms   = nr_hists;
-	src->histograms	     = calloc(nr_hists, sizeof_sym_hist) ;
+	src->histograms	     = calloc(nr_hists, sizeof(*src->histograms));
 
 	if (src->histograms == NULL)
 		return -1;
@@ -941,7 +918,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym)
 	annotation__lock(notes);
 	if (notes->src != NULL) {
 		memset(notes->src->histograms, 0,
-		       notes->src->nr_histograms * notes->src->sizeof_sym_hist);
+		       notes->src->nr_histograms * sizeof(*notes->src->histograms));
 		hashmap__clear(notes->src->samples);
 	}
 	if (notes->branch && notes->branch->cycles_hist) {
@@ -1039,9 +1016,7 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
 	}
 
 	h->nr_samples++;
-	h->addr[offset].nr_samples++;
 	h->period += sample->period;
-	h->addr[offset].period += sample->period;
 	entry->nr_samples++;
 	entry->period += sample->period;
 
@@ -1094,8 +1069,7 @@ struct annotated_source *symbol__hists(struct symbol *sym, int nr_hists)
 
 	if (notes->src->histograms == NULL) {
 alloc_histograms:
-		annotated_source__alloc_histograms(notes->src, symbol__size(sym),
-						   nr_hists);
+		annotated_source__alloc_histograms(notes->src, nr_hists);
 	}
 
 	return notes->src;
@@ -2854,7 +2828,7 @@ void symbol__annotate_zero_histogram(struct symbol *sym, int evidx)
 	struct annotation *notes = symbol__annotation(sym);
 	struct sym_hist *h = annotation__histogram(notes, evidx);
 
-	memset(h, 0, notes->src->sizeof_sym_hist);
+	memset(h, 0, sizeof(*notes->src->histograms) * notes->src->nr_histograms);
 }
 
 void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 3362980a5d3d..4bdc70a9d376 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -242,7 +242,6 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel);
 struct sym_hist {
 	u64		      nr_samples;
 	u64		      period;
-	struct sym_hist_entry addr[];
 };
 
 struct cyc_hist {
@@ -278,7 +277,6 @@ struct cyc_hist {
  */
 struct annotated_source {
 	struct list_head	source;
-	size_t			sizeof_sym_hist;
 	struct sym_hist		*histograms;
 	struct annotation_line	**offsets;
 	struct hashmap	   	*samples;
@@ -348,7 +346,7 @@ void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *m
 
 static inline struct sym_hist *annotated_source__histogram(struct annotated_source *src, int idx)
 {
-	return ((void *)src->histograms) + (src->sizeof_sym_hist * idx);
+	return &src->histograms[idx];
 }
 
 static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx)
-- 
2.44.0.rc1.240.g4c46232300-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/4] perf annotate: Add comments in the data structures
  2024-02-28  0:52 [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Namhyung Kim
                   ` (2 preceding siblings ...)
  2024-02-28  0:52 ` [PATCH 3/4] perf annotate: Remove sym_hist.addr[] array Namhyung Kim
@ 2024-02-28  0:52 ` Namhyung Kim
  2024-02-28  1:20 ` [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Ian Rogers
  2024-03-04 13:58 ` Arnaldo Carvalho de Melo
  5 siblings, 0 replies; 12+ messages in thread
From: Namhyung Kim @ 2024-02-28  0:52 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Ian Rogers
  Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Andi Kleen

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate.h | 69 ++++++++++++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 4bdc70a9d376..13cc659e508c 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -239,11 +239,42 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool r
 size_t disasm__fprintf(struct list_head *head, FILE *fp);
 void symbol__calc_percent(struct symbol *sym, struct evsel *evsel);
 
+/**
+ * struct sym_hist - symbol histogram information for an event
+ *
+ * @nr_samples: Total number of samples.
+ * @period: Sum of sample periods.
+ */
 struct sym_hist {
 	u64		      nr_samples;
 	u64		      period;
 };
 
+/**
+ * struct cyc_hist - (CPU) cycle histogram for a basic block
+ *
+ * @start: Start address of current block (if known).
+ * @cycles: Sum of cycles for the longest basic block.
+ * @cycles_aggr: Total cycles for this address.
+ * @cycles_max: Max cycles for this address.
+ * @cycles_min: Min cycles for this address.
+ * @cycles_spark: History of cycles for the longest basic block.
+ * @num: Number of samples for the longest basic block.
+ * @num_aggr: Total number of samples for this address.
+ * @have_start: Whether the current branch info has a start address.
+ * @reset: Number of resets due to a different start address.
+ *
+ * If sample has branch_stack and cycles info, it can construct basic blocks
+ * between two adjacent branches.  It'd have start and end addresses but
+ * sometimes the start address may not be available.  So the cycles are
+ * accounted at the end address.  If multiple basic blocks end at the same
+ * address, it will take the longest one.
+ *
+ * The @start, @cycles, @cycles_spark and @num fields are used for the longest
+ * block only.  Other fields are used for all cases.
+ *
+ * See __symbol__account_cycles().
+ */
 struct cyc_hist {
 	u64	start;
 	u64	cycles;
@@ -258,18 +289,24 @@ struct cyc_hist {
 	u16	reset;
 };
 
-/** struct annotated_source - symbols with hits have this attached as in sannotation
+/**
+ * struct annotated_source - symbols with hits have this attached as in annotation
  *
- * @histograms: Array of addr hit histograms per event being monitored
- * nr_histograms: This may not be the same as evsel->evlist->core.nr_entries if
+ * @source: List head for annotated_line (embeded in disasm_line).
+ * @histograms: Array of symbol histograms per event to maintain the total number
+ * 		of samples and period.
+ * @nr_histograms: This may not be the same as evsel->evlist->core.nr_entries if
  * 		  we have more than a group in a evlist, where we will want
  * 		  to see each group separately, that is why symbol__annotate2()
  * 		  sets src->nr_histograms to evsel->nr_members.
- * @lines: If 'print_lines' is specified, per source code line percentages
- * @source: source parsed from a disassembler like objdump -dS
- * @cyc_hist: Average cycles per basic block
+ * @offsets: Array of annotation_line to be accessed by offset.
+ * @samples: Hash map of sym_hist_entry.  Keyed by event index and offset in symbol.
+ * @nr_entries: Number of annotated_line in the source list.
+ * @nr_asm_entries: Number of annotated_line with actual asm instruction in the
+ * 		    source list.
+ * @max_line_len: Maximum length of objdump output in an annotated_line.
  *
- * lines is allocated, percentages calculated and all sorted by percentage
+ * disasm_lines are allocated, percentages calculated and all sorted by percentage
  * when the annotation is about to be presented, so the percentages are for
  * one of the entries in the histogram array, i.e. for the event/counter being
  * presented. It is deallocated right after symbol__{tui,tty,etc}_annotate
@@ -286,6 +323,24 @@ struct annotated_source {
 	u16			max_line_len;
 };
 
+/**
+ * struct annotated_branch - basic block and IPC information for a symbol.
+ *
+ * @hit_cycles: Total executed cycles.
+ * @hit_insn: Total number of instructions executed.
+ * @total_insn: Number of instructions in the function.
+ * @cover_insn: Number of distinct, actually executed instructions.
+ * @cycles_hist: Array of cyc_hist for each instruction.
+ * @max_coverage: Maximum number of covered basic block (used for block-range).
+ *
+ * This struct is used by two different codes when the sample has branch stack
+ * and cycles information.  annotation__compute_ipc() calculates average IPC
+ * using @hit_insn / @hit_cycles.  The actual coverage can be calculated using
+ * @cover_insn / @total_insn.  The @cycles_hist can give IPC for each (longest)
+ * basic block ends at the given address.
+ * process_basic_block() calculates coverage of instructions (or basic blocks)
+ * in the function.
+ */
 struct annotated_branch {
 	u64			hit_cycles;
 	u64			hit_insn;
-- 
2.44.0.rc1.240.g4c46232300-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] perf annotate: Add a hashmap for symbol histogram
  2024-02-28  0:52 ` [PATCH 1/4] perf annotate: Add a hashmap " Namhyung Kim
@ 2024-02-28  1:19   ` Ian Rogers
  2024-02-28  5:21     ` Namhyung Kim
  0 siblings, 1 reply; 12+ messages in thread
From: Ian Rogers @ 2024-02-28  1:19 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Adrian Hunter,
	Peter Zijlstra, Ingo Molnar, LKML, linux-perf-users, Andi Kleen

On Tue, Feb 27, 2024 at 4:52 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Now symbol histogram uses an array to save per-offset sample counts.
> But it wastes a lot of memory if the symbol has a few samples only.
> Add a hashmap to save values only for actual samples.
>
> For now, it has duplicate histogram (one in the existing array and
> another in the new hash map).  Once it can convert to use the hash
> in all places, we can get rid of the array later.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/util/annotate.c | 40 +++++++++++++++++++++++++++++++++++++-
>  tools/perf/util/annotate.h |  2 ++
>  2 files changed, 41 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
> index 107b264fa41e..7a70e4d35c9b 100644
> --- a/tools/perf/util/annotate.c
> +++ b/tools/perf/util/annotate.c
> @@ -38,6 +38,7 @@
>  #include "arch/common.h"
>  #include "namespaces.h"
>  #include "thread.h"
> +#include "hashmap.h"
>  #include <regex.h>
>  #include <linux/bitops.h>
>  #include <linux/kernel.h>
> @@ -863,6 +864,17 @@ bool arch__is(struct arch *arch, const char *name)
>         return !strcmp(arch->name, name);
>  }
>
> +/* symbol histogram: key = offset << 16 | evsel->core.idx */
> +static size_t sym_hist_hash(long key, void *ctx __maybe_unused)
> +{
> +       return (key >> 16) + (key & 0xffff);
> +}
> +
> +static bool sym_hist_equal(long key1, long key2, void *ctx __maybe_unused)
> +{
> +       return key1 == key2;
> +}
> +
>  static struct annotated_source *annotated_source__new(void)
>  {
>         struct annotated_source *src = zalloc(sizeof(*src));
> @@ -877,6 +889,8 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src
>  {
>         if (src == NULL)
>                 return;
> +
> +       hashmap__free(src->samples);
>         zfree(&src->histograms);
>         free(src);
>  }
> @@ -909,6 +923,14 @@ static int annotated_source__alloc_histograms(struct annotated_source *src,
>         src->sizeof_sym_hist = sizeof_sym_hist;
>         src->nr_histograms   = nr_hists;
>         src->histograms      = calloc(nr_hists, sizeof_sym_hist) ;
> +
> +       if (src->histograms == NULL)
> +               return -1;
> +
> +       src->samples = hashmap__new(sym_hist_hash, sym_hist_equal, NULL);
> +       if (src->samples == NULL)
> +               zfree(&src->histograms);
> +
>         return src->histograms ? 0 : -1;
>  }
>
> @@ -920,6 +942,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym)
>         if (notes->src != NULL) {
>                 memset(notes->src->histograms, 0,
>                        notes->src->nr_histograms * notes->src->sizeof_sym_hist);
> +               hashmap__clear(notes->src->samples);
>         }
>         if (notes->branch && notes->branch->cycles_hist) {
>                 memset(notes->branch->cycles_hist, 0,
> @@ -983,8 +1006,10 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
>                                       struct perf_sample *sample)
>  {
>         struct symbol *sym = ms->sym;
> +       long hash_key;
>         unsigned offset;
>         struct sym_hist *h;
> +       struct sym_hist_entry *entry;
>
>         pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map__unmap_ip(ms->map, addr));
>
> @@ -1002,15 +1027,28 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
>                          __func__, __LINE__, sym->name, sym->start, addr, sym->end, sym->type == STT_FUNC);
>                 return -ENOMEM;
>         }
> +
> +       hash_key = offset << 16 | evidx;
> +       if (!hashmap__find(src->samples, hash_key, &entry)) {
> +               entry = zalloc(sizeof(*entry));
> +               if (entry == NULL)
> +                       return -ENOMEM;
> +
> +               if (hashmap__add(src->samples, hash_key, entry) < 0)
> +                       return -ENOMEM;
> +       }
> +
>         h->nr_samples++;
>         h->addr[offset].nr_samples++;
>         h->period += sample->period;
>         h->addr[offset].period += sample->period;
> +       entry->nr_samples++;
> +       entry->period += sample->period;
>
>         pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
>                   ", evidx=%d] => nr_samples: %" PRIu64 ", period: %" PRIu64 "\n",
>                   sym->start, sym->name, addr, addr - sym->start, evidx,
> -                 h->addr[offset].nr_samples, h->addr[offset].period);
> +                 entry->nr_samples, entry->period);
>         return 0;
>  }
>
> diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
> index 94435607c958..a2b0c8210740 100644
> --- a/tools/perf/util/annotate.h
> +++ b/tools/perf/util/annotate.h
> @@ -12,6 +12,7 @@
>  #include "symbol_conf.h"
>  #include "mutex.h"
>  #include "spark.h"
> +#include "hashmap.h"

nit: This could just be a forward reference to keep the number of
header files down.

Thanks,
Ian

>
>  struct hist_browser_timer;
>  struct hist_entry;
> @@ -280,6 +281,7 @@ struct annotated_source {
>         size_t                  sizeof_sym_hist;
>         struct sym_hist         *histograms;
>         struct annotation_line  **offsets;
> +       struct hashmap          *samples;
>         int                     nr_histograms;
>         int                     nr_entries;
>         int                     nr_asm_entries;
> --
> 2.44.0.rc1.240.g4c46232300-goog
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram
  2024-02-28  0:52 [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Namhyung Kim
                   ` (3 preceding siblings ...)
  2024-02-28  0:52 ` [PATCH 4/4] perf annotate: Add comments in the data structures Namhyung Kim
@ 2024-02-28  1:20 ` Ian Rogers
  2024-03-04 13:58 ` Arnaldo Carvalho de Melo
  5 siblings, 0 replies; 12+ messages in thread
From: Ian Rogers @ 2024-02-28  1:20 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Adrian Hunter,
	Peter Zijlstra, Ingo Molnar, LKML, linux-perf-users, Andi Kleen

On Tue, Feb 27, 2024 at 4:52 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hello,
>
> This is another series of memory optimization in perf annotate.
>
> When perf annotate (or perf report/top with TUI) processes samples, it
> needs to save the sample period (overhead) at instruction level.  For
> now, it allocates an array to do that for the whole symbol when it
> hits any new symbol.  This comes with a lot of waste since samples can
> be very few and instructions span to multiple bytes.
>
> For example, when a sample hits symbol 'foo' that has size of 100 and
> that's the only sample falls into the symbol.  Then it needs to
> allocate a symbol histogram (sym_hist) and the its size would be
>
>   16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
>
> But actually it just needs 32 (header + sym_hist_entry) bytes.  Things
> get worse if the symbol size is bigger (and it doesn't have many
> samples in different places).  Also note that it needs separate
> histogram for each event.
>
> Let's split the sym_hist_entry and have it in a hash table so that it
> can allocate only necessary entries.
>
> No functional change intended.
>
> Thanks,
> Namhyung
>
>
> Namhyung Kim (4):
>   perf annotate: Add a hashmap for symbol histogram
>   perf annotate: Calculate instruction overhead using hashmap
>   perf annotate: Remove sym_hist.addr[] array
>   perf annotate: Add comments in the data structures

Reviewed-by: Ian Rogers <irogers@google.com>

Thanks,
Ian

>  tools/perf/ui/gtk/annotate.c |  14 ++++-
>  tools/perf/util/annotate.c   | 114 ++++++++++++++++++++++-------------
>  tools/perf/util/annotate.h   |  86 +++++++++++++++++++++++---
>  3 files changed, 158 insertions(+), 56 deletions(-)
>
> --
> 2.44.0.rc1.240.g4c46232300-goog
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] perf annotate: Add a hashmap for symbol histogram
  2024-02-28  1:19   ` Ian Rogers
@ 2024-02-28  5:21     ` Namhyung Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Namhyung Kim @ 2024-02-28  5:21 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Adrian Hunter,
	Peter Zijlstra, Ingo Molnar, LKML, linux-perf-users, Andi Kleen

On Tue, Feb 27, 2024 at 5:20 PM Ian Rogers <irogers@google.com> wrote:
>
> On Tue, Feb 27, 2024 at 4:52 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > Now symbol histogram uses an array to save per-offset sample counts.
> > But it wastes a lot of memory if the symbol has a few samples only.
> > Add a hashmap to save values only for actual samples.
> >
> > For now, it has duplicate histogram (one in the existing array and
> > another in the new hash map).  Once it can convert to use the hash
> > in all places, we can get rid of the array later.
> >
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
[SNIP]
> > diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
> > index 94435607c958..a2b0c8210740 100644
> > --- a/tools/perf/util/annotate.h
> > +++ b/tools/perf/util/annotate.h
> > @@ -12,6 +12,7 @@
> >  #include "symbol_conf.h"
> >  #include "mutex.h"
> >  #include "spark.h"
> > +#include "hashmap.h"
>
> nit: This could just be a forward reference to keep the number of
> header files down.

Sounds good.  Will fix in v2.

Thanks for your review!
Namhyung

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] perf annotate: Calculate instruction overhead using hashmap
  2024-02-28  0:52 ` [PATCH 2/4] perf annotate: Calculate instruction overhead using hashmap Namhyung Kim
@ 2024-02-28  5:43   ` Namhyung Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Namhyung Kim @ 2024-02-28  5:43 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Adrian Hunter,
	Peter Zijlstra, Ingo Molnar, LKML, linux-perf-users, Andi Kleen

On Tue, Feb 27, 2024 at 5:27 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Use annotated_source.samples hashmap instead of addr array in the
> struct sym_hist.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/ui/gtk/annotate.c | 14 +++++++++---
>  tools/perf/util/annotate.c   | 44 ++++++++++++++++++++++++------------
>  tools/perf/util/annotate.h   | 11 +++++++++
>  3 files changed, 52 insertions(+), 17 deletions(-)
>
> diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
> index 394861245fd3..93ce3d47e47e 100644
> --- a/tools/perf/ui/gtk/annotate.c
> +++ b/tools/perf/ui/gtk/annotate.c
> @@ -28,21 +28,29 @@ static const char *const col_names[] = {
>  static int perf_gtk__get_percent(char *buf, size_t size, struct symbol *sym,
>                                  struct disasm_line *dl, int evidx)
>  {
> +       struct annotation *notes;
>         struct sym_hist *symhist;
> +       struct sym_hist_entry *entry;
>         double percent = 0.0;
>         const char *markup;
>         int ret = 0;
> +       u64 nr_samples = 0;
>
>         strcpy(buf, "");
>
>         if (dl->al.offset == (s64) -1)
>                 return 0;
>
> -       symhist = annotation__histogram(symbol__annotation(sym), evidx);
> -       if (!symbol_conf.event_group && !symhist->addr[dl->al.offset].nr_samples)
> +       notes = symbol__annotation(sym);
> +       symhist = annotation__histogram(notes, evidx);
> +       entry = annotated_source__hist_entry(notes->src, evidx, dl->al.offset);
> +       if (entry)
> +               nr_samples = entry->nr_samples;
> +
> +       if (!symbol_conf.event_group && nr_samples == 0)
>                 return 0;
>
> -       percent = 100.0 * symhist->addr[dl->al.offset].nr_samples / symhist->nr_samples;
> +       percent = 100.0 * nr_samples / symhist->nr_samples;
>
>         markup = perf_gtk__get_percent_color(percent);
>         if (markup)
> diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
> index 7a70e4d35c9b..e7859f756252 100644
> --- a/tools/perf/util/annotate.c
> +++ b/tools/perf/util/annotate.c
> @@ -2368,17 +2368,25 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
>         return err;
>  }
>
> -static void calc_percent(struct sym_hist *sym_hist,
> -                        struct hists *hists,
> +static void calc_percent(struct annotation *notes,
> +                        struct evsel *evsel,
>                          struct annotation_data *data,
>                          s64 offset, s64 end)
>  {
> +       struct hists *hists = evsel__hists(evsel);
> +       int evidx = evsel->core.idx;
> +       struct sym_hist *sym_hist = annotation__histogram(notes, evidx);
>         unsigned int hits = 0;
>         u64 period = 0;
>
>         while (offset < end) {
> -               hits   += sym_hist->addr[offset].nr_samples;
> -               period += sym_hist->addr[offset].period;
> +               struct sym_hist_entry *entry;
> +
> +               entry = annotated_source__hist_entry(notes->src, evidx, offset);
> +               if (entry) {
> +                       hits   += entry->nr_samples;
> +                       period += entry->period;
> +               }
>                 ++offset;
>         }
>
> @@ -2415,16 +2423,13 @@ static void annotation__calc_percent(struct annotation *notes,
>                 end  = next ? next->offset : len;
>
>                 for_each_group_evsel(evsel, leader) {
> -                       struct hists *hists = evsel__hists(evsel);
>                         struct annotation_data *data;
> -                       struct sym_hist *sym_hist;
>
>                         BUG_ON(i >= al->data_nr);
>
> -                       sym_hist = annotation__histogram(notes, evsel->core.idx);
>                         data = &al->data[i++];
>
> -                       calc_percent(sym_hist, hists, data, al->offset, end);
> +                       calc_percent(notes, evsel, data, al->offset, end);
>                 }
>         }
>  }
> @@ -2619,14 +2624,19 @@ static void print_summary(struct rb_root *root, const char *filename)
>
>  static void symbol__annotate_hits(struct symbol *sym, struct evsel *evsel)
>  {
> +       int evidx = evsel->core.idx;
>         struct annotation *notes = symbol__annotation(sym);
> -       struct sym_hist *h = annotation__histogram(notes, evsel->core.idx);
> +       struct sym_hist *h = annotation__histogram(notes, evidx);
>         u64 len = symbol__size(sym), offset;
>
> -       for (offset = 0; offset < len; ++offset)
> -               if (h->addr[offset].nr_samples != 0)
> +       for (offset = 0; offset < len; ++offset) {
> +               struct sym_hist_entry *entry;
> +
> +               entry = annotated_source__hist_entry(notes->src, evidx, offset);
> +               if (entry && entry->nr_samples != 0)
>                         printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
> -                              sym->start + offset, h->addr[offset].nr_samples);
> +                              sym->start + offset, entry->nr_samples);
> +       }
>         printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->nr_samples", h->nr_samples);
>  }
>
> @@ -2855,8 +2865,14 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
>
>         h->nr_samples = 0;
>         for (offset = 0; offset < len; ++offset) {
> -               h->addr[offset].nr_samples = h->addr[offset].nr_samples * 7 / 8;
> -               h->nr_samples += h->addr[offset].nr_samples;
> +               struct sym_hist_entry *entry;
> +
> +               entry = annotated_source__hist_entry(notes->src, evidx, offset);
> +               if (entry == NULL)
> +                       continue;
> +
> +               entry->nr_samples = entry->nr_samples * 7 / 8;
> +               h->nr_samples += entry->nr_samples;
>         }
>  }
>
> diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
> index a2b0c8210740..3362980a5d3d 100644
> --- a/tools/perf/util/annotate.h
> +++ b/tools/perf/util/annotate.h
> @@ -356,6 +356,17 @@ static inline struct sym_hist *annotation__histogram(struct annotation *notes, i
>         return annotated_source__histogram(notes->src, idx);
>  }
>
> +static inline struct sym_hist_entry *
> +annotated_source__hist_entry(struct annotated_source *src, int idx, u64 offset)
> +{
> +       struct sym_hist_entry *entry;
> +       long key = offset << 16 | idx;
> +
> +       if (!hashmap__find(src->samples, key, &entry))

Hmm.. then I've realized that it requires the header file anyway.
This code is needed by multiple places for stdio, tui, gtk output.

Thanks,
Namhyung


> +               return NULL;
> +       return entry;
> +}
> +
>  static inline struct annotation *symbol__annotation(struct symbol *sym)
>  {
>         return (void *)sym - symbol_conf.priv_size;
> --
> 2.44.0.rc1.240.g4c46232300-goog
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram
  2024-02-28  0:52 [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Namhyung Kim
                   ` (4 preceding siblings ...)
  2024-02-28  1:20 ` [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Ian Rogers
@ 2024-03-04 13:58 ` Arnaldo Carvalho de Melo
  2024-03-04 16:36   ` Namhyung Kim
  5 siblings, 1 reply; 12+ messages in thread
From: Arnaldo Carvalho de Melo @ 2024-03-04 13:58 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Andi Kleen

On Tue, Feb 27, 2024 at 04:52:26PM -0800, Namhyung Kim wrote:
> This is another series of memory optimization in perf annotate.
 
> When perf annotate (or perf report/top with TUI) processes samples, it
> needs to save the sample period (overhead) at instruction level.  For
> now, it allocates an array to do that for the whole symbol when it
> hits any new symbol.  This comes with a lot of waste since samples can
> be very few and instructions span to multiple bytes.
 
> For example, when a sample hits symbol 'foo' that has size of 100 and
> that's the only sample falls into the symbol.  Then it needs to
> allocate a symbol histogram (sym_hist) and the its size would be
 
>   16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
 
> But actually it just needs 32 (header + sym_hist_entry) bytes.  Things
> get worse if the symbol size is bigger (and it doesn't have many
> samples in different places).  Also note that it needs separate
> histogram for each event.
 
> Let's split the sym_hist_entry and have it in a hash table so that it
> can allocate only necessary entries.
 
> No functional change intended.

I tried this before/after this series:

  $ time perf annotate --stdio2 -i perf.data.annotate

For:

  perf record -e '{cycles,instructions,cache-misses}' make -k CORESIGHT=1 O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin

And found these odd cases:

  $ diff -u before after
  --- before	2024-02-28 15:38:25.086062812 -0300
  +++ after	2024-02-29 14:12:05.606652725 -0300
  @@ -2450826,7 +2450826,7 @@
                                ↓ je       1c62                  
                                → call     operator delete(void*)@plt
                                { return _M_dataplus._M_p; }
  -                       1c62:   mov      0x13c0(%rsp),%rdi     
  +  0.00   0.00 100.00   1c62:   mov      0x13c0(%rsp),%rdi     
                                if (_M_data() == _M_local_data())
                                  lea      0x13d0(%rsp),%rax     
                                  cmp      %rax,%rdi             
  @@ -2470648,7 +2470648,7 @@
                                  mov      %rbx,%rdi             
                                → call     operator delete(void*)@plt
                                using reference = T &;     
  -  0.00   0.00 100.00  11c65:   mov      0x8(%r12),%rax        
  +                      11c65:   mov      0x8(%r12),%rax        
                                size_t size() const { return Size; }
                                  mov      0x10(%r12),%ecx       
                                  mov      %rax,%rbp             
  $


This is a large function:

Samples: 574K of events 'anon group { cpu_core/cycles/u, cpu_core/instructions/u, cpu_core/cache-misses/u }', 4000 Hz, Event count (approx.): 614695751751, [percent: local period]$
clang::CompilerInvocation::ParseCodeGenArgs(clang::CodeGenOptions&, llvm::opt::ArgList&, clang::InputKind, clang::DiagnosticsEngine&, llvm::Triple const&, std::__cxx11::basic_string<char, std
Percent 

Probably when building the BPF skels in tools/perf/

- Arnaldo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram
  2024-03-04 13:58 ` Arnaldo Carvalho de Melo
@ 2024-03-04 16:36   ` Namhyung Kim
  2024-03-04 22:53     ` Namhyung Kim
  0 siblings, 1 reply; 12+ messages in thread
From: Namhyung Kim @ 2024-03-04 16:36 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Andi Kleen

Hi Arnaldo,

On Mon, Mar 4, 2024 at 5:58 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
>
> On Tue, Feb 27, 2024 at 04:52:26PM -0800, Namhyung Kim wrote:
> > This is another series of memory optimization in perf annotate.
>
> > When perf annotate (or perf report/top with TUI) processes samples, it
> > needs to save the sample period (overhead) at instruction level.  For
> > now, it allocates an array to do that for the whole symbol when it
> > hits any new symbol.  This comes with a lot of waste since samples can
> > be very few and instructions span to multiple bytes.
>
> > For example, when a sample hits symbol 'foo' that has size of 100 and
> > that's the only sample falls into the symbol.  Then it needs to
> > allocate a symbol histogram (sym_hist) and the its size would be
>
> >   16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
>
> > But actually it just needs 32 (header + sym_hist_entry) bytes.  Things
> > get worse if the symbol size is bigger (and it doesn't have many
> > samples in different places).  Also note that it needs separate
> > histogram for each event.
>
> > Let's split the sym_hist_entry and have it in a hash table so that it
> > can allocate only necessary entries.
>
> > No functional change intended.
>
> I tried this before/after this series:
>
>   $ time perf annotate --stdio2 -i perf.data.annotate
>
> For:
>
>   perf record -e '{cycles,instructions,cache-misses}' make -k CORESIGHT=1 O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin
>
> And found these odd cases:
>
>   $ diff -u before after
>   --- before    2024-02-28 15:38:25.086062812 -0300
>   +++ after     2024-02-29 14:12:05.606652725 -0300
>   @@ -2450826,7 +2450826,7 @@
>                                 ↓ je       1c62
>                                 → call     operator delete(void*)@plt
>                                 { return _M_dataplus._M_p; }
>   -                       1c62:   mov      0x13c0(%rsp),%rdi
>   +  0.00   0.00 100.00   1c62:   mov      0x13c0(%rsp),%rdi
>                                 if (_M_data() == _M_local_data())
>                                   lea      0x13d0(%rsp),%rax
>                                   cmp      %rax,%rdi
>   @@ -2470648,7 +2470648,7 @@
>                                   mov      %rbx,%rdi
>                                 → call     operator delete(void*)@plt
>                                 using reference = T &;
>   -  0.00   0.00 100.00  11c65:   mov      0x8(%r12),%rax
>   +                      11c65:   mov      0x8(%r12),%rax
>                                 size_t size() const { return Size; }
>                                   mov      0x10(%r12),%ecx
>                                   mov      %rax,%rbp
>   $
>
>
> This is a large function:

Thanks for the test!  I think it missed the cast to 64-bit somewhere.
I'll check and send v2 soon.

Thanks,
Namhyung


>
> Samples: 574K of events 'anon group { cpu_core/cycles/u, cpu_core/instructions/u, cpu_core/cache-misses/u }', 4000 Hz, Event count (approx.): 614695751751, [percent: local period]$
> clang::CompilerInvocation::ParseCodeGenArgs(clang::CodeGenOptions&, llvm::opt::ArgList&, clang::InputKind, clang::DiagnosticsEngine&, llvm::Triple const&, std::__cxx11::basic_string<char, std
> Percent
>
> Probably when building the BPF skels in tools/perf/
>
> - Arnaldo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram
  2024-03-04 16:36   ` Namhyung Kim
@ 2024-03-04 22:53     ` Namhyung Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Namhyung Kim @ 2024-03-04 22:53 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ian Rogers, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Andi Kleen

On Mon, Mar 4, 2024 at 8:36 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hi Arnaldo,
>
> On Mon, Mar 4, 2024 at 5:58 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> >
> > On Tue, Feb 27, 2024 at 04:52:26PM -0800, Namhyung Kim wrote:
> > > This is another series of memory optimization in perf annotate.
> >
> > > When perf annotate (or perf report/top with TUI) processes samples, it
> > > needs to save the sample period (overhead) at instruction level.  For
> > > now, it allocates an array to do that for the whole symbol when it
> > > hits any new symbol.  This comes with a lot of waste since samples can
> > > be very few and instructions span to multiple bytes.
> >
> > > For example, when a sample hits symbol 'foo' that has size of 100 and
> > > that's the only sample falls into the symbol.  Then it needs to
> > > allocate a symbol histogram (sym_hist) and the its size would be
> >
> > >   16 (header) + 16 (sym_hist_entry) * 100 (symbol_size) = 1616
> >
> > > But actually it just needs 32 (header + sym_hist_entry) bytes.  Things
> > > get worse if the symbol size is bigger (and it doesn't have many
> > > samples in different places).  Also note that it needs separate
> > > histogram for each event.
> >
> > > Let's split the sym_hist_entry and have it in a hash table so that it
> > > can allocate only necessary entries.
> >
> > > No functional change intended.
> >
> > I tried this before/after this series:
> >
> >   $ time perf annotate --stdio2 -i perf.data.annotate
> >
> > For:
> >
> >   perf record -e '{cycles,instructions,cache-misses}' make -k CORESIGHT=1 O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin
> >
> > And found these odd cases:
> >
> >   $ diff -u before after
> >   --- before    2024-02-28 15:38:25.086062812 -0300
> >   +++ after     2024-02-29 14:12:05.606652725 -0300
> >   @@ -2450826,7 +2450826,7 @@
> >                                 ↓ je       1c62
> >                                 → call     operator delete(void*)@plt
> >                                 { return _M_dataplus._M_p; }
> >   -                       1c62:   mov      0x13c0(%rsp),%rdi
> >   +  0.00   0.00 100.00   1c62:   mov      0x13c0(%rsp),%rdi
> >                                 if (_M_data() == _M_local_data())
> >                                   lea      0x13d0(%rsp),%rax
> >                                   cmp      %rax,%rdi
> >   @@ -2470648,7 +2470648,7 @@
> >                                   mov      %rbx,%rdi
> >                                 → call     operator delete(void*)@plt
> >                                 using reference = T &;
> >   -  0.00   0.00 100.00  11c65:   mov      0x8(%r12),%rax
> >   +                      11c65:   mov      0x8(%r12),%rax
> >                                 size_t size() const { return Size; }
> >                                   mov      0x10(%r12),%ecx
> >                                   mov      %rax,%rbp
> >   $
> >
> >
> > This is a large function:
>
> Thanks for the test!  I think it missed the cast to 64-bit somewhere.
> I'll check and send v2 soon.

Yep, the offset variable in __symbol__inc_addr_samples()
should be u64.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-03-04 22:54 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-28  0:52 [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Namhyung Kim
2024-02-28  0:52 ` [PATCH 1/4] perf annotate: Add a hashmap " Namhyung Kim
2024-02-28  1:19   ` Ian Rogers
2024-02-28  5:21     ` Namhyung Kim
2024-02-28  0:52 ` [PATCH 2/4] perf annotate: Calculate instruction overhead using hashmap Namhyung Kim
2024-02-28  5:43   ` Namhyung Kim
2024-02-28  0:52 ` [PATCH 3/4] perf annotate: Remove sym_hist.addr[] array Namhyung Kim
2024-02-28  0:52 ` [PATCH 4/4] perf annotate: Add comments in the data structures Namhyung Kim
2024-02-28  1:20 ` [PATCH 0/4] perf annotate: Improve memory usage for symbol histogram Ian Rogers
2024-03-04 13:58 ` Arnaldo Carvalho de Melo
2024-03-04 16:36   ` Namhyung Kim
2024-03-04 22:53     ` Namhyung Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.