linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] hist lookups
@ 2018-10-31  5:03 David Miller
  2018-10-31 12:43 ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-10-31  5:03 UTC (permalink / raw)
  To: linux-kernel; +Cc: acme


So when a cpu is overpowered processing samples, most of the time is
spent in the histogram code.

It seems we initialize a ~262 byte structure on the stack to do every
histogram entry lookup.

This is a side effect of how the sorting code is shared with the code
that does lookups and insertions into the histogram tree(s).

I tried to change this so that lookups use a smaller key, but it gets
ugly real fast.

I don't know when I'd be able to work more on this so I'm posting this
hoping maybe someone else can move it forward, or maybe even find a
better way to do this.

The histogram code is really the limiting factor in how well perf can
handle high sample rates.

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index f96c005..f0265e4 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -81,6 +81,12 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 	return right->thread->tid - left->thread->tid;
 }
 
+static int64_t
+sort__thread_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	return key->al->thread->tid - entry->thread->tid;
+}
+
 static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf,
 				       size_t size, unsigned int width)
 {
@@ -104,6 +110,7 @@ static int hist_entry__thread_filter(struct hist_entry *he, int type, const void
 struct sort_entry sort_thread = {
 	.se_header	= "    Pid:Command",
 	.se_cmp		= sort__thread_cmp,
+	.se_cmp_key	= sort__thread_cmp_key,
 	.se_snprintf	= hist_entry__thread_snprintf,
 	.se_filter	= hist_entry__thread_filter,
 	.se_width_idx	= HISTC_THREAD,
@@ -123,6 +130,13 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static int64_t
+sort__comm_cmp_key(struct hist_entry *entry,
+	       struct hist_entry_cmp_key *key)
+{
+	return strcmp(comm__str(key->comm), comm__str(entry->comm));
+}
+
+static int64_t
 sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
 {
 	return strcmp(comm__str(right->comm), comm__str(left->comm));
@@ -143,6 +157,7 @@ static int hist_entry__comm_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_comm = {
 	.se_header	= "Command",
 	.se_cmp		= sort__comm_cmp,
+	.se_cmp_key	= sort__comm_cmp_key,
 	.se_collapse	= sort__comm_collapse,
 	.se_sort	= sort__comm_sort,
 	.se_snprintf	= hist_entry__comm_snprintf,
@@ -178,6 +193,12 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 	return _sort__dso_cmp(right->ms.map, left->ms.map);
 }
 
+static int64_t
+sort__dso_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	return _sort__dso_cmp(key->al->map, entry->ms.map);
+}
+
 static int _hist_entry__dso_snprintf(struct map *map, char *bf,
 				     size_t size, unsigned int width)
 {
@@ -209,6 +230,7 @@ static int hist_entry__dso_filter(struct hist_entry *he, int type, const void *a
 struct sort_entry sort_dso = {
 	.se_header	= "Shared Object",
 	.se_cmp		= sort__dso_cmp,
+	.se_cmp_key	= sort__dso_cmp_key,
 	.se_snprintf	= hist_entry__dso_snprintf,
 	.se_filter	= hist_entry__dso_filter,
 	.se_width_idx	= HISTC_DSO,
@@ -260,6 +282,25 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static int64_t
+sort__sym_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	int64_t ret;
+
+	if (!entry->ms.sym && !key->al->sym)
+		return _sort__addr_cmp(entry->ip, key->al->addr);
+
+	/*
+	 * comparing symbol address alone is not enough since it's a
+	 * relative address within a dso.
+	 */
+	ret = sort__dso_cmp_key(entry, key);
+	if (ret != 0)
+		return ret;
+
+	return _sort__sym_cmp(entry->ms.sym, key->al->sym);
+}
+
+static int64_t
 sort__sym_sort(struct hist_entry *left, struct hist_entry *right)
 {
 	if (!left->ms.sym || !right->ms.sym)
@@ -323,6 +364,7 @@ static int hist_entry__sym_filter(struct hist_entry *he, int type, const void *a
 struct sort_entry sort_sym = {
 	.se_header	= "Symbol",
 	.se_cmp		= sort__sym_cmp,
+	.se_cmp_key	= sort__sym_cmp_key,
 	.se_sort	= sort__sym_sort,
 	.se_snprintf	= hist_entry__sym_snprintf,
 	.se_filter	= hist_entry__sym_filter,
@@ -347,6 +389,18 @@ sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right)
 	return strcmp(right->srcline, left->srcline);
 }
 
+static int64_t
+sort__srcline_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->srcline)
+		entry->srcline = hist_entry__srcline(entry);
+	if (!key->al->srcline)
+		key->al->srcline =
+			map__srcline(key->al->map, key->al->addr, key->al->sym);
+
+	return strcmp(key->al->srcline, entry->srcline);
+}
+
 static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
@@ -359,6 +413,7 @@ static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_srcline = {
 	.se_header	= "Source:Line",
 	.se_cmp		= sort__srcline_cmp,
+	.se_cmp_key	= sort__srcline_cmp_key,
 	.se_snprintf	= hist_entry__srcline_snprintf,
 	.se_width_idx	= HISTC_SRCLINE,
 };
@@ -382,6 +437,18 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
 	return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
 }
 
+static int64_t
+sort__srcline_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->branch_info->srcline_from)
+		entry->branch_info->srcline_from = addr_map_symbol__srcline(&entry->branch_info->from);
+
+	if (!key->bi->srcline_from)
+		key->bi->srcline_from = addr_map_symbol__srcline(&key->bi->from);
+
+	return strcmp(key->bi->srcline_from, entry->branch_info->srcline_from);
+}
+
 static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
@@ -391,6 +458,7 @@ static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_srcline_from = {
 	.se_header	= "From Source:Line",
 	.se_cmp		= sort__srcline_from_cmp,
+	.se_cmp_key	= sort__srcline_from_cmp_key,
 	.se_snprintf	= hist_entry__srcline_from_snprintf,
 	.se_width_idx	= HISTC_SRCLINE_FROM,
 };
@@ -409,6 +477,18 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
 	return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
 }
 
+static int64_t
+sort__srcline_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->branch_info->srcline_to)
+		entry->branch_info->srcline_to = addr_map_symbol__srcline(&entry->branch_info->to);
+
+	if (!key->bi->srcline_to)
+		key->bi->srcline_to = addr_map_symbol__srcline(&key->bi->to);
+
+	return strcmp(key->bi->srcline_to, entry->branch_info->srcline_to);
+}
+
 static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
@@ -418,6 +498,7 @@ static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_srcline_to = {
 	.se_header	= "To Source:Line",
 	.se_cmp		= sort__srcline_to_cmp,
+	.se_cmp_key	= sort__srcline_to_cmp_key,
 	.se_snprintf	= hist_entry__srcline_to_snprintf,
 	.se_width_idx	= HISTC_SRCLINE_TO,
 };
@@ -426,16 +507,16 @@ struct sort_entry sort_srcline_to = {
 
 static char no_srcfile[1];
 
-static char *hist_entry__get_srcfile(struct hist_entry *e)
+static char *__hist_entry__get_srcfile(struct map *map, struct symbol *sym,
+				       u64 ip)
 {
 	char *sf, *p;
-	struct map *map = e->ms.map;
 
 	if (!map)
 		return no_srcfile;
 
-	sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
-			 e->ms.sym, false, true, true, e->ip);
+	sf = __get_srcline(map->dso, map__rip_2objdump(map, ip),
+			 sym, false, true, true, ip);
 	if (!strcmp(sf, SRCLINE_UNKNOWN))
 		return no_srcfile;
 	p = strchr(sf, ':');
@@ -447,6 +528,15 @@ static char *hist_entry__get_srcfile(struct hist_entry *e)
 	return no_srcfile;
 }
 
+static char *hist_entry__get_srcfile(struct hist_entry *e)
+{
+	return __hist_entry__get_srcfile(e->ms.map, e->ms.sym, e->ip);
+}
+
+static char *hist_entry_key__get_srcfile(struct hist_entry_cmp_key *key)
+{
+	return __hist_entry__get_srcfile(key->al->map, key->al->sym, key->al->addr);
+}
 static int64_t
 sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -458,6 +548,17 @@ sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
 	return strcmp(right->srcfile, left->srcfile);
 }
 
+static int64_t
+sort__srcfile_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->srcfile)
+		entry->srcfile = hist_entry__get_srcfile(entry);
+	if (!key->srcfile)
+		key->srcfile = hist_entry_key__get_srcfile(key);
+
+	return strcmp(key->srcfile, entry->srcfile);
+}
+
 static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
@@ -470,6 +571,7 @@ static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_srcfile = {
 	.se_header	= "Source File",
 	.se_cmp		= sort__srcfile_cmp,
+	.se_cmp_key	= sort__srcfile_cmp_key,
 	.se_snprintf	= hist_entry__srcfile_snprintf,
 	.se_width_idx	= HISTC_SRCFILE,
 };
@@ -488,6 +590,18 @@ sort__parent_cmp(struct hist_entry *left, struct hist_entry *right)
 	return strcmp(sym_r->name, sym_l->name);
 }
 
+static int64_t
+sort__parent_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	struct symbol *sym_l = entry->parent;
+	struct symbol *sym_r = key->sym_parent;
+
+	if (!sym_l || !sym_r)
+		return cmp_null(sym_l, sym_r);
+
+	return strcmp(sym_r->name, sym_l->name);
+}
+
 static int hist_entry__parent_snprintf(struct hist_entry *he, char *bf,
 				       size_t size, unsigned int width)
 {
@@ -498,6 +612,7 @@ static int hist_entry__parent_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_parent = {
 	.se_header	= "Parent symbol",
 	.se_cmp		= sort__parent_cmp,
+	.se_cmp_key	= sort__parent_cmp_key,
 	.se_snprintf	= hist_entry__parent_snprintf,
 	.se_width_idx	= HISTC_PARENT,
 };
@@ -510,6 +625,12 @@ sort__cpu_cmp(struct hist_entry *left, struct hist_entry *right)
 	return right->cpu - left->cpu;
 }
 
+static int64_t
+sort__cpu_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	return key->al->cpu - entry->cpu;
+}
+
 static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -519,6 +640,7 @@ static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_cpu = {
 	.se_header      = "CPU",
 	.se_cmp	        = sort__cpu_cmp,
+	.se_cmp_key     = sort__cpu_cmp_key,
 	.se_snprintf    = hist_entry__cpu_snprintf,
 	.se_width_idx	= HISTC_CPU,
 };
@@ -548,6 +670,22 @@ sort__cgroup_id_cmp(struct hist_entry *left, struct hist_entry *right)
 				       left->cgroup_id.ino);
 }
 
+static int64_t
+sort__cgroup_id_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	struct namespaces *ns = thread__namespaces(key->al->thread);
+	int64_t ret;
+	u64 val;
+
+	val = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0;
+	ret = _sort__cgroup_dev_cmp(val, entry->cgroup_id.dev);
+	if (ret != 0)
+		return ret;
+
+	val = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0;
+	return _sort__cgroup_inode_cmp(val, entry->cgroup_id.ino);
+}
+
 static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
 					  char *bf, size_t size,
 					  unsigned int width __maybe_unused)
@@ -559,6 +697,7 @@ static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
 struct sort_entry sort_cgroup_id = {
 	.se_header      = "cgroup id (dev/inode)",
 	.se_cmp	        = sort__cgroup_id_cmp,
+	.se_cmp_key     = sort__cgroup_id_cmp_key,
 	.se_snprintf    = hist_entry__cgroup_id_snprintf,
 	.se_width_idx	= HISTC_CGROUP_ID,
 };
@@ -571,6 +710,12 @@ sort__socket_cmp(struct hist_entry *left, struct hist_entry *right)
 	return right->socket - left->socket;
 }
 
+static int64_t
+sort__socket_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	return key->al->socket - entry->socket;
+}
+
 static int hist_entry__socket_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -590,6 +735,7 @@ static int hist_entry__socket_filter(struct hist_entry *he, int type, const void
 struct sort_entry sort_socket = {
 	.se_header      = "Socket",
 	.se_cmp	        = sort__socket_cmp,
+	.se_cmp_key     = sort__socket_cmp_key,
 	.se_snprintf    = hist_entry__socket_snprintf,
 	.se_filter      = hist_entry__socket_filter,
 	.se_width_idx	= HISTC_SOCKET,
@@ -597,20 +743,21 @@ struct sort_entry sort_socket = {
 
 /* --sort trace */
 
-static char *get_trace_output(struct hist_entry *he)
+static char *__get_trace_output(struct hists *hists, void *raw_data,
+				u32 raw_size)
 {
 	struct trace_seq seq;
 	struct perf_evsel *evsel;
 	struct tep_record rec = {
-		.data = he->raw_data,
-		.size = he->raw_size,
+		.data = raw_data,
+		.size = raw_size,
 	};
 
-	evsel = hists_to_evsel(he->hists);
+	evsel = hists_to_evsel(hists);
 
 	trace_seq_init(&seq);
 	if (symbol_conf.raw_trace) {
-		tep_print_fields(&seq, he->raw_data, he->raw_size,
+		tep_print_fields(&seq, raw_data, raw_size,
 				 evsel->tp_format);
 	} else {
 		tep_event_info(&seq, evsel->tp_format, &rec);
@@ -622,6 +769,16 @@ static char *get_trace_output(struct hist_entry *he)
 	return realloc(seq.buffer, seq.len + 1);
 }
 
+static char *get_trace_output(struct hist_entry *he)
+{
+	return __get_trace_output(he->hists, he->raw_data, he->raw_size);
+}
+
+static char *get_trace_output_key(struct hists *hists, struct hist_entry_cmp_key *key)
+{
+	return __get_trace_output(hists, key->sample->raw_data, key->sample->raw_size);
+}
+
 static int64_t
 sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -639,6 +796,23 @@ sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
 	return strcmp(right->trace_output, left->trace_output);
 }
 
+static int64_t
+sort__trace_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	struct perf_evsel *evsel;
+
+	evsel = hists_to_evsel(entry->hists);
+	if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+		return 0;
+
+	if (entry->trace_output == NULL)
+		entry->trace_output = get_trace_output(entry);
+	if (key->trace_output == NULL)
+		key->trace_output = get_trace_output_key(entry->hists, key);
+
+	return strcmp(key->trace_output, entry->trace_output);
+}
+
 static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -656,6 +830,7 @@ static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_trace = {
 	.se_header      = "Trace output",
 	.se_cmp	        = sort__trace_cmp,
+	.se_cmp_key     = sort__trace_cmp_key,
 	.se_snprintf    = hist_entry__trace_snprintf,
 	.se_width_idx	= HISTC_TRACE,
 };
@@ -672,6 +847,16 @@ sort__dso_from_cmp(struct hist_entry *left, struct hist_entry *right)
 			      right->branch_info->from.map);
 }
 
+static int64_t
+sort__dso_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->branch_info || !key->bi)
+		return cmp_null(entry->branch_info, key->bi);
+
+	return _sort__dso_cmp(entry->branch_info->from.map,
+			      key->bi->from.map);
+}
+
 static int hist_entry__dso_from_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -704,6 +889,16 @@ sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
 			      right->branch_info->to.map);
 }
 
+static int64_t
+sort__dso_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->branch_info || !key->bi)
+		return cmp_null(entry->branch_info, key->bi);
+
+	return _sort__dso_cmp(entry->branch_info->to.map,
+			      key->bi->to.map);
+}
+
 static int hist_entry__dso_to_snprintf(struct hist_entry *he, char *bf,
 				       size_t size, unsigned int width)
 {
@@ -745,6 +940,24 @@ sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static int64_t
+sort__sym_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	struct addr_map_symbol *from_l = &entry->branch_info->from;
+	struct addr_map_symbol *from_r = &key->bi->from;
+
+	if (!entry->branch_info || !key->bi)
+		return cmp_null(entry->branch_info, key->bi);
+
+	from_l = &entry->branch_info->from;
+	from_r = &key->bi->from;
+
+	if (!from_l->sym && !from_r->sym)
+		return _sort__addr_cmp(from_l->addr, from_r->addr);
+
+	return _sort__sym_cmp(from_l->sym, from_r->sym);
+}
+
+static int64_t
 sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	struct addr_map_symbol *to_l, *to_r;
@@ -761,6 +974,23 @@ sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
 	return _sort__sym_cmp(to_l->sym, to_r->sym);
 }
 
+static int64_t
+sort__sym_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	struct addr_map_symbol *to_l, *to_r;
+
+	if (!entry->branch_info || !key->bi)
+		return cmp_null(entry->branch_info, key->bi);
+
+	to_l = &entry->branch_info->to;
+	to_r = &key->bi->to;
+
+	if (!to_l->sym && !to_r->sym)
+		return _sort__addr_cmp(to_l->addr, to_r->addr);
+
+	return _sort__sym_cmp(to_l->sym, to_r->sym);
+}
+
 static int hist_entry__sym_from_snprintf(struct hist_entry *he, char *bf,
 					 size_t size, unsigned int width)
 {
@@ -814,6 +1044,7 @@ static int hist_entry__sym_to_filter(struct hist_entry *he, int type,
 struct sort_entry sort_dso_from = {
 	.se_header	= "Source Shared Object",
 	.se_cmp		= sort__dso_from_cmp,
+	.se_cmp_key	= sort__dso_from_cmp_key,
 	.se_snprintf	= hist_entry__dso_from_snprintf,
 	.se_filter	= hist_entry__dso_from_filter,
 	.se_width_idx	= HISTC_DSO_FROM,
@@ -822,6 +1053,7 @@ struct sort_entry sort_dso_from = {
 struct sort_entry sort_dso_to = {
 	.se_header	= "Target Shared Object",
 	.se_cmp		= sort__dso_to_cmp,
+	.se_cmp_key	= sort__dso_to_cmp_key,
 	.se_snprintf	= hist_entry__dso_to_snprintf,
 	.se_filter	= hist_entry__dso_to_filter,
 	.se_width_idx	= HISTC_DSO_TO,
@@ -830,6 +1062,7 @@ struct sort_entry sort_dso_to = {
 struct sort_entry sort_sym_from = {
 	.se_header	= "Source Symbol",
 	.se_cmp		= sort__sym_from_cmp,
+	.se_cmp_key	= sort__sym_from_cmp_key,
 	.se_snprintf	= hist_entry__sym_from_snprintf,
 	.se_filter	= hist_entry__sym_from_filter,
 	.se_width_idx	= HISTC_SYMBOL_FROM,
@@ -838,6 +1071,7 @@ struct sort_entry sort_sym_from = {
 struct sort_entry sort_sym_to = {
 	.se_header	= "Target Symbol",
 	.se_cmp		= sort__sym_to_cmp,
+	.se_cmp_key	= sort__sym_to_cmp_key,
 	.se_snprintf	= hist_entry__sym_to_snprintf,
 	.se_filter	= hist_entry__sym_to_filter,
 	.se_width_idx	= HISTC_SYMBOL_TO,
@@ -856,6 +1090,19 @@ sort__mispredict_cmp(struct hist_entry *left, struct hist_entry *right)
 	return mp || p;
 }
 
+static int64_t
+sort__mispredict_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	unsigned char mp, p;
+
+	if (!entry->branch_info || !key->bi)
+		return cmp_null(entry->branch_info, key->bi);
+
+	mp = entry->branch_info->flags.mispred != key->bi->flags.mispred;
+	p  = entry->branch_info->flags.predicted != key->bi->flags.predicted;
+	return mp || p;
+}
+
 static int hist_entry__mispredict_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width){
 	static const char *out = "N/A";
@@ -880,6 +1127,16 @@ sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right)
 		right->branch_info->flags.cycles;
 }
 
+static int64_t
+sort__cycles_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->branch_info || !key->bi)
+		return cmp_null(entry->branch_info, key->bi);
+
+	return entry->branch_info->flags.cycles -
+		key->bi->flags.cycles;
+}
+
 static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -894,6 +1151,7 @@ static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_cycles = {
 	.se_header	= "Basic Block Cycles",
 	.se_cmp		= sort__cycles_cmp,
+	.se_cmp_key	= sort__cycles_cmp_key,
 	.se_snprintf	= hist_entry__cycles_snprintf,
 	.se_width_idx	= HISTC_CYCLES,
 };
@@ -912,6 +1170,19 @@ sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(r - l);
 }
 
+static int64_t
+sort__daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	uint64_t l = 0, r = 0;
+
+	if (entry->mem_info)
+		l = entry->mem_info->daddr.addr;
+	if (key->mem_info)
+		r = key->mem_info->daddr.addr;
+
+	return (int64_t)(r - l);
+}
+
 static int hist_entry__daddr_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -941,6 +1212,19 @@ sort__iaddr_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(r - l);
 }
 
+static int64_t
+sort__iaddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	uint64_t l = 0, r = 0;
+
+	if (entry->mem_info)
+		l = entry->mem_info->iaddr.addr;
+	if (key->mem_info)
+		r = key->mem_info->iaddr.addr;
+
+	return (int64_t)(r - l);
+}
+
 static int hist_entry__iaddr_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -971,6 +1255,20 @@ sort__dso_daddr_cmp(struct hist_entry *left, struct hist_entry *right)
 	return _sort__dso_cmp(map_l, map_r);
 }
 
+static int64_t
+sort__dso_daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	struct map *map_l = NULL;
+	struct map *map_r = NULL;
+
+	if (entry->mem_info)
+		map_l = entry->mem_info->daddr.map;
+	if (key->mem_info)
+		map_r = key->mem_info->daddr.map;
+
+	return _sort__dso_cmp(map_l, map_r);
+}
+
 static int hist_entry__dso_daddr_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -1001,6 +1299,25 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(data_src_r.mem_lock - data_src_l.mem_lock);
 }
 
+static int64_t
+sort__locked_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	union perf_mem_data_src data_src_l;
+	union perf_mem_data_src data_src_r;
+
+	if (entry->mem_info)
+		data_src_l = entry->mem_info->data_src;
+	else
+		data_src_l.mem_lock = PERF_MEM_LOCK_NA;
+
+	if (key->mem_info)
+		data_src_r = key->mem_info->data_src;
+	else
+		data_src_r.mem_lock = PERF_MEM_LOCK_NA;
+
+	return (int64_t)(data_src_r.mem_lock - data_src_l.mem_lock);
+}
+
 static int hist_entry__locked_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -1029,6 +1346,25 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
 }
 
+static int64_t
+sort__tlb_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	union perf_mem_data_src data_src_l;
+	union perf_mem_data_src data_src_r;
+
+	if (entry->mem_info)
+		data_src_l = entry->mem_info->data_src;
+	else
+		data_src_l.mem_dtlb = PERF_MEM_TLB_NA;
+
+	if (key->mem_info)
+		data_src_r = key->mem_info->data_src;
+	else
+		data_src_r.mem_dtlb = PERF_MEM_TLB_NA;
+
+	return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
+}
+
 static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -1057,6 +1393,25 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
 }
 
+static int64_t
+sort__lvl_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	union perf_mem_data_src data_src_l;
+	union perf_mem_data_src data_src_r;
+
+	if (entry->mem_info)
+		data_src_l = entry->mem_info->data_src;
+	else
+		data_src_l.mem_lvl = PERF_MEM_LVL_NA;
+
+	if (key->mem_info)
+		data_src_r = key->mem_info->data_src;
+	else
+		data_src_r.mem_lvl = PERF_MEM_LVL_NA;
+
+	return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
+}
+
 static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -1085,6 +1440,25 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
 }
 
+static int64_t
+sort__snoop_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	union perf_mem_data_src data_src_l;
+	union perf_mem_data_src data_src_r;
+
+	if (entry->mem_info)
+		data_src_l = entry->mem_info->data_src;
+	else
+		data_src_l.mem_snoop = PERF_MEM_SNOOP_NA;
+
+	if (key->mem_info)
+		data_src_r = key->mem_info->data_src;
+	else
+		data_src_r.mem_snoop = PERF_MEM_SNOOP_NA;
+
+	return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
+}
+
 static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -1158,6 +1532,70 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
 	return 0;
 }
 
+static int64_t
+sort__dcacheline_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	u64 l, r;
+	struct map *l_map, *r_map;
+
+	if (!entry->mem_info)  return -1;
+	if (!key->mem_info) return 1;
+
+	/* group event types together */
+	if (entry->cpumode > key->al->cpumode) return -1;
+	if (entry->cpumode < key->al->cpumode) return 1;
+
+	l_map = entry->mem_info->daddr.map;
+	r_map = key->mem_info->daddr.map;
+
+	/* if both are NULL, jump to sort on al_addr instead */
+	if (!l_map && !r_map)
+		goto addr;
+
+	if (!l_map) return -1;
+	if (!r_map) return 1;
+
+	if (l_map->maj > r_map->maj) return -1;
+	if (l_map->maj < r_map->maj) return 1;
+
+	if (l_map->min > r_map->min) return -1;
+	if (l_map->min < r_map->min) return 1;
+
+	if (l_map->ino > r_map->ino) return -1;
+	if (l_map->ino < r_map->ino) return 1;
+
+	if (l_map->ino_generation > r_map->ino_generation) return -1;
+	if (l_map->ino_generation < r_map->ino_generation) return 1;
+
+	/*
+	 * Addresses with no major/minor numbers are assumed to be
+	 * anonymous in userspace.  Sort those on pid then address.
+	 *
+	 * The kernel and non-zero major/minor mapped areas are
+	 * assumed to be unity mapped.  Sort those on address.
+	 */
+
+	if ((entry->cpumode != PERF_RECORD_MISC_KERNEL) &&
+	    (!(l_map->flags & MAP_SHARED)) &&
+	    !l_map->maj && !l_map->min && !l_map->ino &&
+	    !l_map->ino_generation) {
+		/* userspace anonymous */
+
+		if (entry->thread->pid_ > key->al->thread->pid_) return -1;
+		if (entry->thread->pid_ < key->al->thread->pid_) return 1;
+	}
+
+addr:
+	/* al_addr does all the right addr - start + offset calculations */
+	l = cl_address(entry->mem_info->daddr.al_addr);
+	r = cl_address(key->mem_info->daddr.al_addr);
+
+	if (l > r) return -1;
+	if (l < r) return 1;
+
+	return 0;
+}
+
 static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
 					  size_t size, unsigned int width)
 {
@@ -1189,6 +1627,7 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_mispredict = {
 	.se_header	= "Branch Mispredicted",
 	.se_cmp		= sort__mispredict_cmp,
+	.se_cmp_key	= sort__mispredict_cmp_key,
 	.se_snprintf	= hist_entry__mispredict_snprintf,
 	.se_width_idx	= HISTC_MISPREDICT,
 };
@@ -1198,12 +1637,24 @@ static u64 he_weight(struct hist_entry *he)
 	return he->stat.nr_events ? he->stat.weight / he->stat.nr_events : 0;
 }
 
+static u64 key_weight(struct hist_entry_cmp_key *key)
+{
+	return key->sample->weight;
+}
+
 static int64_t
 sort__local_weight_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	return he_weight(left) - he_weight(right);
 }
 
+static int64_t
+sort__local_weight_cmp_key(struct hist_entry *entry,
+			   struct hist_entry_cmp_key *key)
+{
+	return he_weight(entry) - key_weight(key);
+}
+
 static int hist_entry__local_weight_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -1213,6 +1664,7 @@ static int hist_entry__local_weight_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_local_weight = {
 	.se_header	= "Local Weight",
 	.se_cmp		= sort__local_weight_cmp,
+	.se_cmp_key	= sort__local_weight_cmp_key,
 	.se_snprintf	= hist_entry__local_weight_snprintf,
 	.se_width_idx	= HISTC_LOCAL_WEIGHT,
 };
@@ -1223,6 +1675,13 @@ sort__global_weight_cmp(struct hist_entry *left, struct hist_entry *right)
 	return left->stat.weight - right->stat.weight;
 }
 
+static int64_t
+sort__global_weight_cmp_key(struct hist_entry *entry,
+			    struct hist_entry_cmp_key *key __maybe_unused)
+{
+	return entry->stat.weight - key->sample->weight;
+}
+
 static int hist_entry__global_weight_snprintf(struct hist_entry *he, char *bf,
 					      size_t size, unsigned int width)
 {
@@ -1232,6 +1691,7 @@ static int hist_entry__global_weight_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_global_weight = {
 	.se_header	= "Weight",
 	.se_cmp		= sort__global_weight_cmp,
+	.se_cmp_key	= sort__global_weight_cmp_key,
 	.se_snprintf	= hist_entry__global_weight_snprintf,
 	.se_width_idx	= HISTC_GLOBAL_WEIGHT,
 };
@@ -1239,6 +1699,7 @@ struct sort_entry sort_global_weight = {
 struct sort_entry sort_mem_daddr_sym = {
 	.se_header	= "Data Symbol",
 	.se_cmp		= sort__daddr_cmp,
+	.se_cmp_key	= sort__daddr_cmp_key,
 	.se_snprintf	= hist_entry__daddr_snprintf,
 	.se_width_idx	= HISTC_MEM_DADDR_SYMBOL,
 };
@@ -1246,6 +1707,7 @@ struct sort_entry sort_mem_daddr_sym = {
 struct sort_entry sort_mem_iaddr_sym = {
 	.se_header	= "Code Symbol",
 	.se_cmp		= sort__iaddr_cmp,
+	.se_cmp_key	= sort__iaddr_cmp_key,
 	.se_snprintf	= hist_entry__iaddr_snprintf,
 	.se_width_idx	= HISTC_MEM_IADDR_SYMBOL,
 };
@@ -1253,6 +1715,7 @@ struct sort_entry sort_mem_iaddr_sym = {
 struct sort_entry sort_mem_daddr_dso = {
 	.se_header	= "Data Object",
 	.se_cmp		= sort__dso_daddr_cmp,
+	.se_cmp_key	= sort__dso_daddr_cmp_key,
 	.se_snprintf	= hist_entry__dso_daddr_snprintf,
 	.se_width_idx	= HISTC_MEM_DADDR_DSO,
 };
@@ -1260,6 +1723,7 @@ struct sort_entry sort_mem_daddr_dso = {
 struct sort_entry sort_mem_locked = {
 	.se_header	= "Locked",
 	.se_cmp		= sort__locked_cmp,
+	.se_cmp_key	= sort__locked_cmp_key,
 	.se_snprintf	= hist_entry__locked_snprintf,
 	.se_width_idx	= HISTC_MEM_LOCKED,
 };
@@ -1267,6 +1731,7 @@ struct sort_entry sort_mem_locked = {
 struct sort_entry sort_mem_tlb = {
 	.se_header	= "TLB access",
 	.se_cmp		= sort__tlb_cmp,
+	.se_cmp_key	= sort__tlb_cmp_key,
 	.se_snprintf	= hist_entry__tlb_snprintf,
 	.se_width_idx	= HISTC_MEM_TLB,
 };
@@ -1274,6 +1739,7 @@ struct sort_entry sort_mem_tlb = {
 struct sort_entry sort_mem_lvl = {
 	.se_header	= "Memory access",
 	.se_cmp		= sort__lvl_cmp,
+	.se_cmp_key	= sort__lvl_cmp_key,
 	.se_snprintf	= hist_entry__lvl_snprintf,
 	.se_width_idx	= HISTC_MEM_LVL,
 };
@@ -1281,6 +1747,7 @@ struct sort_entry sort_mem_lvl = {
 struct sort_entry sort_mem_snoop = {
 	.se_header	= "Snoop",
 	.se_cmp		= sort__snoop_cmp,
+	.se_cmp_key	= sort__snoop_cmp_key,
 	.se_snprintf	= hist_entry__snoop_snprintf,
 	.se_width_idx	= HISTC_MEM_SNOOP,
 };
@@ -1288,6 +1755,7 @@ struct sort_entry sort_mem_snoop = {
 struct sort_entry sort_mem_dcacheline = {
 	.se_header	= "Data Cacheline",
 	.se_cmp		= sort__dcacheline_cmp,
+	.se_cmp_key	= sort__dcacheline_cmp_key,
 	.se_snprintf	= hist_entry__dcacheline_snprintf,
 	.se_width_idx	= HISTC_MEM_DCACHELINE,
 };
@@ -1305,6 +1773,19 @@ sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(r - l);
 }
 
+static int64_t
+sort__phys_daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	uint64_t l = 0, r = 0;
+
+	if (entry->mem_info)
+		l = entry->mem_info->daddr.phys_addr;
+	if (key->mem_info)
+		r = key->mem_info->daddr.phys_addr;
+
+	return (int64_t)(r - l);
+}
+
 static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
 					   size_t size, unsigned int width)
 {
@@ -1329,6 +1810,7 @@ static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_mem_phys_daddr = {
 	.se_header	= "Data Physical Address",
 	.se_cmp		= sort__phys_daddr_cmp,
+	.se_cmp_key	= sort__phys_daddr_cmp_key,
 	.se_snprintf	= hist_entry__phys_daddr_snprintf,
 	.se_width_idx	= HISTC_MEM_PHYS_DADDR,
 };
@@ -1343,6 +1825,16 @@ sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
 		right->branch_info->flags.abort;
 }
 
+static int64_t
+sort__abort_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->branch_info || !key->bi)
+		return cmp_null(entry->branch_info, key->bi);
+
+	return entry->branch_info->flags.abort !=
+		key->bi->flags.abort;
+}
+
 static int hist_entry__abort_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -1361,6 +1853,7 @@ static int hist_entry__abort_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_abort = {
 	.se_header	= "Transaction abort",
 	.se_cmp		= sort__abort_cmp,
+	.se_cmp_key	= sort__abort_cmp_key,
 	.se_snprintf	= hist_entry__abort_snprintf,
 	.se_width_idx	= HISTC_ABORT,
 };
@@ -1375,6 +1868,16 @@ sort__in_tx_cmp(struct hist_entry *left, struct hist_entry *right)
 		right->branch_info->flags.in_tx;
 }
 
+static int64_t
+sort__in_tx_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	if (!entry->branch_info || !key->bi)
+		return cmp_null(entry->branch_info, key->bi);
+
+	return entry->branch_info->flags.in_tx !=
+		key->bi->flags.in_tx;
+}
+
 static int hist_entry__in_tx_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
@@ -1393,6 +1896,7 @@ static int hist_entry__in_tx_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_in_tx = {
 	.se_header	= "Branch in transaction",
 	.se_cmp		= sort__in_tx_cmp,
+	.se_cmp_key	= sort__in_tx_cmp_key,
 	.se_snprintf	= hist_entry__in_tx_snprintf,
 	.se_width_idx	= HISTC_IN_TX,
 };
@@ -1403,6 +1907,12 @@ sort__transaction_cmp(struct hist_entry *left, struct hist_entry *right)
 	return left->transaction - right->transaction;
 }
 
+static int64_t
+sort__transaction_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	return entry->transaction - key->sample->transaction;
+}
+
 static inline char *add_str(char *p, const char *str)
 {
 	strcpy(p, str);
@@ -1465,6 +1975,7 @@ static int hist_entry__transaction_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_transaction = {
 	.se_header	= "Transaction                ",
 	.se_cmp		= sort__transaction_cmp,
+	.se_cmp_key	= sort__transaction_cmp_key,
 	.se_snprintf	= hist_entry__transaction_snprintf,
 	.se_width_idx	= HISTC_TRANSACTION,
 };
@@ -1486,6 +1997,12 @@ sort__sym_size_cmp(struct hist_entry *left, struct hist_entry *right)
 	return _sort__sym_size_cmp(right->ms.sym, left->ms.sym);
 }
 
+static int64_t
+sort__sym_size_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	return _sort__sym_size_cmp(key->al->sym, entry->ms.sym);
+}
+
 static int _hist_entry__sym_size_snprintf(struct symbol *sym, char *bf,
 					  size_t bf_size, unsigned int width)
 {
@@ -1504,6 +2021,7 @@ static int hist_entry__sym_size_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_sym_size = {
 	.se_header	= "Symbol size",
 	.se_cmp		= sort__sym_size_cmp,
+	.se_cmp_key	= sort__sym_size_cmp_key,
 	.se_snprintf	= hist_entry__sym_size_snprintf,
 	.se_width_idx	= HISTC_SYM_SIZE,
 };
@@ -1525,6 +2043,12 @@ sort__dso_size_cmp(struct hist_entry *left, struct hist_entry *right)
 	return _sort__dso_size_cmp(right->ms.map, left->ms.map);
 }
 
+static int64_t
+sort__dso_size_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	return _sort__dso_size_cmp(key->al->map, entry->ms.map);
+}
+
 static int _hist_entry__dso_size_snprintf(struct map *map, char *bf,
 					  size_t bf_size, unsigned int width)
 {
@@ -1544,6 +2068,7 @@ static int hist_entry__dso_size_snprintf(struct hist_entry *he, char *bf,
 struct sort_entry sort_dso_size = {
 	.se_header	= "DSO size",
 	.se_cmp		= sort__dso_size_cmp,
+	.se_cmp_key	= sort__dso_size_cmp_key,
 	.se_snprintf	= hist_entry__dso_size_snprintf,
 	.se_width_idx	= HISTC_DSO_SIZE,
 };
@@ -1693,12 +2218,13 @@ static int __sort__hpp_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 }
 
 static int64_t __sort__hpp_cmp(struct perf_hpp_fmt *fmt,
-			       struct hist_entry *a, struct hist_entry *b)
+			       struct hist_entry *entry,
+			       struct hist_entry_cmp_key *key)
 {
 	struct hpp_sort_entry *hse;
 
 	hse = container_of(fmt, struct hpp_sort_entry, hpp);
-	return hse->se->se_cmp(a, b);
+	return hse->se->se_cmp_key(entry, key);
 }
 
 static int64_t __sort__hpp_collapse(struct perf_hpp_fmt *fmt,
@@ -2089,9 +2615,37 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 	return memcmp(a->raw_data + offset, b->raw_data + offset, size);
 }
 
+static int64_t __sort__hde_cmp_key(struct perf_hpp_fmt *fmt,
+				   struct hist_entry *a,
+				   struct hist_entry_cmp_key *key)
+{
+	struct hpp_dynamic_entry *hde;
+	struct tep_format_field *field;
+	unsigned offset, size;
+
+	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+	field = hde->field;
+	if (field->flags & TEP_FIELD_IS_DYNAMIC) {
+		unsigned long long dyn;
+
+		tep_read_number_field(field, a->raw_data, &dyn);
+		offset = dyn & 0xffff;
+		size = (dyn >> 16) & 0xffff;
+
+		/* record max width for output */
+		if (size > hde->dynamic_len)
+			hde->dynamic_len = size;
+	} else {
+		offset = field->offset;
+		size = field->size;
+	}
+
+	return memcmp(a->raw_data + offset, key->sample->raw_data + offset, size);
+}
+
 bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
 {
-	return fmt->cmp == __sort__hde_cmp;
+	return fmt->cmp == __sort__hde_cmp_key;
 }
 
 static bool __sort__hde_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
@@ -2138,7 +2692,7 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct tep_format_field *field,
 	hde->hpp.entry  = __sort__hde_entry;
 	hde->hpp.color  = NULL;
 
-	hde->hpp.cmp = __sort__hde_cmp;
+	hde->hpp.cmp = __sort__hde_cmp_key;
 	hde->hpp.collapse = __sort__hde_cmp;
 	hde->hpp.sort = __sort__hde_cmp;
 	hde->hpp.equal = __sort__hde_equal;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index a97cf8e..da85224 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -264,6 +264,7 @@ struct sort_entry {
 	const char *se_header;
 
 	int64_t (*se_cmp)(struct hist_entry *, struct hist_entry *);
+	int64_t (*se_cmp_key)(struct hist_entry *, struct hist_entry_cmp_key *);
 	int64_t (*se_collapse)(struct hist_entry *, struct hist_entry *);
 	int64_t	(*se_sort)(struct hist_entry *, struct hist_entry *);
 	int	(*se_snprintf)(struct hist_entry *he, char *bf, size_t size,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 3badd7f..78df16b 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -150,7 +150,6 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
 struct perf_hpp;
 struct perf_hpp_fmt;
 
-int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
 int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
 int hist_entry__transaction_len(void);
 int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size,
@@ -238,6 +237,18 @@ struct perf_hpp {
 	void *ptr;
 };
 
+struct hist_entry_cmp_key {
+	struct addr_location *al;
+	struct comm *comm;
+	struct branch_info *bi;
+	struct symbol *sym_parent;
+	struct perf_sample *sample;
+	struct mem_info *mem_info;
+	char *srcfile;
+	char *trace_output;
+};
+
+struct comm;
 struct perf_hpp_fmt {
 	const char *name;
 	int (*header)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
@@ -249,7 +260,8 @@ struct perf_hpp_fmt {
 	int (*entry)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 		     struct hist_entry *he);
 	int64_t (*cmp)(struct perf_hpp_fmt *fmt,
-		       struct hist_entry *a, struct hist_entry *b);
+		       struct hist_entry *entry,
+		       struct hist_entry_cmp_key *key);
 	int64_t (*collapse)(struct perf_hpp_fmt *fmt,
 			    struct hist_entry *a, struct hist_entry *b);
 	int64_t (*sort)(struct perf_hpp_fmt *fmt,
@@ -525,4 +537,8 @@ static inline int hists__scnprintf_title(struct hists *hists, char *bf, size_t s
 	return __hists__scnprintf_title(hists, bf, size, true);
 }
 
+extern unsigned long hist_lookups;
+extern unsigned long hist_hits;
+extern unsigned long hist_misses;
+
 #endif	/* __PERF_HIST_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 828cb97..a4deb5d 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -364,16 +364,49 @@ void hists__delete_entries(struct hists *hists)
 	}
 }
 
+static u8 symbol__parent_filter(const struct symbol *parent)
+{
+	if (symbol_conf.exclude_other && parent == NULL)
+		return 1 << HIST_FILTER__PARENT;
+	return 0;
+}
+
 /*
  * histogram, sorted on item, collects periods
  */
 
 static int hist_entry__init(struct hist_entry *he,
-			    struct hist_entry *template,
+			    struct hist_entry_cmp_key *key,
+			    struct hists *hists,
 			    bool sample_self,
 			    size_t callchain_size)
 {
-	*he = *template;
+	struct namespaces *ns = thread__namespaces(key->al->thread);
+
+	he->thread = key->al->thread;
+	he->comm = thread__comm(he->thread);
+	he->cgroup_id.dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0;
+	he->cgroup_id.ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0;
+	he->ms.map = key->al->map;
+	he->ms.sym = key->al->sym;
+	he->srcline = key->al->srcline ? strdup(key->al->srcline) : NULL;
+	he->socket	 = key->al->socket;
+	he->cpu	 = key->al->cpu;
+	he->cpumode = key->al->cpumode;
+	he->ip	 = key->al->addr;
+	he->level	 = key->al->level;
+	he->stat.nr_events = 1;
+	he->stat.period = key->sample->period;
+	he->stat.weight = key->sample->weight;
+	he->parent = key->sym_parent;
+	he->filtered = symbol__parent_filter(key->sym_parent) | key->al->filtered;
+	he->hists = hists;
+	he->branch_info = key->bi;
+	he->mem_info = key->mem_info;
+	he->transaction = key->sample->transaction;
+	he->raw_data = key->sample->raw_data;
+	he->raw_size = key->sample->raw_size;
+
 	he->callchain_size = callchain_size;
 
 	if (symbol_conf.cumulate_callchain) {
@@ -400,7 +433,7 @@ static int hist_entry__init(struct hist_entry *he,
 			return -ENOMEM;
 		}
 
-		memcpy(he->branch_info, template->branch_info,
+		memcpy(he->branch_info, key->bi,
 		       sizeof(*he->branch_info));
 
 		map__get(he->branch_info->from.map);
@@ -459,23 +492,25 @@ static struct hist_entry_ops default_ops = {
 	.free	= hist_entry__free,
 };
 
-static struct hist_entry *hist_entry__new(struct hist_entry *template,
+static struct hist_entry *hist_entry__new(struct hist_entry_cmp_key *key,
+					  struct hists *hists,
+					  struct hist_entry_ops *ops,
 					  bool sample_self)
 {
-	struct hist_entry_ops *ops = template->ops;
 	size_t callchain_size = 0;
 	struct hist_entry *he;
 	int err = 0;
 
 	if (!ops)
-		ops = template->ops = &default_ops;
+		ops = &default_ops;
 
 	if (symbol_conf.use_callchain)
 		callchain_size = sizeof(struct callchain_root);
 
 	he = ops->new(callchain_size);
 	if (he) {
-		err = hist_entry__init(he, template, sample_self, callchain_size);
+		he->ops = ops;
+		err = hist_entry__init(he, key, hists, sample_self, callchain_size);
 		if (err) {
 			ops->free(he);
 			he = NULL;
@@ -485,13 +520,6 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
 	return he;
 }
 
-static u8 symbol__parent_filter(const struct symbol *parent)
-{
-	if (symbol_conf.exclude_other && parent == NULL)
-		return 1 << HIST_FILTER__PARENT;
-	return 0;
-}
-
 static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
 {
 	if (!hist_entry__has_callchains(he) || !symbol_conf.use_callchain)
@@ -502,17 +530,43 @@ static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
 		he->hists->callchain_non_filtered_period += period;
 }
 
+static int64_t
+hist_entry__cmp(struct hist_entry *entry, struct hist_entry_cmp_key *key)
+{
+	struct hists *hists = entry->hists;
+	struct perf_hpp_fmt *fmt;
+	int64_t cmp = 0;
+
+	hists__for_each_sort_list(hists, fmt) {
+		if (perf_hpp__is_dynamic_entry(fmt) &&
+		    !perf_hpp__defined_dynamic_entry(fmt, hists))
+			continue;
+
+		cmp = fmt->cmp(fmt, entry, key);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+unsigned long hist_lookups;
+unsigned long hist_hits;
+unsigned long hist_misses;
+
 static struct hist_entry *hists__findnew_entry(struct hists *hists,
-					       struct hist_entry *entry,
-					       struct addr_location *al,
+					       struct hist_entry_cmp_key *key,
+					       struct hist_entry_ops *ops,
 					       bool sample_self)
 {
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct hist_entry *he;
 	int64_t cmp;
-	u64 period = entry->stat.period;
-	u64 weight = entry->stat.weight;
+	u64 period = key->sample->period;
+	u64 weight = key->sample->weight;
+
+	hist_lookups++;
 
 	p = &hists->entries_in->rb_node;
 
@@ -526,7 +580,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 		 * function when searching an entry regardless which sort
 		 * keys were used.
 		 */
-		cmp = hist_entry__cmp(he, entry);
+		cmp = hist_entry__cmp(he, key);
 
 		if (!cmp) {
 			if (sample_self) {
@@ -540,7 +594,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 			 * This mem info was allocated from sample__resolve_mem
 			 * and will not be used anymore.
 			 */
-			mem_info__zput(entry->mem_info);
+			mem_info__zput(key->mem_info);
 
 			/* If the map of an existing hist_entry has
 			 * become out-of-date due to an exec() or
@@ -548,10 +602,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 			 * mis-adjust symbol addresses when computing
 			 * the history counter to increment.
 			 */
-			if (he->ms.map != entry->ms.map) {
+			if (he->ms.map != key->al->map) {
 				map__put(he->ms.map);
-				he->ms.map = map__get(entry->ms.map);
+				he->ms.map = map__get(key->al->map);
 			}
+			hist_hits++;
 			goto out;
 		}
 
@@ -561,7 +616,8 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 			p = &(*p)->rb_right;
 	}
 
-	he = hist_entry__new(entry, sample_self);
+	hist_misses++;
+	he = hist_entry__new(key, hists, ops, sample_self);
 	if (!he)
 		return NULL;
 
@@ -573,9 +629,9 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 	rb_insert_color(&he->rb_node_in, hists->entries_in);
 out:
 	if (sample_self)
-		he_stat__add_cpumode_period(&he->stat, al->cpumode, period);
+		he_stat__add_cpumode_period(&he->stat, key->al->cpumode, period);
 	if (symbol_conf.cumulate_callchain)
-		he_stat__add_cpumode_period(he->stat_acc, al->cpumode, period);
+		he_stat__add_cpumode_period(he->stat_acc, key->al->cpumode, period);
 	return he;
 }
 
@@ -589,39 +645,19 @@ __hists__add_entry(struct hists *hists,
 		   bool sample_self,
 		   struct hist_entry_ops *ops)
 {
-	struct namespaces *ns = thread__namespaces(al->thread);
-	struct hist_entry entry = {
-		.thread	= al->thread,
-		.comm = thread__comm(al->thread),
-		.cgroup_id = {
-			.dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0,
-			.ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0,
-		},
-		.ms = {
-			.map	= al->map,
-			.sym	= al->sym,
-		},
-		.srcline = al->srcline ? strdup(al->srcline) : NULL,
-		.socket	 = al->socket,
-		.cpu	 = al->cpu,
-		.cpumode = al->cpumode,
-		.ip	 = al->addr,
-		.level	 = al->level,
-		.stat = {
-			.nr_events = 1,
-			.period	= sample->period,
-			.weight = sample->weight,
-		},
-		.parent = sym_parent,
-		.filtered = symbol__parent_filter(sym_parent) | al->filtered,
-		.hists	= hists,
-		.branch_info = bi,
-		.mem_info = mi,
-		.transaction = sample->transaction,
-		.raw_data = sample->raw_data,
-		.raw_size = sample->raw_size,
-		.ops = ops,
-	}, *he = hists__findnew_entry(hists, &entry, al, sample_self);
+	struct hist_entry_cmp_key key;
+	struct hist_entry *he;
+
+	key.al = al;
+	key.comm = thread__comm(al->thread);
+	key.bi = bi;
+	key.sym_parent = sym_parent;
+	key.sample = sample;
+	key.mem_info = mi;
+	key.srcfile = NULL;
+	key.trace_output = NULL;
+
+	he = hists__findnew_entry(hists, &key, ops, sample_self);
 
 	if (!hists->has_callchains && he && he->callchain_size != 0)
 		hists->has_callchains = true;
@@ -947,7 +983,9 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 	struct perf_evsel *evsel = iter->evsel;
 	struct perf_sample *sample = iter->sample;
 	struct hist_entry **he_cache = iter->priv;
+	struct hist_entry_cmp_key key;
 	struct hist_entry *he;
+#if 0
 	struct hist_entry he_tmp = {
 		.hists = evsel__hists(evsel),
 		.cpu = al->cpu,
@@ -963,6 +1001,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 		.raw_data = sample->raw_data,
 		.raw_size = sample->raw_size,
 	};
+#endif
 	int i;
 	struct callchain_cursor cursor;
 
@@ -974,8 +1013,16 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 	 * Check if there's duplicate entries in the callchain.
 	 * It's possible that it has cycles or recursive calls.
 	 */
+	key.al = al;
+	key.comm = thread__comm(al->thread);
+	key.bi = NULL;
+	key.sym_parent = iter->parent;
+	key.sample = sample;
+	key.mem_info = NULL;
+	key.srcfile = NULL;
+	key.trace_output = NULL;
 	for (i = 0; i < iter->curr; i++) {
-		if (hist_entry__cmp(he_cache[i], &he_tmp) == 0) {
+		if (hist_entry__cmp(he_cache[i], &key) == 0) {
 			/* to avoid calling callback function */
 			iter->he = NULL;
 			return 0;
@@ -1088,26 +1135,6 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
 }
 
 int64_t
-hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	struct hists *hists = left->hists;
-	struct perf_hpp_fmt *fmt;
-	int64_t cmp = 0;
-
-	hists__for_each_sort_list(hists, fmt) {
-		if (perf_hpp__is_dynamic_entry(fmt) &&
-		    !perf_hpp__defined_dynamic_entry(fmt, hists))
-			continue;
-
-		cmp = fmt->cmp(fmt, left, right);
-		if (cmp)
-			break;
-	}
-
-	return cmp;
-}
-
-int64_t
 hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 {
 	struct hists *hists = left->hists;
@@ -1312,7 +1339,11 @@ static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
 			p = &parent->rb_right;
 	}
 
-	new = hist_entry__new(he, true);
+#if 1
+	new = NULL;
+#else
+	new = hist_entry__new(he, true); /* XXX fix XXX */
+#endif
 	if (new == NULL)
 		return NULL;
 
@@ -2168,7 +2199,11 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
 			p = &(*p)->rb_right;
 	}
 
-	he = hist_entry__new(pair, true);
+#if 1
+	he = NULL;
+#else
+	he = hist_entry__new(pair, true); /* XXX fix XXX */
+#endif
 	if (he) {
 		memset(&he->stat, 0, sizeof(he->stat));
 		he->hists = hists;
@@ -2213,7 +2248,11 @@ static struct hist_entry *add_dummy_hierarchy_entry(struct hists *hists,
 			p = &parent->rb_right;
 	}
 
-	he = hist_entry__new(pair, true);
+#if 1
+	he = NULL;
+#else
+	he = hist_entry__new(pair, true); /* XXX fix XXX */
+#endif
 	if (he) {
 		rb_link_node(&he->rb_node_in, parent, p);
 		rb_insert_color(&he->rb_node_in, root);
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index fe3dfaa..a3d66e1 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -372,8 +372,15 @@ HPP_RAW_FNS(samples, nr_events)
 HPP_RAW_FNS(period, period)
 
 static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
-			    struct hist_entry *a __maybe_unused,
-			    struct hist_entry *b __maybe_unused)
+			    struct hist_entry *entry __maybe_unused,
+			    struct hist_entry_cmp_key *key __maybe_unused)
+{
+	return 0;
+}
+
+static int64_t hpp__nop_collapse(struct perf_hpp_fmt *fmt __maybe_unused,
+				 struct hist_entry *a __maybe_unused,
+				 struct hist_entry *b __maybe_unused)
 {
 	return 0;
 }
@@ -399,7 +406,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
 		.color	= hpp__color_ ## _fn,		\
 		.entry	= hpp__entry_ ## _fn,		\
 		.cmp	= hpp__nop_cmp,			\
-		.collapse = hpp__nop_cmp,		\
+		.collapse = hpp__nop_collapse,		\
 		.sort	= hpp__sort_ ## _fn,		\
 		.idx	= PERF_HPP__ ## _idx,		\
 		.equal	= hpp__equal,			\
@@ -413,7 +420,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
 		.color	= hpp__color_ ## _fn,		\
 		.entry	= hpp__entry_ ## _fn,		\
 		.cmp	= hpp__nop_cmp,			\
-		.collapse = hpp__nop_cmp,		\
+		.collapse = hpp__nop_collapse,		\
 		.sort	= hpp__sort_ ## _fn,		\
 		.idx	= PERF_HPP__ ## _idx,		\
 		.equal	= hpp__equal,			\
@@ -426,7 +433,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
 		.width	= hpp__width_fn,		\
 		.entry	= hpp__entry_ ## _fn,		\
 		.cmp	= hpp__nop_cmp,			\
-		.collapse = hpp__nop_cmp,		\
+		.collapse = hpp__nop_collapse,		\
 		.sort	= hpp__sort_ ## _fn,		\
 		.idx	= PERF_HPP__ ## _idx,		\
 		.equal	= hpp__equal,			\
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index f3aa9d0..190f5eb 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -1717,12 +1717,13 @@ static int c2c_se_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 }
 
 static int64_t c2c_se_cmp(struct perf_hpp_fmt *fmt,
-			  struct hist_entry *a, struct hist_entry *b)
+			  struct hist_entry *entry,
+			  struct hist_entry_cmp_key *key)
 {
 	struct c2c_fmt *c2c_fmt = container_of(fmt, struct c2c_fmt, fmt);
 	struct c2c_dimension *dim = c2c_fmt->dim;
 
-	return dim->se->se_cmp(a, b);
+	return dim->se->se_cmp_key(entry, key);
 }
 
 static int64_t c2c_se_collapse(struct perf_hpp_fmt *fmt,
@@ -1755,8 +1756,13 @@ static struct c2c_fmt *get_format(const char *name)
 	INIT_LIST_HEAD(&fmt->list);
 	INIT_LIST_HEAD(&fmt->sort_list);
 
+#if 1
+	fmt->cmp	= c2c_se_cmp;
+	fmt->sort	= dim->cmp;
+#else
 	fmt->cmp	= dim->se ? c2c_se_cmp   : dim->cmp;
 	fmt->sort	= dim->se ? c2c_se_cmp   : dim->cmp;
+#endif
 	fmt->color	= dim->se ? NULL	 : dim->color;
 	fmt->entry	= dim->se ? c2c_se_entry : dim->entry;
 	fmt->header	= c2c_header;
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 39db2ee..2684efa 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -604,8 +604,16 @@ hist_entry__cmp_compute_idx(struct hist_entry *left, struct hist_entry *right,
 
 static int64_t
 hist_entry__cmp_nop(struct perf_hpp_fmt *fmt __maybe_unused,
-		    struct hist_entry *left __maybe_unused,
-		    struct hist_entry *right __maybe_unused)
+		    struct hist_entry *entry __maybe_unused,
+		    struct hist_entry_cmp_key *key __maybe_unused)
+{
+	return 0;
+}
+
+static int64_t
+hist_entry__collapse_nop(struct perf_hpp_fmt *fmt __maybe_unused,
+			 struct hist_entry *a __maybe_unused,
+			 struct hist_entry *b __maybe_unused)
 {
 	return 0;
 }
@@ -1141,7 +1149,7 @@ static void data__hpp_register(struct data__file *d, int idx)
 	fmt->width  = hpp__width;
 	fmt->entry  = hpp__entry_global;
 	fmt->cmp    = hist_entry__cmp_nop;
-	fmt->collapse = hist_entry__cmp_nop;
+	fmt->collapse = hist_entry__collapse_nop;
 
 	/* TODO more colors */
 	switch (idx) {
@@ -1166,7 +1174,7 @@ static void data__hpp_register(struct data__file *d, int idx)
 		fmt->sort  = hist_entry__cmp_delta_abs;
 		break;
 	default:
-		fmt->sort  = hist_entry__cmp_nop;
+		fmt->sort  = hist_entry__collapse_nop;
 		break;
 	}
 
@@ -1230,7 +1238,7 @@ static int ui_init(void)
 	}
 
 	fmt->cmp      = hist_entry__cmp_nop;
-	fmt->collapse = hist_entry__cmp_nop;
+	fmt->collapse = hist_entry__collapse_nop;
 
 	switch (compute) {
 	case COMPUTE_DELTA:

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-10-31  5:03 [PATCH RFC] hist lookups David Miller
@ 2018-10-31 12:43 ` Arnaldo Carvalho de Melo
  2018-10-31 15:39   ` Jiri Olsa
  0 siblings, 1 reply; 40+ messages in thread
From: Arnaldo Carvalho de Melo @ 2018-10-31 12:43 UTC (permalink / raw)
  To: David Miller; +Cc: linux-kernel, Namhyung Kim, Jiri Olsa

Em Tue, Oct 30, 2018 at 10:03:28PM -0700, David Miller escreveu:
> 
> So when a cpu is overpowered processing samples, most of the time is
> spent in the histogram code.
> 
> It seems we initialize a ~262 byte structure on the stack to do every
> histogram entry lookup.
> 
> This is a side effect of how the sorting code is shared with the code
> that does lookups and insertions into the histogram tree(s).
> 
> I tried to change this so that lookups use a smaller key, but it gets
> ugly real fast.
> 
> I don't know when I'd be able to work more on this so I'm posting this
> hoping maybe someone else can move it forward, or maybe even find a
> better way to do this.

Added Namhyung to the CC list, that is in vacations right now but said
he would look into the issues recently raised when he gets back.

Thanks a lot for all the work you did in this sprint, really
appreciated, I'm processing the fixes for the fallback to kallsyms and
the other patch you submitted, will do tests and push to Ingo and
revisit this after I get ready for Vancouver ;-)

- Arnaldo
 
> The histogram code is really the limiting factor in how well perf can
> handle high sample rates.
> 
> diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
> index f96c005..f0265e4 100644
> --- a/tools/perf/util/sort.c
> +++ b/tools/perf/util/sort.c
> @@ -81,6 +81,12 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return right->thread->tid - left->thread->tid;
>  }
>  
> +static int64_t
> +sort__thread_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	return key->al->thread->tid - entry->thread->tid;
> +}
> +
>  static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf,
>  				       size_t size, unsigned int width)
>  {
> @@ -104,6 +110,7 @@ static int hist_entry__thread_filter(struct hist_entry *he, int type, const void
>  struct sort_entry sort_thread = {
>  	.se_header	= "    Pid:Command",
>  	.se_cmp		= sort__thread_cmp,
> +	.se_cmp_key	= sort__thread_cmp_key,
>  	.se_snprintf	= hist_entry__thread_snprintf,
>  	.se_filter	= hist_entry__thread_filter,
>  	.se_width_idx	= HISTC_THREAD,
> @@ -123,6 +130,13 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
>  }
>  
>  static int64_t
> +sort__comm_cmp_key(struct hist_entry *entry,
> +	       struct hist_entry_cmp_key *key)
> +{
> +	return strcmp(comm__str(key->comm), comm__str(entry->comm));
> +}
> +
> +static int64_t
>  sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
>  {
>  	return strcmp(comm__str(right->comm), comm__str(left->comm));
> @@ -143,6 +157,7 @@ static int hist_entry__comm_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_comm = {
>  	.se_header	= "Command",
>  	.se_cmp		= sort__comm_cmp,
> +	.se_cmp_key	= sort__comm_cmp_key,
>  	.se_collapse	= sort__comm_collapse,
>  	.se_sort	= sort__comm_sort,
>  	.se_snprintf	= hist_entry__comm_snprintf,
> @@ -178,6 +193,12 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return _sort__dso_cmp(right->ms.map, left->ms.map);
>  }
>  
> +static int64_t
> +sort__dso_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	return _sort__dso_cmp(key->al->map, entry->ms.map);
> +}
> +
>  static int _hist_entry__dso_snprintf(struct map *map, char *bf,
>  				     size_t size, unsigned int width)
>  {
> @@ -209,6 +230,7 @@ static int hist_entry__dso_filter(struct hist_entry *he, int type, const void *a
>  struct sort_entry sort_dso = {
>  	.se_header	= "Shared Object",
>  	.se_cmp		= sort__dso_cmp,
> +	.se_cmp_key	= sort__dso_cmp_key,
>  	.se_snprintf	= hist_entry__dso_snprintf,
>  	.se_filter	= hist_entry__dso_filter,
>  	.se_width_idx	= HISTC_DSO,
> @@ -260,6 +282,25 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
>  }
>  
>  static int64_t
> +sort__sym_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	int64_t ret;
> +
> +	if (!entry->ms.sym && !key->al->sym)
> +		return _sort__addr_cmp(entry->ip, key->al->addr);
> +
> +	/*
> +	 * comparing symbol address alone is not enough since it's a
> +	 * relative address within a dso.
> +	 */
> +	ret = sort__dso_cmp_key(entry, key);
> +	if (ret != 0)
> +		return ret;
> +
> +	return _sort__sym_cmp(entry->ms.sym, key->al->sym);
> +}
> +
> +static int64_t
>  sort__sym_sort(struct hist_entry *left, struct hist_entry *right)
>  {
>  	if (!left->ms.sym || !right->ms.sym)
> @@ -323,6 +364,7 @@ static int hist_entry__sym_filter(struct hist_entry *he, int type, const void *a
>  struct sort_entry sort_sym = {
>  	.se_header	= "Symbol",
>  	.se_cmp		= sort__sym_cmp,
> +	.se_cmp_key	= sort__sym_cmp_key,
>  	.se_sort	= sort__sym_sort,
>  	.se_snprintf	= hist_entry__sym_snprintf,
>  	.se_filter	= hist_entry__sym_filter,
> @@ -347,6 +389,18 @@ sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return strcmp(right->srcline, left->srcline);
>  }
>  
> +static int64_t
> +sort__srcline_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->srcline)
> +		entry->srcline = hist_entry__srcline(entry);
> +	if (!key->al->srcline)
> +		key->al->srcline =
> +			map__srcline(key->al->map, key->al->addr, key->al->sym);
> +
> +	return strcmp(key->al->srcline, entry->srcline);
> +}
> +
>  static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
>  					size_t size, unsigned int width)
>  {
> @@ -359,6 +413,7 @@ static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_srcline = {
>  	.se_header	= "Source:Line",
>  	.se_cmp		= sort__srcline_cmp,
> +	.se_cmp_key	= sort__srcline_cmp_key,
>  	.se_snprintf	= hist_entry__srcline_snprintf,
>  	.se_width_idx	= HISTC_SRCLINE,
>  };
> @@ -382,6 +437,18 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
>  }
>  
> +static int64_t
> +sort__srcline_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->branch_info->srcline_from)
> +		entry->branch_info->srcline_from = addr_map_symbol__srcline(&entry->branch_info->from);
> +
> +	if (!key->bi->srcline_from)
> +		key->bi->srcline_from = addr_map_symbol__srcline(&key->bi->from);
> +
> +	return strcmp(key->bi->srcline_from, entry->branch_info->srcline_from);
> +}
> +
>  static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
>  					size_t size, unsigned int width)
>  {
> @@ -391,6 +458,7 @@ static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_srcline_from = {
>  	.se_header	= "From Source:Line",
>  	.se_cmp		= sort__srcline_from_cmp,
> +	.se_cmp_key	= sort__srcline_from_cmp_key,
>  	.se_snprintf	= hist_entry__srcline_from_snprintf,
>  	.se_width_idx	= HISTC_SRCLINE_FROM,
>  };
> @@ -409,6 +477,18 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
>  }
>  
> +static int64_t
> +sort__srcline_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->branch_info->srcline_to)
> +		entry->branch_info->srcline_to = addr_map_symbol__srcline(&entry->branch_info->to);
> +
> +	if (!key->bi->srcline_to)
> +		key->bi->srcline_to = addr_map_symbol__srcline(&key->bi->to);
> +
> +	return strcmp(key->bi->srcline_to, entry->branch_info->srcline_to);
> +}
> +
>  static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
>  					size_t size, unsigned int width)
>  {
> @@ -418,6 +498,7 @@ static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_srcline_to = {
>  	.se_header	= "To Source:Line",
>  	.se_cmp		= sort__srcline_to_cmp,
> +	.se_cmp_key	= sort__srcline_to_cmp_key,
>  	.se_snprintf	= hist_entry__srcline_to_snprintf,
>  	.se_width_idx	= HISTC_SRCLINE_TO,
>  };
> @@ -426,16 +507,16 @@ struct sort_entry sort_srcline_to = {
>  
>  static char no_srcfile[1];
>  
> -static char *hist_entry__get_srcfile(struct hist_entry *e)
> +static char *__hist_entry__get_srcfile(struct map *map, struct symbol *sym,
> +				       u64 ip)
>  {
>  	char *sf, *p;
> -	struct map *map = e->ms.map;
>  
>  	if (!map)
>  		return no_srcfile;
>  
> -	sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
> -			 e->ms.sym, false, true, true, e->ip);
> +	sf = __get_srcline(map->dso, map__rip_2objdump(map, ip),
> +			 sym, false, true, true, ip);
>  	if (!strcmp(sf, SRCLINE_UNKNOWN))
>  		return no_srcfile;
>  	p = strchr(sf, ':');
> @@ -447,6 +528,15 @@ static char *hist_entry__get_srcfile(struct hist_entry *e)
>  	return no_srcfile;
>  }
>  
> +static char *hist_entry__get_srcfile(struct hist_entry *e)
> +{
> +	return __hist_entry__get_srcfile(e->ms.map, e->ms.sym, e->ip);
> +}
> +
> +static char *hist_entry_key__get_srcfile(struct hist_entry_cmp_key *key)
> +{
> +	return __hist_entry__get_srcfile(key->al->map, key->al->sym, key->al->addr);
> +}
>  static int64_t
>  sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
>  {
> @@ -458,6 +548,17 @@ sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return strcmp(right->srcfile, left->srcfile);
>  }
>  
> +static int64_t
> +sort__srcfile_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->srcfile)
> +		entry->srcfile = hist_entry__get_srcfile(entry);
> +	if (!key->srcfile)
> +		key->srcfile = hist_entry_key__get_srcfile(key);
> +
> +	return strcmp(key->srcfile, entry->srcfile);
> +}
> +
>  static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
>  					size_t size, unsigned int width)
>  {
> @@ -470,6 +571,7 @@ static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_srcfile = {
>  	.se_header	= "Source File",
>  	.se_cmp		= sort__srcfile_cmp,
> +	.se_cmp_key	= sort__srcfile_cmp_key,
>  	.se_snprintf	= hist_entry__srcfile_snprintf,
>  	.se_width_idx	= HISTC_SRCFILE,
>  };
> @@ -488,6 +590,18 @@ sort__parent_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return strcmp(sym_r->name, sym_l->name);
>  }
>  
> +static int64_t
> +sort__parent_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	struct symbol *sym_l = entry->parent;
> +	struct symbol *sym_r = key->sym_parent;
> +
> +	if (!sym_l || !sym_r)
> +		return cmp_null(sym_l, sym_r);
> +
> +	return strcmp(sym_r->name, sym_l->name);
> +}
> +
>  static int hist_entry__parent_snprintf(struct hist_entry *he, char *bf,
>  				       size_t size, unsigned int width)
>  {
> @@ -498,6 +612,7 @@ static int hist_entry__parent_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_parent = {
>  	.se_header	= "Parent symbol",
>  	.se_cmp		= sort__parent_cmp,
> +	.se_cmp_key	= sort__parent_cmp_key,
>  	.se_snprintf	= hist_entry__parent_snprintf,
>  	.se_width_idx	= HISTC_PARENT,
>  };
> @@ -510,6 +625,12 @@ sort__cpu_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return right->cpu - left->cpu;
>  }
>  
> +static int64_t
> +sort__cpu_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	return key->al->cpu - entry->cpu;
> +}
> +
>  static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -519,6 +640,7 @@ static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_cpu = {
>  	.se_header      = "CPU",
>  	.se_cmp	        = sort__cpu_cmp,
> +	.se_cmp_key     = sort__cpu_cmp_key,
>  	.se_snprintf    = hist_entry__cpu_snprintf,
>  	.se_width_idx	= HISTC_CPU,
>  };
> @@ -548,6 +670,22 @@ sort__cgroup_id_cmp(struct hist_entry *left, struct hist_entry *right)
>  				       left->cgroup_id.ino);
>  }
>  
> +static int64_t
> +sort__cgroup_id_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	struct namespaces *ns = thread__namespaces(key->al->thread);
> +	int64_t ret;
> +	u64 val;
> +
> +	val = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0;
> +	ret = _sort__cgroup_dev_cmp(val, entry->cgroup_id.dev);
> +	if (ret != 0)
> +		return ret;
> +
> +	val = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0;
> +	return _sort__cgroup_inode_cmp(val, entry->cgroup_id.ino);
> +}
> +
>  static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
>  					  char *bf, size_t size,
>  					  unsigned int width __maybe_unused)
> @@ -559,6 +697,7 @@ static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
>  struct sort_entry sort_cgroup_id = {
>  	.se_header      = "cgroup id (dev/inode)",
>  	.se_cmp	        = sort__cgroup_id_cmp,
> +	.se_cmp_key     = sort__cgroup_id_cmp_key,
>  	.se_snprintf    = hist_entry__cgroup_id_snprintf,
>  	.se_width_idx	= HISTC_CGROUP_ID,
>  };
> @@ -571,6 +710,12 @@ sort__socket_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return right->socket - left->socket;
>  }
>  
> +static int64_t
> +sort__socket_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	return key->al->socket - entry->socket;
> +}
> +
>  static int hist_entry__socket_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -590,6 +735,7 @@ static int hist_entry__socket_filter(struct hist_entry *he, int type, const void
>  struct sort_entry sort_socket = {
>  	.se_header      = "Socket",
>  	.se_cmp	        = sort__socket_cmp,
> +	.se_cmp_key     = sort__socket_cmp_key,
>  	.se_snprintf    = hist_entry__socket_snprintf,
>  	.se_filter      = hist_entry__socket_filter,
>  	.se_width_idx	= HISTC_SOCKET,
> @@ -597,20 +743,21 @@ struct sort_entry sort_socket = {
>  
>  /* --sort trace */
>  
> -static char *get_trace_output(struct hist_entry *he)
> +static char *__get_trace_output(struct hists *hists, void *raw_data,
> +				u32 raw_size)
>  {
>  	struct trace_seq seq;
>  	struct perf_evsel *evsel;
>  	struct tep_record rec = {
> -		.data = he->raw_data,
> -		.size = he->raw_size,
> +		.data = raw_data,
> +		.size = raw_size,
>  	};
>  
> -	evsel = hists_to_evsel(he->hists);
> +	evsel = hists_to_evsel(hists);
>  
>  	trace_seq_init(&seq);
>  	if (symbol_conf.raw_trace) {
> -		tep_print_fields(&seq, he->raw_data, he->raw_size,
> +		tep_print_fields(&seq, raw_data, raw_size,
>  				 evsel->tp_format);
>  	} else {
>  		tep_event_info(&seq, evsel->tp_format, &rec);
> @@ -622,6 +769,16 @@ static char *get_trace_output(struct hist_entry *he)
>  	return realloc(seq.buffer, seq.len + 1);
>  }
>  
> +static char *get_trace_output(struct hist_entry *he)
> +{
> +	return __get_trace_output(he->hists, he->raw_data, he->raw_size);
> +}
> +
> +static char *get_trace_output_key(struct hists *hists, struct hist_entry_cmp_key *key)
> +{
> +	return __get_trace_output(hists, key->sample->raw_data, key->sample->raw_size);
> +}
> +
>  static int64_t
>  sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
>  {
> @@ -639,6 +796,23 @@ sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return strcmp(right->trace_output, left->trace_output);
>  }
>  
> +static int64_t
> +sort__trace_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	struct perf_evsel *evsel;
> +
> +	evsel = hists_to_evsel(entry->hists);
> +	if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
> +		return 0;
> +
> +	if (entry->trace_output == NULL)
> +		entry->trace_output = get_trace_output(entry);
> +	if (key->trace_output == NULL)
> +		key->trace_output = get_trace_output_key(entry->hists, key);
> +
> +	return strcmp(key->trace_output, entry->trace_output);
> +}
> +
>  static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -656,6 +830,7 @@ static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_trace = {
>  	.se_header      = "Trace output",
>  	.se_cmp	        = sort__trace_cmp,
> +	.se_cmp_key     = sort__trace_cmp_key,
>  	.se_snprintf    = hist_entry__trace_snprintf,
>  	.se_width_idx	= HISTC_TRACE,
>  };
> @@ -672,6 +847,16 @@ sort__dso_from_cmp(struct hist_entry *left, struct hist_entry *right)
>  			      right->branch_info->from.map);
>  }
>  
> +static int64_t
> +sort__dso_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->branch_info || !key->bi)
> +		return cmp_null(entry->branch_info, key->bi);
> +
> +	return _sort__dso_cmp(entry->branch_info->from.map,
> +			      key->bi->from.map);
> +}
> +
>  static int hist_entry__dso_from_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -704,6 +889,16 @@ sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
>  			      right->branch_info->to.map);
>  }
>  
> +static int64_t
> +sort__dso_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->branch_info || !key->bi)
> +		return cmp_null(entry->branch_info, key->bi);
> +
> +	return _sort__dso_cmp(entry->branch_info->to.map,
> +			      key->bi->to.map);
> +}
> +
>  static int hist_entry__dso_to_snprintf(struct hist_entry *he, char *bf,
>  				       size_t size, unsigned int width)
>  {
> @@ -745,6 +940,24 @@ sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
>  }
>  
>  static int64_t
> +sort__sym_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	struct addr_map_symbol *from_l = &entry->branch_info->from;
> +	struct addr_map_symbol *from_r = &key->bi->from;
> +
> +	if (!entry->branch_info || !key->bi)
> +		return cmp_null(entry->branch_info, key->bi);
> +
> +	from_l = &entry->branch_info->from;
> +	from_r = &key->bi->from;
> +
> +	if (!from_l->sym && !from_r->sym)
> +		return _sort__addr_cmp(from_l->addr, from_r->addr);
> +
> +	return _sort__sym_cmp(from_l->sym, from_r->sym);
> +}
> +
> +static int64_t
>  sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
>  {
>  	struct addr_map_symbol *to_l, *to_r;
> @@ -761,6 +974,23 @@ sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return _sort__sym_cmp(to_l->sym, to_r->sym);
>  }
>  
> +static int64_t
> +sort__sym_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	struct addr_map_symbol *to_l, *to_r;
> +
> +	if (!entry->branch_info || !key->bi)
> +		return cmp_null(entry->branch_info, key->bi);
> +
> +	to_l = &entry->branch_info->to;
> +	to_r = &key->bi->to;
> +
> +	if (!to_l->sym && !to_r->sym)
> +		return _sort__addr_cmp(to_l->addr, to_r->addr);
> +
> +	return _sort__sym_cmp(to_l->sym, to_r->sym);
> +}
> +
>  static int hist_entry__sym_from_snprintf(struct hist_entry *he, char *bf,
>  					 size_t size, unsigned int width)
>  {
> @@ -814,6 +1044,7 @@ static int hist_entry__sym_to_filter(struct hist_entry *he, int type,
>  struct sort_entry sort_dso_from = {
>  	.se_header	= "Source Shared Object",
>  	.se_cmp		= sort__dso_from_cmp,
> +	.se_cmp_key	= sort__dso_from_cmp_key,
>  	.se_snprintf	= hist_entry__dso_from_snprintf,
>  	.se_filter	= hist_entry__dso_from_filter,
>  	.se_width_idx	= HISTC_DSO_FROM,
> @@ -822,6 +1053,7 @@ struct sort_entry sort_dso_from = {
>  struct sort_entry sort_dso_to = {
>  	.se_header	= "Target Shared Object",
>  	.se_cmp		= sort__dso_to_cmp,
> +	.se_cmp_key	= sort__dso_to_cmp_key,
>  	.se_snprintf	= hist_entry__dso_to_snprintf,
>  	.se_filter	= hist_entry__dso_to_filter,
>  	.se_width_idx	= HISTC_DSO_TO,
> @@ -830,6 +1062,7 @@ struct sort_entry sort_dso_to = {
>  struct sort_entry sort_sym_from = {
>  	.se_header	= "Source Symbol",
>  	.se_cmp		= sort__sym_from_cmp,
> +	.se_cmp_key	= sort__sym_from_cmp_key,
>  	.se_snprintf	= hist_entry__sym_from_snprintf,
>  	.se_filter	= hist_entry__sym_from_filter,
>  	.se_width_idx	= HISTC_SYMBOL_FROM,
> @@ -838,6 +1071,7 @@ struct sort_entry sort_sym_from = {
>  struct sort_entry sort_sym_to = {
>  	.se_header	= "Target Symbol",
>  	.se_cmp		= sort__sym_to_cmp,
> +	.se_cmp_key	= sort__sym_to_cmp_key,
>  	.se_snprintf	= hist_entry__sym_to_snprintf,
>  	.se_filter	= hist_entry__sym_to_filter,
>  	.se_width_idx	= HISTC_SYMBOL_TO,
> @@ -856,6 +1090,19 @@ sort__mispredict_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return mp || p;
>  }
>  
> +static int64_t
> +sort__mispredict_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	unsigned char mp, p;
> +
> +	if (!entry->branch_info || !key->bi)
> +		return cmp_null(entry->branch_info, key->bi);
> +
> +	mp = entry->branch_info->flags.mispred != key->bi->flags.mispred;
> +	p  = entry->branch_info->flags.predicted != key->bi->flags.predicted;
> +	return mp || p;
> +}
> +
>  static int hist_entry__mispredict_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width){
>  	static const char *out = "N/A";
> @@ -880,6 +1127,16 @@ sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right)
>  		right->branch_info->flags.cycles;
>  }
>  
> +static int64_t
> +sort__cycles_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->branch_info || !key->bi)
> +		return cmp_null(entry->branch_info, key->bi);
> +
> +	return entry->branch_info->flags.cycles -
> +		key->bi->flags.cycles;
> +}
> +
>  static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -894,6 +1151,7 @@ static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_cycles = {
>  	.se_header	= "Basic Block Cycles",
>  	.se_cmp		= sort__cycles_cmp,
> +	.se_cmp_key	= sort__cycles_cmp_key,
>  	.se_snprintf	= hist_entry__cycles_snprintf,
>  	.se_width_idx	= HISTC_CYCLES,
>  };
> @@ -912,6 +1170,19 @@ sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return (int64_t)(r - l);
>  }
>  
> +static int64_t
> +sort__daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	uint64_t l = 0, r = 0;
> +
> +	if (entry->mem_info)
> +		l = entry->mem_info->daddr.addr;
> +	if (key->mem_info)
> +		r = key->mem_info->daddr.addr;
> +
> +	return (int64_t)(r - l);
> +}
> +
>  static int hist_entry__daddr_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -941,6 +1212,19 @@ sort__iaddr_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return (int64_t)(r - l);
>  }
>  
> +static int64_t
> +sort__iaddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	uint64_t l = 0, r = 0;
> +
> +	if (entry->mem_info)
> +		l = entry->mem_info->iaddr.addr;
> +	if (key->mem_info)
> +		r = key->mem_info->iaddr.addr;
> +
> +	return (int64_t)(r - l);
> +}
> +
>  static int hist_entry__iaddr_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -971,6 +1255,20 @@ sort__dso_daddr_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return _sort__dso_cmp(map_l, map_r);
>  }
>  
> +static int64_t
> +sort__dso_daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	struct map *map_l = NULL;
> +	struct map *map_r = NULL;
> +
> +	if (entry->mem_info)
> +		map_l = entry->mem_info->daddr.map;
> +	if (key->mem_info)
> +		map_r = key->mem_info->daddr.map;
> +
> +	return _sort__dso_cmp(map_l, map_r);
> +}
> +
>  static int hist_entry__dso_daddr_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -1001,6 +1299,25 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return (int64_t)(data_src_r.mem_lock - data_src_l.mem_lock);
>  }
>  
> +static int64_t
> +sort__locked_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	union perf_mem_data_src data_src_l;
> +	union perf_mem_data_src data_src_r;
> +
> +	if (entry->mem_info)
> +		data_src_l = entry->mem_info->data_src;
> +	else
> +		data_src_l.mem_lock = PERF_MEM_LOCK_NA;
> +
> +	if (key->mem_info)
> +		data_src_r = key->mem_info->data_src;
> +	else
> +		data_src_r.mem_lock = PERF_MEM_LOCK_NA;
> +
> +	return (int64_t)(data_src_r.mem_lock - data_src_l.mem_lock);
> +}
> +
>  static int hist_entry__locked_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -1029,6 +1346,25 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
>  }
>  
> +static int64_t
> +sort__tlb_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	union perf_mem_data_src data_src_l;
> +	union perf_mem_data_src data_src_r;
> +
> +	if (entry->mem_info)
> +		data_src_l = entry->mem_info->data_src;
> +	else
> +		data_src_l.mem_dtlb = PERF_MEM_TLB_NA;
> +
> +	if (key->mem_info)
> +		data_src_r = key->mem_info->data_src;
> +	else
> +		data_src_r.mem_dtlb = PERF_MEM_TLB_NA;
> +
> +	return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
> +}
> +
>  static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -1057,6 +1393,25 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
>  }
>  
> +static int64_t
> +sort__lvl_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	union perf_mem_data_src data_src_l;
> +	union perf_mem_data_src data_src_r;
> +
> +	if (entry->mem_info)
> +		data_src_l = entry->mem_info->data_src;
> +	else
> +		data_src_l.mem_lvl = PERF_MEM_LVL_NA;
> +
> +	if (key->mem_info)
> +		data_src_r = key->mem_info->data_src;
> +	else
> +		data_src_r.mem_lvl = PERF_MEM_LVL_NA;
> +
> +	return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
> +}
> +
>  static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -1085,6 +1440,25 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
>  }
>  
> +static int64_t
> +sort__snoop_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	union perf_mem_data_src data_src_l;
> +	union perf_mem_data_src data_src_r;
> +
> +	if (entry->mem_info)
> +		data_src_l = entry->mem_info->data_src;
> +	else
> +		data_src_l.mem_snoop = PERF_MEM_SNOOP_NA;
> +
> +	if (key->mem_info)
> +		data_src_r = key->mem_info->data_src;
> +	else
> +		data_src_r.mem_snoop = PERF_MEM_SNOOP_NA;
> +
> +	return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
> +}
> +
>  static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -1158,6 +1532,70 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return 0;
>  }
>  
> +static int64_t
> +sort__dcacheline_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	u64 l, r;
> +	struct map *l_map, *r_map;
> +
> +	if (!entry->mem_info)  return -1;
> +	if (!key->mem_info) return 1;
> +
> +	/* group event types together */
> +	if (entry->cpumode > key->al->cpumode) return -1;
> +	if (entry->cpumode < key->al->cpumode) return 1;
> +
> +	l_map = entry->mem_info->daddr.map;
> +	r_map = key->mem_info->daddr.map;
> +
> +	/* if both are NULL, jump to sort on al_addr instead */
> +	if (!l_map && !r_map)
> +		goto addr;
> +
> +	if (!l_map) return -1;
> +	if (!r_map) return 1;
> +
> +	if (l_map->maj > r_map->maj) return -1;
> +	if (l_map->maj < r_map->maj) return 1;
> +
> +	if (l_map->min > r_map->min) return -1;
> +	if (l_map->min < r_map->min) return 1;
> +
> +	if (l_map->ino > r_map->ino) return -1;
> +	if (l_map->ino < r_map->ino) return 1;
> +
> +	if (l_map->ino_generation > r_map->ino_generation) return -1;
> +	if (l_map->ino_generation < r_map->ino_generation) return 1;
> +
> +	/*
> +	 * Addresses with no major/minor numbers are assumed to be
> +	 * anonymous in userspace.  Sort those on pid then address.
> +	 *
> +	 * The kernel and non-zero major/minor mapped areas are
> +	 * assumed to be unity mapped.  Sort those on address.
> +	 */
> +
> +	if ((entry->cpumode != PERF_RECORD_MISC_KERNEL) &&
> +	    (!(l_map->flags & MAP_SHARED)) &&
> +	    !l_map->maj && !l_map->min && !l_map->ino &&
> +	    !l_map->ino_generation) {
> +		/* userspace anonymous */
> +
> +		if (entry->thread->pid_ > key->al->thread->pid_) return -1;
> +		if (entry->thread->pid_ < key->al->thread->pid_) return 1;
> +	}
> +
> +addr:
> +	/* al_addr does all the right addr - start + offset calculations */
> +	l = cl_address(entry->mem_info->daddr.al_addr);
> +	r = cl_address(key->mem_info->daddr.al_addr);
> +
> +	if (l > r) return -1;
> +	if (l < r) return 1;
> +
> +	return 0;
> +}
> +
>  static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
>  					  size_t size, unsigned int width)
>  {
> @@ -1189,6 +1627,7 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_mispredict = {
>  	.se_header	= "Branch Mispredicted",
>  	.se_cmp		= sort__mispredict_cmp,
> +	.se_cmp_key	= sort__mispredict_cmp_key,
>  	.se_snprintf	= hist_entry__mispredict_snprintf,
>  	.se_width_idx	= HISTC_MISPREDICT,
>  };
> @@ -1198,12 +1637,24 @@ static u64 he_weight(struct hist_entry *he)
>  	return he->stat.nr_events ? he->stat.weight / he->stat.nr_events : 0;
>  }
>  
> +static u64 key_weight(struct hist_entry_cmp_key *key)
> +{
> +	return key->sample->weight;
> +}
> +
>  static int64_t
>  sort__local_weight_cmp(struct hist_entry *left, struct hist_entry *right)
>  {
>  	return he_weight(left) - he_weight(right);
>  }
>  
> +static int64_t
> +sort__local_weight_cmp_key(struct hist_entry *entry,
> +			   struct hist_entry_cmp_key *key)
> +{
> +	return he_weight(entry) - key_weight(key);
> +}
> +
>  static int hist_entry__local_weight_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -1213,6 +1664,7 @@ static int hist_entry__local_weight_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_local_weight = {
>  	.se_header	= "Local Weight",
>  	.se_cmp		= sort__local_weight_cmp,
> +	.se_cmp_key	= sort__local_weight_cmp_key,
>  	.se_snprintf	= hist_entry__local_weight_snprintf,
>  	.se_width_idx	= HISTC_LOCAL_WEIGHT,
>  };
> @@ -1223,6 +1675,13 @@ sort__global_weight_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return left->stat.weight - right->stat.weight;
>  }
>  
> +static int64_t
> +sort__global_weight_cmp_key(struct hist_entry *entry,
> +			    struct hist_entry_cmp_key *key __maybe_unused)
> +{
> +	return entry->stat.weight - key->sample->weight;
> +}
> +
>  static int hist_entry__global_weight_snprintf(struct hist_entry *he, char *bf,
>  					      size_t size, unsigned int width)
>  {
> @@ -1232,6 +1691,7 @@ static int hist_entry__global_weight_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_global_weight = {
>  	.se_header	= "Weight",
>  	.se_cmp		= sort__global_weight_cmp,
> +	.se_cmp_key	= sort__global_weight_cmp_key,
>  	.se_snprintf	= hist_entry__global_weight_snprintf,
>  	.se_width_idx	= HISTC_GLOBAL_WEIGHT,
>  };
> @@ -1239,6 +1699,7 @@ struct sort_entry sort_global_weight = {
>  struct sort_entry sort_mem_daddr_sym = {
>  	.se_header	= "Data Symbol",
>  	.se_cmp		= sort__daddr_cmp,
> +	.se_cmp_key	= sort__daddr_cmp_key,
>  	.se_snprintf	= hist_entry__daddr_snprintf,
>  	.se_width_idx	= HISTC_MEM_DADDR_SYMBOL,
>  };
> @@ -1246,6 +1707,7 @@ struct sort_entry sort_mem_daddr_sym = {
>  struct sort_entry sort_mem_iaddr_sym = {
>  	.se_header	= "Code Symbol",
>  	.se_cmp		= sort__iaddr_cmp,
> +	.se_cmp_key	= sort__iaddr_cmp_key,
>  	.se_snprintf	= hist_entry__iaddr_snprintf,
>  	.se_width_idx	= HISTC_MEM_IADDR_SYMBOL,
>  };
> @@ -1253,6 +1715,7 @@ struct sort_entry sort_mem_iaddr_sym = {
>  struct sort_entry sort_mem_daddr_dso = {
>  	.se_header	= "Data Object",
>  	.se_cmp		= sort__dso_daddr_cmp,
> +	.se_cmp_key	= sort__dso_daddr_cmp_key,
>  	.se_snprintf	= hist_entry__dso_daddr_snprintf,
>  	.se_width_idx	= HISTC_MEM_DADDR_DSO,
>  };
> @@ -1260,6 +1723,7 @@ struct sort_entry sort_mem_daddr_dso = {
>  struct sort_entry sort_mem_locked = {
>  	.se_header	= "Locked",
>  	.se_cmp		= sort__locked_cmp,
> +	.se_cmp_key	= sort__locked_cmp_key,
>  	.se_snprintf	= hist_entry__locked_snprintf,
>  	.se_width_idx	= HISTC_MEM_LOCKED,
>  };
> @@ -1267,6 +1731,7 @@ struct sort_entry sort_mem_locked = {
>  struct sort_entry sort_mem_tlb = {
>  	.se_header	= "TLB access",
>  	.se_cmp		= sort__tlb_cmp,
> +	.se_cmp_key	= sort__tlb_cmp_key,
>  	.se_snprintf	= hist_entry__tlb_snprintf,
>  	.se_width_idx	= HISTC_MEM_TLB,
>  };
> @@ -1274,6 +1739,7 @@ struct sort_entry sort_mem_tlb = {
>  struct sort_entry sort_mem_lvl = {
>  	.se_header	= "Memory access",
>  	.se_cmp		= sort__lvl_cmp,
> +	.se_cmp_key	= sort__lvl_cmp_key,
>  	.se_snprintf	= hist_entry__lvl_snprintf,
>  	.se_width_idx	= HISTC_MEM_LVL,
>  };
> @@ -1281,6 +1747,7 @@ struct sort_entry sort_mem_lvl = {
>  struct sort_entry sort_mem_snoop = {
>  	.se_header	= "Snoop",
>  	.se_cmp		= sort__snoop_cmp,
> +	.se_cmp_key	= sort__snoop_cmp_key,
>  	.se_snprintf	= hist_entry__snoop_snprintf,
>  	.se_width_idx	= HISTC_MEM_SNOOP,
>  };
> @@ -1288,6 +1755,7 @@ struct sort_entry sort_mem_snoop = {
>  struct sort_entry sort_mem_dcacheline = {
>  	.se_header	= "Data Cacheline",
>  	.se_cmp		= sort__dcacheline_cmp,
> +	.se_cmp_key	= sort__dcacheline_cmp_key,
>  	.se_snprintf	= hist_entry__dcacheline_snprintf,
>  	.se_width_idx	= HISTC_MEM_DCACHELINE,
>  };
> @@ -1305,6 +1773,19 @@ sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return (int64_t)(r - l);
>  }
>  
> +static int64_t
> +sort__phys_daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	uint64_t l = 0, r = 0;
> +
> +	if (entry->mem_info)
> +		l = entry->mem_info->daddr.phys_addr;
> +	if (key->mem_info)
> +		r = key->mem_info->daddr.phys_addr;
> +
> +	return (int64_t)(r - l);
> +}
> +
>  static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
>  					   size_t size, unsigned int width)
>  {
> @@ -1329,6 +1810,7 @@ static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_mem_phys_daddr = {
>  	.se_header	= "Data Physical Address",
>  	.se_cmp		= sort__phys_daddr_cmp,
> +	.se_cmp_key	= sort__phys_daddr_cmp_key,
>  	.se_snprintf	= hist_entry__phys_daddr_snprintf,
>  	.se_width_idx	= HISTC_MEM_PHYS_DADDR,
>  };
> @@ -1343,6 +1825,16 @@ sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
>  		right->branch_info->flags.abort;
>  }
>  
> +static int64_t
> +sort__abort_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->branch_info || !key->bi)
> +		return cmp_null(entry->branch_info, key->bi);
> +
> +	return entry->branch_info->flags.abort !=
> +		key->bi->flags.abort;
> +}
> +
>  static int hist_entry__abort_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -1361,6 +1853,7 @@ static int hist_entry__abort_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_abort = {
>  	.se_header	= "Transaction abort",
>  	.se_cmp		= sort__abort_cmp,
> +	.se_cmp_key	= sort__abort_cmp_key,
>  	.se_snprintf	= hist_entry__abort_snprintf,
>  	.se_width_idx	= HISTC_ABORT,
>  };
> @@ -1375,6 +1868,16 @@ sort__in_tx_cmp(struct hist_entry *left, struct hist_entry *right)
>  		right->branch_info->flags.in_tx;
>  }
>  
> +static int64_t
> +sort__in_tx_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	if (!entry->branch_info || !key->bi)
> +		return cmp_null(entry->branch_info, key->bi);
> +
> +	return entry->branch_info->flags.in_tx !=
> +		key->bi->flags.in_tx;
> +}
> +
>  static int hist_entry__in_tx_snprintf(struct hist_entry *he, char *bf,
>  				    size_t size, unsigned int width)
>  {
> @@ -1393,6 +1896,7 @@ static int hist_entry__in_tx_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_in_tx = {
>  	.se_header	= "Branch in transaction",
>  	.se_cmp		= sort__in_tx_cmp,
> +	.se_cmp_key	= sort__in_tx_cmp_key,
>  	.se_snprintf	= hist_entry__in_tx_snprintf,
>  	.se_width_idx	= HISTC_IN_TX,
>  };
> @@ -1403,6 +1907,12 @@ sort__transaction_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return left->transaction - right->transaction;
>  }
>  
> +static int64_t
> +sort__transaction_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	return entry->transaction - key->sample->transaction;
> +}
> +
>  static inline char *add_str(char *p, const char *str)
>  {
>  	strcpy(p, str);
> @@ -1465,6 +1975,7 @@ static int hist_entry__transaction_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_transaction = {
>  	.se_header	= "Transaction                ",
>  	.se_cmp		= sort__transaction_cmp,
> +	.se_cmp_key	= sort__transaction_cmp_key,
>  	.se_snprintf	= hist_entry__transaction_snprintf,
>  	.se_width_idx	= HISTC_TRANSACTION,
>  };
> @@ -1486,6 +1997,12 @@ sort__sym_size_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return _sort__sym_size_cmp(right->ms.sym, left->ms.sym);
>  }
>  
> +static int64_t
> +sort__sym_size_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	return _sort__sym_size_cmp(key->al->sym, entry->ms.sym);
> +}
> +
>  static int _hist_entry__sym_size_snprintf(struct symbol *sym, char *bf,
>  					  size_t bf_size, unsigned int width)
>  {
> @@ -1504,6 +2021,7 @@ static int hist_entry__sym_size_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_sym_size = {
>  	.se_header	= "Symbol size",
>  	.se_cmp		= sort__sym_size_cmp,
> +	.se_cmp_key	= sort__sym_size_cmp_key,
>  	.se_snprintf	= hist_entry__sym_size_snprintf,
>  	.se_width_idx	= HISTC_SYM_SIZE,
>  };
> @@ -1525,6 +2043,12 @@ sort__dso_size_cmp(struct hist_entry *left, struct hist_entry *right)
>  	return _sort__dso_size_cmp(right->ms.map, left->ms.map);
>  }
>  
> +static int64_t
> +sort__dso_size_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	return _sort__dso_size_cmp(key->al->map, entry->ms.map);
> +}
> +
>  static int _hist_entry__dso_size_snprintf(struct map *map, char *bf,
>  					  size_t bf_size, unsigned int width)
>  {
> @@ -1544,6 +2068,7 @@ static int hist_entry__dso_size_snprintf(struct hist_entry *he, char *bf,
>  struct sort_entry sort_dso_size = {
>  	.se_header	= "DSO size",
>  	.se_cmp		= sort__dso_size_cmp,
> +	.se_cmp_key	= sort__dso_size_cmp_key,
>  	.se_snprintf	= hist_entry__dso_size_snprintf,
>  	.se_width_idx	= HISTC_DSO_SIZE,
>  };
> @@ -1693,12 +2218,13 @@ static int __sort__hpp_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
>  }
>  
>  static int64_t __sort__hpp_cmp(struct perf_hpp_fmt *fmt,
> -			       struct hist_entry *a, struct hist_entry *b)
> +			       struct hist_entry *entry,
> +			       struct hist_entry_cmp_key *key)
>  {
>  	struct hpp_sort_entry *hse;
>  
>  	hse = container_of(fmt, struct hpp_sort_entry, hpp);
> -	return hse->se->se_cmp(a, b);
> +	return hse->se->se_cmp_key(entry, key);
>  }
>  
>  static int64_t __sort__hpp_collapse(struct perf_hpp_fmt *fmt,
> @@ -2089,9 +2615,37 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
>  	return memcmp(a->raw_data + offset, b->raw_data + offset, size);
>  }
>  
> +static int64_t __sort__hde_cmp_key(struct perf_hpp_fmt *fmt,
> +				   struct hist_entry *a,
> +				   struct hist_entry_cmp_key *key)
> +{
> +	struct hpp_dynamic_entry *hde;
> +	struct tep_format_field *field;
> +	unsigned offset, size;
> +
> +	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
> +	field = hde->field;
> +	if (field->flags & TEP_FIELD_IS_DYNAMIC) {
> +		unsigned long long dyn;
> +
> +		tep_read_number_field(field, a->raw_data, &dyn);
> +		offset = dyn & 0xffff;
> +		size = (dyn >> 16) & 0xffff;
> +
> +		/* record max width for output */
> +		if (size > hde->dynamic_len)
> +			hde->dynamic_len = size;
> +	} else {
> +		offset = field->offset;
> +		size = field->size;
> +	}
> +
> +	return memcmp(a->raw_data + offset, key->sample->raw_data + offset, size);
> +}
> +
>  bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
>  {
> -	return fmt->cmp == __sort__hde_cmp;
> +	return fmt->cmp == __sort__hde_cmp_key;
>  }
>  
>  static bool __sort__hde_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
> @@ -2138,7 +2692,7 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct tep_format_field *field,
>  	hde->hpp.entry  = __sort__hde_entry;
>  	hde->hpp.color  = NULL;
>  
> -	hde->hpp.cmp = __sort__hde_cmp;
> +	hde->hpp.cmp = __sort__hde_cmp_key;
>  	hde->hpp.collapse = __sort__hde_cmp;
>  	hde->hpp.sort = __sort__hde_cmp;
>  	hde->hpp.equal = __sort__hde_equal;
> diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
> index a97cf8e..da85224 100644
> --- a/tools/perf/util/sort.h
> +++ b/tools/perf/util/sort.h
> @@ -264,6 +264,7 @@ struct sort_entry {
>  	const char *se_header;
>  
>  	int64_t (*se_cmp)(struct hist_entry *, struct hist_entry *);
> +	int64_t (*se_cmp_key)(struct hist_entry *, struct hist_entry_cmp_key *);
>  	int64_t (*se_collapse)(struct hist_entry *, struct hist_entry *);
>  	int64_t	(*se_sort)(struct hist_entry *, struct hist_entry *);
>  	int	(*se_snprintf)(struct hist_entry *he, char *bf, size_t size,
> diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> index 3badd7f..78df16b 100644
> --- a/tools/perf/util/hist.h
> +++ b/tools/perf/util/hist.h
> @@ -150,7 +150,6 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
>  struct perf_hpp;
>  struct perf_hpp_fmt;
>  
> -int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
>  int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
>  int hist_entry__transaction_len(void);
>  int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size,
> @@ -238,6 +237,18 @@ struct perf_hpp {
>  	void *ptr;
>  };
>  
> +struct hist_entry_cmp_key {
> +	struct addr_location *al;
> +	struct comm *comm;
> +	struct branch_info *bi;
> +	struct symbol *sym_parent;
> +	struct perf_sample *sample;
> +	struct mem_info *mem_info;
> +	char *srcfile;
> +	char *trace_output;
> +};
> +
> +struct comm;
>  struct perf_hpp_fmt {
>  	const char *name;
>  	int (*header)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
> @@ -249,7 +260,8 @@ struct perf_hpp_fmt {
>  	int (*entry)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
>  		     struct hist_entry *he);
>  	int64_t (*cmp)(struct perf_hpp_fmt *fmt,
> -		       struct hist_entry *a, struct hist_entry *b);
> +		       struct hist_entry *entry,
> +		       struct hist_entry_cmp_key *key);
>  	int64_t (*collapse)(struct perf_hpp_fmt *fmt,
>  			    struct hist_entry *a, struct hist_entry *b);
>  	int64_t (*sort)(struct perf_hpp_fmt *fmt,
> @@ -525,4 +537,8 @@ static inline int hists__scnprintf_title(struct hists *hists, char *bf, size_t s
>  	return __hists__scnprintf_title(hists, bf, size, true);
>  }
>  
> +extern unsigned long hist_lookups;
> +extern unsigned long hist_hits;
> +extern unsigned long hist_misses;
> +
>  #endif	/* __PERF_HIST_H */
> diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> index 828cb97..a4deb5d 100644
> --- a/tools/perf/util/hist.c
> +++ b/tools/perf/util/hist.c
> @@ -364,16 +364,49 @@ void hists__delete_entries(struct hists *hists)
>  	}
>  }
>  
> +static u8 symbol__parent_filter(const struct symbol *parent)
> +{
> +	if (symbol_conf.exclude_other && parent == NULL)
> +		return 1 << HIST_FILTER__PARENT;
> +	return 0;
> +}
> +
>  /*
>   * histogram, sorted on item, collects periods
>   */
>  
>  static int hist_entry__init(struct hist_entry *he,
> -			    struct hist_entry *template,
> +			    struct hist_entry_cmp_key *key,
> +			    struct hists *hists,
>  			    bool sample_self,
>  			    size_t callchain_size)
>  {
> -	*he = *template;
> +	struct namespaces *ns = thread__namespaces(key->al->thread);
> +
> +	he->thread = key->al->thread;
> +	he->comm = thread__comm(he->thread);
> +	he->cgroup_id.dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0;
> +	he->cgroup_id.ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0;
> +	he->ms.map = key->al->map;
> +	he->ms.sym = key->al->sym;
> +	he->srcline = key->al->srcline ? strdup(key->al->srcline) : NULL;
> +	he->socket	 = key->al->socket;
> +	he->cpu	 = key->al->cpu;
> +	he->cpumode = key->al->cpumode;
> +	he->ip	 = key->al->addr;
> +	he->level	 = key->al->level;
> +	he->stat.nr_events = 1;
> +	he->stat.period = key->sample->period;
> +	he->stat.weight = key->sample->weight;
> +	he->parent = key->sym_parent;
> +	he->filtered = symbol__parent_filter(key->sym_parent) | key->al->filtered;
> +	he->hists = hists;
> +	he->branch_info = key->bi;
> +	he->mem_info = key->mem_info;
> +	he->transaction = key->sample->transaction;
> +	he->raw_data = key->sample->raw_data;
> +	he->raw_size = key->sample->raw_size;
> +
>  	he->callchain_size = callchain_size;
>  
>  	if (symbol_conf.cumulate_callchain) {
> @@ -400,7 +433,7 @@ static int hist_entry__init(struct hist_entry *he,
>  			return -ENOMEM;
>  		}
>  
> -		memcpy(he->branch_info, template->branch_info,
> +		memcpy(he->branch_info, key->bi,
>  		       sizeof(*he->branch_info));
>  
>  		map__get(he->branch_info->from.map);
> @@ -459,23 +492,25 @@ static struct hist_entry_ops default_ops = {
>  	.free	= hist_entry__free,
>  };
>  
> -static struct hist_entry *hist_entry__new(struct hist_entry *template,
> +static struct hist_entry *hist_entry__new(struct hist_entry_cmp_key *key,
> +					  struct hists *hists,
> +					  struct hist_entry_ops *ops,
>  					  bool sample_self)
>  {
> -	struct hist_entry_ops *ops = template->ops;
>  	size_t callchain_size = 0;
>  	struct hist_entry *he;
>  	int err = 0;
>  
>  	if (!ops)
> -		ops = template->ops = &default_ops;
> +		ops = &default_ops;
>  
>  	if (symbol_conf.use_callchain)
>  		callchain_size = sizeof(struct callchain_root);
>  
>  	he = ops->new(callchain_size);
>  	if (he) {
> -		err = hist_entry__init(he, template, sample_self, callchain_size);
> +		he->ops = ops;
> +		err = hist_entry__init(he, key, hists, sample_self, callchain_size);
>  		if (err) {
>  			ops->free(he);
>  			he = NULL;
> @@ -485,13 +520,6 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
>  	return he;
>  }
>  
> -static u8 symbol__parent_filter(const struct symbol *parent)
> -{
> -	if (symbol_conf.exclude_other && parent == NULL)
> -		return 1 << HIST_FILTER__PARENT;
> -	return 0;
> -}
> -
>  static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
>  {
>  	if (!hist_entry__has_callchains(he) || !symbol_conf.use_callchain)
> @@ -502,17 +530,43 @@ static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
>  		he->hists->callchain_non_filtered_period += period;
>  }
>  
> +static int64_t
> +hist_entry__cmp(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> +{
> +	struct hists *hists = entry->hists;
> +	struct perf_hpp_fmt *fmt;
> +	int64_t cmp = 0;
> +
> +	hists__for_each_sort_list(hists, fmt) {
> +		if (perf_hpp__is_dynamic_entry(fmt) &&
> +		    !perf_hpp__defined_dynamic_entry(fmt, hists))
> +			continue;
> +
> +		cmp = fmt->cmp(fmt, entry, key);
> +		if (cmp)
> +			break;
> +	}
> +
> +	return cmp;
> +}
> +
> +unsigned long hist_lookups;
> +unsigned long hist_hits;
> +unsigned long hist_misses;
> +
>  static struct hist_entry *hists__findnew_entry(struct hists *hists,
> -					       struct hist_entry *entry,
> -					       struct addr_location *al,
> +					       struct hist_entry_cmp_key *key,
> +					       struct hist_entry_ops *ops,
>  					       bool sample_self)
>  {
>  	struct rb_node **p;
>  	struct rb_node *parent = NULL;
>  	struct hist_entry *he;
>  	int64_t cmp;
> -	u64 period = entry->stat.period;
> -	u64 weight = entry->stat.weight;
> +	u64 period = key->sample->period;
> +	u64 weight = key->sample->weight;
> +
> +	hist_lookups++;
>  
>  	p = &hists->entries_in->rb_node;
>  
> @@ -526,7 +580,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
>  		 * function when searching an entry regardless which sort
>  		 * keys were used.
>  		 */
> -		cmp = hist_entry__cmp(he, entry);
> +		cmp = hist_entry__cmp(he, key);
>  
>  		if (!cmp) {
>  			if (sample_self) {
> @@ -540,7 +594,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
>  			 * This mem info was allocated from sample__resolve_mem
>  			 * and will not be used anymore.
>  			 */
> -			mem_info__zput(entry->mem_info);
> +			mem_info__zput(key->mem_info);
>  
>  			/* If the map of an existing hist_entry has
>  			 * become out-of-date due to an exec() or
> @@ -548,10 +602,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
>  			 * mis-adjust symbol addresses when computing
>  			 * the history counter to increment.
>  			 */
> -			if (he->ms.map != entry->ms.map) {
> +			if (he->ms.map != key->al->map) {
>  				map__put(he->ms.map);
> -				he->ms.map = map__get(entry->ms.map);
> +				he->ms.map = map__get(key->al->map);
>  			}
> +			hist_hits++;
>  			goto out;
>  		}
>  
> @@ -561,7 +616,8 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
>  			p = &(*p)->rb_right;
>  	}
>  
> -	he = hist_entry__new(entry, sample_self);
> +	hist_misses++;
> +	he = hist_entry__new(key, hists, ops, sample_self);
>  	if (!he)
>  		return NULL;
>  
> @@ -573,9 +629,9 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
>  	rb_insert_color(&he->rb_node_in, hists->entries_in);
>  out:
>  	if (sample_self)
> -		he_stat__add_cpumode_period(&he->stat, al->cpumode, period);
> +		he_stat__add_cpumode_period(&he->stat, key->al->cpumode, period);
>  	if (symbol_conf.cumulate_callchain)
> -		he_stat__add_cpumode_period(he->stat_acc, al->cpumode, period);
> +		he_stat__add_cpumode_period(he->stat_acc, key->al->cpumode, period);
>  	return he;
>  }
>  
> @@ -589,39 +645,19 @@ __hists__add_entry(struct hists *hists,
>  		   bool sample_self,
>  		   struct hist_entry_ops *ops)
>  {
> -	struct namespaces *ns = thread__namespaces(al->thread);
> -	struct hist_entry entry = {
> -		.thread	= al->thread,
> -		.comm = thread__comm(al->thread),
> -		.cgroup_id = {
> -			.dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0,
> -			.ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0,
> -		},
> -		.ms = {
> -			.map	= al->map,
> -			.sym	= al->sym,
> -		},
> -		.srcline = al->srcline ? strdup(al->srcline) : NULL,
> -		.socket	 = al->socket,
> -		.cpu	 = al->cpu,
> -		.cpumode = al->cpumode,
> -		.ip	 = al->addr,
> -		.level	 = al->level,
> -		.stat = {
> -			.nr_events = 1,
> -			.period	= sample->period,
> -			.weight = sample->weight,
> -		},
> -		.parent = sym_parent,
> -		.filtered = symbol__parent_filter(sym_parent) | al->filtered,
> -		.hists	= hists,
> -		.branch_info = bi,
> -		.mem_info = mi,
> -		.transaction = sample->transaction,
> -		.raw_data = sample->raw_data,
> -		.raw_size = sample->raw_size,
> -		.ops = ops,
> -	}, *he = hists__findnew_entry(hists, &entry, al, sample_self);
> +	struct hist_entry_cmp_key key;
> +	struct hist_entry *he;
> +
> +	key.al = al;
> +	key.comm = thread__comm(al->thread);
> +	key.bi = bi;
> +	key.sym_parent = sym_parent;
> +	key.sample = sample;
> +	key.mem_info = mi;
> +	key.srcfile = NULL;
> +	key.trace_output = NULL;
> +
> +	he = hists__findnew_entry(hists, &key, ops, sample_self);
>  
>  	if (!hists->has_callchains && he && he->callchain_size != 0)
>  		hists->has_callchains = true;
> @@ -947,7 +983,9 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
>  	struct perf_evsel *evsel = iter->evsel;
>  	struct perf_sample *sample = iter->sample;
>  	struct hist_entry **he_cache = iter->priv;
> +	struct hist_entry_cmp_key key;
>  	struct hist_entry *he;
> +#if 0
>  	struct hist_entry he_tmp = {
>  		.hists = evsel__hists(evsel),
>  		.cpu = al->cpu,
> @@ -963,6 +1001,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
>  		.raw_data = sample->raw_data,
>  		.raw_size = sample->raw_size,
>  	};
> +#endif
>  	int i;
>  	struct callchain_cursor cursor;
>  
> @@ -974,8 +1013,16 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
>  	 * Check if there's duplicate entries in the callchain.
>  	 * It's possible that it has cycles or recursive calls.
>  	 */
> +	key.al = al;
> +	key.comm = thread__comm(al->thread);
> +	key.bi = NULL;
> +	key.sym_parent = iter->parent;
> +	key.sample = sample;
> +	key.mem_info = NULL;
> +	key.srcfile = NULL;
> +	key.trace_output = NULL;
>  	for (i = 0; i < iter->curr; i++) {
> -		if (hist_entry__cmp(he_cache[i], &he_tmp) == 0) {
> +		if (hist_entry__cmp(he_cache[i], &key) == 0) {
>  			/* to avoid calling callback function */
>  			iter->he = NULL;
>  			return 0;
> @@ -1088,26 +1135,6 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
>  }
>  
>  int64_t
> -hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
> -{
> -	struct hists *hists = left->hists;
> -	struct perf_hpp_fmt *fmt;
> -	int64_t cmp = 0;
> -
> -	hists__for_each_sort_list(hists, fmt) {
> -		if (perf_hpp__is_dynamic_entry(fmt) &&
> -		    !perf_hpp__defined_dynamic_entry(fmt, hists))
> -			continue;
> -
> -		cmp = fmt->cmp(fmt, left, right);
> -		if (cmp)
> -			break;
> -	}
> -
> -	return cmp;
> -}
> -
> -int64_t
>  hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
>  {
>  	struct hists *hists = left->hists;
> @@ -1312,7 +1339,11 @@ static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
>  			p = &parent->rb_right;
>  	}
>  
> -	new = hist_entry__new(he, true);
> +#if 1
> +	new = NULL;
> +#else
> +	new = hist_entry__new(he, true); /* XXX fix XXX */
> +#endif
>  	if (new == NULL)
>  		return NULL;
>  
> @@ -2168,7 +2199,11 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
>  			p = &(*p)->rb_right;
>  	}
>  
> -	he = hist_entry__new(pair, true);
> +#if 1
> +	he = NULL;
> +#else
> +	he = hist_entry__new(pair, true); /* XXX fix XXX */
> +#endif
>  	if (he) {
>  		memset(&he->stat, 0, sizeof(he->stat));
>  		he->hists = hists;
> @@ -2213,7 +2248,11 @@ static struct hist_entry *add_dummy_hierarchy_entry(struct hists *hists,
>  			p = &parent->rb_right;
>  	}
>  
> -	he = hist_entry__new(pair, true);
> +#if 1
> +	he = NULL;
> +#else
> +	he = hist_entry__new(pair, true); /* XXX fix XXX */
> +#endif
>  	if (he) {
>  		rb_link_node(&he->rb_node_in, parent, p);
>  		rb_insert_color(&he->rb_node_in, root);
> diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
> index fe3dfaa..a3d66e1 100644
> --- a/tools/perf/ui/hist.c
> +++ b/tools/perf/ui/hist.c
> @@ -372,8 +372,15 @@ HPP_RAW_FNS(samples, nr_events)
>  HPP_RAW_FNS(period, period)
>  
>  static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
> -			    struct hist_entry *a __maybe_unused,
> -			    struct hist_entry *b __maybe_unused)
> +			    struct hist_entry *entry __maybe_unused,
> +			    struct hist_entry_cmp_key *key __maybe_unused)
> +{
> +	return 0;
> +}
> +
> +static int64_t hpp__nop_collapse(struct perf_hpp_fmt *fmt __maybe_unused,
> +				 struct hist_entry *a __maybe_unused,
> +				 struct hist_entry *b __maybe_unused)
>  {
>  	return 0;
>  }
> @@ -399,7 +406,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
>  		.color	= hpp__color_ ## _fn,		\
>  		.entry	= hpp__entry_ ## _fn,		\
>  		.cmp	= hpp__nop_cmp,			\
> -		.collapse = hpp__nop_cmp,		\
> +		.collapse = hpp__nop_collapse,		\
>  		.sort	= hpp__sort_ ## _fn,		\
>  		.idx	= PERF_HPP__ ## _idx,		\
>  		.equal	= hpp__equal,			\
> @@ -413,7 +420,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
>  		.color	= hpp__color_ ## _fn,		\
>  		.entry	= hpp__entry_ ## _fn,		\
>  		.cmp	= hpp__nop_cmp,			\
> -		.collapse = hpp__nop_cmp,		\
> +		.collapse = hpp__nop_collapse,		\
>  		.sort	= hpp__sort_ ## _fn,		\
>  		.idx	= PERF_HPP__ ## _idx,		\
>  		.equal	= hpp__equal,			\
> @@ -426,7 +433,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
>  		.width	= hpp__width_fn,		\
>  		.entry	= hpp__entry_ ## _fn,		\
>  		.cmp	= hpp__nop_cmp,			\
> -		.collapse = hpp__nop_cmp,		\
> +		.collapse = hpp__nop_collapse,		\
>  		.sort	= hpp__sort_ ## _fn,		\
>  		.idx	= PERF_HPP__ ## _idx,		\
>  		.equal	= hpp__equal,			\
> diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
> index f3aa9d0..190f5eb 100644
> --- a/tools/perf/builtin-c2c.c
> +++ b/tools/perf/builtin-c2c.c
> @@ -1717,12 +1717,13 @@ static int c2c_se_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
>  }
>  
>  static int64_t c2c_se_cmp(struct perf_hpp_fmt *fmt,
> -			  struct hist_entry *a, struct hist_entry *b)
> +			  struct hist_entry *entry,
> +			  struct hist_entry_cmp_key *key)
>  {
>  	struct c2c_fmt *c2c_fmt = container_of(fmt, struct c2c_fmt, fmt);
>  	struct c2c_dimension *dim = c2c_fmt->dim;
>  
> -	return dim->se->se_cmp(a, b);
> +	return dim->se->se_cmp_key(entry, key);
>  }
>  
>  static int64_t c2c_se_collapse(struct perf_hpp_fmt *fmt,
> @@ -1755,8 +1756,13 @@ static struct c2c_fmt *get_format(const char *name)
>  	INIT_LIST_HEAD(&fmt->list);
>  	INIT_LIST_HEAD(&fmt->sort_list);
>  
> +#if 1
> +	fmt->cmp	= c2c_se_cmp;
> +	fmt->sort	= dim->cmp;
> +#else
>  	fmt->cmp	= dim->se ? c2c_se_cmp   : dim->cmp;
>  	fmt->sort	= dim->se ? c2c_se_cmp   : dim->cmp;
> +#endif
>  	fmt->color	= dim->se ? NULL	 : dim->color;
>  	fmt->entry	= dim->se ? c2c_se_entry : dim->entry;
>  	fmt->header	= c2c_header;
> diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
> index 39db2ee..2684efa 100644
> --- a/tools/perf/builtin-diff.c
> +++ b/tools/perf/builtin-diff.c
> @@ -604,8 +604,16 @@ hist_entry__cmp_compute_idx(struct hist_entry *left, struct hist_entry *right,
>  
>  static int64_t
>  hist_entry__cmp_nop(struct perf_hpp_fmt *fmt __maybe_unused,
> -		    struct hist_entry *left __maybe_unused,
> -		    struct hist_entry *right __maybe_unused)
> +		    struct hist_entry *entry __maybe_unused,
> +		    struct hist_entry_cmp_key *key __maybe_unused)
> +{
> +	return 0;
> +}
> +
> +static int64_t
> +hist_entry__collapse_nop(struct perf_hpp_fmt *fmt __maybe_unused,
> +			 struct hist_entry *a __maybe_unused,
> +			 struct hist_entry *b __maybe_unused)
>  {
>  	return 0;
>  }
> @@ -1141,7 +1149,7 @@ static void data__hpp_register(struct data__file *d, int idx)
>  	fmt->width  = hpp__width;
>  	fmt->entry  = hpp__entry_global;
>  	fmt->cmp    = hist_entry__cmp_nop;
> -	fmt->collapse = hist_entry__cmp_nop;
> +	fmt->collapse = hist_entry__collapse_nop;
>  
>  	/* TODO more colors */
>  	switch (idx) {
> @@ -1166,7 +1174,7 @@ static void data__hpp_register(struct data__file *d, int idx)
>  		fmt->sort  = hist_entry__cmp_delta_abs;
>  		break;
>  	default:
> -		fmt->sort  = hist_entry__cmp_nop;
> +		fmt->sort  = hist_entry__collapse_nop;
>  		break;
>  	}
>  
> @@ -1230,7 +1238,7 @@ static int ui_init(void)
>  	}
>  
>  	fmt->cmp      = hist_entry__cmp_nop;
> -	fmt->collapse = hist_entry__cmp_nop;
> +	fmt->collapse = hist_entry__collapse_nop;
>  
>  	switch (compute) {
>  	case COMPUTE_DELTA:

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-10-31 12:43 ` Arnaldo Carvalho de Melo
@ 2018-10-31 15:39   ` Jiri Olsa
  2018-10-31 16:08     ` David Miller
  0 siblings, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-10-31 15:39 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, David Miller
  Cc: linux-kernel, Namhyung Kim, Jiri Olsa

On Wed, Oct 31, 2018 at 09:43:06AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, Oct 30, 2018 at 10:03:28PM -0700, David Miller escreveu:
> > 
> > So when a cpu is overpowered processing samples, most of the time is
> > spent in the histogram code.

it'd be great to make hist processing faster, but is your main target here
to get the load out of the reader thread, so we dont lose events during the
hist processing?

we could queue events directly from reader thread into another thread and
keep it (the reader thread) free of processing, focusing only on event
reading/passing 

jirka

> > 
> > It seems we initialize a ~262 byte structure on the stack to do every
> > histogram entry lookup.
> > 
> > This is a side effect of how the sorting code is shared with the code
> > that does lookups and insertions into the histogram tree(s).
> > 
> > I tried to change this so that lookups use a smaller key, but it gets
> > ugly real fast.
> > 
> > I don't know when I'd be able to work more on this so I'm posting this
> > hoping maybe someone else can move it forward, or maybe even find a
> > better way to do this.
> 
> Added Namhyung to the CC list, that is in vacations right now but said
> he would look into the issues recently raised when he gets back.
> 
> Thanks a lot for all the work you did in this sprint, really
> appreciated, I'm processing the fixes for the fallback to kallsyms and
> the other patch you submitted, will do tests and push to Ingo and
> revisit this after I get ready for Vancouver ;-)
> 
> - Arnaldo
>  
> > The histogram code is really the limiting factor in how well perf can
> > handle high sample rates.
> > 
> > diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
> > index f96c005..f0265e4 100644
> > --- a/tools/perf/util/sort.c
> > +++ b/tools/perf/util/sort.c
> > @@ -81,6 +81,12 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return right->thread->tid - left->thread->tid;
> >  }
> >  
> > +static int64_t
> > +sort__thread_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	return key->al->thread->tid - entry->thread->tid;
> > +}
> > +
> >  static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf,
> >  				       size_t size, unsigned int width)
> >  {
> > @@ -104,6 +110,7 @@ static int hist_entry__thread_filter(struct hist_entry *he, int type, const void
> >  struct sort_entry sort_thread = {
> >  	.se_header	= "    Pid:Command",
> >  	.se_cmp		= sort__thread_cmp,
> > +	.se_cmp_key	= sort__thread_cmp_key,
> >  	.se_snprintf	= hist_entry__thread_snprintf,
> >  	.se_filter	= hist_entry__thread_filter,
> >  	.se_width_idx	= HISTC_THREAD,
> > @@ -123,6 +130,13 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
> >  }
> >  
> >  static int64_t
> > +sort__comm_cmp_key(struct hist_entry *entry,
> > +	       struct hist_entry_cmp_key *key)
> > +{
> > +	return strcmp(comm__str(key->comm), comm__str(entry->comm));
> > +}
> > +
> > +static int64_t
> >  sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
> >  {
> >  	return strcmp(comm__str(right->comm), comm__str(left->comm));
> > @@ -143,6 +157,7 @@ static int hist_entry__comm_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_comm = {
> >  	.se_header	= "Command",
> >  	.se_cmp		= sort__comm_cmp,
> > +	.se_cmp_key	= sort__comm_cmp_key,
> >  	.se_collapse	= sort__comm_collapse,
> >  	.se_sort	= sort__comm_sort,
> >  	.se_snprintf	= hist_entry__comm_snprintf,
> > @@ -178,6 +193,12 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return _sort__dso_cmp(right->ms.map, left->ms.map);
> >  }
> >  
> > +static int64_t
> > +sort__dso_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	return _sort__dso_cmp(key->al->map, entry->ms.map);
> > +}
> > +
> >  static int _hist_entry__dso_snprintf(struct map *map, char *bf,
> >  				     size_t size, unsigned int width)
> >  {
> > @@ -209,6 +230,7 @@ static int hist_entry__dso_filter(struct hist_entry *he, int type, const void *a
> >  struct sort_entry sort_dso = {
> >  	.se_header	= "Shared Object",
> >  	.se_cmp		= sort__dso_cmp,
> > +	.se_cmp_key	= sort__dso_cmp_key,
> >  	.se_snprintf	= hist_entry__dso_snprintf,
> >  	.se_filter	= hist_entry__dso_filter,
> >  	.se_width_idx	= HISTC_DSO,
> > @@ -260,6 +282,25 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
> >  }
> >  
> >  static int64_t
> > +sort__sym_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	int64_t ret;
> > +
> > +	if (!entry->ms.sym && !key->al->sym)
> > +		return _sort__addr_cmp(entry->ip, key->al->addr);
> > +
> > +	/*
> > +	 * comparing symbol address alone is not enough since it's a
> > +	 * relative address within a dso.
> > +	 */
> > +	ret = sort__dso_cmp_key(entry, key);
> > +	if (ret != 0)
> > +		return ret;
> > +
> > +	return _sort__sym_cmp(entry->ms.sym, key->al->sym);
> > +}
> > +
> > +static int64_t
> >  sort__sym_sort(struct hist_entry *left, struct hist_entry *right)
> >  {
> >  	if (!left->ms.sym || !right->ms.sym)
> > @@ -323,6 +364,7 @@ static int hist_entry__sym_filter(struct hist_entry *he, int type, const void *a
> >  struct sort_entry sort_sym = {
> >  	.se_header	= "Symbol",
> >  	.se_cmp		= sort__sym_cmp,
> > +	.se_cmp_key	= sort__sym_cmp_key,
> >  	.se_sort	= sort__sym_sort,
> >  	.se_snprintf	= hist_entry__sym_snprintf,
> >  	.se_filter	= hist_entry__sym_filter,
> > @@ -347,6 +389,18 @@ sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return strcmp(right->srcline, left->srcline);
> >  }
> >  
> > +static int64_t
> > +sort__srcline_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->srcline)
> > +		entry->srcline = hist_entry__srcline(entry);
> > +	if (!key->al->srcline)
> > +		key->al->srcline =
> > +			map__srcline(key->al->map, key->al->addr, key->al->sym);
> > +
> > +	return strcmp(key->al->srcline, entry->srcline);
> > +}
> > +
> >  static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
> >  					size_t size, unsigned int width)
> >  {
> > @@ -359,6 +413,7 @@ static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_srcline = {
> >  	.se_header	= "Source:Line",
> >  	.se_cmp		= sort__srcline_cmp,
> > +	.se_cmp_key	= sort__srcline_cmp_key,
> >  	.se_snprintf	= hist_entry__srcline_snprintf,
> >  	.se_width_idx	= HISTC_SRCLINE,
> >  };
> > @@ -382,6 +437,18 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
> >  }
> >  
> > +static int64_t
> > +sort__srcline_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->branch_info->srcline_from)
> > +		entry->branch_info->srcline_from = addr_map_symbol__srcline(&entry->branch_info->from);
> > +
> > +	if (!key->bi->srcline_from)
> > +		key->bi->srcline_from = addr_map_symbol__srcline(&key->bi->from);
> > +
> > +	return strcmp(key->bi->srcline_from, entry->branch_info->srcline_from);
> > +}
> > +
> >  static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
> >  					size_t size, unsigned int width)
> >  {
> > @@ -391,6 +458,7 @@ static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_srcline_from = {
> >  	.se_header	= "From Source:Line",
> >  	.se_cmp		= sort__srcline_from_cmp,
> > +	.se_cmp_key	= sort__srcline_from_cmp_key,
> >  	.se_snprintf	= hist_entry__srcline_from_snprintf,
> >  	.se_width_idx	= HISTC_SRCLINE_FROM,
> >  };
> > @@ -409,6 +477,18 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
> >  }
> >  
> > +static int64_t
> > +sort__srcline_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->branch_info->srcline_to)
> > +		entry->branch_info->srcline_to = addr_map_symbol__srcline(&entry->branch_info->to);
> > +
> > +	if (!key->bi->srcline_to)
> > +		key->bi->srcline_to = addr_map_symbol__srcline(&key->bi->to);
> > +
> > +	return strcmp(key->bi->srcline_to, entry->branch_info->srcline_to);
> > +}
> > +
> >  static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
> >  					size_t size, unsigned int width)
> >  {
> > @@ -418,6 +498,7 @@ static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_srcline_to = {
> >  	.se_header	= "To Source:Line",
> >  	.se_cmp		= sort__srcline_to_cmp,
> > +	.se_cmp_key	= sort__srcline_to_cmp_key,
> >  	.se_snprintf	= hist_entry__srcline_to_snprintf,
> >  	.se_width_idx	= HISTC_SRCLINE_TO,
> >  };
> > @@ -426,16 +507,16 @@ struct sort_entry sort_srcline_to = {
> >  
> >  static char no_srcfile[1];
> >  
> > -static char *hist_entry__get_srcfile(struct hist_entry *e)
> > +static char *__hist_entry__get_srcfile(struct map *map, struct symbol *sym,
> > +				       u64 ip)
> >  {
> >  	char *sf, *p;
> > -	struct map *map = e->ms.map;
> >  
> >  	if (!map)
> >  		return no_srcfile;
> >  
> > -	sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
> > -			 e->ms.sym, false, true, true, e->ip);
> > +	sf = __get_srcline(map->dso, map__rip_2objdump(map, ip),
> > +			 sym, false, true, true, ip);
> >  	if (!strcmp(sf, SRCLINE_UNKNOWN))
> >  		return no_srcfile;
> >  	p = strchr(sf, ':');
> > @@ -447,6 +528,15 @@ static char *hist_entry__get_srcfile(struct hist_entry *e)
> >  	return no_srcfile;
> >  }
> >  
> > +static char *hist_entry__get_srcfile(struct hist_entry *e)
> > +{
> > +	return __hist_entry__get_srcfile(e->ms.map, e->ms.sym, e->ip);
> > +}
> > +
> > +static char *hist_entry_key__get_srcfile(struct hist_entry_cmp_key *key)
> > +{
> > +	return __hist_entry__get_srcfile(key->al->map, key->al->sym, key->al->addr);
> > +}
> >  static int64_t
> >  sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
> >  {
> > @@ -458,6 +548,17 @@ sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return strcmp(right->srcfile, left->srcfile);
> >  }
> >  
> > +static int64_t
> > +sort__srcfile_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->srcfile)
> > +		entry->srcfile = hist_entry__get_srcfile(entry);
> > +	if (!key->srcfile)
> > +		key->srcfile = hist_entry_key__get_srcfile(key);
> > +
> > +	return strcmp(key->srcfile, entry->srcfile);
> > +}
> > +
> >  static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
> >  					size_t size, unsigned int width)
> >  {
> > @@ -470,6 +571,7 @@ static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_srcfile = {
> >  	.se_header	= "Source File",
> >  	.se_cmp		= sort__srcfile_cmp,
> > +	.se_cmp_key	= sort__srcfile_cmp_key,
> >  	.se_snprintf	= hist_entry__srcfile_snprintf,
> >  	.se_width_idx	= HISTC_SRCFILE,
> >  };
> > @@ -488,6 +590,18 @@ sort__parent_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return strcmp(sym_r->name, sym_l->name);
> >  }
> >  
> > +static int64_t
> > +sort__parent_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	struct symbol *sym_l = entry->parent;
> > +	struct symbol *sym_r = key->sym_parent;
> > +
> > +	if (!sym_l || !sym_r)
> > +		return cmp_null(sym_l, sym_r);
> > +
> > +	return strcmp(sym_r->name, sym_l->name);
> > +}
> > +
> >  static int hist_entry__parent_snprintf(struct hist_entry *he, char *bf,
> >  				       size_t size, unsigned int width)
> >  {
> > @@ -498,6 +612,7 @@ static int hist_entry__parent_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_parent = {
> >  	.se_header	= "Parent symbol",
> >  	.se_cmp		= sort__parent_cmp,
> > +	.se_cmp_key	= sort__parent_cmp_key,
> >  	.se_snprintf	= hist_entry__parent_snprintf,
> >  	.se_width_idx	= HISTC_PARENT,
> >  };
> > @@ -510,6 +625,12 @@ sort__cpu_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return right->cpu - left->cpu;
> >  }
> >  
> > +static int64_t
> > +sort__cpu_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	return key->al->cpu - entry->cpu;
> > +}
> > +
> >  static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -519,6 +640,7 @@ static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_cpu = {
> >  	.se_header      = "CPU",
> >  	.se_cmp	        = sort__cpu_cmp,
> > +	.se_cmp_key     = sort__cpu_cmp_key,
> >  	.se_snprintf    = hist_entry__cpu_snprintf,
> >  	.se_width_idx	= HISTC_CPU,
> >  };
> > @@ -548,6 +670,22 @@ sort__cgroup_id_cmp(struct hist_entry *left, struct hist_entry *right)
> >  				       left->cgroup_id.ino);
> >  }
> >  
> > +static int64_t
> > +sort__cgroup_id_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	struct namespaces *ns = thread__namespaces(key->al->thread);
> > +	int64_t ret;
> > +	u64 val;
> > +
> > +	val = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0;
> > +	ret = _sort__cgroup_dev_cmp(val, entry->cgroup_id.dev);
> > +	if (ret != 0)
> > +		return ret;
> > +
> > +	val = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0;
> > +	return _sort__cgroup_inode_cmp(val, entry->cgroup_id.ino);
> > +}
> > +
> >  static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
> >  					  char *bf, size_t size,
> >  					  unsigned int width __maybe_unused)
> > @@ -559,6 +697,7 @@ static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
> >  struct sort_entry sort_cgroup_id = {
> >  	.se_header      = "cgroup id (dev/inode)",
> >  	.se_cmp	        = sort__cgroup_id_cmp,
> > +	.se_cmp_key     = sort__cgroup_id_cmp_key,
> >  	.se_snprintf    = hist_entry__cgroup_id_snprintf,
> >  	.se_width_idx	= HISTC_CGROUP_ID,
> >  };
> > @@ -571,6 +710,12 @@ sort__socket_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return right->socket - left->socket;
> >  }
> >  
> > +static int64_t
> > +sort__socket_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	return key->al->socket - entry->socket;
> > +}
> > +
> >  static int hist_entry__socket_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -590,6 +735,7 @@ static int hist_entry__socket_filter(struct hist_entry *he, int type, const void
> >  struct sort_entry sort_socket = {
> >  	.se_header      = "Socket",
> >  	.se_cmp	        = sort__socket_cmp,
> > +	.se_cmp_key     = sort__socket_cmp_key,
> >  	.se_snprintf    = hist_entry__socket_snprintf,
> >  	.se_filter      = hist_entry__socket_filter,
> >  	.se_width_idx	= HISTC_SOCKET,
> > @@ -597,20 +743,21 @@ struct sort_entry sort_socket = {
> >  
> >  /* --sort trace */
> >  
> > -static char *get_trace_output(struct hist_entry *he)
> > +static char *__get_trace_output(struct hists *hists, void *raw_data,
> > +				u32 raw_size)
> >  {
> >  	struct trace_seq seq;
> >  	struct perf_evsel *evsel;
> >  	struct tep_record rec = {
> > -		.data = he->raw_data,
> > -		.size = he->raw_size,
> > +		.data = raw_data,
> > +		.size = raw_size,
> >  	};
> >  
> > -	evsel = hists_to_evsel(he->hists);
> > +	evsel = hists_to_evsel(hists);
> >  
> >  	trace_seq_init(&seq);
> >  	if (symbol_conf.raw_trace) {
> > -		tep_print_fields(&seq, he->raw_data, he->raw_size,
> > +		tep_print_fields(&seq, raw_data, raw_size,
> >  				 evsel->tp_format);
> >  	} else {
> >  		tep_event_info(&seq, evsel->tp_format, &rec);
> > @@ -622,6 +769,16 @@ static char *get_trace_output(struct hist_entry *he)
> >  	return realloc(seq.buffer, seq.len + 1);
> >  }
> >  
> > +static char *get_trace_output(struct hist_entry *he)
> > +{
> > +	return __get_trace_output(he->hists, he->raw_data, he->raw_size);
> > +}
> > +
> > +static char *get_trace_output_key(struct hists *hists, struct hist_entry_cmp_key *key)
> > +{
> > +	return __get_trace_output(hists, key->sample->raw_data, key->sample->raw_size);
> > +}
> > +
> >  static int64_t
> >  sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
> >  {
> > @@ -639,6 +796,23 @@ sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return strcmp(right->trace_output, left->trace_output);
> >  }
> >  
> > +static int64_t
> > +sort__trace_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	struct perf_evsel *evsel;
> > +
> > +	evsel = hists_to_evsel(entry->hists);
> > +	if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
> > +		return 0;
> > +
> > +	if (entry->trace_output == NULL)
> > +		entry->trace_output = get_trace_output(entry);
> > +	if (key->trace_output == NULL)
> > +		key->trace_output = get_trace_output_key(entry->hists, key);
> > +
> > +	return strcmp(key->trace_output, entry->trace_output);
> > +}
> > +
> >  static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -656,6 +830,7 @@ static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_trace = {
> >  	.se_header      = "Trace output",
> >  	.se_cmp	        = sort__trace_cmp,
> > +	.se_cmp_key     = sort__trace_cmp_key,
> >  	.se_snprintf    = hist_entry__trace_snprintf,
> >  	.se_width_idx	= HISTC_TRACE,
> >  };
> > @@ -672,6 +847,16 @@ sort__dso_from_cmp(struct hist_entry *left, struct hist_entry *right)
> >  			      right->branch_info->from.map);
> >  }
> >  
> > +static int64_t
> > +sort__dso_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->branch_info || !key->bi)
> > +		return cmp_null(entry->branch_info, key->bi);
> > +
> > +	return _sort__dso_cmp(entry->branch_info->from.map,
> > +			      key->bi->from.map);
> > +}
> > +
> >  static int hist_entry__dso_from_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -704,6 +889,16 @@ sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
> >  			      right->branch_info->to.map);
> >  }
> >  
> > +static int64_t
> > +sort__dso_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->branch_info || !key->bi)
> > +		return cmp_null(entry->branch_info, key->bi);
> > +
> > +	return _sort__dso_cmp(entry->branch_info->to.map,
> > +			      key->bi->to.map);
> > +}
> > +
> >  static int hist_entry__dso_to_snprintf(struct hist_entry *he, char *bf,
> >  				       size_t size, unsigned int width)
> >  {
> > @@ -745,6 +940,24 @@ sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
> >  }
> >  
> >  static int64_t
> > +sort__sym_from_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	struct addr_map_symbol *from_l = &entry->branch_info->from;
> > +	struct addr_map_symbol *from_r = &key->bi->from;
> > +
> > +	if (!entry->branch_info || !key->bi)
> > +		return cmp_null(entry->branch_info, key->bi);
> > +
> > +	from_l = &entry->branch_info->from;
> > +	from_r = &key->bi->from;
> > +
> > +	if (!from_l->sym && !from_r->sym)
> > +		return _sort__addr_cmp(from_l->addr, from_r->addr);
> > +
> > +	return _sort__sym_cmp(from_l->sym, from_r->sym);
> > +}
> > +
> > +static int64_t
> >  sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
> >  {
> >  	struct addr_map_symbol *to_l, *to_r;
> > @@ -761,6 +974,23 @@ sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return _sort__sym_cmp(to_l->sym, to_r->sym);
> >  }
> >  
> > +static int64_t
> > +sort__sym_to_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	struct addr_map_symbol *to_l, *to_r;
> > +
> > +	if (!entry->branch_info || !key->bi)
> > +		return cmp_null(entry->branch_info, key->bi);
> > +
> > +	to_l = &entry->branch_info->to;
> > +	to_r = &key->bi->to;
> > +
> > +	if (!to_l->sym && !to_r->sym)
> > +		return _sort__addr_cmp(to_l->addr, to_r->addr);
> > +
> > +	return _sort__sym_cmp(to_l->sym, to_r->sym);
> > +}
> > +
> >  static int hist_entry__sym_from_snprintf(struct hist_entry *he, char *bf,
> >  					 size_t size, unsigned int width)
> >  {
> > @@ -814,6 +1044,7 @@ static int hist_entry__sym_to_filter(struct hist_entry *he, int type,
> >  struct sort_entry sort_dso_from = {
> >  	.se_header	= "Source Shared Object",
> >  	.se_cmp		= sort__dso_from_cmp,
> > +	.se_cmp_key	= sort__dso_from_cmp_key,
> >  	.se_snprintf	= hist_entry__dso_from_snprintf,
> >  	.se_filter	= hist_entry__dso_from_filter,
> >  	.se_width_idx	= HISTC_DSO_FROM,
> > @@ -822,6 +1053,7 @@ struct sort_entry sort_dso_from = {
> >  struct sort_entry sort_dso_to = {
> >  	.se_header	= "Target Shared Object",
> >  	.se_cmp		= sort__dso_to_cmp,
> > +	.se_cmp_key	= sort__dso_to_cmp_key,
> >  	.se_snprintf	= hist_entry__dso_to_snprintf,
> >  	.se_filter	= hist_entry__dso_to_filter,
> >  	.se_width_idx	= HISTC_DSO_TO,
> > @@ -830,6 +1062,7 @@ struct sort_entry sort_dso_to = {
> >  struct sort_entry sort_sym_from = {
> >  	.se_header	= "Source Symbol",
> >  	.se_cmp		= sort__sym_from_cmp,
> > +	.se_cmp_key	= sort__sym_from_cmp_key,
> >  	.se_snprintf	= hist_entry__sym_from_snprintf,
> >  	.se_filter	= hist_entry__sym_from_filter,
> >  	.se_width_idx	= HISTC_SYMBOL_FROM,
> > @@ -838,6 +1071,7 @@ struct sort_entry sort_sym_from = {
> >  struct sort_entry sort_sym_to = {
> >  	.se_header	= "Target Symbol",
> >  	.se_cmp		= sort__sym_to_cmp,
> > +	.se_cmp_key	= sort__sym_to_cmp_key,
> >  	.se_snprintf	= hist_entry__sym_to_snprintf,
> >  	.se_filter	= hist_entry__sym_to_filter,
> >  	.se_width_idx	= HISTC_SYMBOL_TO,
> > @@ -856,6 +1090,19 @@ sort__mispredict_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return mp || p;
> >  }
> >  
> > +static int64_t
> > +sort__mispredict_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	unsigned char mp, p;
> > +
> > +	if (!entry->branch_info || !key->bi)
> > +		return cmp_null(entry->branch_info, key->bi);
> > +
> > +	mp = entry->branch_info->flags.mispred != key->bi->flags.mispred;
> > +	p  = entry->branch_info->flags.predicted != key->bi->flags.predicted;
> > +	return mp || p;
> > +}
> > +
> >  static int hist_entry__mispredict_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width){
> >  	static const char *out = "N/A";
> > @@ -880,6 +1127,16 @@ sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right)
> >  		right->branch_info->flags.cycles;
> >  }
> >  
> > +static int64_t
> > +sort__cycles_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->branch_info || !key->bi)
> > +		return cmp_null(entry->branch_info, key->bi);
> > +
> > +	return entry->branch_info->flags.cycles -
> > +		key->bi->flags.cycles;
> > +}
> > +
> >  static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -894,6 +1151,7 @@ static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_cycles = {
> >  	.se_header	= "Basic Block Cycles",
> >  	.se_cmp		= sort__cycles_cmp,
> > +	.se_cmp_key	= sort__cycles_cmp_key,
> >  	.se_snprintf	= hist_entry__cycles_snprintf,
> >  	.se_width_idx	= HISTC_CYCLES,
> >  };
> > @@ -912,6 +1170,19 @@ sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return (int64_t)(r - l);
> >  }
> >  
> > +static int64_t
> > +sort__daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	uint64_t l = 0, r = 0;
> > +
> > +	if (entry->mem_info)
> > +		l = entry->mem_info->daddr.addr;
> > +	if (key->mem_info)
> > +		r = key->mem_info->daddr.addr;
> > +
> > +	return (int64_t)(r - l);
> > +}
> > +
> >  static int hist_entry__daddr_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -941,6 +1212,19 @@ sort__iaddr_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return (int64_t)(r - l);
> >  }
> >  
> > +static int64_t
> > +sort__iaddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	uint64_t l = 0, r = 0;
> > +
> > +	if (entry->mem_info)
> > +		l = entry->mem_info->iaddr.addr;
> > +	if (key->mem_info)
> > +		r = key->mem_info->iaddr.addr;
> > +
> > +	return (int64_t)(r - l);
> > +}
> > +
> >  static int hist_entry__iaddr_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -971,6 +1255,20 @@ sort__dso_daddr_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return _sort__dso_cmp(map_l, map_r);
> >  }
> >  
> > +static int64_t
> > +sort__dso_daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	struct map *map_l = NULL;
> > +	struct map *map_r = NULL;
> > +
> > +	if (entry->mem_info)
> > +		map_l = entry->mem_info->daddr.map;
> > +	if (key->mem_info)
> > +		map_r = key->mem_info->daddr.map;
> > +
> > +	return _sort__dso_cmp(map_l, map_r);
> > +}
> > +
> >  static int hist_entry__dso_daddr_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -1001,6 +1299,25 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return (int64_t)(data_src_r.mem_lock - data_src_l.mem_lock);
> >  }
> >  
> > +static int64_t
> > +sort__locked_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	union perf_mem_data_src data_src_l;
> > +	union perf_mem_data_src data_src_r;
> > +
> > +	if (entry->mem_info)
> > +		data_src_l = entry->mem_info->data_src;
> > +	else
> > +		data_src_l.mem_lock = PERF_MEM_LOCK_NA;
> > +
> > +	if (key->mem_info)
> > +		data_src_r = key->mem_info->data_src;
> > +	else
> > +		data_src_r.mem_lock = PERF_MEM_LOCK_NA;
> > +
> > +	return (int64_t)(data_src_r.mem_lock - data_src_l.mem_lock);
> > +}
> > +
> >  static int hist_entry__locked_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -1029,6 +1346,25 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
> >  }
> >  
> > +static int64_t
> > +sort__tlb_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	union perf_mem_data_src data_src_l;
> > +	union perf_mem_data_src data_src_r;
> > +
> > +	if (entry->mem_info)
> > +		data_src_l = entry->mem_info->data_src;
> > +	else
> > +		data_src_l.mem_dtlb = PERF_MEM_TLB_NA;
> > +
> > +	if (key->mem_info)
> > +		data_src_r = key->mem_info->data_src;
> > +	else
> > +		data_src_r.mem_dtlb = PERF_MEM_TLB_NA;
> > +
> > +	return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
> > +}
> > +
> >  static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -1057,6 +1393,25 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
> >  }
> >  
> > +static int64_t
> > +sort__lvl_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	union perf_mem_data_src data_src_l;
> > +	union perf_mem_data_src data_src_r;
> > +
> > +	if (entry->mem_info)
> > +		data_src_l = entry->mem_info->data_src;
> > +	else
> > +		data_src_l.mem_lvl = PERF_MEM_LVL_NA;
> > +
> > +	if (key->mem_info)
> > +		data_src_r = key->mem_info->data_src;
> > +	else
> > +		data_src_r.mem_lvl = PERF_MEM_LVL_NA;
> > +
> > +	return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
> > +}
> > +
> >  static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -1085,6 +1440,25 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
> >  }
> >  
> > +static int64_t
> > +sort__snoop_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	union perf_mem_data_src data_src_l;
> > +	union perf_mem_data_src data_src_r;
> > +
> > +	if (entry->mem_info)
> > +		data_src_l = entry->mem_info->data_src;
> > +	else
> > +		data_src_l.mem_snoop = PERF_MEM_SNOOP_NA;
> > +
> > +	if (key->mem_info)
> > +		data_src_r = key->mem_info->data_src;
> > +	else
> > +		data_src_r.mem_snoop = PERF_MEM_SNOOP_NA;
> > +
> > +	return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
> > +}
> > +
> >  static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -1158,6 +1532,70 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return 0;
> >  }
> >  
> > +static int64_t
> > +sort__dcacheline_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	u64 l, r;
> > +	struct map *l_map, *r_map;
> > +
> > +	if (!entry->mem_info)  return -1;
> > +	if (!key->mem_info) return 1;
> > +
> > +	/* group event types together */
> > +	if (entry->cpumode > key->al->cpumode) return -1;
> > +	if (entry->cpumode < key->al->cpumode) return 1;
> > +
> > +	l_map = entry->mem_info->daddr.map;
> > +	r_map = key->mem_info->daddr.map;
> > +
> > +	/* if both are NULL, jump to sort on al_addr instead */
> > +	if (!l_map && !r_map)
> > +		goto addr;
> > +
> > +	if (!l_map) return -1;
> > +	if (!r_map) return 1;
> > +
> > +	if (l_map->maj > r_map->maj) return -1;
> > +	if (l_map->maj < r_map->maj) return 1;
> > +
> > +	if (l_map->min > r_map->min) return -1;
> > +	if (l_map->min < r_map->min) return 1;
> > +
> > +	if (l_map->ino > r_map->ino) return -1;
> > +	if (l_map->ino < r_map->ino) return 1;
> > +
> > +	if (l_map->ino_generation > r_map->ino_generation) return -1;
> > +	if (l_map->ino_generation < r_map->ino_generation) return 1;
> > +
> > +	/*
> > +	 * Addresses with no major/minor numbers are assumed to be
> > +	 * anonymous in userspace.  Sort those on pid then address.
> > +	 *
> > +	 * The kernel and non-zero major/minor mapped areas are
> > +	 * assumed to be unity mapped.  Sort those on address.
> > +	 */
> > +
> > +	if ((entry->cpumode != PERF_RECORD_MISC_KERNEL) &&
> > +	    (!(l_map->flags & MAP_SHARED)) &&
> > +	    !l_map->maj && !l_map->min && !l_map->ino &&
> > +	    !l_map->ino_generation) {
> > +		/* userspace anonymous */
> > +
> > +		if (entry->thread->pid_ > key->al->thread->pid_) return -1;
> > +		if (entry->thread->pid_ < key->al->thread->pid_) return 1;
> > +	}
> > +
> > +addr:
> > +	/* al_addr does all the right addr - start + offset calculations */
> > +	l = cl_address(entry->mem_info->daddr.al_addr);
> > +	r = cl_address(key->mem_info->daddr.al_addr);
> > +
> > +	if (l > r) return -1;
> > +	if (l < r) return 1;
> > +
> > +	return 0;
> > +}
> > +
> >  static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
> >  					  size_t size, unsigned int width)
> >  {
> > @@ -1189,6 +1627,7 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_mispredict = {
> >  	.se_header	= "Branch Mispredicted",
> >  	.se_cmp		= sort__mispredict_cmp,
> > +	.se_cmp_key	= sort__mispredict_cmp_key,
> >  	.se_snprintf	= hist_entry__mispredict_snprintf,
> >  	.se_width_idx	= HISTC_MISPREDICT,
> >  };
> > @@ -1198,12 +1637,24 @@ static u64 he_weight(struct hist_entry *he)
> >  	return he->stat.nr_events ? he->stat.weight / he->stat.nr_events : 0;
> >  }
> >  
> > +static u64 key_weight(struct hist_entry_cmp_key *key)
> > +{
> > +	return key->sample->weight;
> > +}
> > +
> >  static int64_t
> >  sort__local_weight_cmp(struct hist_entry *left, struct hist_entry *right)
> >  {
> >  	return he_weight(left) - he_weight(right);
> >  }
> >  
> > +static int64_t
> > +sort__local_weight_cmp_key(struct hist_entry *entry,
> > +			   struct hist_entry_cmp_key *key)
> > +{
> > +	return he_weight(entry) - key_weight(key);
> > +}
> > +
> >  static int hist_entry__local_weight_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -1213,6 +1664,7 @@ static int hist_entry__local_weight_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_local_weight = {
> >  	.se_header	= "Local Weight",
> >  	.se_cmp		= sort__local_weight_cmp,
> > +	.se_cmp_key	= sort__local_weight_cmp_key,
> >  	.se_snprintf	= hist_entry__local_weight_snprintf,
> >  	.se_width_idx	= HISTC_LOCAL_WEIGHT,
> >  };
> > @@ -1223,6 +1675,13 @@ sort__global_weight_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return left->stat.weight - right->stat.weight;
> >  }
> >  
> > +static int64_t
> > +sort__global_weight_cmp_key(struct hist_entry *entry,
> > +			    struct hist_entry_cmp_key *key __maybe_unused)
> > +{
> > +	return entry->stat.weight - key->sample->weight;
> > +}
> > +
> >  static int hist_entry__global_weight_snprintf(struct hist_entry *he, char *bf,
> >  					      size_t size, unsigned int width)
> >  {
> > @@ -1232,6 +1691,7 @@ static int hist_entry__global_weight_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_global_weight = {
> >  	.se_header	= "Weight",
> >  	.se_cmp		= sort__global_weight_cmp,
> > +	.se_cmp_key	= sort__global_weight_cmp_key,
> >  	.se_snprintf	= hist_entry__global_weight_snprintf,
> >  	.se_width_idx	= HISTC_GLOBAL_WEIGHT,
> >  };
> > @@ -1239,6 +1699,7 @@ struct sort_entry sort_global_weight = {
> >  struct sort_entry sort_mem_daddr_sym = {
> >  	.se_header	= "Data Symbol",
> >  	.se_cmp		= sort__daddr_cmp,
> > +	.se_cmp_key	= sort__daddr_cmp_key,
> >  	.se_snprintf	= hist_entry__daddr_snprintf,
> >  	.se_width_idx	= HISTC_MEM_DADDR_SYMBOL,
> >  };
> > @@ -1246,6 +1707,7 @@ struct sort_entry sort_mem_daddr_sym = {
> >  struct sort_entry sort_mem_iaddr_sym = {
> >  	.se_header	= "Code Symbol",
> >  	.se_cmp		= sort__iaddr_cmp,
> > +	.se_cmp_key	= sort__iaddr_cmp_key,
> >  	.se_snprintf	= hist_entry__iaddr_snprintf,
> >  	.se_width_idx	= HISTC_MEM_IADDR_SYMBOL,
> >  };
> > @@ -1253,6 +1715,7 @@ struct sort_entry sort_mem_iaddr_sym = {
> >  struct sort_entry sort_mem_daddr_dso = {
> >  	.se_header	= "Data Object",
> >  	.se_cmp		= sort__dso_daddr_cmp,
> > +	.se_cmp_key	= sort__dso_daddr_cmp_key,
> >  	.se_snprintf	= hist_entry__dso_daddr_snprintf,
> >  	.se_width_idx	= HISTC_MEM_DADDR_DSO,
> >  };
> > @@ -1260,6 +1723,7 @@ struct sort_entry sort_mem_daddr_dso = {
> >  struct sort_entry sort_mem_locked = {
> >  	.se_header	= "Locked",
> >  	.se_cmp		= sort__locked_cmp,
> > +	.se_cmp_key	= sort__locked_cmp_key,
> >  	.se_snprintf	= hist_entry__locked_snprintf,
> >  	.se_width_idx	= HISTC_MEM_LOCKED,
> >  };
> > @@ -1267,6 +1731,7 @@ struct sort_entry sort_mem_locked = {
> >  struct sort_entry sort_mem_tlb = {
> >  	.se_header	= "TLB access",
> >  	.se_cmp		= sort__tlb_cmp,
> > +	.se_cmp_key	= sort__tlb_cmp_key,
> >  	.se_snprintf	= hist_entry__tlb_snprintf,
> >  	.se_width_idx	= HISTC_MEM_TLB,
> >  };
> > @@ -1274,6 +1739,7 @@ struct sort_entry sort_mem_tlb = {
> >  struct sort_entry sort_mem_lvl = {
> >  	.se_header	= "Memory access",
> >  	.se_cmp		= sort__lvl_cmp,
> > +	.se_cmp_key	= sort__lvl_cmp_key,
> >  	.se_snprintf	= hist_entry__lvl_snprintf,
> >  	.se_width_idx	= HISTC_MEM_LVL,
> >  };
> > @@ -1281,6 +1747,7 @@ struct sort_entry sort_mem_lvl = {
> >  struct sort_entry sort_mem_snoop = {
> >  	.se_header	= "Snoop",
> >  	.se_cmp		= sort__snoop_cmp,
> > +	.se_cmp_key	= sort__snoop_cmp_key,
> >  	.se_snprintf	= hist_entry__snoop_snprintf,
> >  	.se_width_idx	= HISTC_MEM_SNOOP,
> >  };
> > @@ -1288,6 +1755,7 @@ struct sort_entry sort_mem_snoop = {
> >  struct sort_entry sort_mem_dcacheline = {
> >  	.se_header	= "Data Cacheline",
> >  	.se_cmp		= sort__dcacheline_cmp,
> > +	.se_cmp_key	= sort__dcacheline_cmp_key,
> >  	.se_snprintf	= hist_entry__dcacheline_snprintf,
> >  	.se_width_idx	= HISTC_MEM_DCACHELINE,
> >  };
> > @@ -1305,6 +1773,19 @@ sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return (int64_t)(r - l);
> >  }
> >  
> > +static int64_t
> > +sort__phys_daddr_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	uint64_t l = 0, r = 0;
> > +
> > +	if (entry->mem_info)
> > +		l = entry->mem_info->daddr.phys_addr;
> > +	if (key->mem_info)
> > +		r = key->mem_info->daddr.phys_addr;
> > +
> > +	return (int64_t)(r - l);
> > +}
> > +
> >  static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
> >  					   size_t size, unsigned int width)
> >  {
> > @@ -1329,6 +1810,7 @@ static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_mem_phys_daddr = {
> >  	.se_header	= "Data Physical Address",
> >  	.se_cmp		= sort__phys_daddr_cmp,
> > +	.se_cmp_key	= sort__phys_daddr_cmp_key,
> >  	.se_snprintf	= hist_entry__phys_daddr_snprintf,
> >  	.se_width_idx	= HISTC_MEM_PHYS_DADDR,
> >  };
> > @@ -1343,6 +1825,16 @@ sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
> >  		right->branch_info->flags.abort;
> >  }
> >  
> > +static int64_t
> > +sort__abort_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->branch_info || !key->bi)
> > +		return cmp_null(entry->branch_info, key->bi);
> > +
> > +	return entry->branch_info->flags.abort !=
> > +		key->bi->flags.abort;
> > +}
> > +
> >  static int hist_entry__abort_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -1361,6 +1853,7 @@ static int hist_entry__abort_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_abort = {
> >  	.se_header	= "Transaction abort",
> >  	.se_cmp		= sort__abort_cmp,
> > +	.se_cmp_key	= sort__abort_cmp_key,
> >  	.se_snprintf	= hist_entry__abort_snprintf,
> >  	.se_width_idx	= HISTC_ABORT,
> >  };
> > @@ -1375,6 +1868,16 @@ sort__in_tx_cmp(struct hist_entry *left, struct hist_entry *right)
> >  		right->branch_info->flags.in_tx;
> >  }
> >  
> > +static int64_t
> > +sort__in_tx_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	if (!entry->branch_info || !key->bi)
> > +		return cmp_null(entry->branch_info, key->bi);
> > +
> > +	return entry->branch_info->flags.in_tx !=
> > +		key->bi->flags.in_tx;
> > +}
> > +
> >  static int hist_entry__in_tx_snprintf(struct hist_entry *he, char *bf,
> >  				    size_t size, unsigned int width)
> >  {
> > @@ -1393,6 +1896,7 @@ static int hist_entry__in_tx_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_in_tx = {
> >  	.se_header	= "Branch in transaction",
> >  	.se_cmp		= sort__in_tx_cmp,
> > +	.se_cmp_key	= sort__in_tx_cmp_key,
> >  	.se_snprintf	= hist_entry__in_tx_snprintf,
> >  	.se_width_idx	= HISTC_IN_TX,
> >  };
> > @@ -1403,6 +1907,12 @@ sort__transaction_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return left->transaction - right->transaction;
> >  }
> >  
> > +static int64_t
> > +sort__transaction_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	return entry->transaction - key->sample->transaction;
> > +}
> > +
> >  static inline char *add_str(char *p, const char *str)
> >  {
> >  	strcpy(p, str);
> > @@ -1465,6 +1975,7 @@ static int hist_entry__transaction_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_transaction = {
> >  	.se_header	= "Transaction                ",
> >  	.se_cmp		= sort__transaction_cmp,
> > +	.se_cmp_key	= sort__transaction_cmp_key,
> >  	.se_snprintf	= hist_entry__transaction_snprintf,
> >  	.se_width_idx	= HISTC_TRANSACTION,
> >  };
> > @@ -1486,6 +1997,12 @@ sort__sym_size_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return _sort__sym_size_cmp(right->ms.sym, left->ms.sym);
> >  }
> >  
> > +static int64_t
> > +sort__sym_size_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	return _sort__sym_size_cmp(key->al->sym, entry->ms.sym);
> > +}
> > +
> >  static int _hist_entry__sym_size_snprintf(struct symbol *sym, char *bf,
> >  					  size_t bf_size, unsigned int width)
> >  {
> > @@ -1504,6 +2021,7 @@ static int hist_entry__sym_size_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_sym_size = {
> >  	.se_header	= "Symbol size",
> >  	.se_cmp		= sort__sym_size_cmp,
> > +	.se_cmp_key	= sort__sym_size_cmp_key,
> >  	.se_snprintf	= hist_entry__sym_size_snprintf,
> >  	.se_width_idx	= HISTC_SYM_SIZE,
> >  };
> > @@ -1525,6 +2043,12 @@ sort__dso_size_cmp(struct hist_entry *left, struct hist_entry *right)
> >  	return _sort__dso_size_cmp(right->ms.map, left->ms.map);
> >  }
> >  
> > +static int64_t
> > +sort__dso_size_cmp_key(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	return _sort__dso_size_cmp(key->al->map, entry->ms.map);
> > +}
> > +
> >  static int _hist_entry__dso_size_snprintf(struct map *map, char *bf,
> >  					  size_t bf_size, unsigned int width)
> >  {
> > @@ -1544,6 +2068,7 @@ static int hist_entry__dso_size_snprintf(struct hist_entry *he, char *bf,
> >  struct sort_entry sort_dso_size = {
> >  	.se_header	= "DSO size",
> >  	.se_cmp		= sort__dso_size_cmp,
> > +	.se_cmp_key	= sort__dso_size_cmp_key,
> >  	.se_snprintf	= hist_entry__dso_size_snprintf,
> >  	.se_width_idx	= HISTC_DSO_SIZE,
> >  };
> > @@ -1693,12 +2218,13 @@ static int __sort__hpp_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
> >  }
> >  
> >  static int64_t __sort__hpp_cmp(struct perf_hpp_fmt *fmt,
> > -			       struct hist_entry *a, struct hist_entry *b)
> > +			       struct hist_entry *entry,
> > +			       struct hist_entry_cmp_key *key)
> >  {
> >  	struct hpp_sort_entry *hse;
> >  
> >  	hse = container_of(fmt, struct hpp_sort_entry, hpp);
> > -	return hse->se->se_cmp(a, b);
> > +	return hse->se->se_cmp_key(entry, key);
> >  }
> >  
> >  static int64_t __sort__hpp_collapse(struct perf_hpp_fmt *fmt,
> > @@ -2089,9 +2615,37 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
> >  	return memcmp(a->raw_data + offset, b->raw_data + offset, size);
> >  }
> >  
> > +static int64_t __sort__hde_cmp_key(struct perf_hpp_fmt *fmt,
> > +				   struct hist_entry *a,
> > +				   struct hist_entry_cmp_key *key)
> > +{
> > +	struct hpp_dynamic_entry *hde;
> > +	struct tep_format_field *field;
> > +	unsigned offset, size;
> > +
> > +	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
> > +	field = hde->field;
> > +	if (field->flags & TEP_FIELD_IS_DYNAMIC) {
> > +		unsigned long long dyn;
> > +
> > +		tep_read_number_field(field, a->raw_data, &dyn);
> > +		offset = dyn & 0xffff;
> > +		size = (dyn >> 16) & 0xffff;
> > +
> > +		/* record max width for output */
> > +		if (size > hde->dynamic_len)
> > +			hde->dynamic_len = size;
> > +	} else {
> > +		offset = field->offset;
> > +		size = field->size;
> > +	}
> > +
> > +	return memcmp(a->raw_data + offset, key->sample->raw_data + offset, size);
> > +}
> > +
> >  bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
> >  {
> > -	return fmt->cmp == __sort__hde_cmp;
> > +	return fmt->cmp == __sort__hde_cmp_key;
> >  }
> >  
> >  static bool __sort__hde_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
> > @@ -2138,7 +2692,7 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct tep_format_field *field,
> >  	hde->hpp.entry  = __sort__hde_entry;
> >  	hde->hpp.color  = NULL;
> >  
> > -	hde->hpp.cmp = __sort__hde_cmp;
> > +	hde->hpp.cmp = __sort__hde_cmp_key;
> >  	hde->hpp.collapse = __sort__hde_cmp;
> >  	hde->hpp.sort = __sort__hde_cmp;
> >  	hde->hpp.equal = __sort__hde_equal;
> > diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
> > index a97cf8e..da85224 100644
> > --- a/tools/perf/util/sort.h
> > +++ b/tools/perf/util/sort.h
> > @@ -264,6 +264,7 @@ struct sort_entry {
> >  	const char *se_header;
> >  
> >  	int64_t (*se_cmp)(struct hist_entry *, struct hist_entry *);
> > +	int64_t (*se_cmp_key)(struct hist_entry *, struct hist_entry_cmp_key *);
> >  	int64_t (*se_collapse)(struct hist_entry *, struct hist_entry *);
> >  	int64_t	(*se_sort)(struct hist_entry *, struct hist_entry *);
> >  	int	(*se_snprintf)(struct hist_entry *he, char *bf, size_t size,
> > diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> > index 3badd7f..78df16b 100644
> > --- a/tools/perf/util/hist.h
> > +++ b/tools/perf/util/hist.h
> > @@ -150,7 +150,6 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
> >  struct perf_hpp;
> >  struct perf_hpp_fmt;
> >  
> > -int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
> >  int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
> >  int hist_entry__transaction_len(void);
> >  int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size,
> > @@ -238,6 +237,18 @@ struct perf_hpp {
> >  	void *ptr;
> >  };
> >  
> > +struct hist_entry_cmp_key {
> > +	struct addr_location *al;
> > +	struct comm *comm;
> > +	struct branch_info *bi;
> > +	struct symbol *sym_parent;
> > +	struct perf_sample *sample;
> > +	struct mem_info *mem_info;
> > +	char *srcfile;
> > +	char *trace_output;
> > +};
> > +
> > +struct comm;
> >  struct perf_hpp_fmt {
> >  	const char *name;
> >  	int (*header)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
> > @@ -249,7 +260,8 @@ struct perf_hpp_fmt {
> >  	int (*entry)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
> >  		     struct hist_entry *he);
> >  	int64_t (*cmp)(struct perf_hpp_fmt *fmt,
> > -		       struct hist_entry *a, struct hist_entry *b);
> > +		       struct hist_entry *entry,
> > +		       struct hist_entry_cmp_key *key);
> >  	int64_t (*collapse)(struct perf_hpp_fmt *fmt,
> >  			    struct hist_entry *a, struct hist_entry *b);
> >  	int64_t (*sort)(struct perf_hpp_fmt *fmt,
> > @@ -525,4 +537,8 @@ static inline int hists__scnprintf_title(struct hists *hists, char *bf, size_t s
> >  	return __hists__scnprintf_title(hists, bf, size, true);
> >  }
> >  
> > +extern unsigned long hist_lookups;
> > +extern unsigned long hist_hits;
> > +extern unsigned long hist_misses;
> > +
> >  #endif	/* __PERF_HIST_H */
> > diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> > index 828cb97..a4deb5d 100644
> > --- a/tools/perf/util/hist.c
> > +++ b/tools/perf/util/hist.c
> > @@ -364,16 +364,49 @@ void hists__delete_entries(struct hists *hists)
> >  	}
> >  }
> >  
> > +static u8 symbol__parent_filter(const struct symbol *parent)
> > +{
> > +	if (symbol_conf.exclude_other && parent == NULL)
> > +		return 1 << HIST_FILTER__PARENT;
> > +	return 0;
> > +}
> > +
> >  /*
> >   * histogram, sorted on item, collects periods
> >   */
> >  
> >  static int hist_entry__init(struct hist_entry *he,
> > -			    struct hist_entry *template,
> > +			    struct hist_entry_cmp_key *key,
> > +			    struct hists *hists,
> >  			    bool sample_self,
> >  			    size_t callchain_size)
> >  {
> > -	*he = *template;
> > +	struct namespaces *ns = thread__namespaces(key->al->thread);
> > +
> > +	he->thread = key->al->thread;
> > +	he->comm = thread__comm(he->thread);
> > +	he->cgroup_id.dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0;
> > +	he->cgroup_id.ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0;
> > +	he->ms.map = key->al->map;
> > +	he->ms.sym = key->al->sym;
> > +	he->srcline = key->al->srcline ? strdup(key->al->srcline) : NULL;
> > +	he->socket	 = key->al->socket;
> > +	he->cpu	 = key->al->cpu;
> > +	he->cpumode = key->al->cpumode;
> > +	he->ip	 = key->al->addr;
> > +	he->level	 = key->al->level;
> > +	he->stat.nr_events = 1;
> > +	he->stat.period = key->sample->period;
> > +	he->stat.weight = key->sample->weight;
> > +	he->parent = key->sym_parent;
> > +	he->filtered = symbol__parent_filter(key->sym_parent) | key->al->filtered;
> > +	he->hists = hists;
> > +	he->branch_info = key->bi;
> > +	he->mem_info = key->mem_info;
> > +	he->transaction = key->sample->transaction;
> > +	he->raw_data = key->sample->raw_data;
> > +	he->raw_size = key->sample->raw_size;
> > +
> >  	he->callchain_size = callchain_size;
> >  
> >  	if (symbol_conf.cumulate_callchain) {
> > @@ -400,7 +433,7 @@ static int hist_entry__init(struct hist_entry *he,
> >  			return -ENOMEM;
> >  		}
> >  
> > -		memcpy(he->branch_info, template->branch_info,
> > +		memcpy(he->branch_info, key->bi,
> >  		       sizeof(*he->branch_info));
> >  
> >  		map__get(he->branch_info->from.map);
> > @@ -459,23 +492,25 @@ static struct hist_entry_ops default_ops = {
> >  	.free	= hist_entry__free,
> >  };
> >  
> > -static struct hist_entry *hist_entry__new(struct hist_entry *template,
> > +static struct hist_entry *hist_entry__new(struct hist_entry_cmp_key *key,
> > +					  struct hists *hists,
> > +					  struct hist_entry_ops *ops,
> >  					  bool sample_self)
> >  {
> > -	struct hist_entry_ops *ops = template->ops;
> >  	size_t callchain_size = 0;
> >  	struct hist_entry *he;
> >  	int err = 0;
> >  
> >  	if (!ops)
> > -		ops = template->ops = &default_ops;
> > +		ops = &default_ops;
> >  
> >  	if (symbol_conf.use_callchain)
> >  		callchain_size = sizeof(struct callchain_root);
> >  
> >  	he = ops->new(callchain_size);
> >  	if (he) {
> > -		err = hist_entry__init(he, template, sample_self, callchain_size);
> > +		he->ops = ops;
> > +		err = hist_entry__init(he, key, hists, sample_self, callchain_size);
> >  		if (err) {
> >  			ops->free(he);
> >  			he = NULL;
> > @@ -485,13 +520,6 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
> >  	return he;
> >  }
> >  
> > -static u8 symbol__parent_filter(const struct symbol *parent)
> > -{
> > -	if (symbol_conf.exclude_other && parent == NULL)
> > -		return 1 << HIST_FILTER__PARENT;
> > -	return 0;
> > -}
> > -
> >  static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
> >  {
> >  	if (!hist_entry__has_callchains(he) || !symbol_conf.use_callchain)
> > @@ -502,17 +530,43 @@ static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
> >  		he->hists->callchain_non_filtered_period += period;
> >  }
> >  
> > +static int64_t
> > +hist_entry__cmp(struct hist_entry *entry, struct hist_entry_cmp_key *key)
> > +{
> > +	struct hists *hists = entry->hists;
> > +	struct perf_hpp_fmt *fmt;
> > +	int64_t cmp = 0;
> > +
> > +	hists__for_each_sort_list(hists, fmt) {
> > +		if (perf_hpp__is_dynamic_entry(fmt) &&
> > +		    !perf_hpp__defined_dynamic_entry(fmt, hists))
> > +			continue;
> > +
> > +		cmp = fmt->cmp(fmt, entry, key);
> > +		if (cmp)
> > +			break;
> > +	}
> > +
> > +	return cmp;
> > +}
> > +
> > +unsigned long hist_lookups;
> > +unsigned long hist_hits;
> > +unsigned long hist_misses;
> > +
> >  static struct hist_entry *hists__findnew_entry(struct hists *hists,
> > -					       struct hist_entry *entry,
> > -					       struct addr_location *al,
> > +					       struct hist_entry_cmp_key *key,
> > +					       struct hist_entry_ops *ops,
> >  					       bool sample_self)
> >  {
> >  	struct rb_node **p;
> >  	struct rb_node *parent = NULL;
> >  	struct hist_entry *he;
> >  	int64_t cmp;
> > -	u64 period = entry->stat.period;
> > -	u64 weight = entry->stat.weight;
> > +	u64 period = key->sample->period;
> > +	u64 weight = key->sample->weight;
> > +
> > +	hist_lookups++;
> >  
> >  	p = &hists->entries_in->rb_node;
> >  
> > @@ -526,7 +580,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
> >  		 * function when searching an entry regardless which sort
> >  		 * keys were used.
> >  		 */
> > -		cmp = hist_entry__cmp(he, entry);
> > +		cmp = hist_entry__cmp(he, key);
> >  
> >  		if (!cmp) {
> >  			if (sample_self) {
> > @@ -540,7 +594,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
> >  			 * This mem info was allocated from sample__resolve_mem
> >  			 * and will not be used anymore.
> >  			 */
> > -			mem_info__zput(entry->mem_info);
> > +			mem_info__zput(key->mem_info);
> >  
> >  			/* If the map of an existing hist_entry has
> >  			 * become out-of-date due to an exec() or
> > @@ -548,10 +602,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
> >  			 * mis-adjust symbol addresses when computing
> >  			 * the history counter to increment.
> >  			 */
> > -			if (he->ms.map != entry->ms.map) {
> > +			if (he->ms.map != key->al->map) {
> >  				map__put(he->ms.map);
> > -				he->ms.map = map__get(entry->ms.map);
> > +				he->ms.map = map__get(key->al->map);
> >  			}
> > +			hist_hits++;
> >  			goto out;
> >  		}
> >  
> > @@ -561,7 +616,8 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
> >  			p = &(*p)->rb_right;
> >  	}
> >  
> > -	he = hist_entry__new(entry, sample_self);
> > +	hist_misses++;
> > +	he = hist_entry__new(key, hists, ops, sample_self);
> >  	if (!he)
> >  		return NULL;
> >  
> > @@ -573,9 +629,9 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
> >  	rb_insert_color(&he->rb_node_in, hists->entries_in);
> >  out:
> >  	if (sample_self)
> > -		he_stat__add_cpumode_period(&he->stat, al->cpumode, period);
> > +		he_stat__add_cpumode_period(&he->stat, key->al->cpumode, period);
> >  	if (symbol_conf.cumulate_callchain)
> > -		he_stat__add_cpumode_period(he->stat_acc, al->cpumode, period);
> > +		he_stat__add_cpumode_period(he->stat_acc, key->al->cpumode, period);
> >  	return he;
> >  }
> >  
> > @@ -589,39 +645,19 @@ __hists__add_entry(struct hists *hists,
> >  		   bool sample_self,
> >  		   struct hist_entry_ops *ops)
> >  {
> > -	struct namespaces *ns = thread__namespaces(al->thread);
> > -	struct hist_entry entry = {
> > -		.thread	= al->thread,
> > -		.comm = thread__comm(al->thread),
> > -		.cgroup_id = {
> > -			.dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0,
> > -			.ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0,
> > -		},
> > -		.ms = {
> > -			.map	= al->map,
> > -			.sym	= al->sym,
> > -		},
> > -		.srcline = al->srcline ? strdup(al->srcline) : NULL,
> > -		.socket	 = al->socket,
> > -		.cpu	 = al->cpu,
> > -		.cpumode = al->cpumode,
> > -		.ip	 = al->addr,
> > -		.level	 = al->level,
> > -		.stat = {
> > -			.nr_events = 1,
> > -			.period	= sample->period,
> > -			.weight = sample->weight,
> > -		},
> > -		.parent = sym_parent,
> > -		.filtered = symbol__parent_filter(sym_parent) | al->filtered,
> > -		.hists	= hists,
> > -		.branch_info = bi,
> > -		.mem_info = mi,
> > -		.transaction = sample->transaction,
> > -		.raw_data = sample->raw_data,
> > -		.raw_size = sample->raw_size,
> > -		.ops = ops,
> > -	}, *he = hists__findnew_entry(hists, &entry, al, sample_self);
> > +	struct hist_entry_cmp_key key;
> > +	struct hist_entry *he;
> > +
> > +	key.al = al;
> > +	key.comm = thread__comm(al->thread);
> > +	key.bi = bi;
> > +	key.sym_parent = sym_parent;
> > +	key.sample = sample;
> > +	key.mem_info = mi;
> > +	key.srcfile = NULL;
> > +	key.trace_output = NULL;
> > +
> > +	he = hists__findnew_entry(hists, &key, ops, sample_self);
> >  
> >  	if (!hists->has_callchains && he && he->callchain_size != 0)
> >  		hists->has_callchains = true;
> > @@ -947,7 +983,9 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
> >  	struct perf_evsel *evsel = iter->evsel;
> >  	struct perf_sample *sample = iter->sample;
> >  	struct hist_entry **he_cache = iter->priv;
> > +	struct hist_entry_cmp_key key;
> >  	struct hist_entry *he;
> > +#if 0
> >  	struct hist_entry he_tmp = {
> >  		.hists = evsel__hists(evsel),
> >  		.cpu = al->cpu,
> > @@ -963,6 +1001,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
> >  		.raw_data = sample->raw_data,
> >  		.raw_size = sample->raw_size,
> >  	};
> > +#endif
> >  	int i;
> >  	struct callchain_cursor cursor;
> >  
> > @@ -974,8 +1013,16 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
> >  	 * Check if there's duplicate entries in the callchain.
> >  	 * It's possible that it has cycles or recursive calls.
> >  	 */
> > +	key.al = al;
> > +	key.comm = thread__comm(al->thread);
> > +	key.bi = NULL;
> > +	key.sym_parent = iter->parent;
> > +	key.sample = sample;
> > +	key.mem_info = NULL;
> > +	key.srcfile = NULL;
> > +	key.trace_output = NULL;
> >  	for (i = 0; i < iter->curr; i++) {
> > -		if (hist_entry__cmp(he_cache[i], &he_tmp) == 0) {
> > +		if (hist_entry__cmp(he_cache[i], &key) == 0) {
> >  			/* to avoid calling callback function */
> >  			iter->he = NULL;
> >  			return 0;
> > @@ -1088,26 +1135,6 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
> >  }
> >  
> >  int64_t
> > -hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
> > -{
> > -	struct hists *hists = left->hists;
> > -	struct perf_hpp_fmt *fmt;
> > -	int64_t cmp = 0;
> > -
> > -	hists__for_each_sort_list(hists, fmt) {
> > -		if (perf_hpp__is_dynamic_entry(fmt) &&
> > -		    !perf_hpp__defined_dynamic_entry(fmt, hists))
> > -			continue;
> > -
> > -		cmp = fmt->cmp(fmt, left, right);
> > -		if (cmp)
> > -			break;
> > -	}
> > -
> > -	return cmp;
> > -}
> > -
> > -int64_t
> >  hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
> >  {
> >  	struct hists *hists = left->hists;
> > @@ -1312,7 +1339,11 @@ static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
> >  			p = &parent->rb_right;
> >  	}
> >  
> > -	new = hist_entry__new(he, true);
> > +#if 1
> > +	new = NULL;
> > +#else
> > +	new = hist_entry__new(he, true); /* XXX fix XXX */
> > +#endif
> >  	if (new == NULL)
> >  		return NULL;
> >  
> > @@ -2168,7 +2199,11 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
> >  			p = &(*p)->rb_right;
> >  	}
> >  
> > -	he = hist_entry__new(pair, true);
> > +#if 1
> > +	he = NULL;
> > +#else
> > +	he = hist_entry__new(pair, true); /* XXX fix XXX */
> > +#endif
> >  	if (he) {
> >  		memset(&he->stat, 0, sizeof(he->stat));
> >  		he->hists = hists;
> > @@ -2213,7 +2248,11 @@ static struct hist_entry *add_dummy_hierarchy_entry(struct hists *hists,
> >  			p = &parent->rb_right;
> >  	}
> >  
> > -	he = hist_entry__new(pair, true);
> > +#if 1
> > +	he = NULL;
> > +#else
> > +	he = hist_entry__new(pair, true); /* XXX fix XXX */
> > +#endif
> >  	if (he) {
> >  		rb_link_node(&he->rb_node_in, parent, p);
> >  		rb_insert_color(&he->rb_node_in, root);
> > diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
> > index fe3dfaa..a3d66e1 100644
> > --- a/tools/perf/ui/hist.c
> > +++ b/tools/perf/ui/hist.c
> > @@ -372,8 +372,15 @@ HPP_RAW_FNS(samples, nr_events)
> >  HPP_RAW_FNS(period, period)
> >  
> >  static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
> > -			    struct hist_entry *a __maybe_unused,
> > -			    struct hist_entry *b __maybe_unused)
> > +			    struct hist_entry *entry __maybe_unused,
> > +			    struct hist_entry_cmp_key *key __maybe_unused)
> > +{
> > +	return 0;
> > +}
> > +
> > +static int64_t hpp__nop_collapse(struct perf_hpp_fmt *fmt __maybe_unused,
> > +				 struct hist_entry *a __maybe_unused,
> > +				 struct hist_entry *b __maybe_unused)
> >  {
> >  	return 0;
> >  }
> > @@ -399,7 +406,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
> >  		.color	= hpp__color_ ## _fn,		\
> >  		.entry	= hpp__entry_ ## _fn,		\
> >  		.cmp	= hpp__nop_cmp,			\
> > -		.collapse = hpp__nop_cmp,		\
> > +		.collapse = hpp__nop_collapse,		\
> >  		.sort	= hpp__sort_ ## _fn,		\
> >  		.idx	= PERF_HPP__ ## _idx,		\
> >  		.equal	= hpp__equal,			\
> > @@ -413,7 +420,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
> >  		.color	= hpp__color_ ## _fn,		\
> >  		.entry	= hpp__entry_ ## _fn,		\
> >  		.cmp	= hpp__nop_cmp,			\
> > -		.collapse = hpp__nop_cmp,		\
> > +		.collapse = hpp__nop_collapse,		\
> >  		.sort	= hpp__sort_ ## _fn,		\
> >  		.idx	= PERF_HPP__ ## _idx,		\
> >  		.equal	= hpp__equal,			\
> > @@ -426,7 +433,7 @@ static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
> >  		.width	= hpp__width_fn,		\
> >  		.entry	= hpp__entry_ ## _fn,		\
> >  		.cmp	= hpp__nop_cmp,			\
> > -		.collapse = hpp__nop_cmp,		\
> > +		.collapse = hpp__nop_collapse,		\
> >  		.sort	= hpp__sort_ ## _fn,		\
> >  		.idx	= PERF_HPP__ ## _idx,		\
> >  		.equal	= hpp__equal,			\
> > diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
> > index f3aa9d0..190f5eb 100644
> > --- a/tools/perf/builtin-c2c.c
> > +++ b/tools/perf/builtin-c2c.c
> > @@ -1717,12 +1717,13 @@ static int c2c_se_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
> >  }
> >  
> >  static int64_t c2c_se_cmp(struct perf_hpp_fmt *fmt,
> > -			  struct hist_entry *a, struct hist_entry *b)
> > +			  struct hist_entry *entry,
> > +			  struct hist_entry_cmp_key *key)
> >  {
> >  	struct c2c_fmt *c2c_fmt = container_of(fmt, struct c2c_fmt, fmt);
> >  	struct c2c_dimension *dim = c2c_fmt->dim;
> >  
> > -	return dim->se->se_cmp(a, b);
> > +	return dim->se->se_cmp_key(entry, key);
> >  }
> >  
> >  static int64_t c2c_se_collapse(struct perf_hpp_fmt *fmt,
> > @@ -1755,8 +1756,13 @@ static struct c2c_fmt *get_format(const char *name)
> >  	INIT_LIST_HEAD(&fmt->list);
> >  	INIT_LIST_HEAD(&fmt->sort_list);
> >  
> > +#if 1
> > +	fmt->cmp	= c2c_se_cmp;
> > +	fmt->sort	= dim->cmp;
> > +#else
> >  	fmt->cmp	= dim->se ? c2c_se_cmp   : dim->cmp;
> >  	fmt->sort	= dim->se ? c2c_se_cmp   : dim->cmp;
> > +#endif
> >  	fmt->color	= dim->se ? NULL	 : dim->color;
> >  	fmt->entry	= dim->se ? c2c_se_entry : dim->entry;
> >  	fmt->header	= c2c_header;
> > diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
> > index 39db2ee..2684efa 100644
> > --- a/tools/perf/builtin-diff.c
> > +++ b/tools/perf/builtin-diff.c
> > @@ -604,8 +604,16 @@ hist_entry__cmp_compute_idx(struct hist_entry *left, struct hist_entry *right,
> >  
> >  static int64_t
> >  hist_entry__cmp_nop(struct perf_hpp_fmt *fmt __maybe_unused,
> > -		    struct hist_entry *left __maybe_unused,
> > -		    struct hist_entry *right __maybe_unused)
> > +		    struct hist_entry *entry __maybe_unused,
> > +		    struct hist_entry_cmp_key *key __maybe_unused)
> > +{
> > +	return 0;
> > +}
> > +
> > +static int64_t
> > +hist_entry__collapse_nop(struct perf_hpp_fmt *fmt __maybe_unused,
> > +			 struct hist_entry *a __maybe_unused,
> > +			 struct hist_entry *b __maybe_unused)
> >  {
> >  	return 0;
> >  }
> > @@ -1141,7 +1149,7 @@ static void data__hpp_register(struct data__file *d, int idx)
> >  	fmt->width  = hpp__width;
> >  	fmt->entry  = hpp__entry_global;
> >  	fmt->cmp    = hist_entry__cmp_nop;
> > -	fmt->collapse = hist_entry__cmp_nop;
> > +	fmt->collapse = hist_entry__collapse_nop;
> >  
> >  	/* TODO more colors */
> >  	switch (idx) {
> > @@ -1166,7 +1174,7 @@ static void data__hpp_register(struct data__file *d, int idx)
> >  		fmt->sort  = hist_entry__cmp_delta_abs;
> >  		break;
> >  	default:
> > -		fmt->sort  = hist_entry__cmp_nop;
> > +		fmt->sort  = hist_entry__collapse_nop;
> >  		break;
> >  	}
> >  
> > @@ -1230,7 +1238,7 @@ static int ui_init(void)
> >  	}
> >  
> >  	fmt->cmp      = hist_entry__cmp_nop;
> > -	fmt->collapse = hist_entry__cmp_nop;
> > +	fmt->collapse = hist_entry__collapse_nop;
> >  
> >  	switch (compute) {
> >  	case COMPUTE_DELTA:

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-10-31 15:39   ` Jiri Olsa
@ 2018-10-31 16:08     ` David Miller
  2018-11-03  6:30       ` David Miller
  0 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-10-31 16:08 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 31 Oct 2018 16:39:07 +0100

> it'd be great to make hist processing faster, but is your main target here
> to get the load out of the reader thread, so we dont lose events during the
> hist processing?
> 
> we could queue events directly from reader thread into another thread and
> keep it (the reader thread) free of processing, focusing only on event
> reading/passing 

Indeed, we could create threads that take samples from the thread processing
the ring buffers, and insert them into the histogram.

In fact, since there is pthread locking already around the histogram
datastructures we could parallelize that as much as we want.

If beneficial we could also parallelize the ring buffer processing
into a small number of threads too.

My understanding is that in it's default mode perf gets one event ring
buffer per cpu being analyzed.  So we could divide that number of
rings by some factor, like 16 or something, and thus divide the rings
into groups of 16 with one thread assigned to each group.

There is one major concern about this though.  Creating threads makes
perf a bit more "invasive" to the workload it is observing.  And that
is something we've always worked to minimize.

I think your idea to add threads for the histogram work is great.

But I still think that the histogram code is really bloated, and doing
a full 262 byte memset on every histogram lookup is unnecessary
overhead.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-10-31 16:08     ` David Miller
@ 2018-11-03  6:30       ` David Miller
  2018-11-04 20:18         ` Jiri Olsa
  0 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-03  6:30 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: David Miller <davem@davemloft.net>
Date: Wed, 31 Oct 2018 09:08:16 -0700 (PDT)

> From: Jiri Olsa <jolsa@redhat.com>
> Date: Wed, 31 Oct 2018 16:39:07 +0100
> 
>> it'd be great to make hist processing faster, but is your main target here
>> to get the load out of the reader thread, so we dont lose events during the
>> hist processing?
>> 
>> we could queue events directly from reader thread into another thread and
>> keep it (the reader thread) free of processing, focusing only on event
>> reading/passing 
> 
> Indeed, we could create threads that take samples from the thread processing
> the ring buffers, and insert them into the histogram.

So I played around with some ideas like this and ran into some dead ends.

I ran each mmap ring's processing in a separate thread.

This doesn't help at all, the problem is that all the threads serialize
at the pthread lock for the histogram part of the work.

And the histogram part dominates the cost of processing each sample.

Nevertheless I started work on formally threading all of the code that
the mmap threads operate on, such as symbol processing etc. and while
doing so I came to the conclusion that pushing the histogram processing
only to a separate thread poses it's own set of big challenges.

To make this work we would have to make a piece of transient on-stack
state (the processed event) into allocated persistent state.

These persistent event structures get queued up to the histogram
thread(s).

Therefore, if the histogram thread(s) can't keep up (and as per my
experiment above, it is easy to enter this state because the histogram
code itself is going to run linearly with the histgram lock held),
this persistent event memory will just get larger and larger.

We would have to find some way to parallelize the histgram code to
make any kind of threading worthwhile.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-03  6:30       ` David Miller
@ 2018-11-04 20:18         ` Jiri Olsa
  2018-11-05  0:50           ` David Miller
  0 siblings, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-04 20:18 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Fri, Nov 02, 2018 at 11:30:03PM -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Wed, 31 Oct 2018 09:08:16 -0700 (PDT)
> 
> > From: Jiri Olsa <jolsa@redhat.com>
> > Date: Wed, 31 Oct 2018 16:39:07 +0100
> > 
> >> it'd be great to make hist processing faster, but is your main target here
> >> to get the load out of the reader thread, so we dont lose events during the
> >> hist processing?
> >> 
> >> we could queue events directly from reader thread into another thread and
> >> keep it (the reader thread) free of processing, focusing only on event
> >> reading/passing 
> > 
> > Indeed, we could create threads that take samples from the thread processing
> > the ring buffers, and insert them into the histogram.
> 
> So I played around with some ideas like this and ran into some dead ends.
> 
> I ran each mmap ring's processing in a separate thread.
> 
> This doesn't help at all, the problem is that all the threads serialize
> at the pthread lock for the histogram part of the work.
> 
> And the histogram part dominates the cost of processing each sample.

yep, it suck.. I was thinking of keeping separate hist objects for
each thread and merge them at the end

> 
> Nevertheless I started work on formally threading all of the code that
> the mmap threads operate on, such as symbol processing etc. and while
> doing so I came to the conclusion that pushing the histogram processing
> only to a separate thread poses it's own set of big challenges.
> 
> To make this work we would have to make a piece of transient on-stack
> state (the processed event) into allocated persistent state.
> 
> These persistent event structures get queued up to the histogram
> thread(s).
> 
> Therefore, if the histogram thread(s) can't keep up (and as per my
> experiment above, it is easy to enter this state because the histogram
> code itself is going to run linearly with the histgram lock held),
> this persistent event memory will just get larger and larger.
> 
> We would have to find some way to parallelize the histgram code to
> make any kind of threading worthwhile.

do you have some code I could check on?

I'm going to make that separate thread to get the processing out
of the reading thread.. I think we need that in any case, so the
ring buffer is kept free as fast as possible

thanks,
jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-04 20:18         ` Jiri Olsa
@ 2018-11-05  0:50           ` David Miller
  2018-11-05 20:34             ` Jiri Olsa
  0 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-05  0:50 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Sun, 4 Nov 2018 21:18:21 +0100

> do you have some code I could check on?

All I have is this patch which parallelizes the mmap readers in perf
top.

It's not complete and you need to add proper locking, particularly around
the machine__resolve() call.

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d21d875..e214225 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -725,7 +725,9 @@ static void perf_event__process_sample(struct perf_tool *tool,
 	if (event->header.misc & PERF_RECORD_MISC_EXACT_IP)
 		top->exact_samples++;
 
-	if (machine__resolve(machine, &al, sample) < 0)
+	err = machine__resolve(machine, &al, sample);
+
+	if (err < 0)
 		return;
 
 	if (!machine->kptr_restrict_warned &&
@@ -877,6 +879,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 	perf_mmap__read_done(md);
 }
 
+#if 0
 static void perf_top__mmap_read(struct perf_top *top)
 {
 	bool overwrite = top->record_opts.overwrite;
@@ -903,6 +906,7 @@ static void perf_top__mmap_read(struct perf_top *top)
 			    "decreasing the freq (-F) or\n"
 			    "limiting the number of CPUs (-C)\n");
 }
+#endif
 
 /*
  * Check per-event overwrite term.
@@ -1063,6 +1067,59 @@ static int callchain_param__setup_sample_type(struct callchain_param *callchain)
 	return 0;
 }
 
+struct mmap_thread_arg {
+	struct perf_top *top;
+	int index;
+};
+
+static void *mmap_thread_worker(void *arg)
+{
+	struct mmap_thread_arg *ap = arg;
+
+	while (!done)
+		perf_top__mmap_read_idx(ap->top, ap->index);
+
+	return NULL;
+}
+
+static pthread_t *mmap_threads;
+
+static int blitzkreig_bop(struct perf_top *top)
+{
+	struct perf_evlist *evlist = top->evlist;
+	int i, nr_threads = evlist->nr_mmaps;
+	struct mmap_thread_arg *ap;
+
+	fprintf(stderr, "Creating %d mmap threads\n", nr_threads);
+	fflush(stderr);
+
+	ap = calloc(sizeof(*ap), nr_threads);
+	if (!ap)
+		return -ENOMEM;
+
+	mmap_threads = calloc(sizeof(pthread_t), nr_threads);
+	if (!mmap_threads) {
+		free(ap);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		ap[i].top = top;
+		ap[i].index = i;
+	}
+
+	/* Current thread will handle index zero.  */
+	for (i = 1; i < nr_threads; i++) {
+		int err = pthread_create(&mmap_threads[i], NULL,
+					 mmap_thread_worker, &ap[i]);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int __cmd_top(struct perf_top *top)
 {
 	char msg[512];
@@ -1134,11 +1191,6 @@ static int __cmd_top(struct perf_top *top)
         if (!target__none(&opts->target))
                 perf_evlist__enable(top->evlist);
 
-	/* Wait for a minimal set of events before starting the snapshot */
-	perf_evlist__poll(top->evlist, 100);
-
-	perf_top__mmap_read(top);
-
 	ret = -1;
 	if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
 							    display_thread), top)) {
@@ -1156,13 +1208,30 @@ static int __cmd_top(struct perf_top *top)
 		}
 	}
 
+	ret = blitzkreig_bop(top);
+	if (ret)
+		goto out_join;
+
+#if 1
+	perf_top__mmap_read_idx(top, 0);
+#else
+	/* Wait for a minimal set of events before starting the snapshot */
+	perf_evlist__poll(top->evlist, 100);
+	perf_top__mmap_read(top);
+#endif
+
 	while (!done) {
+#if 0
 		u64 hits = top->samples;
+#endif
 
+#if 1
+		perf_top__mmap_read_idx(top, 0);
+#else
 		perf_top__mmap_read(top);
-
 		if (opts->overwrite || (hits == top->samples))
 			ret = perf_evlist__poll(top->evlist, 100);
+#endif
 
 		if (resize) {
 			perf_top__resize(top);
@@ -1257,7 +1326,7 @@ int cmd_top(int argc, const char **argv)
 				.uses_mmap   = true,
 			},
 			.proc_map_timeout    = 500,
-			.overwrite	= 1,
+			.overwrite	= 0,
 		},
 		.max_stack	     = sysctl__max_stack(),
 		.annotation_opts     = annotation__default_options,

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-05  0:50           ` David Miller
@ 2018-11-05 20:34             ` Jiri Olsa
  2018-11-05 22:52               ` David Miller
  2018-11-06  3:45               ` David Miller
  0 siblings, 2 replies; 40+ messages in thread
From: Jiri Olsa @ 2018-11-05 20:34 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Sun, Nov 04, 2018 at 04:50:39PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Sun, 4 Nov 2018 21:18:21 +0100
> 
> > do you have some code I could check on?
> 
> All I have is this patch which parallelizes the mmap readers in perf
> top.

I put something together.. still testing, but it's already
showing 0 lost events when for 'overwrite = 0' case even
for high load.. the old code shows ~1500 for same workload

I'm printing lost event counts in stdio output header:

  # perf top -d 1 --stdio
     PerfTop:    5734 irqs/sec  kernel:57.3%  exact: 100.0% lost: 0 [4000Hz cycles:ppp],  (all, 8 CPUs)

will probably add it to tui as well, seems useful

I pushed it in perf/fixes branch in:
  git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git

it's stil work in prograss.. not too manye complete changelogs ;-)

jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-05 20:34             ` Jiri Olsa
@ 2018-11-05 22:52               ` David Miller
  2018-11-06  3:45               ` David Miller
  1 sibling, 0 replies; 40+ messages in thread
From: David Miller @ 2018-11-05 22:52 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 5 Nov 2018 21:34:47 +0100

> I put something together.. still testing, but it's already
> showing 0 lost events when for 'overwrite = 0' case even
> for high load.. the old code shows ~1500 for same workload
> 
> I'm printing lost event counts in stdio output header:
> 
>   # perf top -d 1 --stdio
>      PerfTop:    5734 irqs/sec  kernel:57.3%  exact: 100.0% lost: 0 [4000Hz cycles:ppp],  (all, 8 CPUs)
> 
> will probably add it to tui as well, seems useful
> 
> I pushed it in perf/fixes branch in:
>   git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> 
> it's stil work in prograss.. not too manye complete changelogs ;-)

Looks interesting!

I'll play with this on sparc64 later, thanks!

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-05 20:34             ` Jiri Olsa
  2018-11-05 22:52               ` David Miller
@ 2018-11-06  3:45               ` David Miller
  2018-11-06  4:03                 ` David Miller
  2018-11-06 11:51                 ` Jiri Olsa
  1 sibling, 2 replies; 40+ messages in thread
From: David Miller @ 2018-11-06  3:45 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 5 Nov 2018 21:34:47 +0100

> I pushed it in perf/fixes branch in:
>   git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git

Build fix for sparc below, I'm playing with this now.

perf: Use PRIu64 for printing top lost events count.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index db3413e..2c8eac8 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c
@@ -46,7 +46,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 							samples_per_sec;
 		ret = SNPRINTF(bf, size,
 			       "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%"
-			       "  exact: %4.1f%% lost: %lu [", samples_per_sec,
+			       "  exact: %4.1f%% lost: %" PRIu64 " [", samples_per_sec,
 			       ksamples_percent, esamples_percent, top->lost);
 	} else {
 		float us_samples_per_sec = top->us_samples / top->delay_secs;

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-06  3:45               ` David Miller
@ 2018-11-06  4:03                 ` David Miller
  2018-11-06  4:53                   ` David Miller
  2018-11-06 11:51                 ` Jiri Olsa
  1 sibling, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-06  4:03 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: David Miller <davem@davemloft.net>
Date: Mon, 05 Nov 2018 19:45:42 -0800 (PST)

> Build fix for sparc below, I'm playing with this now.

I get various assertion failures and crashes during make -j128 kernel
builds on my sparc64 box:

====================
perf: Segmentation fault
-------- backtrace --------
/lib/sparc-linux-gnu/libc.so.6(+0x38918)[0xf7264918]
./perf(thread__set_namespaces+0x30)[0x10b2f0]
./perf(machine__process_namespaces_event+0x134)[0xfad54]
./perf[0x517b8]
./perf(queued_events__flush+0xc8)[0x109288]
./perf[0x5133c]
/lib/sparc-linux-gnu/libpthread.so.0(+0x6714)[0xf7c7e714]
/lib/sparc-linux-gnu/libc.so.6(+0xf70b4)[0xf73230b4]
/lib/sparc-linux-gnu/libpthread.so.0(+0x6a84)[0xf7c7ea84]
davem@patience:~/src/GIT/sparc/tools/perf$ 
====================

and:

====================
perf: /home/davem/src/GIT/sparc/tools/include/linux/refcount.h:131: refcount_sub_and_test: Assertion `!(new > val)' failed.
====================

I don't have a lot of time to try and debug this, but let me know if you
want any fixes tested.

I did look at the changes and I like what you're trying to do.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-06  4:03                 ` David Miller
@ 2018-11-06  4:53                   ` David Miller
  2018-11-06 11:54                     ` Jiri Olsa
  2018-11-06 20:42                     ` Jiri Olsa
  0 siblings, 2 replies; 40+ messages in thread
From: David Miller @ 2018-11-06  4:53 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa


Jiri,

Because you now run queued_events__queue() lockless with that condvar
trick, it is possible for top->qe.in to be seen as one past the data[]
array, this is because the rotate_queues() code goes:

	if (++top->qe.in > &top->qe.data[1])
		top->qe.in = &top->qe.data[0];

So for a brief moment top->qe.in is out of range and thus
perf_top__mmap_read_idx() can try to enqueue to top->qe.data[2]

We can just do:

	if (top->qe.in == &top->qe.data[1])
		top->qe.in = &top->qe.data[0];
	else
		top->qe.in = &top->qe.data[1];

Or, make top->qe.in an index, and simply go:

	top->qe.in ^= 1;

Either way will fix the bug.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-06  3:45               ` David Miller
  2018-11-06  4:03                 ` David Miller
@ 2018-11-06 11:51                 ` Jiri Olsa
  1 sibling, 0 replies; 40+ messages in thread
From: Jiri Olsa @ 2018-11-06 11:51 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Mon, Nov 05, 2018 at 07:45:42PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Mon, 5 Nov 2018 21:34:47 +0100
> 
> > I pushed it in perf/fixes branch in:
> >   git://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> 
> Build fix for sparc below, I'm playing with this now.
> 
> perf: Use PRIu64 for printing top lost events count.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>

will add this one, thanks

jirka

> 
> diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
> index db3413e..2c8eac8 100644
> --- a/tools/perf/util/top.c
> +++ b/tools/perf/util/top.c
> @@ -46,7 +46,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
>  							samples_per_sec;
>  		ret = SNPRINTF(bf, size,
>  			       "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%"
> -			       "  exact: %4.1f%% lost: %lu [", samples_per_sec,
> +			       "  exact: %4.1f%% lost: %" PRIu64 " [", samples_per_sec,
>  			       ksamples_percent, esamples_percent, top->lost);
>  	} else {
>  		float us_samples_per_sec = top->us_samples / top->delay_secs;

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-06  4:53                   ` David Miller
@ 2018-11-06 11:54                     ` Jiri Olsa
  2018-11-19  5:26                       ` Namhyung Kim
  2018-11-06 20:42                     ` Jiri Olsa
  1 sibling, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-06 11:54 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Mon, Nov 05, 2018 at 08:53:42PM -0800, David Miller wrote:
> 
> Jiri,
> 
> Because you now run queued_events__queue() lockless with that condvar
> trick, it is possible for top->qe.in to be seen as one past the data[]
> array, this is because the rotate_queues() code goes:
> 
> 	if (++top->qe.in > &top->qe.data[1])
> 		top->qe.in = &top->qe.data[0];
> 
> So for a brief moment top->qe.in is out of range and thus
> perf_top__mmap_read_idx() can try to enqueue to top->qe.data[2]
> 
> We can just do:
> 
> 	if (top->qe.in == &top->qe.data[1])
> 		top->qe.in = &top->qe.data[0];
> 	else
> 		top->qe.in = &top->qe.data[1];
> 
> Or, make top->qe.in an index, and simply go:
> 
> 	top->qe.in ^= 1;
> 
> Either way will fix the bug.

ah right.. I had originaly full mutex around that,
then I switched it off in the last patch and did
not realize this implication.. nice ;-)

does it fix the crash you reported earlier?

thanks,
jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-06  4:53                   ` David Miller
  2018-11-06 11:54                     ` Jiri Olsa
@ 2018-11-06 20:42                     ` Jiri Olsa
  2018-11-07  6:13                       ` David Miller
  1 sibling, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-06 20:42 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Mon, Nov 05, 2018 at 08:53:42PM -0800, David Miller wrote:
> 
> Jiri,
> 
> Because you now run queued_events__queue() lockless with that condvar
> trick, it is possible for top->qe.in to be seen as one past the data[]
> array, this is because the rotate_queues() code goes:
> 
> 	if (++top->qe.in > &top->qe.data[1])
> 		top->qe.in = &top->qe.data[0];
> 
> So for a brief moment top->qe.in is out of range and thus
> perf_top__mmap_read_idx() can try to enqueue to top->qe.data[2]
> 
> We can just do:
> 
> 	if (top->qe.in == &top->qe.data[1])
> 		top->qe.in = &top->qe.data[0];
> 	else
> 		top->qe.in = &top->qe.data[1];
> 
> Or, make top->qe.in an index, and simply go:
> 
> 	top->qe.in ^= 1;
> 
> Either way will fix the bug.

I pushed that fix in perf/fixes branch, but I'm still occasionaly
hitting the namespace crash.. working on it ;-)

[root@krava perf]# ./perf top -d 1 -g
perf: Segmentation fault
-------- backtrace --------
./perf[0x5956aa]
/lib64/libc.so.6(+0x36f2f)[0x7f99d05d3f2f]
/lib64/libc.so.6(+0x15ce60)[0x7f99d06f9e60]
./perf(namespaces__new+0x5a)[0x4fdeaa]
./perf(thread__set_namespaces+0x55)[0x4ff335]
./perf(machine__process_namespaces_event+0xa5)[0x4f1c95]
./perf[0x44c78b]
./perf(queued_events__flush+0xcd)[0x4fd46d]
./perf[0x44c25b]
/lib64/libpthread.so.0(+0x7593)[0x7f99d2928593]
/lib64/libc.so.6(clone+0x3e)[0x7f99d0696e6e]

jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-06 20:42                     ` Jiri Olsa
@ 2018-11-07  6:13                       ` David Miller
  2018-11-07  8:32                         ` Jiri Olsa
  0 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-07  6:13 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 6 Nov 2018 21:42:55 +0100

> I pushed that fix in perf/fixes branch, but I'm still occasionaly
> hitting the namespace crash.. working on it ;-)

Jiri, how can this new scheme work without setting copy_on_queue
for the queued_events we use here?

I don't see copy_on_queue being set and that means the queued event
structures reference the event memory directly in the mmaps, after the
mmap thread has released them back to the queue.

That means new events can come in to the mmap ring and overwrite what
was there previously, maybe even while deliver_event() is in the
middle of parsing the event.

Setting copy_on_queue for data[0] and data[1] makes all of the crashes
go away for me.

I get a lot of "[unknown]" shared objects shortly after perf top
starts up during a full workload.  I've been wondering about one
side effect of how the mmap queues are processed, consider the
following:

	cpu 0			cpu 1

				exec
				create new mmap2 events
				scheduled to cpu 0 for whatever reason
	sample 1
	sample 2

And let's say that perf top is backlogged processing the mmap ring of
events generated for cpu 0, and sees sample 1 and sample 2 before
getting to any of cpu 1's events.

This means the thread and map and symbol objects won't exist and
we'll get those '[Unknown]' histogram entries, and they won't go
away.

When it finally stops looping over the mmap ring for cpu 0's events
it gets to cpu 1's mmap ring and sees the exec and mmap2 events
but at that point it's far too late.

I surmise from what I see with perf top right now that this happens
a lot.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-07  6:13                       ` David Miller
@ 2018-11-07  8:32                         ` Jiri Olsa
  2018-11-07 19:43                           ` Jiri Olsa
  0 siblings, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-07  8:32 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Tue, Nov 06, 2018 at 10:13:49PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Tue, 6 Nov 2018 21:42:55 +0100
> 
> > I pushed that fix in perf/fixes branch, but I'm still occasionaly
> > hitting the namespace crash.. working on it ;-)
> 
> Jiri, how can this new scheme work without setting copy_on_queue
> for the queued_events we use here?

aahh.. it won't, setting it up ;-)

> 
> I don't see copy_on_queue being set and that means the queued event
> structures reference the event memory directly in the mmaps, after the
> mmap thread has released them back to the queue.
> 
> That means new events can come in to the mmap ring and overwrite what
> was there previously, maybe even while deliver_event() is in the
> middle of parsing the event.
> 
> Setting copy_on_queue for data[0] and data[1] makes all of the crashes
> go away for me.
> 
> I get a lot of "[unknown]" shared objects shortly after perf top
> starts up during a full workload.  I've been wondering about one
> side effect of how the mmap queues are processed, consider the
> following:
> 
> 	cpu 0			cpu 1
> 
> 				exec
> 				create new mmap2 events
> 				scheduled to cpu 0 for whatever reason
> 	sample 1
> 	sample 2
> 
> And let's say that perf top is backlogged processing the mmap ring of
> events generated for cpu 0, and sees sample 1 and sample 2 before
> getting to any of cpu 1's events.
> 
> This means the thread and map and symbol objects won't exist and
> we'll get those '[Unknown]' histogram entries, and they won't go
> away.
> 
> When it finally stops looping over the mmap ring for cpu 0's events
> it gets to cpu 1's mmap ring and sees the exec and mmap2 events
> but at that point it's far too late.
> 
> I surmise from what I see with perf top right now that this happens
> a lot.

right, there's no reason why top should have different standards than
record/report.. above can definitely happen, I'll enable time sample
type and use ordered events for the queue

jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-07  8:32                         ` Jiri Olsa
@ 2018-11-07 19:43                           ` Jiri Olsa
  2018-11-07 20:01                             ` David Miller
  0 siblings, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-07 19:43 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Wed, Nov 07, 2018 at 09:32:17AM +0100, Jiri Olsa wrote:
> On Tue, Nov 06, 2018 at 10:13:49PM -0800, David Miller wrote:
> > From: Jiri Olsa <jolsa@redhat.com>
> > Date: Tue, 6 Nov 2018 21:42:55 +0100
> > 
> > > I pushed that fix in perf/fixes branch, but I'm still occasionaly
> > > hitting the namespace crash.. working on it ;-)
> > 
> > Jiri, how can this new scheme work without setting copy_on_queue
> > for the queued_events we use here?
> 
> aahh.. it won't, setting it up ;-)
> 
> > 
> > I don't see copy_on_queue being set and that means the queued event
> > structures reference the event memory directly in the mmaps, after the
> > mmap thread has released them back to the queue.
> > 
> > That means new events can come in to the mmap ring and overwrite what
> > was there previously, maybe even while deliver_event() is in the
> > middle of parsing the event.
> > 
> > Setting copy_on_queue for data[0] and data[1] makes all of the crashes
> > go away for me.
> > 
> > I get a lot of "[unknown]" shared objects shortly after perf top
> > starts up during a full workload.  I've been wondering about one
> > side effect of how the mmap queues are processed, consider the
> > following:
> > 
> > 	cpu 0			cpu 1
> > 
> > 				exec
> > 				create new mmap2 events
> > 				scheduled to cpu 0 for whatever reason
> > 	sample 1
> > 	sample 2
> > 
> > And let's say that perf top is backlogged processing the mmap ring of
> > events generated for cpu 0, and sees sample 1 and sample 2 before
> > getting to any of cpu 1's events.
> > 
> > This means the thread and map and symbol objects won't exist and
> > we'll get those '[Unknown]' histogram entries, and they won't go
> > away.
> > 
> > When it finally stops looping over the mmap ring for cpu 0's events
> > it gets to cpu 1's mmap ring and sees the exec and mmap2 events
> > but at that point it's far too late.
> > 
> > I surmise from what I see with perf top right now that this happens
> > a lot.
> 
> right, there's no reason why top should have different standards than
> record/report.. above can definitely happen, I'll enable time sample
> type and use ordered events for the queue

I pushed new version in my perf/fixes branch

jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-07 19:43                           ` Jiri Olsa
@ 2018-11-07 20:01                             ` David Miller
  2018-11-07 20:28                               ` Arnaldo Carvalho de Melo
  2018-11-08  7:13                               ` Jiri Olsa
  0 siblings, 2 replies; 40+ messages in thread
From: David Miller @ 2018-11-07 20:01 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 7 Nov 2018 20:43:44 +0100

> I pushed new version in my perf/fixes branch

Thanks, I'll check it out later today for sure!  This is pretty exciting
work.

Just some random thoughts as I've been thinking about this whole
situation a lot lately:

Something to consider might be consolidating all of the event rings
into one.  This would force perf to process all events in "system
order", ie. what order they actually occurred in the machine.

Yes, this means more contention for the entities inside the kernel
queueing up the events, however the benefits are enormous.

Right now we go forwards and backwards in time as we move from one
event ring to another, as you know.

However, we have to reconcile with the need we have to separate "high
priority" (ie. cannot really lose) events like fork, mmap2, etc.  with
"low priority" ones such as IP samples.

Perhaps another way to think about this is to go to the one huge mmap
ring model, and do the prioritization internally in perf.

Actually, this opens up tons of possibilities in my mind.

Perf can queue to an internal high priority queue for fork and mmap2
events, and never drop them.  Whilst at the same time queueing low
priority events like IP samples into a low priority queue and dropping
with whatever policy it wants when overloaded (f.e. drop older events
before newer ones).

I do not like the overwrite ring buffer mode that was implemented
because it enforeces an entire set of policy decisions upon the user.
Either the model works for you (which it currently does not for perf)
or you can't use it at all.

If the issue is that newer events are more interesting than old ones,
that is entirely perf's businness.  And it can implement this policy
%100 internally to itself.  No kernel changes were ever needed to do
this, as explained above.

Please, let's abandon this whole overwrite mode of the ring buffer.
The old one works perfectly fine, we just have to use it properly.
We should never have to shut off kernel side event queueing just
because we are processing the event ring on the user side.

Thanks.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-07 20:01                             ` David Miller
@ 2018-11-07 20:28                               ` Arnaldo Carvalho de Melo
  2018-11-08  6:04                                 ` David Miller
  2018-11-08  7:13                               ` Jiri Olsa
  1 sibling, 1 reply; 40+ messages in thread
From: Arnaldo Carvalho de Melo @ 2018-11-07 20:28 UTC (permalink / raw)
  To: David Miller; +Cc: jolsa, linux-kernel, namhyung, jolsa

Em Wed, Nov 07, 2018 at 12:01:54PM -0800, David Miller escreveu:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Wed, 7 Nov 2018 20:43:44 +0100
> 
> > I pushed new version in my perf/fixes branch
> 
> Thanks, I'll check it out later today for sure!  This is pretty exciting
> work.
> 
> Just some random thoughts as I've been thinking about this whole
> situation a lot lately:
> 
> Something to consider might be consolidating all of the event rings
> into one.  This would force perf to process all events in "system
> order", ie. what order they actually occurred in the machine.

Processing in "system order" is what we want, yeah.
 
> Yes, this means more contention for the entities inside the kernel
> queueing up the events, however the benefits are enormous.
> 
> Right now we go forwards and backwards in time as we move from one
> event ring to another, as you know.
> 
> However, we have to reconcile with the need we have to separate "high
> priority" (ie. cannot really lose) events like fork, mmap2, etc.  with
> "low priority" ones such as IP samples.

So perhaps we should tell the kernel that is ok to lose SAMPLEs but not
the other events, and make userspace ask for PERF_RECORD_!SAMPLE in all
ring buffers? Duplication wouldn't be that much of a problem?
 
> Perhaps another way to think about this is to go to the one huge mmap
> ring model, and do the prioritization internally in perf.
> 
> Actually, this opens up tons of possibilities in my mind.
> 
> Perf can queue to an internal high priority queue for fork and mmap2
> events, and never drop them.  Whilst at the same time queueing low

Right, or put that in all queues? Would that be too costly?

> priority events like IP samples into a low priority queue and dropping
> with whatever policy it wants when overloaded (f.e. drop older events
> before newer ones).
> 
> I do not like the overwrite ring buffer mode that was implemented
> because it enforeces an entire set of policy decisions upon the user.
> Either the model works for you (which it currently does not for perf)
> or you can't use it at all.
> 
> If the issue is that newer events are more interesting than old ones,
> that is entirely perf's businness.  And it can implement this policy
> %100 internally to itself.  No kernel changes were ever needed to do
> this, as explained above.
> 
> Please, let's abandon this whole overwrite mode of the ring buffer.
> The old one works perfectly fine, we just have to use it properly.
> We should never have to shut off kernel side event queueing just
> because we are processing the event ring on the user side.
> 
> Thanks.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-07 20:28                               ` Arnaldo Carvalho de Melo
@ 2018-11-08  6:04                                 ` David Miller
  0 siblings, 0 replies; 40+ messages in thread
From: David Miller @ 2018-11-08  6:04 UTC (permalink / raw)
  To: acme; +Cc: jolsa, linux-kernel, namhyung, jolsa

From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: Wed, 7 Nov 2018 17:28:15 -0300

> So perhaps we should tell the kernel that is ok to lose SAMPLEs but not
> the other events, and make userspace ask for PERF_RECORD_!SAMPLE in all
> ring buffers? Duplication wouldn't be that much of a problem?

I think we should strive to keep the policy in userspace.

The kernel simply provides the events that happen, and the user's
job is to take the events in sort of a "high priority interrupt"
context and push the slow path processing to a separate thread of
execution where drop policy can be implemented.

Jiri's work provides a framework for exactly that.

So what we can have is:

	cpu1  cpu2  cpu3  cpu4  cpu5  cpu6 ... cpuN
	|     |     |     |     |     |        |
	----------------------------------------
	                  |
			  |
			  | single event ring buffer
			  |
			  |
		ultra-fast perf event dequeue
	     queues in-order to event processing
		          |
		  event processing slow path
	        prioritization and drop policy
		     histogram insert
		        etc.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-07 20:01                             ` David Miller
  2018-11-07 20:28                               ` Arnaldo Carvalho de Melo
@ 2018-11-08  7:13                               ` Jiri Olsa
  2018-11-09  1:07                                 ` David Miller
  1 sibling, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-08  7:13 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Wed, Nov 07, 2018 at 12:01:54PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Wed, 7 Nov 2018 20:43:44 +0100
> 
> > I pushed new version in my perf/fixes branch
> 
> Thanks, I'll check it out later today for sure!  This is pretty exciting
> work.
> 
> Just some random thoughts as I've been thinking about this whole
> situation a lot lately:
> 
> Something to consider might be consolidating all of the event rings
> into one.  This would force perf to process all events in "system
> order", ie. what order they actually occurred in the machine.
> 
> Yes, this means more contention for the entities inside the kernel
> queueing up the events, however the benefits are enormous.

yes, perf's ring buffer is real fast because it's per-cpu

> 
> Right now we go forwards and backwards in time as we move from one
> event ring to another, as you know.
> 
> However, we have to reconcile with the need we have to separate "high
> priority" (ie. cannot really lose) events like fork, mmap2, etc.  with
> "low priority" ones such as IP samples.
> 
> Perhaps another way to think about this is to go to the one huge mmap
> ring model, and do the prioritization internally in perf.
> 
> Actually, this opens up tons of possibilities in my mind.
> 
> Perf can queue to an internal high priority queue for fork and mmap2
> events, and never drop them.  Whilst at the same time queueing low
> priority events like IP samples into a low priority queue and dropping
> with whatever policy it wants when overloaded (f.e. drop older events
> before newer ones).

I think I can see the processing thread overloaded with data in tests,
I'll add some counters for it some we can see how much behind it gets

we could separated fork/mmaps to separate dummy event map, or just
parse them out in the read thread and create special queue for them
and drop just samples in case we are behind

jirka

> 
> I do not like the overwrite ring buffer mode that was implemented
> because it enforeces an entire set of policy decisions upon the user.
> Either the model works for you (which it currently does not for perf)
> or you can't use it at all.
> 
> If the issue is that newer events are more interesting than old ones,
> that is entirely perf's businness.  And it can implement this policy
> %100 internally to itself.  No kernel changes were ever needed to do
> this, as explained above.
> 
> Please, let's abandon this whole overwrite mode of the ring buffer.
> The old one works perfectly fine, we just have to use it properly.
> We should never have to shut off kernel side event queueing just
> because we are processing the event ring on the user side.
> 
> Thanks.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-08  7:13                               ` Jiri Olsa
@ 2018-11-09  1:07                                 ` David Miller
  2018-11-11 19:41                                   ` Jiri Olsa
  0 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-09  1:07 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 8 Nov 2018 08:13:03 +0100

> we could separated fork/mmaps to separate dummy event map, or just
> parse them out in the read thread and create special queue for them
> and drop just samples in case we are behind

What you say at the end here is basically what I am proposing.

Perf dequeues events from mmap ring as fast as possible.

Perf has two internal queues, high priority and low priority.

High priority events are never dropped.

Low priority events are dropped on overload, oldest first.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-09  1:07                                 ` David Miller
@ 2018-11-11 19:41                                   ` Jiri Olsa
  2018-11-11 19:41                                     ` Jiri Olsa
                                                       ` (2 more replies)
  0 siblings, 3 replies; 40+ messages in thread
From: Jiri Olsa @ 2018-11-11 19:41 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Thu, Nov 08, 2018 at 05:07:21PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Thu, 8 Nov 2018 08:13:03 +0100
> 
> > we could separated fork/mmaps to separate dummy event map, or just
> > parse them out in the read thread and create special queue for them
> > and drop just samples in case we are behind
> 
> What you say at the end here is basically what I am proposing.
> 
> Perf dequeues events from mmap ring as fast as possible.
> 
> Perf has two internal queues, high priority and low priority.
> 
> High priority events are never dropped.
> 
> Low priority events are dropped on overload, oldest first.

I added the dropping logic, it's simple so far..

the processing thread will drop (not process) samples
which gets behind the latest event time more then
the 'refresh rate' seconds

adding separate queues for samples and other events is
possible, but it looks like the above simple solution
could be ok for now

thanks,
jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-11 19:41                                   ` Jiri Olsa
@ 2018-11-11 19:41                                     ` Jiri Olsa
  2018-11-11 22:32                                     ` David Miller
  2018-11-11 23:08                                     ` David Miller
  2 siblings, 0 replies; 40+ messages in thread
From: Jiri Olsa @ 2018-11-11 19:41 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Sun, Nov 11, 2018 at 08:41:32PM +0100, Jiri Olsa wrote:
> On Thu, Nov 08, 2018 at 05:07:21PM -0800, David Miller wrote:
> > From: Jiri Olsa <jolsa@redhat.com>
> > Date: Thu, 8 Nov 2018 08:13:03 +0100
> > 
> > > we could separated fork/mmaps to separate dummy event map, or just
> > > parse them out in the read thread and create special queue for them
> > > and drop just samples in case we are behind
> > 
> > What you say at the end here is basically what I am proposing.
> > 
> > Perf dequeues events from mmap ring as fast as possible.
> > 
> > Perf has two internal queues, high priority and low priority.
> > 
> > High priority events are never dropped.
> > 
> > Low priority events are dropped on overload, oldest first.
> 
> I added the dropping logic, it's simple so far..
> 
> the processing thread will drop (not process) samples
> which gets behind the latest event time more then
> the 'refresh rate' seconds
> 
> adding separate queues for samples and other events is
> possible, but it looks like the above simple solution
> could be ok for now

I pushed it to my perf/fixes branch

jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-11 19:41                                   ` Jiri Olsa
  2018-11-11 19:41                                     ` Jiri Olsa
@ 2018-11-11 22:32                                     ` David Miller
  2018-11-11 22:43                                       ` Jiri Olsa
  2018-11-11 23:08                                     ` David Miller
  2 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-11 22:32 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Sun, 11 Nov 2018 20:41:32 +0100

> I added the dropping logic, it's simple so far..

How do you maintain your perf/fixes branch?  Do you rebase? :-/

I just pulled after a previous pull and got nothing but conflicts on
every single file.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-11 22:32                                     ` David Miller
@ 2018-11-11 22:43                                       ` Jiri Olsa
  2018-11-11 22:58                                         ` David Miller
  0 siblings, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-11 22:43 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Sun, Nov 11, 2018 at 02:32:08PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Sun, 11 Nov 2018 20:41:32 +0100
> 
> > I added the dropping logic, it's simple so far..
> 
> How do you maintain your perf/fixes branch?  Do you rebase? :-/
> 
> I just pulled after a previous pull and got nothing but conflicts on
> every single file.

yep, I rebase.. it's my devel branch, I dont keep it sane, sry ;-)

jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-11 22:43                                       ` Jiri Olsa
@ 2018-11-11 22:58                                         ` David Miller
  0 siblings, 0 replies; 40+ messages in thread
From: David Miller @ 2018-11-11 22:58 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Sun, 11 Nov 2018 23:43:36 +0100

> On Sun, Nov 11, 2018 at 02:32:08PM -0800, David Miller wrote:
>> From: Jiri Olsa <jolsa@redhat.com>
>> Date: Sun, 11 Nov 2018 20:41:32 +0100
>> 
>> > I added the dropping logic, it's simple so far..
>> 
>> How do you maintain your perf/fixes branch?  Do you rebase? :-/
>> 
>> I just pulled after a previous pull and got nothing but conflicts on
>> every single file.
> 
> yep, I rebase.. it's my devel branch, I dont keep it sane, sry ;-)

Ok, a lot of warnings on sparc, this is the patch I'm using.

The iterator variable type changes are because of comparisons between
signed and unsigned.

The rlimit members are long long unsigned int.

Etc.

diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c
index b6f6fc4..0128ed8 100644
--- a/tools/perf/bench/epoll-ctl.c
+++ b/tools/perf/bench/epoll-ctl.c
@@ -201,7 +201,7 @@ static void *workerfn(void *arg)
 
 static void init_fdmaps(struct worker *w, int pct)
 {
-	ssize_t i;
+	unsigned int i;
 	int inc;
 	struct epoll_event ev;
 
@@ -302,7 +302,7 @@ int bench_epoll_ctl(int argc, const char **argv)
 	struct worker *worker = NULL;
 	struct cpu_map *cpu;
 	struct rlimit rl, prevrl;
-	ssize_t i;
+	unsigned int i;
 
 	argc = parse_options(argc, argv, options, bench_epoll_ctl_usage, 0);
 	if (argc) {
@@ -340,7 +340,7 @@ int bench_epoll_ctl(int argc, const char **argv)
 	if (getrlimit(RLIMIT_NOFILE, &prevrl))
 	    err(EXIT_FAILURE, "getrlimit");
 	rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50;
-	printinfo("Setting RLIMIT_NOFILE rlimit from %lu to: %lu\n",
+	printinfo("Setting RLIMIT_NOFILE rlimit from %llu to: %llu\n",
 		  prevrl.rlim_max, rl.rlim_max);
 	if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
 		err(EXIT_FAILURE, "setrlimit");
diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
index 4e4efc5..d5d17e2 100644
--- a/tools/perf/bench/epoll-wait.c
+++ b/tools/perf/bench/epoll-wait.c
@@ -395,7 +395,7 @@ static void *writerfn(void *p)
 		nanosleep(&ts, NULL);
 	}
 
-	printinfo("exiting writer-thread (total full-loops: %ld)\n", iter);
+	printinfo("exiting writer-thread (total full-loops: %ld)\n", (long int) iter);
 	return NULL;
 }
 
@@ -459,7 +459,7 @@ int bench_epoll_wait(int argc, const char **argv)
 	if (getrlimit(RLIMIT_NOFILE, &prevrl))
 		err(EXIT_FAILURE, "getrlimit");
 	rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50;
-	printinfo("Setting RLIMIT_NOFILE rlimit from %lu to: %lu\n", prevrl.rlim_max, rl.rlim_max);
+	printinfo("Setting RLIMIT_NOFILE rlimit from %llu to: %llu\n", prevrl.rlim_max, rl.rlim_max);
 	if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
 		err(EXIT_FAILURE, "setrlimit");
 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-11 19:41                                   ` Jiri Olsa
  2018-11-11 19:41                                     ` Jiri Olsa
  2018-11-11 22:32                                     ` David Miller
@ 2018-11-11 23:08                                     ` David Miller
  2018-11-11 23:26                                       ` Jiri Olsa
  2 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-11 23:08 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Sun, 11 Nov 2018 20:41:32 +0100

> On Thu, Nov 08, 2018 at 05:07:21PM -0800, David Miller wrote:
>> From: Jiri Olsa <jolsa@redhat.com>
>> Date: Thu, 8 Nov 2018 08:13:03 +0100
>> 
>> > we could separated fork/mmaps to separate dummy event map, or just
>> > parse them out in the read thread and create special queue for them
>> > and drop just samples in case we are behind
>> 
>> What you say at the end here is basically what I am proposing.
>> 
>> Perf dequeues events from mmap ring as fast as possible.
>> 
>> Perf has two internal queues, high priority and low priority.
>> 
>> High priority events are never dropped.
>> 
>> Low priority events are dropped on overload, oldest first.
> 
> I added the dropping logic, it's simple so far..

So for me perf top gets into a state where the samples counter stops
incrementing, but the event counter does keep moving (which is the
histogram code decaying histogram entries from the display thread).

Which means the event processing has basically stopped.

The event threads are not stuck in a loop, because they respond to
the "q" keypress and we can exit.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-11 23:08                                     ` David Miller
@ 2018-11-11 23:26                                       ` Jiri Olsa
  2018-11-11 23:32                                         ` David Miller
  0 siblings, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-11 23:26 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Sun, Nov 11, 2018 at 03:08:01PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Sun, 11 Nov 2018 20:41:32 +0100
> 
> > On Thu, Nov 08, 2018 at 05:07:21PM -0800, David Miller wrote:
> >> From: Jiri Olsa <jolsa@redhat.com>
> >> Date: Thu, 8 Nov 2018 08:13:03 +0100
> >> 
> >> > we could separated fork/mmaps to separate dummy event map, or just
> >> > parse them out in the read thread and create special queue for them
> >> > and drop just samples in case we are behind
> >> 
> >> What you say at the end here is basically what I am proposing.
> >> 
> >> Perf dequeues events from mmap ring as fast as possible.
> >> 
> >> Perf has two internal queues, high priority and low priority.
> >> 
> >> High priority events are never dropped.
> >> 
> >> Low priority events are dropped on overload, oldest first.
> > 
> > I added the dropping logic, it's simple so far..
> 
> So for me perf top gets into a state where the samples counter stops
> incrementing, but the event counter does keep moving (which is the
> histogram code decaying histogram entries from the display thread).
> 
> Which means the event processing has basically stopped.
> 
> The event threads are not stuck in a loop, because they respond to
> the "q" keypress and we can exit.

is the drop count showing something?

jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-11 23:26                                       ` Jiri Olsa
@ 2018-11-11 23:32                                         ` David Miller
  2018-11-13 10:40                                           ` Jiri Olsa
  0 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-11 23:32 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 12 Nov 2018 00:26:27 +0100

> On Sun, Nov 11, 2018 at 03:08:01PM -0800, David Miller wrote:
>> From: Jiri Olsa <jolsa@redhat.com>
>> Date: Sun, 11 Nov 2018 20:41:32 +0100
>> 
>> > On Thu, Nov 08, 2018 at 05:07:21PM -0800, David Miller wrote:
>> >> From: Jiri Olsa <jolsa@redhat.com>
>> >> Date: Thu, 8 Nov 2018 08:13:03 +0100
>> >> 
>> >> > we could separated fork/mmaps to separate dummy event map, or just
>> >> > parse them out in the read thread and create special queue for them
>> >> > and drop just samples in case we are behind
>> >> 
>> >> What you say at the end here is basically what I am proposing.
>> >> 
>> >> Perf dequeues events from mmap ring as fast as possible.
>> >> 
>> >> Perf has two internal queues, high priority and low priority.
>> >> 
>> >> High priority events are never dropped.
>> >> 
>> >> Low priority events are dropped on overload, oldest first.
>> > 
>> > I added the dropping logic, it's simple so far..
>> 
>> So for me perf top gets into a state where the samples counter stops
>> incrementing, but the event counter does keep moving (which is the
>> histogram code decaying histogram entries from the display thread).
>> 
>> Which means the event processing has basically stopped.
>> 
>> The event threads are not stuck in a loop, because they respond to
>> the "q" keypress and we can exit.
> 
> is the drop count showing something?

It does soon after starting up, then it drops to zero.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-11 23:32                                         ` David Miller
@ 2018-11-13 10:40                                           ` Jiri Olsa
  2018-11-19  4:52                                             ` David Miller
  0 siblings, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-13 10:40 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Sun, Nov 11, 2018 at 03:32:59PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Mon, 12 Nov 2018 00:26:27 +0100
> 
> > On Sun, Nov 11, 2018 at 03:08:01PM -0800, David Miller wrote:
> >> From: Jiri Olsa <jolsa@redhat.com>
> >> Date: Sun, 11 Nov 2018 20:41:32 +0100
> >> 
> >> > On Thu, Nov 08, 2018 at 05:07:21PM -0800, David Miller wrote:
> >> >> From: Jiri Olsa <jolsa@redhat.com>
> >> >> Date: Thu, 8 Nov 2018 08:13:03 +0100
> >> >> 
> >> >> > we could separated fork/mmaps to separate dummy event map, or just
> >> >> > parse them out in the read thread and create special queue for them
> >> >> > and drop just samples in case we are behind
> >> >> 
> >> >> What you say at the end here is basically what I am proposing.
> >> >> 
> >> >> Perf dequeues events from mmap ring as fast as possible.
> >> >> 
> >> >> Perf has two internal queues, high priority and low priority.
> >> >> 
> >> >> High priority events are never dropped.
> >> >> 
> >> >> Low priority events are dropped on overload, oldest first.
> >> > 
> >> > I added the dropping logic, it's simple so far..
> >> 
> >> So for me perf top gets into a state where the samples counter stops
> >> incrementing, but the event counter does keep moving (which is the
> >> histogram code decaying histogram entries from the display thread).
> >> 
> >> Which means the event processing has basically stopped.
> >> 
> >> The event threads are not stuck in a loop, because they respond to
> >> the "q" keypress and we can exit.
> > 
> > is the drop count showing something?
> 
> It does soon after starting up, then it drops to zero.

ok I see it on ~200 cpu server now.. we actuly spawn the
UI message box in the reader thread and wait for user to
press a key with some timeout.. which is not good ;-)

I removed that and add it to bottom line notification line
instead and now under heave load I can see lines updates
together with events being lost/drop

I also changed the lost/drop counts format to:
  lost: current/total

where current is the count within the refresh period
and total is overall counts

I pushed/rebased what I have to perf/fixes branch again

please note I had to change our compile changes, because
they wouldn't compile on x86, but I can't verify on sparc,
so you might see some compile fails again

jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-13 10:40                                           ` Jiri Olsa
@ 2018-11-19  4:52                                             ` David Miller
  2018-11-19  6:28                                               ` Namhyung Kim
  2018-11-19  9:14                                               ` Jiri Olsa
  0 siblings, 2 replies; 40+ messages in thread
From: David Miller @ 2018-11-19  4:52 UTC (permalink / raw)
  To: jolsa; +Cc: acme, linux-kernel, namhyung, jolsa

From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 13 Nov 2018 11:40:54 +0100

> I pushed/rebased what I have to perf/fixes branch again
> 
> please note I had to change our compile changes, because
> they wouldn't compile on x86, but I can't verify on sparc,
> so you might see some compile fails again

I just checked your current perf/fixes branch.

It builds on Sparc ;-)

And it behaves better too.  I do get tons of drops and lost events,
but it seems to keep going even during the hardest load.

Eventually I end up with a lot of unresolvable histogram entries,
so that is something to look into.

I looked at your drop logic and it seems perfect, we avoid dropping
all non-SAMPLE events which is what we want.  So that can't be the
cause of the issues I am seeing.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-06 11:54                     ` Jiri Olsa
@ 2018-11-19  5:26                       ` Namhyung Kim
  2018-11-19  9:12                         ` Jiri Olsa
  0 siblings, 1 reply; 40+ messages in thread
From: Namhyung Kim @ 2018-11-19  5:26 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: David Miller, acme, linux-kernel, jolsa, kernel-team

Hi Jirka

Sorry for late!

On Tue, Nov 06, 2018 at 12:54:36PM +0100, Jiri Olsa wrote:
> On Mon, Nov 05, 2018 at 08:53:42PM -0800, David Miller wrote:
> > 
> > Jiri,
> > 
> > Because you now run queued_events__queue() lockless with that condvar
> > trick, it is possible for top->qe.in to be seen as one past the data[]
> > array, this is because the rotate_queues() code goes:
> > 
> > 	if (++top->qe.in > &top->qe.data[1])
> > 		top->qe.in = &top->qe.data[0];
> > 
> > So for a brief moment top->qe.in is out of range and thus
> > perf_top__mmap_read_idx() can try to enqueue to top->qe.data[2]
> > 
> > We can just do:
> > 
> > 	if (top->qe.in == &top->qe.data[1])
> > 		top->qe.in = &top->qe.data[0];
> > 	else
> > 		top->qe.in = &top->qe.data[1];
> > 
> > Or, make top->qe.in an index, and simply go:
> > 
> > 	top->qe.in ^= 1;
> > 
> > Either way will fix the bug.
> 
> ah right.. I had originaly full mutex around that,
> then I switched it off in the last patch and did
> not realize this implication.. nice ;-)

I like the rotate_queues() using cond-variable.  Have you tried to use
the same for hists->lock in hists__get_rotate_entries_in() too?

Eventually it'd be nice to avoid locks when a single thread processes
all the events.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-19  4:52                                             ` David Miller
@ 2018-11-19  6:28                                               ` Namhyung Kim
  2018-11-19  6:33                                                 ` David Miller
  2018-11-19  9:14                                               ` Jiri Olsa
  1 sibling, 1 reply; 40+ messages in thread
From: Namhyung Kim @ 2018-11-19  6:28 UTC (permalink / raw)
  To: David Miller; +Cc: jolsa, acme, linux-kernel, jolsa, kernel-team

Hello David,

On Sun, Nov 18, 2018 at 08:52:43PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Tue, 13 Nov 2018 11:40:54 +0100
> 
> > I pushed/rebased what I have to perf/fixes branch again
> > 
> > please note I had to change our compile changes, because
> > they wouldn't compile on x86, but I can't verify on sparc,
> > so you might see some compile fails again
> 
> I just checked your current perf/fixes branch.
> 
> It builds on Sparc ;-)
> 
> And it behaves better too.  I do get tons of drops and lost events,
> but it seems to keep going even during the hardest load.
> 
> Eventually I end up with a lot of unresolvable histogram entries,
> so that is something to look into.

Did you record callchains as well?  I'd like to know whether it's
related to the children (cumulative) mode or not.

Thanks,
Namhyung


> 
> I looked at your drop logic and it seems perfect, we avoid dropping
> all non-SAMPLE events which is what we want.  So that can't be the
> cause of the issues I am seeing.
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-19  6:28                                               ` Namhyung Kim
@ 2018-11-19  6:33                                                 ` David Miller
  2018-11-19  7:16                                                   ` Namhyung Kim
  0 siblings, 1 reply; 40+ messages in thread
From: David Miller @ 2018-11-19  6:33 UTC (permalink / raw)
  To: namhyung; +Cc: jolsa, acme, linux-kernel, jolsa, kernel-team

From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 19 Nov 2018 15:28:37 +0900

> Hello David,
> 
> On Sun, Nov 18, 2018 at 08:52:43PM -0800, David Miller wrote:
>> From: Jiri Olsa <jolsa@redhat.com>
>> Date: Tue, 13 Nov 2018 11:40:54 +0100
>> 
>> > I pushed/rebased what I have to perf/fixes branch again
>> > 
>> > please note I had to change our compile changes, because
>> > they wouldn't compile on x86, but I can't verify on sparc,
>> > so you might see some compile fails again
>> 
>> I just checked your current perf/fixes branch.
>> 
>> It builds on Sparc ;-)
>> 
>> And it behaves better too.  I do get tons of drops and lost events,
>> but it seems to keep going even during the hardest load.
>> 
>> Eventually I end up with a lot of unresolvable histogram entries,
>> so that is something to look into.
> 
> Did you record callchains as well?  I'd like to know whether it's
> related to the children (cumulative) mode or not.

I did not have callchains on, just plain "./perf top"

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-19  6:33                                                 ` David Miller
@ 2018-11-19  7:16                                                   ` Namhyung Kim
  0 siblings, 0 replies; 40+ messages in thread
From: Namhyung Kim @ 2018-11-19  7:16 UTC (permalink / raw)
  To: David Miller; +Cc: jolsa, acme, linux-kernel, jolsa, kernel-team

On Sun, Nov 18, 2018 at 10:33:55PM -0800, David Miller wrote:
> From: Namhyung Kim <namhyung@kernel.org>
> Date: Mon, 19 Nov 2018 15:28:37 +0900
> 
> > Hello David,
> > 
> > On Sun, Nov 18, 2018 at 08:52:43PM -0800, David Miller wrote:
> >> From: Jiri Olsa <jolsa@redhat.com>
> >> Date: Tue, 13 Nov 2018 11:40:54 +0100
> >> 
> >> > I pushed/rebased what I have to perf/fixes branch again
> >> > 
> >> > please note I had to change our compile changes, because
> >> > they wouldn't compile on x86, but I can't verify on sparc,
> >> > so you might see some compile fails again
> >> 
> >> I just checked your current perf/fixes branch.
> >> 
> >> It builds on Sparc ;-)
> >> 
> >> And it behaves better too.  I do get tons of drops and lost events,
> >> but it seems to keep going even during the hardest load.
> >> 
> >> Eventually I end up with a lot of unresolvable histogram entries,
> >> so that is something to look into.
> > 
> > Did you record callchains as well?  I'd like to know whether it's
> > related to the children (cumulative) mode or not.
> 
> I did not have callchains on, just plain "./perf top"

OK, I need to think about it more..

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-19  5:26                       ` Namhyung Kim
@ 2018-11-19  9:12                         ` Jiri Olsa
  2018-11-20  1:13                           ` Namhyung Kim
  0 siblings, 1 reply; 40+ messages in thread
From: Jiri Olsa @ 2018-11-19  9:12 UTC (permalink / raw)
  To: Namhyung Kim; +Cc: David Miller, acme, linux-kernel, jolsa, kernel-team

On Mon, Nov 19, 2018 at 02:26:03PM +0900, Namhyung Kim wrote:
> Hi Jirka
> 
> Sorry for late!
> 
> On Tue, Nov 06, 2018 at 12:54:36PM +0100, Jiri Olsa wrote:
> > On Mon, Nov 05, 2018 at 08:53:42PM -0800, David Miller wrote:
> > > 
> > > Jiri,
> > > 
> > > Because you now run queued_events__queue() lockless with that condvar
> > > trick, it is possible for top->qe.in to be seen as one past the data[]
> > > array, this is because the rotate_queues() code goes:
> > > 
> > > 	if (++top->qe.in > &top->qe.data[1])
> > > 		top->qe.in = &top->qe.data[0];
> > > 
> > > So for a brief moment top->qe.in is out of range and thus
> > > perf_top__mmap_read_idx() can try to enqueue to top->qe.data[2]
> > > 
> > > We can just do:
> > > 
> > > 	if (top->qe.in == &top->qe.data[1])
> > > 		top->qe.in = &top->qe.data[0];
> > > 	else
> > > 		top->qe.in = &top->qe.data[1];
> > > 
> > > Or, make top->qe.in an index, and simply go:
> > > 
> > > 	top->qe.in ^= 1;
> > > 
> > > Either way will fix the bug.
> > 
> > ah right.. I had originaly full mutex around that,
> > then I switched it off in the last patch and did
> > not realize this implication.. nice ;-)
> 
> I like the rotate_queues() using cond-variable.  Have you tried to use
> the same for hists->lock in hists__get_rotate_entries_in() too?
> 
> Eventually it'd be nice to avoid locks when a single thread processes
> all the events.

yep, I thought we could use it there as well, but it could
be more tricky because we use hists->lock for that, which
is used on other places as well.. will check

thanks,
jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-19  4:52                                             ` David Miller
  2018-11-19  6:28                                               ` Namhyung Kim
@ 2018-11-19  9:14                                               ` Jiri Olsa
  1 sibling, 0 replies; 40+ messages in thread
From: Jiri Olsa @ 2018-11-19  9:14 UTC (permalink / raw)
  To: David Miller; +Cc: acme, linux-kernel, namhyung, jolsa

On Sun, Nov 18, 2018 at 08:52:43PM -0800, David Miller wrote:
> From: Jiri Olsa <jolsa@redhat.com>
> Date: Tue, 13 Nov 2018 11:40:54 +0100
> 
> > I pushed/rebased what I have to perf/fixes branch again
> > 
> > please note I had to change our compile changes, because
> > they wouldn't compile on x86, but I can't verify on sparc,
> > so you might see some compile fails again
> 
> I just checked your current perf/fixes branch.
> 
> It builds on Sparc ;-)
> 
> And it behaves better too.  I do get tons of drops and lost events,
> but it seems to keep going even during the hardest load.
> 
> Eventually I end up with a lot of unresolvable histogram entries,
> so that is something to look into.
> 
> I looked at your drop logic and it seems perfect, we avoid dropping
> all non-SAMPLE events which is what we want.  So that can't be the
> cause of the issues I am seeing.
> 

cool, I'll polish the patchset and send it out

thanks,
jirka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH RFC] hist lookups
  2018-11-19  9:12                         ` Jiri Olsa
@ 2018-11-20  1:13                           ` Namhyung Kim
  0 siblings, 0 replies; 40+ messages in thread
From: Namhyung Kim @ 2018-11-20  1:13 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: David Miller, acme, linux-kernel, jolsa, kernel-team

On Mon, Nov 19, 2018 at 10:12:57AM +0100, Jiri Olsa wrote:
> On Mon, Nov 19, 2018 at 02:26:03PM +0900, Namhyung Kim wrote:
> > Hi Jirka
> > 
> > Sorry for late!
> > 
> > On Tue, Nov 06, 2018 at 12:54:36PM +0100, Jiri Olsa wrote:
> > > On Mon, Nov 05, 2018 at 08:53:42PM -0800, David Miller wrote:
> > > > 
> > > > Jiri,
> > > > 
> > > > Because you now run queued_events__queue() lockless with that condvar
> > > > trick, it is possible for top->qe.in to be seen as one past the data[]
> > > > array, this is because the rotate_queues() code goes:
> > > > 
> > > > 	if (++top->qe.in > &top->qe.data[1])
> > > > 		top->qe.in = &top->qe.data[0];
> > > > 
> > > > So for a brief moment top->qe.in is out of range and thus
> > > > perf_top__mmap_read_idx() can try to enqueue to top->qe.data[2]
> > > > 
> > > > We can just do:
> > > > 
> > > > 	if (top->qe.in == &top->qe.data[1])
> > > > 		top->qe.in = &top->qe.data[0];
> > > > 	else
> > > > 		top->qe.in = &top->qe.data[1];
> > > > 
> > > > Or, make top->qe.in an index, and simply go:
> > > > 
> > > > 	top->qe.in ^= 1;
> > > > 
> > > > Either way will fix the bug.
> > > 
> > > ah right.. I had originaly full mutex around that,
> > > then I switched it off in the last patch and did
> > > not realize this implication.. nice ;-)
> > 
> > I like the rotate_queues() using cond-variable.  Have you tried to use
> > the same for hists->lock in hists__get_rotate_entries_in() too?
> > 
> > Eventually it'd be nice to avoid locks when a single thread processes
> > all the events.
> 
> yep, I thought we could use it there as well, but it could
> be more tricky because we use hists->lock for that, which
> is used on other places as well.. will check

I found hists->lock used only 3 places - hists__get_rotate_entries_in()
and perf_event__process_sample() to protect rotating.  The other place
is perf_top__record_precise_ip() which is called during the sample
processing before going to sleep.

Hmm.. I think processing thread should not go to sleep anyway.  It's
the responsibility of display thread show warning (and waiting for
user input or timeout).  I'll try to do something..

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2018-11-20  1:13 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-31  5:03 [PATCH RFC] hist lookups David Miller
2018-10-31 12:43 ` Arnaldo Carvalho de Melo
2018-10-31 15:39   ` Jiri Olsa
2018-10-31 16:08     ` David Miller
2018-11-03  6:30       ` David Miller
2018-11-04 20:18         ` Jiri Olsa
2018-11-05  0:50           ` David Miller
2018-11-05 20:34             ` Jiri Olsa
2018-11-05 22:52               ` David Miller
2018-11-06  3:45               ` David Miller
2018-11-06  4:03                 ` David Miller
2018-11-06  4:53                   ` David Miller
2018-11-06 11:54                     ` Jiri Olsa
2018-11-19  5:26                       ` Namhyung Kim
2018-11-19  9:12                         ` Jiri Olsa
2018-11-20  1:13                           ` Namhyung Kim
2018-11-06 20:42                     ` Jiri Olsa
2018-11-07  6:13                       ` David Miller
2018-11-07  8:32                         ` Jiri Olsa
2018-11-07 19:43                           ` Jiri Olsa
2018-11-07 20:01                             ` David Miller
2018-11-07 20:28                               ` Arnaldo Carvalho de Melo
2018-11-08  6:04                                 ` David Miller
2018-11-08  7:13                               ` Jiri Olsa
2018-11-09  1:07                                 ` David Miller
2018-11-11 19:41                                   ` Jiri Olsa
2018-11-11 19:41                                     ` Jiri Olsa
2018-11-11 22:32                                     ` David Miller
2018-11-11 22:43                                       ` Jiri Olsa
2018-11-11 22:58                                         ` David Miller
2018-11-11 23:08                                     ` David Miller
2018-11-11 23:26                                       ` Jiri Olsa
2018-11-11 23:32                                         ` David Miller
2018-11-13 10:40                                           ` Jiri Olsa
2018-11-19  4:52                                             ` David Miller
2018-11-19  6:28                                               ` Namhyung Kim
2018-11-19  6:33                                                 ` David Miller
2018-11-19  7:16                                                   ` Namhyung Kim
2018-11-19  9:14                                               ` Jiri Olsa
2018-11-06 11:51                 ` Jiri Olsa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).