linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] perf trace enhancements
@ 2013-10-09  3:26 David Ahern
  2013-10-09  3:26 ` [PATCH 1/3] perf util: Add findnew method to intlist - v2 David Ahern
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: David Ahern @ 2013-10-09  3:26 UTC (permalink / raw)
  To: acme, linux-kernel; +Cc: David Ahern

Hi Arnaldo:

Revision to intlist per your comment with the summary option
updated per your perf/core branch.

The mmap output file has survived local testing without problems
so please consider it for inclusion as well. It lowers the overhead
of perf-record.

David Ahern (3):
  perf util: Add findnew method to intlist - v2
  perf trace: Add summary option to dump syscall statistics
  perf record: mmap output file

 tools/perf/Documentation/perf-trace.txt |    4 ++
 tools/perf/builtin-record.c             |   87 ++++++++++++++++++++++++
 tools/perf/builtin-trace.c              |  110 +++++++++++++++++++++++++++----
 tools/perf/util/intlist.c               |   22 +++++--
 tools/perf/util/intlist.h               |    1 +
 tools/perf/util/rblist.c                |   27 +++++++-
 tools/perf/util/rblist.h                |    1 +
 7 files changed, 233 insertions(+), 19 deletions(-)

-- 
1.7.10.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/3] perf util: Add findnew method to intlist - v2
  2013-10-09  3:26 [PATCH 0/3] perf trace enhancements David Ahern
@ 2013-10-09  3:26 ` David Ahern
  2013-10-15  5:32   ` [tip:perf/core] perf util: Add findnew method to intlist tip-bot for David Ahern
  2013-10-09  3:26 ` [PATCH 2/3] perf trace: Add summary option to dump syscall statistics David Ahern
  2013-10-09  3:26 ` [PATCH 3/3] perf record: mmap output file David Ahern
  2 siblings, 1 reply; 19+ messages in thread
From: David Ahern @ 2013-10-09  3:26 UTC (permalink / raw)
  To: acme, linux-kernel
  Cc: David Ahern, Ingo Molnar, Frederic Weisbecker, Peter Zijlstra,
	Jiri Olsa, Namhyung Kim, Stephane Eranian

Similar to other findnew based methods if the requested
object is not found, add it to the list.

v2: followed format of other findnew methods per acme's request

Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Stephane Eranian <eranian@google.com>
---
 tools/perf/util/intlist.c |   22 ++++++++++++++++++----
 tools/perf/util/intlist.h |    1 +
 tools/perf/util/rblist.c  |   27 ++++++++++++++++++++++++---
 tools/perf/util/rblist.h  |    1 +
 4 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/intlist.c b/tools/perf/util/intlist.c
index 826d7b38c9a0..89715b64a315 100644
--- a/tools/perf/util/intlist.c
+++ b/tools/perf/util/intlist.c
@@ -58,22 +58,36 @@ void intlist__remove(struct intlist *ilist, struct int_node *node)
 	rblist__remove_node(&ilist->rblist, &node->rb_node);
 }
 
-struct int_node *intlist__find(struct intlist *ilist, int i)
+static struct int_node *__intlist__findnew(struct intlist *ilist,
+					   int i, bool create)
 {
-	struct int_node *node;
+	struct int_node *node = NULL;
 	struct rb_node *rb_node;
 
 	if (ilist == NULL)
 		return NULL;
 
-	node = NULL;
-	rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
+	if (create)
+		rb_node = rblist__findnew(&ilist->rblist, (void *)((long)i));
+	else
+		rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
+
 	if (rb_node)
 		node = container_of(rb_node, struct int_node, rb_node);
 
 	return node;
 }
 
+struct int_node *intlist__find(struct intlist *ilist, int i)
+{
+	return __intlist__findnew(ilist, i, false);
+}
+
+struct int_node *intlist__findnew(struct intlist *ilist, int i)
+{
+	return __intlist__findnew(ilist, i, true);
+}
+
 static int intlist__parse_list(struct intlist *ilist, const char *s)
 {
 	char *sep;
diff --git a/tools/perf/util/intlist.h b/tools/perf/util/intlist.h
index 0eb00ac39e01..aa6877d36858 100644
--- a/tools/perf/util/intlist.h
+++ b/tools/perf/util/intlist.h
@@ -24,6 +24,7 @@ int intlist__add(struct intlist *ilist, int i);
 
 struct int_node *intlist__entry(const struct intlist *ilist, unsigned int idx);
 struct int_node *intlist__find(struct intlist *ilist, int i);
+struct int_node *intlist__findnew(struct intlist *ilist, int i);
 
 static inline bool intlist__has_entry(struct intlist *ilist, int i)
 {
diff --git a/tools/perf/util/rblist.c b/tools/perf/util/rblist.c
index a16cdd2625ad..0dfe27d99458 100644
--- a/tools/perf/util/rblist.c
+++ b/tools/perf/util/rblist.c
@@ -48,10 +48,12 @@ void rblist__remove_node(struct rblist *rblist, struct rb_node *rb_node)
 	rblist->node_delete(rblist, rb_node);
 }
 
-struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
+static struct rb_node *__rblist__findnew(struct rblist *rblist,
+					 const void *entry,
+					 bool create)
 {
 	struct rb_node **p = &rblist->entries.rb_node;
-	struct rb_node *parent = NULL;
+	struct rb_node *parent = NULL, *new_node = NULL;
 
 	while (*p != NULL) {
 		int rc;
@@ -67,7 +69,26 @@ struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
 			return parent;
 	}
 
-	return NULL;
+	if (create) {
+		new_node = rblist->node_new(rblist, entry);
+		if (new_node) {
+			rb_link_node(new_node, parent, p);
+			rb_insert_color(new_node, &rblist->entries);
+			++rblist->nr_entries;
+		}
+	}
+
+	return new_node;
+}
+
+struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
+{
+	return __rblist__findnew(rblist, entry, false);
+}
+
+struct rb_node *rblist__findnew(struct rblist *rblist, const void *entry)
+{
+	return __rblist__findnew(rblist, entry, true);
 }
 
 void rblist__init(struct rblist *rblist)
diff --git a/tools/perf/util/rblist.h b/tools/perf/util/rblist.h
index 6d0cae5ae83d..ff9913b994c2 100644
--- a/tools/perf/util/rblist.h
+++ b/tools/perf/util/rblist.h
@@ -32,6 +32,7 @@ void rblist__delete(struct rblist *rblist);
 int rblist__add_node(struct rblist *rblist, const void *new_entry);
 void rblist__remove_node(struct rblist *rblist, struct rb_node *rb_node);
 struct rb_node *rblist__find(struct rblist *rblist, const void *entry);
+struct rb_node *rblist__findnew(struct rblist *rblist, const void *entry);
 struct rb_node *rblist__entry(const struct rblist *rblist, unsigned int idx);
 
 static inline bool rblist__empty(const struct rblist *rblist)
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/3] perf trace: Add summary option to dump syscall statistics
  2013-10-09  3:26 [PATCH 0/3] perf trace enhancements David Ahern
  2013-10-09  3:26 ` [PATCH 1/3] perf util: Add findnew method to intlist - v2 David Ahern
@ 2013-10-09  3:26 ` David Ahern
  2013-10-09 13:16   ` Jiri Olsa
  2013-10-15  5:32   ` [tip:perf/core] " tip-bot for David Ahern
  2013-10-09  3:26 ` [PATCH 3/3] perf record: mmap output file David Ahern
  2 siblings, 2 replies; 19+ messages in thread
From: David Ahern @ 2013-10-09  3:26 UTC (permalink / raw)
  To: acme, linux-kernel
  Cc: David Ahern, Ingo Molnar, Frederic Weisbecker, Peter Zijlstra,
	Jiri Olsa, Namhyung Kim, Stephane Eranian

When enabled dumps a summary of all syscalls by task with the usual
statistics -- min, max, average and relative stddev. For example,

make - 26341 :       3344   [ 17.4% ]      0.000 ms

                read :   52    0.000     4.802     0.644   30.08
               write :   20    0.004     0.036     0.010   21.72
                open :   24    0.003     0.046     0.014   23.68
               close :   64    0.002     0.055     0.008   22.53
                stat : 2714    0.002     0.222     0.004    4.47
               fstat :   18    0.001     0.041     0.006   46.26
                mmap :   30    0.003     0.009     0.006    5.71
            mprotect :    8    0.006     0.039     0.016   32.16
              munmap :   12    0.007     0.077     0.020   38.25
                 brk :   48    0.002     0.014     0.004   10.18
        rt_sigaction :   18    0.002     0.002     0.002    2.11
      rt_sigprocmask :   60    0.002     0.128     0.010   32.88
              access :    2    0.006     0.006     0.006    0.00
                pipe :   12    0.004     0.048     0.013   35.98
               vfork :   34    0.448     0.980     0.692    3.04
              execve :   20    0.000     0.387     0.046   56.66
               wait4 :   34    0.017  9923.287   593.221   68.45
               fcntl :    8    0.001     0.041     0.013   48.79
            getdents :   48    0.002     0.079     0.013   19.62
              getcwd :    2    0.005     0.005     0.005    0.00
               chdir :    2    0.070     0.070     0.070    0.00
           getrlimit :    2    0.045     0.045     0.045    0.00
          arch_prctl :    2    0.002     0.002     0.002    0.00
           setrlimit :    2    0.002     0.002     0.002    0.00
              openat :   94    0.003     0.005     0.003    2.11

Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Stephane Eranian <eranian@google.com>
---
 tools/perf/Documentation/perf-trace.txt |    4 ++
 tools/perf/builtin-trace.c              |  110 +++++++++++++++++++++++++++----
 2 files changed, 102 insertions(+), 12 deletions(-)

diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 1a224862118f..54139c6457f8 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -93,6 +93,10 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
 --comm::
         Show process COMM right beside its ID, on by default, disable with --no-comm.
 
+--summary::
+	Show a summary of syscalls by thread with min, max, and average times (in
+    msec) and relative stddev.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script[1]
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 27c4269b7fe3..8a3f0bdb0f79 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -10,6 +10,7 @@
 #include "util/strlist.h"
 #include "util/intlist.h"
 #include "util/thread_map.h"
+#include "util/stat.h"
 
 #include <libaudit.h>
 #include <stdlib.h>
@@ -892,6 +893,8 @@ struct thread_trace {
 		int	  max;
 		char	  **table;
 	} paths;
+
+	struct intlist *syscall_stats;
 };
 
 static struct thread_trace *thread_trace__new(void)
@@ -901,6 +904,8 @@ static struct thread_trace *thread_trace__new(void)
 	if (ttrace)
 		ttrace->paths.max = -1;
 
+	ttrace->syscall_stats = intlist__new(NULL);
+
 	return ttrace;
 }
 
@@ -947,6 +952,7 @@ struct trace {
 	struct intlist		*pid_list;
 	bool			sched;
 	bool			multiple_threads;
+	bool			summary;
 	bool			show_comm;
 	double			duration_filter;
 	double			runtime_ms;
@@ -1274,10 +1280,8 @@ typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
 				  struct perf_sample *sample);
 
 static struct syscall *trace__syscall_info(struct trace *trace,
-					   struct perf_evsel *evsel,
-					   struct perf_sample *sample)
+					   struct perf_evsel *evsel, int id)
 {
-	int id = perf_evsel__intval(evsel, sample, "id");
 
 	if (id < 0) {
 
@@ -1318,6 +1322,32 @@ out_cant_read:
 	return NULL;
 }
 
+static void thread__update_stats(struct thread_trace *ttrace,
+				 int id, struct perf_sample *sample)
+{
+	struct int_node *inode;
+	struct stats *stats;
+	u64 duration = 0;
+
+	inode = intlist__findnew(ttrace->syscall_stats, id);
+	if (inode == NULL)
+		return;
+
+	stats = inode->priv;
+	if (stats == NULL) {
+		stats = malloc(sizeof(struct stats));
+		if (stats == NULL)
+			return;
+		init_stats(stats);
+		inode->priv = stats;
+	}
+
+	if (ttrace->entry_time && sample->time > ttrace->entry_time)
+		duration = sample->time - ttrace->entry_time;
+
+	update_stats(stats, duration);
+}
+
 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 			    struct perf_sample *sample)
 {
@@ -1325,7 +1355,8 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 	void *args;
 	size_t printed = 0;
 	struct thread *thread;
-	struct syscall *sc = trace__syscall_info(trace, evsel, sample);
+	int id = perf_evsel__intval(evsel, sample, "id");
+	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	struct thread_trace *ttrace;
 
 	if (sc == NULL)
@@ -1377,7 +1408,8 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	int ret;
 	u64 duration = 0;
 	struct thread *thread;
-	struct syscall *sc = trace__syscall_info(trace, evsel, sample);
+	int id = perf_evsel__intval(evsel, sample, "id");
+	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	struct thread_trace *ttrace;
 
 	if (sc == NULL)
@@ -1391,6 +1423,9 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	if (ttrace == NULL)
 		return -1;
 
+	if (trace->summary)
+		thread__update_stats(ttrace, id, sample);
+
 	ret = perf_evsel__intval(evsel, sample, "ret");
 
 	ttrace = thread->priv;
@@ -1557,6 +1592,8 @@ static int trace__record(int argc, const char **argv)
 	return cmd_record(i, rec_argv, NULL);
 }
 
+static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
+
 static int trace__run(struct trace *trace, int argc, const char **argv)
 {
 	struct perf_evlist *evlist = perf_evlist__new();
@@ -1690,6 +1727,9 @@ again:
 	goto again;
 
 out_unmap_evlist:
+	if (!err && trace->summary)
+		trace__fprintf_thread_summary(trace, trace->output);
+
 	perf_evlist__munmap(evlist);
 out_close_evlist:
 	perf_evlist__close(evlist);
@@ -1762,6 +1802,9 @@ static int trace__replay(struct trace *trace)
 	if (err)
 		pr_err("Failed to process events, error %d", err);
 
+	else if (trace->summary)
+		trace__fprintf_thread_summary(trace, trace->output);
+
 out:
 	perf_session__delete(session);
 
@@ -1772,10 +1815,53 @@ static size_t trace__fprintf_threads_header(FILE *fp)
 {
 	size_t printed;
 
-	printed  = fprintf(fp, "\n _____________________________________________________________________\n");
-	printed += fprintf(fp," __)    Summary of events    (__\n\n");
-	printed += fprintf(fp,"              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]\n");
-	printed += fprintf(fp," _____________________________________________________________________\n\n");
+	printed  = fprintf(fp, "\n _____________________________________________________________________________\n");
+	printed += fprintf(fp, " __)    Summary of events    (__\n\n");
+	printed += fprintf(fp, "              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]\n");
+	printed += fprintf(fp, "                                  syscall  count    min     max    avg  stddev\n");
+	printed += fprintf(fp, "                                                   msec    msec   msec     %%\n");
+	printed += fprintf(fp, " _____________________________________________________________________________\n\n");
+
+	return printed;
+}
+
+static size_t thread__dump_stats(struct thread_trace *ttrace,
+				 struct trace *trace, FILE *fp)
+{
+	struct stats *stats;
+	size_t printed = 0;
+	struct syscall *sc;
+	struct int_node *inode = intlist__first(ttrace->syscall_stats);
+
+	if (inode == NULL)
+		return 0;
+
+	printed += fprintf(fp, "\n");
+
+	/* each int_node is a syscall */
+	while (inode) {
+		stats = inode->priv;
+		if (stats) {
+			double min = (double)(stats->min) / NSEC_PER_MSEC;
+			double max = (double)(stats->max) / NSEC_PER_MSEC;
+			double avg = avg_stats(stats);
+			double pct;
+			u64 n = (u64) stats->n;
+
+			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
+			avg /= NSEC_PER_MSEC;
+
+			sc = &trace->syscalls.table[inode->i];
+			printed += fprintf(fp, "%24s  %14s : ", "", sc->name);
+			printed += fprintf(fp, "%5" PRIu64 "  %8.3f  %8.3f",
+					   n, min, max);
+			printed += fprintf(fp, "  %8.3f  %6.2f\n", avg, pct);
+		}
+
+		inode = intlist__next(inode);
+	}
+
+	printed += fprintf(fp, "\n\n");
 
 	return printed;
 }
@@ -1814,6 +1900,7 @@ static int trace__fprintf_one_thread(struct thread *thread, void *priv)
 	printed += fprintf(fp, " - %-5d :%11lu   [", thread->tid, ttrace->nr_events);
 	printed += color_fprintf(fp, color, "%5.1f%%", ratio);
 	printed += fprintf(fp, " ] %10.3f ms\n", ttrace->runtime_ms);
+	printed += thread__dump_stats(ttrace, trace, fp);
 
 	data->printed += printed;
 
@@ -1917,6 +2004,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 	OPT_BOOLEAN('T', "time", &trace.full_time,
 		    "Show full timestamp, not time relative to first start"),
+	OPT_BOOLEAN(0, "summary", &trace.summary,
+		    "Show syscall summary with statistics"),
 	OPT_END()
 	};
 	int err;
@@ -1972,9 +2061,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	else
 		err = trace__run(&trace, argc, argv);
 
-	if (trace.sched && !err)
-		trace__fprintf_thread_summary(&trace, trace.output);
-
 out_close:
 	if (output_name != NULL)
 		fclose(trace.output);
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/3] perf record: mmap output file
  2013-10-09  3:26 [PATCH 0/3] perf trace enhancements David Ahern
  2013-10-09  3:26 ` [PATCH 1/3] perf util: Add findnew method to intlist - v2 David Ahern
  2013-10-09  3:26 ` [PATCH 2/3] perf trace: Add summary option to dump syscall statistics David Ahern
@ 2013-10-09  3:26 ` David Ahern
  2013-10-09  5:59   ` Ingo Molnar
  2013-10-09  7:14   ` Mike Galbraith
  2 siblings, 2 replies; 19+ messages in thread
From: David Ahern @ 2013-10-09  3:26 UTC (permalink / raw)
  To: acme, linux-kernel
  Cc: David Ahern, Ingo Molnar, Frederic Weisbecker, Peter Zijlstra,
	Jiri Olsa, Namhyung Kim, Mike Galbraith, Stephane Eranian

When recording raw_syscalls for the entire system, e.g.,
    perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1

you end up with a negative feedback loop as perf itself calls
write() fairly often. This patch handles the problem by mmap'ing the
file in chunks of 64M at a time and copies events from the event buffers
to the file avoiding write system calls.

Before (with write syscall):

perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
[ perf record: Woken up 0 times to write data ]
[ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]

After (using mmap):

perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
[ perf record: Woken up 31 times to write data ]
[ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]

In addition to perf-trace benefits using mmap lowers the overhead of
perf-record. For example,

  perf stat -i -- perf record -g -o /tmp/perf.data openssl speed aes

showsi a drop in time, CPU cycles, and instructions all drop by more than a
factor of 3. Jiri also ran a test that showed a big improvement.

Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Stephane Eranian <eranian@google.com>
---
 tools/perf/builtin-record.c |   87 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index da1384012505..45bb565e0bb1 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -29,6 +29,9 @@
 #include <sched.h>
 #include <sys/mman.h>
 
+/* mmap file big chunks at a time */
+#define MMAP_OUTPUT_SIZE   (64*1024*1024)
+
 #ifndef HAVE_ON_EXIT
 #ifndef ATEXIT_MAX
 #define ATEXIT_MAX 32
@@ -64,6 +67,14 @@ static void __handle_on_exit_funcs(void)
 struct perf_record {
 	struct perf_tool	tool;
 	struct perf_record_opts	opts;
+
+	/* for MMAP based file writes */
+	void			*mmap_addr;
+	u64			bytes_at_mmap_start; /* bytes in file when mmap use starts */
+	u64			mmap_offset;    /* current location within mmap */
+	size_t			mmap_size;      /* size of mmap segments */
+	bool			use_mmap;
+
 	u64			bytes_written;
 	const char		*output_name;
 	struct perf_evlist	*evlist;
@@ -82,8 +93,66 @@ static void advance_output(struct perf_record *rec, size_t size)
 	rec->bytes_written += size;
 }
 
+static int do_mmap_output(struct perf_record *rec, void *buf, size_t size)
+{
+	u64 remaining;
+	off_t offset;
+
+	if (rec->mmap_addr == NULL) {
+do_mmap:
+		offset = rec->bytes_at_mmap_start + rec->bytes_written;
+		if (offset < (ssize_t) rec->mmap_size) {
+			rec->mmap_offset = offset;
+			offset = 0;
+		} else
+			rec->mmap_offset = 0;
+
+		rec->mmap_addr = mmap(NULL, rec->mmap_size,
+				     PROT_WRITE | PROT_READ,
+				     MAP_SHARED,
+				     rec->output,
+				     offset);
+
+		if (rec->mmap_addr == MAP_FAILED) {
+			pr_err("mmap failed: %d: %s\n", errno, strerror(errno));
+			return -1;
+		}
+
+		/* expand file to include this mmap segment */
+		if (ftruncate(rec->output, offset + rec->mmap_size) != 0) {
+			pr_err("ftruncate failed\n");
+			return -1;
+		}
+	}
+
+	remaining = rec->mmap_size - rec->mmap_offset;
+
+	if (size > remaining) {
+		memcpy(rec->mmap_addr + rec->mmap_offset, buf, remaining);
+		rec->bytes_written += remaining;
+
+		size -= remaining;
+		buf  += remaining;
+
+		msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
+		munmap(rec->mmap_addr, rec->mmap_size);
+		goto do_mmap;
+	}
+
+	if (size) {
+		memcpy(rec->mmap_addr + rec->mmap_offset, buf, size);
+		rec->bytes_written += size;
+		rec->mmap_offset += size;
+	}
+
+	return 0;
+}
+
 static int write_output(struct perf_record *rec, void *buf, size_t size)
 {
+	if (rec->use_mmap)
+		return do_mmap_output(rec, buf, size);
+
 	while (size) {
 		int ret = write(rec->output, buf, size);
 
@@ -546,6 +615,11 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 	if (forks)
 		perf_evlist__start_workload(evsel_list);
 
+	if (!rec->opts.pipe_output && stat(output_name, &st) == 0) {
+		rec->use_mmap = true;
+		rec->bytes_at_mmap_start = st.st_size - rec->bytes_written;
+	}
+
 	for (;;) {
 		int hits = rec->samples;
 
@@ -572,6 +646,18 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 		}
 	}
 
+	if (rec->use_mmap) {
+		off_t len = rec->bytes_at_mmap_start + rec->bytes_written;
+
+		rec->use_mmap = false;
+		msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
+		munmap(rec->mmap_addr, rec->mmap_size);
+		rec->mmap_addr = NULL;
+
+		if (ftruncate(rec->output, len) != 0)
+			pr_err("ftruncate failed\n");
+	}
+
 	if (quiet || signr == SIGUSR1)
 		return 0;
 
@@ -804,6 +890,7 @@ static struct perf_record record = {
 			.uses_mmap   = true,
 		},
 	},
+	.mmap_size = MMAP_OUTPUT_SIZE,
 };
 
 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-09  3:26 ` [PATCH 3/3] perf record: mmap output file David Ahern
@ 2013-10-09  5:59   ` Ingo Molnar
  2013-10-15 14:04     ` David Ahern
  2013-10-09  7:14   ` Mike Galbraith
  1 sibling, 1 reply; 19+ messages in thread
From: Ingo Molnar @ 2013-10-09  5:59 UTC (permalink / raw)
  To: David Ahern
  Cc: acme, linux-kernel, Frederic Weisbecker, Peter Zijlstra,
	Jiri Olsa, Namhyung Kim, Mike Galbraith, Stephane Eranian


* David Ahern <dsahern@gmail.com> wrote:

> When recording raw_syscalls for the entire system, e.g.,
>     perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> 
> you end up with a negative feedback loop as perf itself calls
> write() fairly often. This patch handles the problem by mmap'ing the
> file in chunks of 64M at a time and copies events from the event buffers
> to the file avoiding write system calls.
> 
> Before (with write syscall):
> 
> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> [ perf record: Woken up 0 times to write data ]
> [ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]
> 
> After (using mmap):
> 
> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> [ perf record: Woken up 31 times to write data ]
> [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
> 
> In addition to perf-trace benefits using mmap lowers the overhead of
> perf-record. For example,
> 
>   perf stat -i -- perf record -g -o /tmp/perf.data openssl speed aes
> 
> showsi a drop in time, CPU cycles, and instructions all drop by more than a
> factor of 3. Jiri also ran a test that showed a big improvement.

Here are some thoughts on how 'perf record' tracing performance could be 
further improved:

1)

The use of non-temporal stores (MOVNTQ) to copy the ring-buffer into the 
file buffer makes sure the CPU cache is not trashed by the copying - which 
is the largest 'collateral damage' copying does.

glibc does not appear to expose non-temporal instructions so it's going to 
be architecture dependent - but we could build the copy_user_nocache() 
function from the kernel proper (or copy it - we could even simplify it: 
knowing that only large and page aligned buffers are going to be copied 
with it).

See how tools/perf/bench/mem-mem* does that to be able to measure the 
kernel's memcpy() and memset() function performance.

2)

Yet another method would be to avoid the copies altogether via the splice 
system-call - see:

	git grep splice kernel/trace/

To make splice low-overhead we'd have to introduce a mode to not mmap the 
data part of the perf ring-buffer and splice the data straight from the 
perf fd into a temporary pipe and over from the pipe into the target file 
(or socket).

OTOH non-temporal stores are incredibly simple and memory bandwidth is 
plenty on modern systems so I'd certainly try that route first.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-09  3:26 ` [PATCH 3/3] perf record: mmap output file David Ahern
  2013-10-09  5:59   ` Ingo Molnar
@ 2013-10-09  7:14   ` Mike Galbraith
  1 sibling, 0 replies; 19+ messages in thread
From: Mike Galbraith @ 2013-10-09  7:14 UTC (permalink / raw)
  To: David Ahern
  Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
	Peter Zijlstra, Jiri Olsa, Namhyung Kim, Stephane Eranian

On Tue, 2013-10-08 at 21:26 -0600, David Ahern wrote: 
> When recording raw_syscalls for the entire system, e.g.,
>     perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> 
> you end up with a negative feedback loop as perf itself calls
> write() fairly often. This patch handles the problem by mmap'ing the
> file in chunks of 64M at a time and copies events from the event buffers
> to the file avoiding write system calls.
> 
> Before (with write syscall):
> 
> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> [ perf record: Woken up 0 times to write data ]
> [ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]
> 
> After (using mmap):
> 
> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> [ perf record: Woken up 31 times to write data ]
> [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]

You cured some annoying lost chunks too.

master (cache is hot in target tip tree)

[ perf record: Woken up 0 times to write data ]
[ perf record: Captured and wrote 1398.520 MB perf.data (~61102281 samples) ]
Warning:
Processed 15894046 events and lost 15 chunks!

Check IO/CPU overload!


 Performance counter stats for 'sh -c cd /usr/local/src/kernel/linux-3.x-tip/tools/perf; make clean; perf record -a -e raw_syscalls:* -- make -j4':

      90405.531301 task-clock                #    2.580 CPUs utilized          
             18433 context-switches          #    0.204 K/sec                  
              1503 cpu-migrations            #    0.017 K/sec                  
           2543758 page-faults               #    0.028 M/sec                  
      214649052820 cycles                    #    2.374 GHz                     [50.40%]
   <not supported> stalled-cycles-frontend 
   <not supported> stalled-cycles-backend  
      201026412643 instructions              #    0.94  insns per cycle         [75.48%]
       41960433084 branches                  #  464.136 M/sec                   [75.22%]
        1835477524 branch-misses             #    4.37% of all branches         [75.13%]

      35.039394045 seconds time elapsed


patched master
marge:/usr/local/src/kernel/linux-3.x.git/tools/perf # export PERF=`pwd`/perf

[ perf record: Woken up 1070 times to write data ]
[ perf record: Captured and wrote 269.060 MB perf.data (~11755404 samples) ]

 Performance counter stats for 'sh -c cd /usr/local/src/kernel/linux-3.x-tip/tools/perf; make clean; $PERF record -a -e raw_syscalls:* -- make -j4':

      67553.069792 task-clock                #    2.934 CPUs utilized          
             16183 context-switches          #    0.240 K/sec                  
               880 cpu-migrations            #    0.013 K/sec                  
           2253888 page-faults               #    0.033 M/sec                  
      159940676657 cycles                    #    2.368 GHz                     [50.20%]
   <not supported> stalled-cycles-frontend 
   <not supported> stalled-cycles-backend  
      152420909413 instructions              #    0.95  insns per cycle         [75.39%]
       32857102452 branches                  #  486.389 M/sec                   [75.58%]
        1674403244 branch-misses             #    5.10% of all branches         [75.15%]

      23.020412569 seconds time elapsed


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] perf trace: Add summary option to dump syscall statistics
  2013-10-09  3:26 ` [PATCH 2/3] perf trace: Add summary option to dump syscall statistics David Ahern
@ 2013-10-09 13:16   ` Jiri Olsa
  2013-10-15  5:32   ` [tip:perf/core] " tip-bot for David Ahern
  1 sibling, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2013-10-09 13:16 UTC (permalink / raw)
  To: David Ahern
  Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
	Peter Zijlstra, Namhyung Kim, Stephane Eranian

On Tue, Oct 08, 2013 at 09:26:53PM -0600, David Ahern wrote:
> When enabled dumps a summary of all syscalls by task with the usual
> statistics -- min, max, average and relative stddev. For example,
> 
> make - 26341 :       3344   [ 17.4% ]      0.000 ms
> 
>                 read :   52    0.000     4.802     0.644   30.08
>                write :   20    0.004     0.036     0.010   21.72
>                 open :   24    0.003     0.046     0.014   23.68
>                close :   64    0.002     0.055     0.008   22.53
>                 stat : 2714    0.002     0.222     0.004    4.47
>                fstat :   18    0.001     0.041     0.006   46.26
>                 mmap :   30    0.003     0.009     0.006    5.71
>             mprotect :    8    0.006     0.039     0.016   32.16
>               munmap :   12    0.007     0.077     0.020   38.25
>                  brk :   48    0.002     0.014     0.004   10.18
>         rt_sigaction :   18    0.002     0.002     0.002    2.11
>       rt_sigprocmask :   60    0.002     0.128     0.010   32.88
>               access :    2    0.006     0.006     0.006    0.00
>                 pipe :   12    0.004     0.048     0.013   35.98
>                vfork :   34    0.448     0.980     0.692    3.04
>               execve :   20    0.000     0.387     0.046   56.66
>                wait4 :   34    0.017  9923.287   593.221   68.45
>                fcntl :    8    0.001     0.041     0.013   48.79
>             getdents :   48    0.002     0.079     0.013   19.62
>               getcwd :    2    0.005     0.005     0.005    0.00
>                chdir :    2    0.070     0.070     0.070    0.00
>            getrlimit :    2    0.045     0.045     0.045    0.00
>           arch_prctl :    2    0.002     0.002     0.002    0.00
>            setrlimit :    2    0.002     0.002     0.002    0.00
>               openat :   94    0.003     0.005     0.003    2.11
> 

Should the syscalls columns be aligned with the data?

jirka
---
              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]
                                  syscall  count    min     max    avg  stddev
                                                   msec    msec   msec     %
 _____________________________________________________________________________

                  ls - 10994 :        530   [ 97.4% ]      0.000 ms

                                    read :    18     0.000     0.016     0.003   25.51
                                    open :    24     0.002     0.026     0.005   19.46


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [tip:perf/core] perf util: Add findnew method to intlist
  2013-10-09  3:26 ` [PATCH 1/3] perf util: Add findnew method to intlist - v2 David Ahern
@ 2013-10-15  5:32   ` tip-bot for David Ahern
  0 siblings, 0 replies; 19+ messages in thread
From: tip-bot for David Ahern @ 2013-10-15  5:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: acme, linux-kernel, eranian, hpa, mingo, a.p.zijlstra,
	namhyung.kim, jolsa, fweisbec, dsahern, tglx

Commit-ID:  813335b8b27d9ceeb67a073f501ada8b8dde37a7
Gitweb:     http://git.kernel.org/tip/813335b8b27d9ceeb67a073f501ada8b8dde37a7
Author:     David Ahern <dsahern@gmail.com>
AuthorDate: Tue, 8 Oct 2013 21:26:52 -0600
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Mon, 14 Oct 2013 10:28:48 -0300

perf util: Add findnew method to intlist

Similar to other findnew based methods if the requested object is not
found, add it to the list.

v2: followed format of other findnew methods per acme's request

Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1381289214-24885-2-git-send-email-dsahern@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/intlist.c | 22 ++++++++++++++++++----
 tools/perf/util/intlist.h |  1 +
 tools/perf/util/rblist.c  | 27 ++++++++++++++++++++++++---
 tools/perf/util/rblist.h  |  1 +
 4 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/intlist.c b/tools/perf/util/intlist.c
index 826d7b3..89715b6 100644
--- a/tools/perf/util/intlist.c
+++ b/tools/perf/util/intlist.c
@@ -58,22 +58,36 @@ void intlist__remove(struct intlist *ilist, struct int_node *node)
 	rblist__remove_node(&ilist->rblist, &node->rb_node);
 }
 
-struct int_node *intlist__find(struct intlist *ilist, int i)
+static struct int_node *__intlist__findnew(struct intlist *ilist,
+					   int i, bool create)
 {
-	struct int_node *node;
+	struct int_node *node = NULL;
 	struct rb_node *rb_node;
 
 	if (ilist == NULL)
 		return NULL;
 
-	node = NULL;
-	rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
+	if (create)
+		rb_node = rblist__findnew(&ilist->rblist, (void *)((long)i));
+	else
+		rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
+
 	if (rb_node)
 		node = container_of(rb_node, struct int_node, rb_node);
 
 	return node;
 }
 
+struct int_node *intlist__find(struct intlist *ilist, int i)
+{
+	return __intlist__findnew(ilist, i, false);
+}
+
+struct int_node *intlist__findnew(struct intlist *ilist, int i)
+{
+	return __intlist__findnew(ilist, i, true);
+}
+
 static int intlist__parse_list(struct intlist *ilist, const char *s)
 {
 	char *sep;
diff --git a/tools/perf/util/intlist.h b/tools/perf/util/intlist.h
index 0eb00ac..aa6877d 100644
--- a/tools/perf/util/intlist.h
+++ b/tools/perf/util/intlist.h
@@ -24,6 +24,7 @@ int intlist__add(struct intlist *ilist, int i);
 
 struct int_node *intlist__entry(const struct intlist *ilist, unsigned int idx);
 struct int_node *intlist__find(struct intlist *ilist, int i);
+struct int_node *intlist__findnew(struct intlist *ilist, int i);
 
 static inline bool intlist__has_entry(struct intlist *ilist, int i)
 {
diff --git a/tools/perf/util/rblist.c b/tools/perf/util/rblist.c
index a16cdd2..0dfe27d 100644
--- a/tools/perf/util/rblist.c
+++ b/tools/perf/util/rblist.c
@@ -48,10 +48,12 @@ void rblist__remove_node(struct rblist *rblist, struct rb_node *rb_node)
 	rblist->node_delete(rblist, rb_node);
 }
 
-struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
+static struct rb_node *__rblist__findnew(struct rblist *rblist,
+					 const void *entry,
+					 bool create)
 {
 	struct rb_node **p = &rblist->entries.rb_node;
-	struct rb_node *parent = NULL;
+	struct rb_node *parent = NULL, *new_node = NULL;
 
 	while (*p != NULL) {
 		int rc;
@@ -67,7 +69,26 @@ struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
 			return parent;
 	}
 
-	return NULL;
+	if (create) {
+		new_node = rblist->node_new(rblist, entry);
+		if (new_node) {
+			rb_link_node(new_node, parent, p);
+			rb_insert_color(new_node, &rblist->entries);
+			++rblist->nr_entries;
+		}
+	}
+
+	return new_node;
+}
+
+struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
+{
+	return __rblist__findnew(rblist, entry, false);
+}
+
+struct rb_node *rblist__findnew(struct rblist *rblist, const void *entry)
+{
+	return __rblist__findnew(rblist, entry, true);
 }
 
 void rblist__init(struct rblist *rblist)
diff --git a/tools/perf/util/rblist.h b/tools/perf/util/rblist.h
index 6d0cae5..ff9913b 100644
--- a/tools/perf/util/rblist.h
+++ b/tools/perf/util/rblist.h
@@ -32,6 +32,7 @@ void rblist__delete(struct rblist *rblist);
 int rblist__add_node(struct rblist *rblist, const void *new_entry);
 void rblist__remove_node(struct rblist *rblist, struct rb_node *rb_node);
 struct rb_node *rblist__find(struct rblist *rblist, const void *entry);
+struct rb_node *rblist__findnew(struct rblist *rblist, const void *entry);
 struct rb_node *rblist__entry(const struct rblist *rblist, unsigned int idx);
 
 static inline bool rblist__empty(const struct rblist *rblist)

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [tip:perf/core] perf trace: Add summary option to dump syscall statistics
  2013-10-09  3:26 ` [PATCH 2/3] perf trace: Add summary option to dump syscall statistics David Ahern
  2013-10-09 13:16   ` Jiri Olsa
@ 2013-10-15  5:32   ` tip-bot for David Ahern
  1 sibling, 0 replies; 19+ messages in thread
From: tip-bot for David Ahern @ 2013-10-15  5:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: acme, linux-kernel, eranian, hpa, mingo, a.p.zijlstra,
	namhyung.kim, jolsa, fweisbec, dsahern, tglx

Commit-ID:  bf2575c121ca11247ef07fd02b43f7430834f7b1
Gitweb:     http://git.kernel.org/tip/bf2575c121ca11247ef07fd02b43f7430834f7b1
Author:     David Ahern <dsahern@gmail.com>
AuthorDate: Tue, 8 Oct 2013 21:26:53 -0600
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Mon, 14 Oct 2013 10:28:50 -0300

perf trace: Add summary option to dump syscall statistics

When enabled dumps a summary of all syscalls by task with the usual
statistics -- min, max, average and relative stddev. For example,

make - 26341 :       3344   [ 17.4% ]      0.000 ms

                read :   52    0.000     4.802     0.644   30.08
               write :   20    0.004     0.036     0.010   21.72
                open :   24    0.003     0.046     0.014   23.68
               close :   64    0.002     0.055     0.008   22.53
                stat : 2714    0.002     0.222     0.004    4.47
               fstat :   18    0.001     0.041     0.006   46.26
                mmap :   30    0.003     0.009     0.006    5.71
            mprotect :    8    0.006     0.039     0.016   32.16
              munmap :   12    0.007     0.077     0.020   38.25
                 brk :   48    0.002     0.014     0.004   10.18
        rt_sigaction :   18    0.002     0.002     0.002    2.11
      rt_sigprocmask :   60    0.002     0.128     0.010   32.88
              access :    2    0.006     0.006     0.006    0.00
                pipe :   12    0.004     0.048     0.013   35.98
               vfork :   34    0.448     0.980     0.692    3.04
              execve :   20    0.000     0.387     0.046   56.66
               wait4 :   34    0.017  9923.287   593.221   68.45
               fcntl :    8    0.001     0.041     0.013   48.79
            getdents :   48    0.002     0.079     0.013   19.62
              getcwd :    2    0.005     0.005     0.005    0.00
               chdir :    2    0.070     0.070     0.070    0.00
           getrlimit :    2    0.045     0.045     0.045    0.00
          arch_prctl :    2    0.002     0.002     0.002    0.00
           setrlimit :    2    0.002     0.002     0.002    0.00
              openat :   94    0.003     0.005     0.003    2.11

Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1381289214-24885-3-git-send-email-dsahern@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-trace.txt |   4 ++
 tools/perf/builtin-trace.c              | 110 ++++++++++++++++++++++++++++----
 2 files changed, 102 insertions(+), 12 deletions(-)

diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 1a22486..54139c6 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -93,6 +93,10 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
 --comm::
         Show process COMM right beside its ID, on by default, disable with --no-comm.
 
+--summary::
+	Show a summary of syscalls by thread with min, max, and average times (in
+    msec) and relative stddev.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script[1]
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 5496546..d0f91fe 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -10,6 +10,7 @@
 #include "util/strlist.h"
 #include "util/intlist.h"
 #include "util/thread_map.h"
+#include "util/stat.h"
 
 #include <libaudit.h>
 #include <stdlib.h>
@@ -909,6 +910,8 @@ struct thread_trace {
 		int	  max;
 		char	  **table;
 	} paths;
+
+	struct intlist *syscall_stats;
 };
 
 static struct thread_trace *thread_trace__new(void)
@@ -918,6 +921,8 @@ static struct thread_trace *thread_trace__new(void)
 	if (ttrace)
 		ttrace->paths.max = -1;
 
+	ttrace->syscall_stats = intlist__new(NULL);
+
 	return ttrace;
 }
 
@@ -964,6 +969,7 @@ struct trace {
 	struct intlist		*pid_list;
 	bool			sched;
 	bool			multiple_threads;
+	bool			summary;
 	bool			show_comm;
 	double			duration_filter;
 	double			runtime_ms;
@@ -1291,10 +1297,8 @@ typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
 				  struct perf_sample *sample);
 
 static struct syscall *trace__syscall_info(struct trace *trace,
-					   struct perf_evsel *evsel,
-					   struct perf_sample *sample)
+					   struct perf_evsel *evsel, int id)
 {
-	int id = perf_evsel__intval(evsel, sample, "id");
 
 	if (id < 0) {
 
@@ -1335,6 +1339,32 @@ out_cant_read:
 	return NULL;
 }
 
+static void thread__update_stats(struct thread_trace *ttrace,
+				 int id, struct perf_sample *sample)
+{
+	struct int_node *inode;
+	struct stats *stats;
+	u64 duration = 0;
+
+	inode = intlist__findnew(ttrace->syscall_stats, id);
+	if (inode == NULL)
+		return;
+
+	stats = inode->priv;
+	if (stats == NULL) {
+		stats = malloc(sizeof(struct stats));
+		if (stats == NULL)
+			return;
+		init_stats(stats);
+		inode->priv = stats;
+	}
+
+	if (ttrace->entry_time && sample->time > ttrace->entry_time)
+		duration = sample->time - ttrace->entry_time;
+
+	update_stats(stats, duration);
+}
+
 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 			    struct perf_sample *sample)
 {
@@ -1342,7 +1372,8 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 	void *args;
 	size_t printed = 0;
 	struct thread *thread;
-	struct syscall *sc = trace__syscall_info(trace, evsel, sample);
+	int id = perf_evsel__intval(evsel, sample, "id");
+	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	struct thread_trace *ttrace;
 
 	if (sc == NULL)
@@ -1394,7 +1425,8 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	int ret;
 	u64 duration = 0;
 	struct thread *thread;
-	struct syscall *sc = trace__syscall_info(trace, evsel, sample);
+	int id = perf_evsel__intval(evsel, sample, "id");
+	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	struct thread_trace *ttrace;
 
 	if (sc == NULL)
@@ -1408,6 +1440,9 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	if (ttrace == NULL)
 		return -1;
 
+	if (trace->summary)
+		thread__update_stats(ttrace, id, sample);
+
 	ret = perf_evsel__intval(evsel, sample, "ret");
 
 	ttrace = thread->priv;
@@ -1574,6 +1609,8 @@ static int trace__record(int argc, const char **argv)
 	return cmd_record(i, rec_argv, NULL);
 }
 
+static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
+
 static int trace__run(struct trace *trace, int argc, const char **argv)
 {
 	struct perf_evlist *evlist = perf_evlist__new();
@@ -1703,6 +1740,9 @@ again:
 	goto again;
 
 out_unmap_evlist:
+	if (!err && trace->summary)
+		trace__fprintf_thread_summary(trace, trace->output);
+
 	perf_evlist__munmap(evlist);
 out_close_evlist:
 	perf_evlist__close(evlist);
@@ -1798,6 +1838,9 @@ static int trace__replay(struct trace *trace)
 	if (err)
 		pr_err("Failed to process events, error %d", err);
 
+	else if (trace->summary)
+		trace__fprintf_thread_summary(trace, trace->output);
+
 out:
 	perf_session__delete(session);
 
@@ -1808,10 +1851,53 @@ static size_t trace__fprintf_threads_header(FILE *fp)
 {
 	size_t printed;
 
-	printed  = fprintf(fp, "\n _____________________________________________________________________\n");
-	printed += fprintf(fp," __)    Summary of events    (__\n\n");
-	printed += fprintf(fp,"              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]\n");
-	printed += fprintf(fp," _____________________________________________________________________\n\n");
+	printed  = fprintf(fp, "\n _____________________________________________________________________________\n");
+	printed += fprintf(fp, " __)    Summary of events    (__\n\n");
+	printed += fprintf(fp, "              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]\n");
+	printed += fprintf(fp, "                                  syscall  count    min     max    avg  stddev\n");
+	printed += fprintf(fp, "                                                   msec    msec   msec     %%\n");
+	printed += fprintf(fp, " _____________________________________________________________________________\n\n");
+
+	return printed;
+}
+
+static size_t thread__dump_stats(struct thread_trace *ttrace,
+				 struct trace *trace, FILE *fp)
+{
+	struct stats *stats;
+	size_t printed = 0;
+	struct syscall *sc;
+	struct int_node *inode = intlist__first(ttrace->syscall_stats);
+
+	if (inode == NULL)
+		return 0;
+
+	printed += fprintf(fp, "\n");
+
+	/* each int_node is a syscall */
+	while (inode) {
+		stats = inode->priv;
+		if (stats) {
+			double min = (double)(stats->min) / NSEC_PER_MSEC;
+			double max = (double)(stats->max) / NSEC_PER_MSEC;
+			double avg = avg_stats(stats);
+			double pct;
+			u64 n = (u64) stats->n;
+
+			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
+			avg /= NSEC_PER_MSEC;
+
+			sc = &trace->syscalls.table[inode->i];
+			printed += fprintf(fp, "%24s  %14s : ", "", sc->name);
+			printed += fprintf(fp, "%5" PRIu64 "  %8.3f  %8.3f",
+					   n, min, max);
+			printed += fprintf(fp, "  %8.3f  %6.2f\n", avg, pct);
+		}
+
+		inode = intlist__next(inode);
+	}
+
+	printed += fprintf(fp, "\n\n");
 
 	return printed;
 }
@@ -1850,6 +1936,7 @@ static int trace__fprintf_one_thread(struct thread *thread, void *priv)
 	printed += fprintf(fp, " - %-5d :%11lu   [", thread->tid, ttrace->nr_events);
 	printed += color_fprintf(fp, color, "%5.1f%%", ratio);
 	printed += fprintf(fp, " ] %10.3f ms\n", ttrace->runtime_ms);
+	printed += thread__dump_stats(ttrace, trace, fp);
 
 	data->printed += printed;
 
@@ -1953,6 +2040,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 	OPT_BOOLEAN('T', "time", &trace.full_time,
 		    "Show full timestamp, not time relative to first start"),
+	OPT_BOOLEAN(0, "summary", &trace.summary,
+		    "Show syscall summary with statistics"),
 	OPT_END()
 	};
 	int err;
@@ -2008,9 +2097,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	else
 		err = trace__run(&trace, argc, argv);
 
-	if (trace.sched && !err)
-		trace__fprintf_thread_summary(&trace, trace.output);
-
 out_close:
 	if (output_name != NULL)
 		fclose(trace.output);

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-09  5:59   ` Ingo Molnar
@ 2013-10-15 14:04     ` David Ahern
  2013-10-15 14:32       ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 19+ messages in thread
From: David Ahern @ 2013-10-15 14:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: acme, linux-kernel, Frederic Weisbecker, Peter Zijlstra,
	Jiri Olsa, Namhyung Kim, Mike Galbraith, Stephane Eranian

On 10/8/13 11:59 PM, Ingo Molnar wrote:
> Here are some thoughts on how 'perf record' tracing performance could be
> further improved:
>
> 1)
>
> The use of non-temporal stores (MOVNTQ) to copy the ring-buffer into the
> file buffer makes sure the CPU cache is not trashed by the copying - which
> is the largest 'collateral damage' copying does.
>
> glibc does not appear to expose non-temporal instructions so it's going to
> be architecture dependent - but we could build the copy_user_nocache()
> function from the kernel proper (or copy it - we could even simplify it:
> knowing that only large and page aligned buffers are going to be copied
> with it).
>
> See how tools/perf/bench/mem-mem* does that to be able to measure the
> kernel's memcpy() and memset() function performance.

Forgot about this suggestion as well. Added to the list for v3.

>
> 2)
>
> Yet another method would be to avoid the copies altogether via the splice
> system-call - see:
>
> 	git grep splice kernel/trace/
>
> To make splice low-overhead we'd have to introduce a mode to not mmap the
> data part of the perf ring-buffer and splice the data straight from the
> perf fd into a temporary pipe and over from the pipe into the target file
> (or socket).

I looked into splice and it was not clear it would be a good match. 
First, perf is setup to pull data from mmap's and there is not a 1:1 
association between mmap's and fd's (fd_in for splice). Second and more 
importantly, splice is also a system call and it would have to be 
invoked for each mmap each trip through the loop -- just like write() 
does today -- so it does not solve the feedback loop problem.

>
> OTOH non-temporal stores are incredibly simple and memory bandwidth is
> plenty on modern systems so I'd certainly try that route first.

I'll take a look.

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 14:04     ` David Ahern
@ 2013-10-15 14:32       ` Arnaldo Carvalho de Melo
  2013-10-15 14:38         ` Peter Zijlstra
  0 siblings, 1 reply; 19+ messages in thread
From: Arnaldo Carvalho de Melo @ 2013-10-15 14:32 UTC (permalink / raw)
  To: David Ahern
  Cc: Ingo Molnar, linux-kernel, Frederic Weisbecker, Peter Zijlstra,
	Jiri Olsa, Namhyung Kim, Mike Galbraith, Stephane Eranian

Em Tue, Oct 15, 2013 at 08:04:15AM -0600, David Ahern escreveu:
> On 10/8/13 11:59 PM, Ingo Molnar wrote:
> > 2)

> > Yet another method would be to avoid the copies altogether via the splice
> > system-call - see:

> >	git grep splice kernel/trace/

> > To make splice low-overhead we'd have to introduce a mode to not mmap
> > the data part of the perf ring-buffer and splice the data straight
> > from the perf fd into a temporary pipe and over from the pipe into
> > the target file (or socket).

> I looked into splice and it was not clear it would be a good match.
> First, perf is setup to pull data from mmap's and there is not a 1:1
> association between mmap's and fd's (fd_in for splice). Second and

Jiri and PeterZ probaby will have comments here... ;-) :-)

Jiri even have patches, IIRC.

- Arnaldo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 14:32       ` Arnaldo Carvalho de Melo
@ 2013-10-15 14:38         ` Peter Zijlstra
  2013-10-15 15:27           ` Ingo Molnar
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2013-10-15 14:38 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: David Ahern, Ingo Molnar, linux-kernel, Frederic Weisbecker,
	Jiri Olsa, Namhyung Kim, Mike Galbraith, Stephane Eranian

On Tue, Oct 15, 2013 at 11:32:45AM -0300, Arnaldo Carvalho de Melo wrote:

> Jiri and PeterZ probaby will have comments here... ;-) :-)

The only complication with splice is the vmalloc support; other than
that it should be fairly straight fwd.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 14:38         ` Peter Zijlstra
@ 2013-10-15 15:27           ` Ingo Molnar
  2013-10-15 15:29             ` Peter Zijlstra
  2013-10-15 15:30             ` David Ahern
  0 siblings, 2 replies; 19+ messages in thread
From: Ingo Molnar @ 2013-10-15 15:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnaldo Carvalho de Melo, David Ahern, linux-kernel,
	Frederic Weisbecker, Jiri Olsa, Namhyung Kim, Mike Galbraith,
	Stephane Eranian


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Oct 15, 2013 at 11:32:45AM -0300, Arnaldo Carvalho de Melo wrote:
> 
> > Jiri and PeterZ probaby will have comments here... ;-) :-)
> 
> The only complication with splice is the vmalloc support; other than 
> that it should be fairly straight fwd.

In the initial version we could skip splice support for vmalloc 
ring-buffer, to keep things simple.

Wanna send a patch for people to try? Looks like there's real interest in 
speeding up perf record as much as possible!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 15:27           ` Ingo Molnar
@ 2013-10-15 15:29             ` Peter Zijlstra
  2013-10-15 15:30             ` David Ahern
  1 sibling, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2013-10-15 15:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, David Ahern, linux-kernel,
	Frederic Weisbecker, Jiri Olsa, Namhyung Kim, Mike Galbraith,
	Stephane Eranian

On Tue, Oct 15, 2013 at 05:27:47PM +0200, Ingo Molnar wrote:
> Wanna send a patch for people to try? Looks like there's real interest in 
> speeding up perf record as much as possible!

I've got a pile of pending bug reports to sort through before I can
start making more bugs ;-)



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 15:27           ` Ingo Molnar
  2013-10-15 15:29             ` Peter Zijlstra
@ 2013-10-15 15:30             ` David Ahern
  2013-10-15 16:06               ` Ingo Molnar
  1 sibling, 1 reply; 19+ messages in thread
From: David Ahern @ 2013-10-15 15:30 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: Arnaldo Carvalho de Melo, linux-kernel, Frederic Weisbecker,
	Jiri Olsa, Namhyung Kim, Mike Galbraith, Stephane Eranian

On 10/15/13 9:27 AM, Ingo Molnar wrote:
>
> * Peter Zijlstra <peterz@infradead.org> wrote:
>
>> On Tue, Oct 15, 2013 at 11:32:45AM -0300, Arnaldo Carvalho de Melo wrote:
>>
>>> Jiri and PeterZ probaby will have comments here... ;-) :-)
>>
>> The only complication with splice is the vmalloc support; other than
>> that it should be fairly straight fwd.
>
> In the initial version we could skip splice support for vmalloc
> ring-buffer, to keep things simple.
>
> Wanna send a patch for people to try? Looks like there's real interest in
> speeding up perf record as much as possible!

My second -- and more important -- concern about splice has been dropped 
from this path:

"splice is also a system call and it would have to be invoked for each 
mmap each trip through the loop -- just like write() does today -- so it 
does not solve the feedback loop problem. "

Am I missing something on how splice works?

David


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 15:30             ` David Ahern
@ 2013-10-15 16:06               ` Ingo Molnar
  2013-10-15 16:16                 ` David Ahern
  2013-10-15 16:37                 ` Peter Zijlstra
  0 siblings, 2 replies; 19+ messages in thread
From: Ingo Molnar @ 2013-10-15 16:06 UTC (permalink / raw)
  To: David Ahern
  Cc: Peter Zijlstra, Arnaldo Carvalho de Melo, linux-kernel,
	Frederic Weisbecker, Jiri Olsa, Namhyung Kim, Mike Galbraith,
	Stephane Eranian


* David Ahern <dsahern@gmail.com> wrote:

> On 10/15/13 9:27 AM, Ingo Molnar wrote:
> >
> >* Peter Zijlstra <peterz@infradead.org> wrote:
> >
> >>On Tue, Oct 15, 2013 at 11:32:45AM -0300, Arnaldo Carvalho de Melo wrote:
> >>
> >>>Jiri and PeterZ probaby will have comments here... ;-) :-)
> >>
> >>The only complication with splice is the vmalloc support; other than
> >>that it should be fairly straight fwd.
> >
> >In the initial version we could skip splice support for vmalloc
> >ring-buffer, to keep things simple.
> >
> > Wanna send a patch for people to try? Looks like there's real interest 
> > in speeding up perf record as much as possible!
> 
> My second -- and more important -- concern about splice has been dropped 
> from this path:
> 
> "splice is also a system call and it would have to be invoked for each 
> mmap each trip through the loop -- just like write() does today -- so it 
> does not solve the feedback loop problem. "
> 
> Am I missing something on how splice works?

splice() is very fast and should be able to process a lot of pages in one 
go, so the feedback loop should be pretty weak. mmap() triggers kernel 
code as well, every time we run out of the 64 MB window we got to remap 
it, right?

But that's all just theory - I think non-temporal stores have a pretty 
good chance of being a very good solution as well.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 16:06               ` Ingo Molnar
@ 2013-10-15 16:16                 ` David Ahern
  2013-10-16  7:11                   ` Ingo Molnar
  2013-10-15 16:37                 ` Peter Zijlstra
  1 sibling, 1 reply; 19+ messages in thread
From: David Ahern @ 2013-10-15 16:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Arnaldo Carvalho de Melo, linux-kernel,
	Frederic Weisbecker, Jiri Olsa, Namhyung Kim, Mike Galbraith,
	Stephane Eranian

On 10/15/13 10:06 AM, Ingo Molnar wrote:
> splice() is very fast and should be able to process a lot of pages in one
> go, so the feedback loop should be pretty weak. mmap() triggers kernel
> code as well, every time we run out of the 64 MB window we got to remap
> it, right?


Yes, 1 mmap, 1 munmap for every 64MB. Compare to the write() case which 
calls write() for each mmap each time through the mmap_read loop.

I am conjecturing that splice would follow the write model in the sense 
of a ring buffer has N bytes, call splice to copy the data from the ring 
buffer to the file. So, splice saves on the memcpy, but not the syscalls.

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 16:06               ` Ingo Molnar
  2013-10-15 16:16                 ` David Ahern
@ 2013-10-15 16:37                 ` Peter Zijlstra
  1 sibling, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2013-10-15 16:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Ahern, Arnaldo Carvalho de Melo, linux-kernel,
	Frederic Weisbecker, Jiri Olsa, Namhyung Kim, Mike Galbraith,
	Stephane Eranian

On Tue, Oct 15, 2013 at 06:06:46PM +0200, Ingo Molnar wrote:
> 
> splice() is very fast and should be able to process a lot of pages in one 
> go, so the feedback loop should be pretty weak. mmap() triggers kernel 
> code as well, every time we run out of the 64 MB window we got to remap 
> it, right?
> 
> But that's all just theory - I think non-temporal stores have a pretty 
> good chance of being a very good solution as well.

The first write to a fresh page will also cause kernel code to be ran. We fault
and run mk_write, which will end up doing block allocation etc..

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] perf record: mmap output file
  2013-10-15 16:16                 ` David Ahern
@ 2013-10-16  7:11                   ` Ingo Molnar
  0 siblings, 0 replies; 19+ messages in thread
From: Ingo Molnar @ 2013-10-16  7:11 UTC (permalink / raw)
  To: David Ahern
  Cc: Peter Zijlstra, Arnaldo Carvalho de Melo, linux-kernel,
	Frederic Weisbecker, Jiri Olsa, Namhyung Kim, Mike Galbraith,
	Stephane Eranian


* David Ahern <dsahern@gmail.com> wrote:

> On 10/15/13 10:06 AM, Ingo Molnar wrote:

> > splice() is very fast and should be able to process a lot of pages in 
> > one go, so the feedback loop should be pretty weak. mmap() triggers 
> > kernel code as well, every time we run out of the 64 MB window we got 
> > to remap it, right?
> 
> 
> Yes, 1 mmap, 1 munmap for every 64MB. Compare to the write() case which 
> calls write() for each mmap each time through the mmap_read loop.
> 
> I am conjecturing that splice would follow the write model in the sense 
> of a ring buffer has N bytes, call splice to copy the data from the ring 
> buffer to the file. So, splice saves on the memcpy, but not the 
> syscalls.

Well, splice can be 64 MB granular as well - but it depends on how 
buffered the pipes are (splice always uses pipes to shuffle around pages).

It's possible to have pipes with larger buffering - see F_SETPIPE_SZ. The 
current limit (for unprivileged users?) is:

  fs/pipe.c:unsigned int pipe_max_size = 1048576;

[But I haven't actually used this to increase splice() size yet.]

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2013-10-16  7:11 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-09  3:26 [PATCH 0/3] perf trace enhancements David Ahern
2013-10-09  3:26 ` [PATCH 1/3] perf util: Add findnew method to intlist - v2 David Ahern
2013-10-15  5:32   ` [tip:perf/core] perf util: Add findnew method to intlist tip-bot for David Ahern
2013-10-09  3:26 ` [PATCH 2/3] perf trace: Add summary option to dump syscall statistics David Ahern
2013-10-09 13:16   ` Jiri Olsa
2013-10-15  5:32   ` [tip:perf/core] " tip-bot for David Ahern
2013-10-09  3:26 ` [PATCH 3/3] perf record: mmap output file David Ahern
2013-10-09  5:59   ` Ingo Molnar
2013-10-15 14:04     ` David Ahern
2013-10-15 14:32       ` Arnaldo Carvalho de Melo
2013-10-15 14:38         ` Peter Zijlstra
2013-10-15 15:27           ` Ingo Molnar
2013-10-15 15:29             ` Peter Zijlstra
2013-10-15 15:30             ` David Ahern
2013-10-15 16:06               ` Ingo Molnar
2013-10-15 16:16                 ` David Ahern
2013-10-16  7:11                   ` Ingo Molnar
2013-10-15 16:37                 ` Peter Zijlstra
2013-10-09  7:14   ` Mike Galbraith

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).