All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
To: rostedt@goodmis.org
Cc: linux-trace-devel@vger.kernel.org
Subject: [PATCH 4/4] trace-cmd: Add new subcomand "trace-cmd perf"
Date: Fri,  7 Aug 2020 15:06:55 +0300	[thread overview]
Message-ID: <20200807120655.797084-5-tz.stoyanov@gmail.com> (raw)
In-Reply-To: <20200807120655.797084-1-tz.stoyanov@gmail.com>

The new sub command uses perf to collect performance information of the
selected process. It has one mandatory argument:
trace-cmd perf --pid <PID>

Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
---
 tracecmd/Makefile              |   1 +
 tracecmd/include/trace-local.h |   2 +
 tracecmd/trace-cmd.c           |   1 +
 tracecmd/trace-perf.c          | 540 +++++++++++++++++++++++++++++++++
 4 files changed, 544 insertions(+)
 create mode 100644 tracecmd/trace-perf.c

diff --git a/tracecmd/Makefile b/tracecmd/Makefile
index f9435558..64cd7430 100644
--- a/tracecmd/Makefile
+++ b/tracecmd/Makefile
@@ -32,6 +32,7 @@ TRACE_CMD_OBJS += trace-list.o
 TRACE_CMD_OBJS += trace-usage.o
 TRACE_CMD_OBJS += trace-dump.o
 TRACE_CMD_OBJS += trace-obj-debug.o
+TRACE_CMD_OBJS += trace-perf.o
 ifeq ($(VSOCK_DEFINED), 1)
 TRACE_CMD_OBJS += trace-tsync.o
 endif
diff --git a/tracecmd/include/trace-local.h b/tracecmd/include/trace-local.h
index ccae61d4..2ab915ac 100644
--- a/tracecmd/include/trace-local.h
+++ b/tracecmd/include/trace-local.h
@@ -101,6 +101,8 @@ void trace_usage(int argc, char **argv);
 
 void trace_dump(int argc, char **argv);
 
+void trace_perf(int argc, char **argv);
+
 int trace_record_agent(struct tracecmd_msg_handle *msg_handle,
 		       int cpus, int *fds,
 		       int argc, char **argv, bool use_fifos,
diff --git a/tracecmd/trace-cmd.c b/tracecmd/trace-cmd.c
index 7376c5a5..0c5b324f 100644
--- a/tracecmd/trace-cmd.c
+++ b/tracecmd/trace-cmd.c
@@ -104,6 +104,7 @@ struct command commands[] = {
 	{"list", trace_list},
 	{"help", trace_usage},
 	{"dump", trace_dump},
+	{"perf", trace_perf},
 	{"-h", trace_usage},
 };
 
diff --git a/tracecmd/trace-perf.c b/tracecmd/trace-perf.c
new file mode 100644
index 00000000..0b100aa0
--- /dev/null
+++ b/tracecmd/trace-perf.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2019, VMware, Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
+ *
+ */
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <linux/perf_event.h>
+#include <asm/unistd.h>
+#include <sys/mman.h>
+
+
+#include "trace-local.h"
+#include "trace-cmd-local.h"
+
+#define BUF_SHIFT	8
+
+struct perf_stat {
+	__u64 count;
+	__u64 time_enabled;
+	__u64 time_running;
+	__u64 id;
+};
+
+struct perf_cpu {
+	int cpu;
+	int fd;
+	char *file;
+	pthread_t tid;
+	struct perf_event_mmap_page *mpage;
+};
+
+struct perf_session {
+	int pid;
+	int cpu_count;
+	struct perf_cpu *cpus;
+	struct trace_debug_object *fdebug;
+};
+
+static long perf_event_open(struct perf_event_attr *event, pid_t pid,
+			    int cpu, int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, event, pid, cpu, group_fd, flags);
+}
+
+/*
+ * struct {
+ *	struct perf_event_header	header;
+ *
+ *	#
+ *	# Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.
+ *	# The advantage of PERF_SAMPLE_IDENTIFIER is that its position
+ *	# is fixed relative to header.
+ *	#
+ *
+ *	{ u64			id;	  } && PERF_SAMPLE_IDENTIFIER
+ *	{ u64			ip;	  } && PERF_SAMPLE_IP
+ *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+ *	{ u64			time;     } && PERF_SAMPLE_TIME
+ *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+ *	{ u64			id;	  } && PERF_SAMPLE_ID
+ *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
+ *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+ *	{ u64			period;   } && PERF_SAMPLE_PERIOD
+ *
+ *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
+ *
+ *	{ u64			nr,
+ *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+ *
+ *	#
+ *	# The RAW record below is opaque data wrt the ABI
+ *	#
+ *	# That is, the ABI doesn't make any promises wrt to
+ *	# the stability of its content, it may vary depending
+ *	# on event, hardware, kernel version and phase of
+ *	# the moon.
+ *	#
+ *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+ *	#
+ *
+ *	{ u32			size;
+ *	  char                  data[size];}&& PERF_SAMPLE_RAW
+ *
+ *	{ u64                   nr;
+ *	  { u64	hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX
+ *        { u64 from, to, flags } lbr[nr];
+ *      } && PERF_SAMPLE_BRANCH_STACK
+ *
+ *	{ u64			abi; # enum perf_sample_regs_abi
+ *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
+ *
+ *	{ u64			size;
+ *	  char			data[size];
+ *	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
+ *
+ *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
+ *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
+ *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
+ *	{ u64			abi; # enum perf_sample_regs_abi
+ *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+ *	{ u64			size;
+ *	  char			data[size]; } && PERF_SAMPLE_AUX
+ * };
+ */
+
+struct perf_event_sample {
+	struct perf_event_header head;
+	uint64_t id; //PERF_SAMPLE_IDENTIFIER
+	uint32_t pid, tid; //PERF_SAMPLE_TID
+	uint64_t time; //PERF_SAMPLE_TIME
+	uint32_t cpu, res; // PERF_SAMPLE_CPU
+	uint64_t period; // PERF_SAMPLE_PERIOD
+	uint64_t nr;
+	uint64_t ips[]; // PERF_SAMPLE_CALLCHAIN
+} __attribute__((packed));
+
+static void handle_event(struct perf_event_sample *sample,
+			 struct perf_cpu *cpu, int fd)
+{
+	int i;
+#if 0
+	printf(" Sample %d/%ld %d %d\n", sample->head.size,
+					  sizeof(struct perf_event_sample),
+					  sample->head.type, sample->head.misc);
+	printf("  ID %ld, TID (%d, %d), TIME %ld, CPU (%d, %d), PERIOD %ld, STACK %ld\n    ",
+			sample->id, sample->pid, sample->tid, sample->time,
+			sample->cpu, sample->res, sample->period, sample->nr);
+#endif
+	write(fd, &sample->time, sizeof(uint64_t));
+	write(fd, &sample->nr, sizeof(uint64_t));
+	for (i = 0; i < sample->nr; i++) {
+		write(fd, &sample->ips[i], sizeof(uint64_t));
+//		printf("0x%lx ", sample->ips[i]);
+	}
+//	printf("\n");
+}
+
+static void perf_event_read(struct perf_cpu *cpu, int fd)
+{
+	struct perf_event_header *header;
+	int rd = 0, all = 0;
+	unsigned char *data;
+
+
+	data = (unsigned char *)cpu->mpage + getpagesize();
+
+	while (cpu->mpage->data_tail != cpu->mpage->data_head) {
+		while (cpu->mpage->data_tail >= BUF_SHIFT * getpagesize())
+			cpu->mpage->data_tail -= BUF_SHIFT * getpagesize();
+
+		header = (struct perf_event_header *)(data + cpu->mpage->data_tail);
+		if (header->size == 0)
+			break;
+
+		cpu->mpage->data_tail += header->size;
+
+		while (cpu->mpage->data_tail >= BUF_SHIFT * getpagesize())
+			cpu->mpage->data_tail -= BUF_SHIFT * getpagesize();
+		all++;
+		if (header->type == PERF_RECORD_SAMPLE) {
+			rd++;
+			handle_event((struct perf_event_sample *)header, cpu, fd);
+		}
+	}
+	cpu->mpage->data_tail = cpu->mpage->data_head;
+}
+
+static void perf_init_pe(struct perf_event_attr *pe)
+{
+	memset(pe, 0, sizeof(struct perf_event_attr));
+	pe->type = PERF_TYPE_HARDWARE;
+	pe->sample_type = PERF_SAMPLE_CALLCHAIN |
+			  PERF_SAMPLE_IDENTIFIER |
+			  PERF_SAMPLE_TID |
+			  PERF_SAMPLE_TIME |
+			  PERF_SAMPLE_CPU |
+			  PERF_SAMPLE_PERIOD;
+	pe->size = sizeof(struct perf_event_attr);
+	pe->config = PERF_COUNT_HW_CPU_CYCLES;
+	pe->disabled = 1;
+	pe->exclude_kernel = 1;
+	pe->freq = 1;
+	pe->sample_freq = 1000;
+	pe->inherit = 1;
+	pe->mmap = 1;
+	pe->comm = 1;
+	pe->task = 1;
+	pe->precise_ip = 1;
+	pe->sample_id_all = 1;
+//	pe->mmap2 = 1;
+	pe->comm_exec = 1;
+	pe->ksymbol = 1;
+	pe->bpf_event = 1;
+	pe->read_format = PERF_FORMAT_ID |
+			PERF_FORMAT_TOTAL_TIME_ENABLED|
+			PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+}
+
+static int perf_enable(struct perf_cpu *perf, bool enable)
+{
+	int ret;
+
+	if (enable) {
+		ret = ioctl(perf->fd, PERF_EVENT_IOC_RESET, 0);
+		ret = ioctl(perf->fd, PERF_EVENT_IOC_ENABLE, 0);
+	} else
+		ret = ioctl(perf->fd, PERF_EVENT_IOC_DISABLE, 0);
+
+	return ret;
+}
+
+static int perf_mmap(struct perf_cpu *perf)
+{
+	/* associate a buffer with the file */
+	perf->mpage = mmap(NULL, (BUF_SHIFT + 1)*getpagesize(),
+			PROT_READ | PROT_WRITE, MAP_SHARED, perf->fd, 0);
+	if (perf->mpage == MAP_FAILED)
+		return -1;
+	return 0;
+}
+
+static void perf_dump_stat(struct perf_cpu *perf)
+{
+	struct perf_stat stat;
+	int ret;
+
+	ret = read(perf->fd, &stat, sizeof(stat));
+	if (ret > 0)
+		printf("cpu %d: %lld %lld %lld %lld\n",
+			perf->cpu, stat.count,
+			stat.time_enabled, stat.time_running,
+			stat.id);
+}
+
+static void perf_session_destroy(struct perf_session *perf)
+{
+	int i;
+
+	if (perf->cpu_count && perf->cpus) {
+		for (i = 0; i < perf->cpu_count; i++) {
+			if (perf->cpus[i].fd >= 0)
+				close(perf->cpus[i].fd);
+			if (perf->cpus[i].mpage)
+				munmap(perf->cpus[i].mpage,
+				       (BUF_SHIFT + 1)*getpagesize());
+			if (perf->cpus[i].file) {
+				remove(perf->cpus[i].file);
+				free(perf->cpus[i].file);
+			}
+		}
+		free(perf->cpus);
+	}
+
+	if (perf->fdebug)
+		trace_debug_obj_destroy(perf->fdebug);
+
+	free(perf);
+}
+
+#define TMP_FILE "/tmp/perf_temp_data.XXXXXX"
+static struct perf_session *perf_session_new(int pid)
+{
+	int cpus = tracecmd_count_cpus();
+	struct perf_event_attr pe;
+	struct perf_session *perf;
+	char *template;
+	int i;
+
+	template = strdup(TMP_FILE);
+	if (!template)
+		return NULL;
+	perf = calloc(cpus, sizeof(struct perf_session));
+	if (!perf)
+		return NULL;
+	perf->fdebug = trace_debug_obj_create_pid(pid);
+	if (!perf->fdebug)
+		goto error;
+	perf->cpus = calloc(cpus, sizeof(struct perf_cpu));
+	if (!cpus)
+		goto error;
+	for (i = 0; i < cpus; i++)
+		perf->cpus[i].fd = -1;
+
+	perf_init_pe(&pe);
+	for (i = 0; i < cpus; i++) {
+		perf->cpus[i].fd = perf_event_open(&pe, pid, i, -1, 0);
+		if (perf->cpus[i].fd < 0)
+			goto error;
+		fcntl(perf->cpus[i].fd, F_SETFL, O_NONBLOCK);
+		perf->cpus[i].cpu = i;
+		if (perf_mmap(&perf->cpus[i]) < 0)
+			goto error;
+		strcpy(template, TMP_FILE);
+		mktemp(template);
+		perf->cpus[i].file = strdup(template);
+	}
+	perf->cpu_count = cpus;
+	perf->pid = pid;
+	free(template);
+	return perf;
+error:
+	free(template);
+	perf_session_destroy(perf);
+	return NULL;
+}
+
+void *perf_read_thread(void *context)
+{
+	struct perf_cpu *cpu = (struct perf_cpu *)context;
+	struct perf_stat stat;
+	fd_set rset;
+	int ret;
+	int fd;
+
+	fd = open(cpu->file, O_WRONLY|O_CREAT|O_TRUNC, 0600);
+	if (fd < 0)
+		return NULL;
+
+	perf_enable(cpu, true);
+
+	while (1) {
+//		FD_ZERO(&rset);
+//		FD_SET(cpu->fd, &rset);
+//		ret = select(cpu->fd + 1, &rset, NULL, NULL, NULL);
+		ret = read(cpu->fd, &stat, sizeof(stat));
+//		if ( ret > 0 && stat.count > 0) {
+//			printf("Got %lld on CPU %d\n", stat.count, cpu->cpu);
+			perf_event_read(cpu, fd);
+//		}
+	}
+	return NULL;
+}
+
+#define PERF_READER_SCHEDULER SCHED_RR
+static int perf_run_readers(struct perf_session *perf)
+{
+	struct sched_param sched;
+	pthread_attr_t attr;
+	int i;
+
+	sched.sched_priority = sched_get_priority_max(PERF_READER_SCHEDULER);
+	pthread_attr_init(&attr);
+	pthread_attr_setschedpolicy(&attr, PERF_READER_SCHEDULER);
+	pthread_attr_setschedparam(&attr, &sched);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	for (i = 0; i < perf->cpu_count; i++) {
+		if (pthread_create(&perf->cpus[i].tid, &attr,
+				   perf_read_thread, &perf->cpus[i]))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int perf_stop_readers(struct perf_session *perf)
+{
+	int i;
+
+	for (i = 0; i < perf->cpu_count; i++)
+		pthread_cancel(perf->cpus[i].tid);
+
+	for (i = 0; i < perf->cpu_count; i++)
+		pthread_join(perf->cpus[i].tid, NULL);
+
+	return 0;
+}
+
+static void perf_collect_results(struct perf_session *perf, int cpu)
+{
+	unsigned long long time, count, ip;
+	int fd, i;
+	int ret;
+
+	fd = open(perf->cpus[cpu].file, O_RDONLY, 0600);
+	if (fd < 0)
+		return;
+	do {
+		ret = read(fd, &time, sizeof(time));
+		if (ret != sizeof(time))
+			break;
+
+		ret = read(fd, &count, sizeof(count));
+		if (ret != sizeof(count))
+			break;
+
+		printf(" %lld %lld: ", time, count);
+		for (i = 0; i < count; i++) {
+			ret = read(fd, &ip, sizeof(ip));
+			if (ret != sizeof(ip))
+				break;
+			ret = trace_debug_add_resolve_symbol(perf->fdebug, ip, NULL);
+			if (ret < 0)
+				printf("0x%llx(unknown) ", ip);
+			else
+				printf("0x%llx ", ip);
+		}
+		printf("\n");
+		if (i < count)
+			break;
+	} while (1);
+	close(fd);
+}
+
+static int perf_collect(struct perf_session *perf)
+{
+	int i;
+
+	if (perf_run_readers(perf))
+		return -1;
+
+	printf("Collecting callstack of pid %d\n", perf->pid);
+	sleep(2);
+
+	for (i = 0; i < perf->cpu_count; i++)
+		perf_enable(&perf->cpus[i], false);
+
+	perf_stop_readers(perf);
+
+	for (i = 0; i < perf->cpu_count; i++) {
+		perf_dump_stat(&perf->cpus[i]);
+		perf_collect_results(perf, i);
+	}
+
+	return 0;
+}
+
+struct symb_walk_context {
+	struct trace_seq *s;
+	int count;
+};
+
+static int perf_symbols_walk(struct tracecmd_debug_symbols *symb, void *data)
+{
+	struct symb_walk_context *context = (struct symb_walk_context *)data;
+
+	trace_seq_printf(context->s, "%s %s %llx %llx\n",
+			 symb->fname, symb->name,
+			 symb->vma_start, symb->vma_near);
+	context->count++;
+
+	return 0;
+}
+
+static void
+add_pid_symbols(struct tracecmd_output *handle, struct perf_session *perf)
+{
+	struct symb_walk_context context;
+	struct trace_seq body, head;
+
+	trace_seq_init(&body);
+	trace_seq_init(&head);
+	context.s = &body;
+	context.count = 0;
+	trace_debug_walk_resolved_symbols(perf->fdebug, perf_symbols_walk, &context);
+	trace_seq_printf(&head, "%x %d\n", perf->pid, context.count);
+	trace_seq_terminate(&head);
+	trace_seq_terminate(&body);
+	trace_seq_puts(&head, body.buffer);
+	if (handle && context.count)
+		tracecmd_add_option(handle, TRACECMD_OPTION_PID_SYMBOLS,
+				    head.len + 1, head.buffer);
+	else
+		trace_seq_do_printf(&head);
+	trace_seq_destroy(&body);
+}
+
+static int perf_resolve_trace(struct perf_session *perf)
+{
+	trace_debug_resolve_symbols(perf->fdebug);
+	add_pid_symbols(NULL, perf);
+
+	return 0;
+}
+
+int perf_run(int pid)
+{
+	struct perf_session *perf;
+
+	perf = perf_session_new(pid);
+	if (perf == NULL)
+		return -1;
+
+	perf_collect(perf);
+
+	perf_resolve_trace(perf);
+
+	perf_session_destroy(perf);
+	return 0;
+}
+
+enum {
+	OPT_pid	= 255,
+};
+
+void trace_perf(int argc, char **argv)
+{
+	int pid = 0;
+	int c;
+
+
+	if (strcmp(argv[1], "perf") != 0)
+		usage(argv);
+	for (;;) {
+		int option_index = 0;
+		static struct option long_options[] = {
+			{"pid", required_argument, NULL, OPT_pid},
+			{"help", no_argument, NULL, '?'},
+			{NULL, 0, NULL, 0}
+		};
+
+		c = getopt_long (argc-1, argv+1, "+hp:",
+				long_options, &option_index);
+		if (c == -1)
+			break;
+		switch (c) {
+
+		case OPT_pid:
+		case 'p':
+			pid = atoi(optarg);
+			break;
+		case 'h':
+		case '?':
+		default:
+			usage(argv);
+		}
+	}
+	perf_run(pid);
+}
-- 
2.26.2


      parent reply	other threads:[~2020-08-07 12:07 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-07 12:06 [PATCH 0/4][WiP] trace-cmd: Add "perf" sub command Tzvetomir Stoyanov (VMware)
2020-08-07 12:06 ` [PATCH 1/4] trace-cmd: Internal refactoring of pid address map logic Tzvetomir Stoyanov (VMware)
2020-08-07 12:06 ` [PATCH 2/4] trace-cmd: New internal APIs for reading ELF header Tzvetomir Stoyanov (VMware)
2020-08-07 12:06 ` [PATCH 3/4] trace-cmd: Add a new option in trace.dat file for the address to function name mapping Tzvetomir Stoyanov (VMware)
2020-08-07 12:06 ` Tzvetomir Stoyanov (VMware) [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200807120655.797084-5-tz.stoyanov@gmail.com \
    --to=tz.stoyanov@gmail.com \
    --cc=linux-trace-devel@vger.kernel.org \
    --cc=rostedt@goodmis.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.