linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Arnaldo Carvalho de Melo <acme@kernel.org>
To: Ingo Molnar <mingo@kernel.org>
Cc: "Clark Williams" <williams@redhat.com>,
	linux-kernel@vger.kernel.org, linux-perf-users@vger.kernel.org,
	"Arnaldo Carvalho de Melo" <acme@redhat.com>,
	"Adrian Hunter" <adrian.hunter@intel.com>,
	"Jiri Olsa" <jolsa@kernel.org>,
	"Luis Cláudio Gonçalves" <lclaudio@redhat.com>,
	"Namhyung Kim" <namhyung@kernel.org>,
	"Wang Nan" <wangnan0@huawei.com>
Subject: [PATCH 04/29] perf trace: Do not hardcode the size of the tracepoint common_ fields
Date: Thu,  3 Jan 2019 09:45:44 -0300	[thread overview]
Message-ID: <20190103124609.29672-5-acme@kernel.org> (raw)
In-Reply-To: <20190103124609.29672-1-acme@kernel.org>

From: Arnaldo Carvalho de Melo <acme@redhat.com>

We shouldn't hardcode the size of the tracepoint common_ fields, use the
offset of the 'id'/'__syscallnr' field in the sys_enter event instead.

This caused the augmented syscalls code to fail on a particular build of a
PREEMPT_RT_FULL kernel where these extra 'common_migrate_disable' and
'common_padding' fields were before the syscall id one:

  # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/format
  name: sys_enter
  ID: 22
  format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;
	field:unsigned short common_migrate_disable;	offset:8;	size:2;	signed:0;
	field:unsigned short common_padding;	offset:10;	size:2;	signed:0;

	field:long id;	offset:16;	size:8;	signed:1;
	field:unsigned long args[6];	offset:24;	size:48;	signed:0;

  print fmt: "NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)", REC->id, REC->args[0], REC->args[1], REC->args[2], REC->args[3], REC->args[4], REC->args[5]
  #

All those 'common_' prefixed fields are zeroed when they hit a BPF tracepoint
hook, we better just discard those, i.e. somehow pass an offset to the
BPF program from the start of the ctx and make adjustments in the 'perf trace'
handlers to adjust the offset of the syscall arg offsets obtained from tracefs.

Till then, fix it the quick way and add this to the augmented_raw_syscalls.c to
bet it to work in such kernels:

  diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
  index 53c233370fae..1f746f931e13 100644
  --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
  +++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
  @@ -38,12 +38,14 @@ struct bpf_map SEC("maps") syscalls = {

   struct syscall_enter_args {
          unsigned long long common_tp_fields;
  +       long               rt_common_tp_fields;
          long               syscall_nr;
          unsigned long      args[6];
   };

   struct syscall_exit_args {
          unsigned long long common_tp_fields;
  +       long               rt_common_tp_fields;
          long               syscall_nr;
          long               ret;
   };

Just to check that this was the case. Fix it properly later, for now remove the
hardcoding of the offset in the 'perf trace' side and document the situation
with this patch.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Luis Cláudio Gonçalves <lclaudio@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-2pqavrktqkliu5b9nzouio21@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 73 +++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 21 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 6689c1a114fe..1e9e886b2811 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -112,8 +112,9 @@ struct trace {
 	} stats;
 	unsigned int		max_stack;
 	unsigned int		min_stack;
-	bool			sort_events;
+	int			raw_augmented_syscalls_args_size;
 	bool			raw_augmented_syscalls;
+	bool			sort_events;
 	bool			not_ev_qualifier;
 	bool			live;
 	bool			full_time;
@@ -283,12 +284,17 @@ static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
 	return -ENOENT;
 }
 
-static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
+static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel, struct perf_evsel *tp)
 {
 	struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
 
-	if (evsel->priv != NULL) {       /* field, sizeof_field, offsetof_field */
-		if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
+	if (evsel->priv != NULL) {
+		struct tep_format_field *syscall_id = perf_evsel__field(tp, "id");
+		if (syscall_id == NULL)
+			syscall_id = perf_evsel__field(tp, "__syscall_nr");
+		if (syscall_id == NULL)
+			goto out_delete;
+		if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
 			goto out_delete;
 
 		return 0;
@@ -1768,16 +1774,16 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
 	return printed;
 }
 
-static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, bool raw_augmented)
+static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
 {
 	void *augmented_args = NULL;
 	/*
 	 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
-	 * and there we get all 6 syscall args plus the tracepoint common
-	 * fields (sizeof(long)) and the syscall_nr (another long). So we check
-	 * if that is the case and if so don't look after the sc->args_size,
-	 * but always after the full raw_syscalls:sys_enter payload, which is
-	 * fixed.
+	 * and there we get all 6 syscall args plus the tracepoint common fields
+	 * that gets calculated at the start and the syscall_nr (another long).
+	 * So we check if that is the case and if so don't look after the
+	 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
+	 * which is fixed.
 	 *
 	 * We'll revisit this later to pass s->args_size to the BPF augmenter
 	 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
@@ -1785,7 +1791,7 @@ static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sam
 	 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
 	 * traffic to just what is needed for each syscall.
 	 */
-	int args_size = raw_augmented ? (8 * (int)sizeof(long)) : sc->args_size;
+	int args_size = raw_augmented_args_size ?: sc->args_size;
 
 	*augmented_args_size = sample->raw_size - args_size;
 	if (*augmented_args_size > 0)
@@ -1839,7 +1845,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
 	 */
 	if (evsel != trace->syscalls.events.sys_enter)
-		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls);
+		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
 	ttrace->entry_time = sample->time;
 	msg = ttrace->entry_str;
 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
@@ -1897,7 +1903,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse
 		goto out_put;
 
 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
-	augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls);
+	augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
 	syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
 	fprintf(trace->output, "%s", msg);
 	err = 0;
@@ -3814,13 +3820,6 @@ int cmd_trace(int argc, const char **argv)
 	 * syscall.
 	 */
 	if (trace.syscalls.events.augmented) {
-		evsel = trace.syscalls.events.augmented;
-
-		if (perf_evsel__init_augmented_syscall_tp(evsel) ||
-		    perf_evsel__init_augmented_syscall_tp_args(evsel))
-			goto out;
-		evsel->handler = trace__sys_enter;
-
 		evlist__for_each_entry(trace.evlist, evsel) {
 			bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
 
@@ -3829,9 +3828,41 @@ int cmd_trace(int argc, const char **argv)
 				goto init_augmented_syscall_tp;
 			}
 
+			if (strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_enter") == 0) {
+				struct perf_evsel *augmented = trace.syscalls.events.augmented;
+				if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) ||
+				    perf_evsel__init_augmented_syscall_tp_args(augmented))
+					goto out;
+				augmented->handler = trace__sys_enter;
+			}
+
 			if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
+				struct syscall_tp *sc;
 init_augmented_syscall_tp:
-				perf_evsel__init_augmented_syscall_tp(evsel);
+				if (perf_evsel__init_augmented_syscall_tp(evsel, evsel))
+					goto out;
+				sc = evsel->priv;
+				/*
+				 * For now with BPF raw_augmented we hook into
+				 * raw_syscalls:sys_enter and there we get all
+				 * 6 syscall args plus the tracepoint common
+				 * fields and the syscall_nr (another long).
+				 * So we check if that is the case and if so
+				 * don't look after the sc->args_size but
+				 * always after the full raw_syscalls:sys_enter
+				 * payload, which is fixed.
+				 *
+				 * We'll revisit this later to pass
+				 * s->args_size to the BPF augmenter (now
+				 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
+				 * so that it copies only what we need for each
+				 * syscall, like what happens when we use
+				 * syscalls:sys_enter_NAME, so that we reduce
+				 * the kernel/userspace traffic to just what is
+				 * needed for each syscall.
+				 */
+				if (trace.raw_augmented_syscalls)
+					trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
 				perf_evsel__init_augmented_syscall_tp_ret(evsel);
 				evsel->handler = trace__sys_exit;
 			}
-- 
2.19.2


  parent reply	other threads:[~2019-01-03 12:46 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-03 12:45 [GIT PULL 00/29] perf/core improvements and fixes Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 01/29] perf trace: Check if the raw_syscalls:sys_{enter,exit} are setup before setting tp filter Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 02/29] perf beauty mmap: PROT_WRITE should come before PROT_EXEC Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 03/29] perf build: Don't unconditionally link the libbfd feature test to -liberty and -lz Arnaldo Carvalho de Melo
2019-01-03 12:45 ` Arnaldo Carvalho de Melo [this message]
2019-01-03 12:45 ` [PATCH 05/29] perf trace: Use correct SECCOMP prefix spelling, "SECOMP_*" -> "SECCOMP_*" Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 06/29] perf python: Do not force closing original perf descriptor in evlist.get_pollfd() Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 07/29] perf script: Fix LBR skid dump problems in brstackinsn Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 08/29] perf trace: Rename thread_thread->paths to thread_trace->files Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 09/29] perf trace: Move the files table resizing to outside set_pathname() Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 10/29] perf trace: Store the major number for a file when storing its pathname Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 11/29] tools headers uapi: Grab a copy of usbdevice_fs.h Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 12/29] perf beauty ioctl: Add generator for USBDEVFS_ ioctl commands Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 13/29] perf trace: Wire up ioctl's USBDEBFS_ cmd table generator Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 14/29] perf trace beauty: Export function to get the files for a thread Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 15/29] perf trace beauty ioctl: Beautify USBDEVFS_ commands Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 16/29] perf c2c: Change the default coalesce setup Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 17/29] perf c2c: Increase the HITM ratio limit for displayed cachelines Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 18/29] tools power x86_energy_perf_policy: Override CFLAGS assignments and add LDFLAGS to build command Arnaldo Carvalho de Melo
2019-01-03 12:45 ` [PATCH 19/29] tools thermal tmon: Allow overriding CFLAGS assignments Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 20/29] tools power turbostat: Override CFLAGS assignments and add LDFLAGS to build command Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 21/29] tools gpio: Allow overriding CFLAGS Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 22/29] perf thread-stack: Simplify some code in thread_stack__process() Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 23/29] perf thread-stack: Tidy thread_stack__bottom() usage Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 24/29] perf thread-stack: Avoid direct reference to the thread's stack Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 25/29] perf thread-stack: Allow for a thread stack array Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 26/29] perf thread-stack: Factor out thread_stack__init() Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 27/29] perf thread-stack: Allocate an array of thread stacks Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 28/29] perf thread-stack: Fix thread stack processing for the idle task Arnaldo Carvalho de Melo
2019-01-03 12:46 ` [PATCH 29/29] perf session: Add comment for perf_session__register_idle_thread() Arnaldo Carvalho de Melo
2019-01-03 13:07 ` [GIT PULL 00/29] perf/core improvements and fixes Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190103124609.29672-5-acme@kernel.org \
    --to=acme@kernel.org \
    --cc=acme@redhat.com \
    --cc=adrian.hunter@intel.com \
    --cc=jolsa@kernel.org \
    --cc=lclaudio@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=wangnan0@huawei.com \
    --cc=williams@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).