linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jiri Olsa <jolsa@kernel.org>
To: Arnaldo Carvalho de Melo <acme@kernel.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: lkml <linux-kernel@vger.kernel.org>,
	Ingo Molnar <mingo@kernel.org>,
	Namhyung Kim <namhyung@kernel.org>,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	"Luis Claudio R. Goncalves" <lclaudio@uudg.org>,
	ldv@altlinux.org, esyr@redhat.com,
	Frederic Weisbecker <fweisbec@gmail.com>
Subject: [PATCH 1/8] perf: Allow to block process in syscall tracepoints
Date: Wed,  5 Dec 2018 17:05:02 +0100	[thread overview]
Message-ID: <20181205160509.1168-2-jolsa@kernel.org> (raw)
In-Reply-To: <20181205160509.1168-1-jolsa@kernel.org>

Adding support to specify 'block' bool in struct perf_event_attr
for syscalls tracepoints, allowing the event to block the process,
if there's no space in the ring buffer.

The blocking code will poll/periodically check for the space and
continue if the event was successfully written.

It's allowed only for syscall tracepoint events attached to
process. Following syscall events are supported:

  raw_syscalls:sys_enter
  raw_syscalls:sys_exit
  syscalls:sys_enter_accept
  syscalls:sys_enter_accept4
  syscalls:sys_enter_access
  syscalls:sys_enter_acct
  syscalls:sys_enter_add_key
  ...

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/n/tip-ocz7zwwkkx11v0mkxrtcddih@git.kernel.org
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/entry/common.c         | 36 +++++++++++++++++++++++++++--
 include/linux/perf_event.h      |  2 ++
 include/linux/sched.h           |  2 ++
 include/linux/syscalls.h        |  2 ++
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c            | 40 +++++++++++++++++++++++++++++++--
 kernel/events/ring_buffer.c     |  4 +++-
 kernel/trace/trace_event_perf.c |  4 ++++
 kernel/trace/trace_syscalls.c   | 28 +++++++++++++++++++----
 9 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b81918..e55cf9169a03 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -60,6 +60,32 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
 	}
 }
 
+static void trace_block_syscall(struct pt_regs *regs, bool enter)
+{
+	current->perf_blocked = true;
+
+	do {
+		schedule_timeout(100 * HZ);
+		current->perf_blocked_cnt = 0;
+
+		if (enter) {
+			/* perf syscalls:* enter */
+			perf_trace_syscall_enter(regs);
+
+			/* perf raw_syscalls:* enter */
+			perf_trace_sys_enter(&event_sys_enter, regs, regs->orig_ax);
+		} else {
+			/* perf syscalls:* enter */
+			perf_trace_syscall_exit(regs);
+
+			/* perf raw_syscalls:* enter */
+			perf_trace_sys_exit(&event_sys_exit, regs, regs->ax);
+		}
+	} while (current->perf_blocked_cnt);
+
+	current->perf_blocked = false;
+}
+
 /*
  * Returns the syscall nr to run (which should match regs->orig_ax) or -1
  * to skip the syscall.
@@ -123,8 +149,11 @@ static long syscall_trace_enter(struct pt_regs *regs)
 	}
 #endif
 
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) {
 		trace_sys_enter(regs, regs->orig_ax);
+		if (current->perf_blocked_cnt)
+			trace_block_syscall(regs, true);
+	}
 
 	do_audit_syscall_entry(regs, arch);
 
@@ -224,8 +253,11 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
 
 	audit_syscall_exit(regs);
 
-	if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+	if (cached_flags & _TIF_SYSCALL_TRACEPOINT) {
 		trace_sys_exit(regs, regs->ax);
+		if (current->perf_blocked_cnt)
+			trace_block_syscall(regs, false);
+	}
 
 	/*
 	 * If TIF_SYSCALL_EMU is set, we only get here because of
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 47a31d01df5a..904b7245357a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -695,6 +695,8 @@ struct perf_event {
 #endif
 
 	struct list_head		sb_list;
+
+	bool				blocked;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a51c13c2b1a0..aea741ef29ae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1009,6 +1009,8 @@ struct task_struct {
 	struct perf_event_context	*perf_event_ctxp[perf_nr_task_contexts];
 	struct mutex			perf_event_mutex;
 	struct list_head		perf_event_list;
+	bool				perf_blocked;
+	unsigned int			perf_blocked_cnt;
 #endif
 #ifdef CONFIG_DEBUG_PREEMPT
 	unsigned long			preempt_disable_ip;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2ac3d13a915b..3c8012ca9aa3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1296,4 +1296,6 @@ static inline unsigned int ksys_personality(unsigned int personality)
 	return old;
 }
 
+void perf_trace_syscall_enter(struct pt_regs *regs);
+void perf_trace_syscall_exit(struct pt_regs *regs);
 #endif
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9de8780ac8d9..92bae4cf279c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -372,7 +372,8 @@ struct perf_event_attr {
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
-				__reserved_1   : 35;
+				block          :  1, /* block process if there's no space in RB (syscall tracepoints only) */
+				__reserved_1   : 34;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7403a27363f8..8955c3ebbb58 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6489,6 +6489,23 @@ void perf_prepare_sample(struct perf_event_header *header,
 		data->phys_addr = perf_virt_to_phys(data->addr);
 }
 
+static bool perf_event_is_blocked(struct perf_event *event)
+{
+	bool blocked = event->attr.block && event->blocked;
+
+	if (blocked)
+		event->blocked = false;
+	return blocked;
+}
+
+static void perf_event_set_blocked(struct perf_event *event)
+{
+	if (event->attr.block) {
+		current->perf_blocked_cnt++;
+		event->blocked = true;
+	}
+}
+
 static __always_inline void
 __perf_event_output(struct perf_event *event,
 		    struct perf_sample_data *data,
@@ -6505,8 +6522,10 @@ __perf_event_output(struct perf_event *event,
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	if (output_begin(&handle, event, header.size))
+	if (output_begin(&handle, event, header.size)) {
+		perf_event_set_blocked(event);
 		goto exit;
+	}
 
 	perf_output_sample(&handle, &header, data, event);
 
@@ -8264,7 +8283,7 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 			       struct pt_regs *regs, struct hlist_head *head,
 			       struct task_struct *task)
 {
-	if (bpf_prog_array_valid(call)) {
+	if (!current->perf_blocked && bpf_prog_array_valid(call)) {
 		*(struct pt_regs **)raw_data = regs;
 		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
 			perf_swevent_put_recursion_context(rctx);
@@ -8296,6 +8315,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 	perf_trace_buf_update(record, event_type);
 
 	hlist_for_each_entry_rcu(event, head, hlist_entry) {
+		if (current->perf_blocked && !perf_event_is_blocked(event))
+			continue;
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
 	}
@@ -8314,6 +8335,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 			goto unlock;
 
 		list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+			if (current->perf_blocked && !perf_event_is_blocked(event))
+				continue;
 			if (event->cpu != smp_processor_id())
 				continue;
 			if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -10461,6 +10484,19 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	if (attr.block) {
+		/*
+		 * Allow only syscall tracepoints, check for syscall class
+		 * is made in the tracepoint event_init callback.
+		 */
+		if (attr.type != PERF_TYPE_TRACEPOINT)
+			return -EINVAL;
+
+		/* Allow to block only if we attach to a process. */
+		if (pid == -1)
+			return -EINVAL;
+	}
+
 	/* Only privileged users can get physical addresses */
 	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
 	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4a9937076331..d28849365431 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -223,7 +223,9 @@ __perf_output_begin(struct perf_output_handle *handle,
 	return 0;
 
 fail:
-	local_inc(&rb->lost);
+	/* Do not count lost if we are going to block and try again. */
+	if (!event->attr.block)
+		local_inc(&rb->lost);
 	perf_output_put_handle(handle);
 out:
 	rcu_read_unlock();
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 76217bbef815..1efbb819539d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/kprobes.h>
+#include <linux/syscalls.h>
 #include "trace.h"
 #include "trace_probe.h"
 
@@ -85,6 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 	if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (p_event->attr.block && !is_syscall_trace_event(tp_event))
+		return -EINVAL;
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f93a56d2db27..a8fd7a81361e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -578,7 +578,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
 	return trace_call_bpf(call, &param);
 }
 
-static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void __perf_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
@@ -616,7 +616,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 
-	if ((valid_prog_array &&
+	if ((!current->perf_blocked && valid_prog_array &&
 	     !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
@@ -628,6 +628,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 			      head, NULL);
 }
 
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+{
+	__perf_syscall_enter(regs, id);
+}
+
+void perf_trace_syscall_enter(struct pt_regs *regs)
+{
+	__perf_syscall_enter(regs, regs->orig_ax);
+}
+
 static int perf_sysenter_enable(struct trace_event_call *call)
 {
 	int ret = 0;
@@ -677,7 +687,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
 	return trace_call_bpf(call, &param);
 }
 
-static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void __perf_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
@@ -713,7 +723,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	if ((valid_prog_array &&
+	if ((!current->perf_blocked && valid_prog_array &&
 	     !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
@@ -724,6 +734,16 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 			      1, regs, head, NULL);
 }
 
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+{
+	__perf_syscall_exit(regs, ret);
+}
+
+void perf_trace_syscall_exit(struct pt_regs *regs)
+{
+	__perf_syscall_exit(regs, regs->ax);
+}
+
 static int perf_sysexit_enable(struct trace_event_call *call)
 {
 	int ret = 0;
-- 
2.17.2


  reply	other threads:[~2018-12-05 16:05 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-05 16:05 [RFC 1/8] perf: Block perf calls for system call tracepoints Jiri Olsa
2018-12-05 16:05 ` Jiri Olsa [this message]
2018-12-05 17:35   ` [PATCH 1/8] perf: Allow to block process in syscall tracepoints Steven Rostedt
2018-12-05 17:56     ` Jiri Olsa
2018-12-06  8:09   ` Peter Zijlstra
2018-12-06 10:30     ` Jiri Olsa
2018-12-06  8:10   ` Peter Zijlstra
2018-12-06  8:24     ` Jiri Olsa
2018-12-06 10:31       ` Peter Zijlstra
2018-12-06  8:34     ` Peter Zijlstra
2018-12-06 10:31       ` Jiri Olsa
2018-12-06 18:19       ` Steven Rostedt
2018-12-07  8:44         ` Jiri Olsa
2018-12-07  8:58         ` Peter Zijlstra
2018-12-07 13:41           ` Steven Rostedt
2018-12-07 15:11             ` Peter Zijlstra
2018-12-07 15:49               ` Arnaldo Carvalho de Melo
2018-12-08 10:41                 ` Peter Zijlstra
2018-12-08 17:34                   ` Steven Rostedt
2018-12-07 20:14               ` Steven Rostedt
2018-12-08 10:44                 ` Peter Zijlstra
2018-12-08 17:38                   ` Steven Rostedt
2018-12-10 10:18                     ` Peter Zijlstra
2018-12-13  0:39                       ` Dmitry V. Levin
2018-12-13  1:26                         ` Steven Rostedt
2018-12-13  1:49                           ` Dmitry V. Levin
2018-12-13 10:01                           ` Peter Zijlstra
2018-12-13 10:05                             ` Peter Zijlstra
2018-12-13 10:08                             ` Peter Zijlstra
2018-12-13 11:29                             ` Jiri Olsa
2018-12-06  8:17   ` Peter Zijlstra
2018-12-06 10:27     ` Jiri Olsa
2018-12-05 16:05 ` [PATCH 2/8] perf tools: Sync uapi perf_event.h Jiri Olsa
2018-12-05 16:05 ` [PATCH 3/8] perf record: Add --block option Jiri Olsa
2018-12-05 16:05 ` [PATCH 4/8] perf trace: " Jiri Olsa
2018-12-05 16:05 ` [PATCH 5/8] perf tools: Add block term support for tracepoints Jiri Olsa
2018-12-05 16:05 ` [PATCH 6/8] perf tools: Add ordered_events__flush_time interface Jiri Olsa
2018-12-14 21:00   ` [tip:perf/core] perf ordered_events: " tip-bot for Jiri Olsa
2018-12-18 14:27   ` tip-bot for Jiri Olsa
2018-12-05 16:05 ` [PATCH 7/8] perf trace: Move event delivery to deliver_event function Jiri Olsa
2018-12-14 21:01   ` [tip:perf/core] perf trace: Move event delivery to a new deliver_event() function tip-bot for Jiri Olsa
2018-12-18 14:28   ` tip-bot for Jiri Olsa
2018-12-05 16:05 ` [PATCH 8/8] perf trace: Add ordered processing for --block option Jiri Olsa
2018-12-14 21:02   ` [tip:perf/core] perf trace: Add ordered processing tip-bot for Jiri Olsa
2018-12-18 14:29   ` tip-bot for Jiri Olsa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181205160509.1168-2-jolsa@kernel.org \
    --to=jolsa@kernel.org \
    --cc=a.p.zijlstra@chello.nl \
    --cc=acme@kernel.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=esyr@redhat.com \
    --cc=fweisbec@gmail.com \
    --cc=lclaudio@uudg.org \
    --cc=ldv@altlinux.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).