[v3,06/11] perf: Add support for SIGTRAP on perf events
diff mbox series

Message ID 20210324112503.623833-7-elver@google.com
State New, archived
Headers show
Series
  • Add support for synchronous signals on perf events
Related show

Commit Message

Marco Elver March 24, 2021, 11:24 a.m. UTC
Adds bit perf_event_attr::sigtrap, which can be set to cause events to
send SIGTRAP (with si_code TRAP_PERF) to the task where the event
occurred. To distinguish perf events and allow user space to decode
si_perf (if set), the event type is set in si_errno.

The primary motivation is to support synchronous signals on perf events
in the task where an event (such as breakpoints) triggered.

Link: https://lore.kernel.org/lkml/YBv3rAT566k+6zjg@hirez.programming.kicks-ass.net/
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Marco Elver <elver@google.com>
---
v2:
* Use atomic_set(&event_count, 1), since it must always be 0 in
  perf_pending_event_disable().
* Implicitly restrict inheriting events if sigtrap, but the child was
  cloned with CLONE_CLEAR_SIGHAND, because it is not generally safe if
  the child cleared all signal handlers to continue sending SIGTRAP.
---
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c            | 28 +++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

Comments

Marco Elver March 25, 2021, 8:14 a.m. UTC | #1
On Wed, Mar 24, 2021 at 12:24PM +0100, Marco Elver wrote:
[...]
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index b6434697c516..1e4c949bf75f 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6391,6 +6391,17 @@ void perf_event_wakeup(struct perf_event *event)
>  	}
>  }
>  
> +static void perf_sigtrap(struct perf_event *event)
> +{
> +	struct kernel_siginfo info;
> +

I think we need to add something like this here:

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b82788fbaab..4fcd6b45ce66 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
 {
 	struct kernel_siginfo info;
 
+	/*
+	 * This irq_work can race with an exiting task; bail out if sighand has
+	 * already been released in release_task().
+	 */
+	if (!current->sighand)
+		return;
+
 	clear_siginfo(&info);
 	info.si_signo = SIGTRAP;
 	info.si_code = TRAP_PERF;


Because syzkaller was able to produce this:

| general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN
| KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f]
| CPU: 0 PID: 28393 Comm: kworker/u9:4 Not tainted 5.12.0-rc4+ #5
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
| RIP: 0010:__lock_acquire+0x87/0x5e60 kernel/locking/lockdep.c:4770
| Code: 84 c0 48 89 7c 24 78 0f 85 10 26 00 00 83 3d 53 64 59 0c 00 0f 84 84 41 00 00 83 3d 72 8a 01 0b 00 74 32 48 89 f8 48 c1 e8 03 <80> 3c 30 00 74 19 48 8b 7c 24 78 e8 79 8b 60 00 48 8b 7c 24 78 48
| RSP: 0018:ffffc90000007c00 EFLAGS: 00010006
| RAX: 0000000000000003 RBX: ffff888048058000 RCX: 0000000000000000
| RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000000018
| RBP: ffffc90000007da8 R08: 0000000000000001 R09: 0000000000000001
| R10: fffffbfff1b6b27e R11: 0000000000000000 R12: 0000000000000001
| R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
| FS:  0000000000000000(0000) GS:ffff88802ce00000(0000) knlGS:0000000000000000
| CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: 0000000000970004 CR3: 0000000040d91000 CR4: 0000000000750ef0
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
| DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
| PKRU: 55555554
| Call Trace:
|  <IRQ>
|  lock_acquire+0x126/0x650 kernel/locking/lockdep.c:5510
|  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
|  _raw_spin_lock_irqsave+0x73/0xa0 kernel/locking/spinlock.c:159
|  force_sig_info_to_task+0x65/0x3f0 kernel/signal.c:1322
|  perf_sigtrap kernel/events/core.c:6418 [inline]
|  perf_pending_event_disable kernel/events/core.c:6433 [inline]
|  perf_pending_event+0x46f/0x620 kernel/events/core.c:6475
|  irq_work_single kernel/irq_work.c:153 [inline]
|  irq_work_run_list kernel/irq_work.c:175 [inline]
|  irq_work_run+0x1da/0x640 kernel/irq_work.c:184
|  __sysvec_irq_work+0x62/0x70 arch/x86/kernel/irq_work.c:22
|  sysvec_irq_work+0x8c/0xb0 arch/x86/kernel/irq_work.c:17
|  </IRQ>
|  asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:658
| RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:268 [inline]
| RIP: 0010:_raw_write_unlock_irq+0x25/0x40 kernel/locking/spinlock.c:343
| Code: aa fd ff 66 90 53 48 89 fb 48 83 c7 18 48 8b 74 24 08 e8 3e 34 04 f8 48 89 df e8 a6 1a 06 f8 e8 21 85 26 f8 fb bf 01 00 00 00 <e8> 56 19 fa f7 65 8b 05 77 65 a9 76 85 c0 74 02 5b c3 e8 2b c1 a7
| RSP: 0018:ffffc9000202fd68 EFLAGS: 00000286
| RAX: 2a7870700b93e400 RBX: ffffffff8c40a040 RCX: ffffffff8ff9cb03
| RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000001
| RBP: ffff888047b24790 R08: ffffffff817f0f50 R09: fffffbfff1b6b27e
| R10: fffffbfff1b6b27e R11: 0000000000000000 R12: ffff888048058000
| R13: dffffc0000000000 R14: ffff888047b24701 R15: ffff888048058000
|  release_task+0x10bf/0x1360 kernel/exit.c:220
|  exit_notify kernel/exit.c:699 [inline]
|  do_exit+0x19b0/0x2290 kernel/exit.c:845
|  call_usermodehelper_exec_async+0x39c/0x3a0 kernel/umh.c:123
|  ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294


> +	clear_siginfo(&info);
> +	info.si_signo = SIGTRAP;
> +	info.si_code = TRAP_PERF;
> +	info.si_errno = event->attr.type;
> +	force_sig_info(&info);
> +}
> +
>  static void perf_pending_event_disable(struct perf_event *event)
>  {
>  	int cpu = READ_ONCE(event->pending_disable);
> @@ -6400,6 +6411,13 @@ static void perf_pending_event_disable(struct perf_event *event)
>  
>  	if (cpu == smp_processor_id()) {
>  		WRITE_ONCE(event->pending_disable, -1);
> +
> +		if (event->attr.sigtrap) {
> +			atomic_set(&event->event_limit, 1); /* rearm event */
> +			perf_sigtrap(event);
> +			return;
> +		}
> +
>  		perf_event_disable_local(event);
>  		return;
>  	}
[...]
Peter Zijlstra March 29, 2021, 12:07 p.m. UTC | #2
On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote:
> On Wed, Mar 24, 2021 at 12:24PM +0100, Marco Elver wrote:
> [...]
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index b6434697c516..1e4c949bf75f 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6391,6 +6391,17 @@ void perf_event_wakeup(struct perf_event *event)
> >  	}
> >  }
> >  
> > +static void perf_sigtrap(struct perf_event *event)
> > +{
> > +	struct kernel_siginfo info;
> > +
> 
> I think we need to add something like this here:
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 4b82788fbaab..4fcd6b45ce66 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
>  {
>  	struct kernel_siginfo info;
>  
> +	/*
> +	 * This irq_work can race with an exiting task; bail out if sighand has
> +	 * already been released in release_task().
> +	 */
> +	if (!current->sighand)
> +		return;
> +
>  	clear_siginfo(&info);
>  	info.si_signo = SIGTRAP;
>  	info.si_code = TRAP_PERF;
> 
> 

Urgh.. I'm not entirely sure that check is correct, but I always forget
the rules with signal. It could be we ought to be testing PF_EXISTING
instead.

But also, I think Jiri Olsa was going to poke around here because all of
this is broken on PREEMPT_RT. IIRC the plan was to add yet another stage
to the construct. So where today we have:


	<NMI>
		irq_work_queue()
	</NMI>
	...
	<IRQ>
		perf_pending_event()
	</IRQ>

(and we might already have a problem on some architectures where there
can be significant time between these due to not having
arch_irq_work_raise(), so ideally we ought to double check current in
your case)

The idea was, I think to add a task_work(), such that we get:

	<NMI>
		irq_work_queue()
	</NMI>
	...
	<IRQ>
		perf_pending_event()
		  task_work_add()
	</IRQ>

	<ret-to-user>
		run_task_work()
		  ...
		    kill_fasync();
Oleg Nesterov March 29, 2021, 2:27 p.m. UTC | #3
On 03/29, Peter Zijlstra wrote:
>
> On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote:
> > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
> >  {
> >  	struct kernel_siginfo info;
> >
> > +	/*
> > +	 * This irq_work can race with an exiting task; bail out if sighand has
> > +	 * already been released in release_task().
> > +	 */
> > +	if (!current->sighand)
> > +		return;

This is racy. If "current" has already passed exit_notify(), current->parent
can do release_task() and destroy current->sighand right after the check.

> Urgh.. I'm not entirely sure that check is correct, but I always forget
> the rules with signal. It could be we ought to be testing PF_EXISTING
> instead.

Agreed, PF_EXISTING check makes more sense in any case, the exiting task
can't receive the signal anyway.

Oleg.
Marco Elver March 29, 2021, 2:32 p.m. UTC | #4
On Mon, 29 Mar 2021 at 16:27, Oleg Nesterov <oleg@redhat.com> wrote:
> On 03/29, Peter Zijlstra wrote:
> >
> > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote:
> > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
> > >  {
> > >     struct kernel_siginfo info;
> > >
> > > +   /*
> > > +    * This irq_work can race with an exiting task; bail out if sighand has
> > > +    * already been released in release_task().
> > > +    */
> > > +   if (!current->sighand)
> > > +           return;
>
> This is racy. If "current" has already passed exit_notify(), current->parent
> can do release_task() and destroy current->sighand right after the check.
>
> > Urgh.. I'm not entirely sure that check is correct, but I always forget
> > the rules with signal. It could be we ought to be testing PF_EXISTING
> > instead.
>
> Agreed, PF_EXISTING check makes more sense in any case, the exiting task
> can't receive the signal anyway.

Thanks for confirming. I'll switch to just checking PF_EXITING
(PF_EXISTING does not exist :-)).

Thanks,
-- Marco
Marco Elver March 29, 2021, 6:22 p.m. UTC | #5
On Mon, 29 Mar 2021 at 16:27, Oleg Nesterov <oleg@redhat.com> wrote:
> On 03/29, Peter Zijlstra wrote:
> >
> > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote:
> > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
> > >  {
> > >     struct kernel_siginfo info;
> > >
> > > +   /*
> > > +    * This irq_work can race with an exiting task; bail out if sighand has
> > > +    * already been released in release_task().
> > > +    */
> > > +   if (!current->sighand)
> > > +           return;
>
> This is racy. If "current" has already passed exit_notify(), current->parent
> can do release_task() and destroy current->sighand right after the check.
>
> > Urgh.. I'm not entirely sure that check is correct, but I always forget
> > the rules with signal. It could be we ought to be testing PF_EXISTING
> > instead.
>
> Agreed, PF_EXISTING check makes more sense in any case, the exiting task
> can't receive the signal anyway.

So, per off-list discussion, it appears that I should ask to clarify:
PF_EXISTING or PF_EXITING?

It appears that PF_EXISTING is what's being suggested, whereas it has
not been mentioned anywhere, nor are its semantics clear. If it is not
simply the negation of PF_EXITING, what are its semantics? And why do
we need it in the case here (instead of something else that already
exists)?

Thanks,
-- Marco
Oleg Nesterov March 29, 2021, 6:33 p.m. UTC | #6
On 03/29, Marco Elver wrote:
>
> So, per off-list discussion, it appears that I should ask to clarify:
> PF_EXISTING or PF_EXITING?

Aaaaaaah, sorry Marco.

PF_EXITING, of course.

Oleg.
Peter Zijlstra March 30, 2021, 7:04 a.m. UTC | #7
On Mon, Mar 29, 2021 at 04:32:18PM +0200, Marco Elver wrote:
> On Mon, 29 Mar 2021 at 16:27, Oleg Nesterov <oleg@redhat.com> wrote:
> > On 03/29, Peter Zijlstra wrote:
> > >
> > > On Thu, Mar 25, 2021 at 09:14:39AM +0100, Marco Elver wrote:
> > > > @@ -6395,6 +6395,13 @@ static void perf_sigtrap(struct perf_event *event)
> > > >  {
> > > >     struct kernel_siginfo info;
> > > >
> > > > +   /*
> > > > +    * This irq_work can race with an exiting task; bail out if sighand has
> > > > +    * already been released in release_task().
> > > > +    */
> > > > +   if (!current->sighand)
> > > > +           return;
> >
> > This is racy. If "current" has already passed exit_notify(), current->parent
> > can do release_task() and destroy current->sighand right after the check.
> >
> > > Urgh.. I'm not entirely sure that check is correct, but I always forget
> > > the rules with signal. It could be we ought to be testing PF_EXISTING
> > > instead.
> >
> > Agreed, PF_EXISTING check makes more sense in any case, the exiting task
> > can't receive the signal anyway.
> 
> Thanks for confirming. I'll switch to just checking PF_EXITING
> (PF_EXISTING does not exist :-)).

Indeed! Typing be hard :-)
Marco Elver March 31, 2021, 12:32 p.m. UTC | #8
On Mon, 29 Mar 2021 at 14:07, Peter Zijlstra <peterz@infradead.org> wrote:

> (and we might already have a problem on some architectures where there
> can be significant time between these due to not having
> arch_irq_work_raise(), so ideally we ought to double check current in
> your case)

I missed this bit -- just to verify: here we want to check that
event->ctx->task == current, in case the the irq_work runs when the
current task has already been replaced. Correct?

Thanks,
-- Marco
Peter Zijlstra March 31, 2021, 2:51 p.m. UTC | #9
On Wed, Mar 31, 2021 at 02:32:58PM +0200, Marco Elver wrote:
> On Mon, 29 Mar 2021 at 14:07, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > (and we might already have a problem on some architectures where there
> > can be significant time between these due to not having
> > arch_irq_work_raise(), so ideally we ought to double check current in
> > your case)
> 
> I missed this bit -- just to verify: here we want to check that
> event->ctx->task == current, in case the the irq_work runs when the
> current task has already been replaced. Correct?

Yeah, just not sure what a decent failure would be, silent ignore seems
undesired, maybe WARN and archs that can trigger it get to fix it ?
Marco Elver March 31, 2021, 4:50 p.m. UTC | #10
On Wed, 31 Mar 2021 at 16:51, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Mar 31, 2021 at 02:32:58PM +0200, Marco Elver wrote:
> > On Mon, 29 Mar 2021 at 14:07, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > > (and we might already have a problem on some architectures where there
> > > can be significant time between these due to not having
> > > arch_irq_work_raise(), so ideally we ought to double check current in
> > > your case)
> >
> > I missed this bit -- just to verify: here we want to check that
> > event->ctx->task == current, in case the the irq_work runs when the
> > current task has already been replaced. Correct?
>
> Yeah, just not sure what a decent failure would be, silent ignore seems
> undesired, maybe WARN and archs that can trigger it get to fix it ?

I'll go with a WARN and add a comment.

This also revealed there should be a requirement that sigtrap events
must be associated with a task (syzkaller managed to trigger the
warning for cpu events).

Thanks,
-- Marco

Patch
diff mbox series

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8c5b9f5ad63f..3a4dbb1688f0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -391,7 +391,8 @@  struct perf_event_attr {
 				build_id       :  1, /* use build id in mmap2 events */
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
-				__reserved_1   : 27;
+				sigtrap        :  1, /* send synchronous SIGTRAP on event */
+				__reserved_1   : 26;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b6434697c516..1e4c949bf75f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6391,6 +6391,17 @@  void perf_event_wakeup(struct perf_event *event)
 	}
 }
 
+static void perf_sigtrap(struct perf_event *event)
+{
+	struct kernel_siginfo info;
+
+	clear_siginfo(&info);
+	info.si_signo = SIGTRAP;
+	info.si_code = TRAP_PERF;
+	info.si_errno = event->attr.type;
+	force_sig_info(&info);
+}
+
 static void perf_pending_event_disable(struct perf_event *event)
 {
 	int cpu = READ_ONCE(event->pending_disable);
@@ -6400,6 +6411,13 @@  static void perf_pending_event_disable(struct perf_event *event)
 
 	if (cpu == smp_processor_id()) {
 		WRITE_ONCE(event->pending_disable, -1);
+
+		if (event->attr.sigtrap) {
+			atomic_set(&event->event_limit, 1); /* rearm event */
+			perf_sigtrap(event);
+			return;
+		}
+
 		perf_event_disable_local(event);
 		return;
 	}
@@ -11428,6 +11446,9 @@  perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (event->attr.sigtrap)
+		atomic_set(&event->event_limit, 1);
+
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
 		/*
@@ -11706,6 +11727,9 @@  static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	if (attr->remove_on_exec && attr->enable_on_exec)
 		return -EINVAL;
 
+	if (attr->sigtrap && !attr->remove_on_exec)
+		return -EINVAL;
+
 out:
 	return ret;
 
@@ -12932,7 +12956,9 @@  inherit_task_group(struct perf_event *event, struct task_struct *parent,
 	struct perf_event_context *child_ctx;
 
 	if (!event->attr.inherit ||
-	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
+	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
+	    /* Do not inherit if sigtrap and signal handlers were cleared. */
+	    (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
 		*inherited_all = 0;
 		return 0;
 	}