linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder
@ 2019-03-01  3:12 Jann Horn
  2019-03-01  3:12 ` [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL Jann Horn
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Jann Horn @ 2019-03-01  3:12 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Borislav Petkov, jannh
  Cc: Andrew Morton, Josh Poimboeuf, syzbot, H. Peter Anvin, x86,
	linux-kernel, Masahiro Yamada, Michal Marek, linux-kbuild

When the frame unwinder is invoked for an oops caused by a call to NULL,
it currently skips the parent function because BP still points to the
parent's stack frame; the (nonexistent) current function only has the first
half of a stack frame, and BP doesn't point to it yet.

Add a special case for IP==0 that calculates a fake BP from SP, then uses
the real BP for the next frame.

Note that this handles first_frame specially: We return information about
the parent function as long as the saved IP is >=first_frame, even if the
fake BP points below it.

With an artificially-added NULL call in prctl_set_seccomp(), before this
patch, the trace is:

Call Trace:
 ? prctl_set_seccomp+0x3a/0x50
 __x64_sys_prctl+0x457/0x6f0
 ? __ia32_sys_prctl+0x750/0x750
 do_syscall_64+0x72/0x160
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

After this patch, the trace is:

Call Trace:
 prctl_set_seccomp+0x3a/0x50
 __x64_sys_prctl+0x457/0x6f0
 ? __ia32_sys_prctl+0x750/0x750
 do_syscall_64+0x72/0x160
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Jann Horn <jannh@google.com>
---
 arch/x86/include/asm/unwind.h  |  6 ++++++
 arch/x86/kernel/unwind_frame.c | 25 ++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 1f86e1b0a5cd..499578f7e6d7 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -23,6 +23,12 @@ struct unwind_state {
 #elif defined(CONFIG_UNWINDER_FRAME_POINTER)
 	bool got_irq;
 	unsigned long *bp, *orig_sp, ip;
+	/*
+	 * If non-NULL: The current frame is incomplete and doesn't contain a
+	 * valid BP. When looking for the next frame, use this instead of the
+	 * non-existent saved BP.
+	 */
+	unsigned long *next_bp;
 	struct pt_regs *regs;
 #else
 	unsigned long *sp;
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 3dc26f95d46e..9b9fd4826e7a 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state)
 	}
 
 	/* Get the next frame pointer: */
-	if (state->regs)
+	if (state->next_bp) {
+		next_bp = state->next_bp;
+		state->next_bp = NULL;
+	} else if (state->regs) {
 		next_bp = (unsigned long *)state->regs->bp;
-	else
+	} else {
 		next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+	}
 
 	/* Move to the next frame if it's safe: */
 	if (!update_stack_state(state, next_bp))
@@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
 
 	bp = get_frame_pointer(task, regs);
 
+	/*
+	 * If we crash with IP==0, the last successfully executed instruction
+	 * was probably an indirect function call with a NULL function pointer.
+	 * That means that SP points into the middle of an incomplete frame:
+	 * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
+	 * would have written a frame pointer if we hadn't crashed.
+	 * Pretend that the frame is complete and that BP points to it, but save
+	 * the real BP so that we can use it when looking for the next frame.
+	 */
+	if (regs && regs->ip == 0 &&
+	    (unsigned long *)kernel_stack_pointer(regs) >= first_frame) {
+		state->next_bp = bp;
+		bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1;
+	}
+
 	/* Initialize stack info and make sure the frame data is accessible: */
 	get_stack_info(bp, state->task, &state->stack_info,
 		       &state->stack_mask);
@@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
 	 */
 	while (!unwind_done(state) &&
 	       (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
-			state->bp < first_frame))
+			(state->next_bp == NULL && state->bp < first_frame)))
 		unwind_next_frame(state);
 }
 EXPORT_SYMBOL_GPL(__unwind_start);
-- 
2.21.0.352.gf09ad66450-goog


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL
  2019-03-01  3:12 [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Jann Horn
@ 2019-03-01  3:12 ` Jann Horn
  2019-03-01 16:24   ` Josh Poimboeuf
  2019-03-06 22:31   ` [tip:x86/urgent] x86/unwind: Add " tip-bot for Jann Horn
  2019-03-01  4:22 ` [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Josh Poimboeuf
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 11+ messages in thread
From: Jann Horn @ 2019-03-01  3:12 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Borislav Petkov, jannh
  Cc: Andrew Morton, Josh Poimboeuf, syzbot, H. Peter Anvin, x86,
	linux-kernel, Masahiro Yamada, Michal Marek, linux-kbuild

When the ORC unwinder is invoked for an oops caused by IP==0,
it currently has no idea what to do because there is no debug information
for the stack frame of NULL.

But if RIP is NULL, it is very likely that the last successfully executed
instruction was an indirect CALL/JMP, and it is possible to unwind out in
the same way as for the first instruction of a normal function. Hardcode
a corresponding ORC entry.


With an artificially-added NULL call in prctl_set_seccomp(), before this
patch, the trace is:

Call Trace:
 ? __x64_sys_prctl+0x402/0x680
 ? __ia32_sys_prctl+0x6e0/0x6e0
 ? __do_page_fault+0x457/0x620
 ? do_syscall_64+0x6d/0x160
 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9

After this patch, the trace looks like this:

Call Trace:
 __x64_sys_prctl+0x402/0x680
 ? __ia32_sys_prctl+0x6e0/0x6e0
 ? __do_page_fault+0x457/0x620
 do_syscall_64+0x6d/0x160
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

prctl_set_seccomp() still doesn't show up in the trace because for some
reason, tail call optimization is only disabled in builds that use the
frame pointer unwinder.

Signed-off-by: Jann Horn <jannh@google.com>
---
Is there a reason why the top-level Makefile only sets
-fno-optimize-sibling-calls if CONFIG_FRAME_POINTER is set?
I suspect that this is just a historical thing, because reliable
unwinding didn't work without frame pointers until ORC came along.
I'm not quite sure how best to express "don't do tail optimization if
either frame pointers are used or ORC is used" in a Makefile, and
whether we want an indirection through Kconfig for that, so I'm not
doing anything about it in this series.
Can someone send a patch to deal with it properly?


 arch/x86/kernel/unwind_orc.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 26038eacf74a..89be1be1790c 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -113,6 +113,20 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip)
 }
 #endif
 
+/*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer,
+ * and we don't have unwind information for NULL.
+ * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
+ * pointer into its parent and then continue normally from there.
+ */
+static struct orc_entry null_orc_entry = {
+	.sp_offset = sizeof(long),
+	.sp_reg = ORC_REG_SP,
+	.bp_reg = ORC_REG_UNDEFINED,
+	.type = ORC_TYPE_CALL
+};
+
 static struct orc_entry *orc_find(unsigned long ip)
 {
 	static struct orc_entry *orc;
@@ -120,6 +134,9 @@ static struct orc_entry *orc_find(unsigned long ip)
 	if (!orc_init)
 		return NULL;
 
+	if (ip == 0)
+		return &null_orc_entry;
+
 	/* For non-init vmlinux addresses, use the fast lookup table: */
 	if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
 		unsigned int idx, start, stop;
-- 
2.21.0.352.gf09ad66450-goog


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder
  2019-03-01  3:12 [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Jann Horn
  2019-03-01  3:12 ` [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL Jann Horn
@ 2019-03-01  4:22 ` Josh Poimboeuf
  2019-03-01 15:32 ` Sean Christopherson
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Josh Poimboeuf @ 2019-03-01  4:22 UTC (permalink / raw)
  To: Jann Horn
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Andrew Morton,
	syzbot, H. Peter Anvin, x86, linux-kernel, Masahiro Yamada,
	Michal Marek, linux-kbuild

On Fri, Mar 01, 2019 at 04:12:00AM +0100, Jann Horn wrote:
> When the frame unwinder is invoked for an oops caused by a call to NULL,
> it currently skips the parent function because BP still points to the
> parent's stack frame; the (nonexistent) current function only has the first
> half of a stack frame, and BP doesn't point to it yet.
> 
> Add a special case for IP==0 that calculates a fake BP from SP, then uses
> the real BP for the next frame.
> 
> Note that this handles first_frame specially: We return information about
> the parent function as long as the saved IP is >=first_frame, even if the
> fake BP points below it.
> 
> With an artificially-added NULL call in prctl_set_seccomp(), before this
> patch, the trace is:
> 
> Call Trace:
>  ? prctl_set_seccomp+0x3a/0x50
>  __x64_sys_prctl+0x457/0x6f0
>  ? __ia32_sys_prctl+0x750/0x750
>  do_syscall_64+0x72/0x160
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> After this patch, the trace is:
> 
> Call Trace:
>  prctl_set_seccomp+0x3a/0x50
>  __x64_sys_prctl+0x457/0x6f0
>  ? __ia32_sys_prctl+0x750/0x750
>  do_syscall_64+0x72/0x160
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Signed-off-by: Jann Horn <jannh@google.com>

The general approach looks good.  I'll try to give it a proper review
tomorrow or so.

-- 
Josh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder
  2019-03-01  3:12 [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Jann Horn
  2019-03-01  3:12 ` [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL Jann Horn
  2019-03-01  4:22 ` [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Josh Poimboeuf
@ 2019-03-01 15:32 ` Sean Christopherson
  2019-03-01 15:59   ` Jann Horn
  2019-03-01 16:19 ` Josh Poimboeuf
  2019-03-06 22:31 ` [tip:x86/urgent] x86/unwind: Handle " tip-bot for Jann Horn
  4 siblings, 1 reply; 11+ messages in thread
From: Sean Christopherson @ 2019-03-01 15:32 UTC (permalink / raw)
  To: Jann Horn
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Andrew Morton,
	Josh Poimboeuf, syzbot, H. Peter Anvin, x86, linux-kernel,
	Masahiro Yamada, Michal Marek, linux-kbuild

On Fri, Mar 01, 2019 at 04:12:00AM +0100, Jann Horn wrote:
> When the frame unwinder is invoked for an oops caused by a call to NULL,
> it currently skips the parent function because BP still points to the
> parent's stack frame; the (nonexistent) current function only has the first
> half of a stack frame, and BP doesn't point to it yet.
> 
> Add a special case for IP==0 that calculates a fake BP from SP, then uses
> the real BP for the next frame.
> 
> Note that this handles first_frame specially: We return information about
> the parent function as long as the saved IP is >=first_frame, even if the
> fake BP points below it.
> 
> With an artificially-added NULL call in prctl_set_seccomp(), before this
> patch, the trace is:
> 
> Call Trace:
>  ? prctl_set_seccomp+0x3a/0x50
>  __x64_sys_prctl+0x457/0x6f0
>  ? __ia32_sys_prctl+0x750/0x750
>  do_syscall_64+0x72/0x160
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> After this patch, the trace is:
> 
> Call Trace:
>  prctl_set_seccomp+0x3a/0x50
>  __x64_sys_prctl+0x457/0x6f0
>  ? __ia32_sys_prctl+0x750/0x750
>  do_syscall_64+0x72/0x160
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Signed-off-by: Jann Horn <jannh@google.com>
> ---
>  arch/x86/include/asm/unwind.h  |  6 ++++++
>  arch/x86/kernel/unwind_frame.c | 25 ++++++++++++++++++++++---
>  2 files changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
> index 1f86e1b0a5cd..499578f7e6d7 100644
> --- a/arch/x86/include/asm/unwind.h
> +++ b/arch/x86/include/asm/unwind.h
> @@ -23,6 +23,12 @@ struct unwind_state {
>  #elif defined(CONFIG_UNWINDER_FRAME_POINTER)
>  	bool got_irq;
>  	unsigned long *bp, *orig_sp, ip;
> +	/*
> +	 * If non-NULL: The current frame is incomplete and doesn't contain a
> +	 * valid BP. When looking for the next frame, use this instead of the
> +	 * non-existent saved BP.
> +	 */
> +	unsigned long *next_bp;
>  	struct pt_regs *regs;
>  #else
>  	unsigned long *sp;
> diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
> index 3dc26f95d46e..9b9fd4826e7a 100644
> --- a/arch/x86/kernel/unwind_frame.c
> +++ b/arch/x86/kernel/unwind_frame.c
> @@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state)
>  	}
>  
>  	/* Get the next frame pointer: */
> -	if (state->regs)
> +	if (state->next_bp) {
> +		next_bp = state->next_bp;
> +		state->next_bp = NULL;
> +	} else if (state->regs) {
>  		next_bp = (unsigned long *)state->regs->bp;
> -	else
> +	} else {
>  		next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
> +	}
>  
>  	/* Move to the next frame if it's safe: */
>  	if (!update_stack_state(state, next_bp))
> @@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
>  
>  	bp = get_frame_pointer(task, regs);
>  
> +	/*
> +	 * If we crash with IP==0, the last successfully executed instruction
> +	 * was probably an indirect function call with a NULL function pointer.
> +	 * That means that SP points into the middle of an incomplete frame:
> +	 * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
> +	 * would have written a frame pointer if we hadn't crashed.
> +	 * Pretend that the frame is complete and that BP points to it, but save
> +	 * the real BP so that we can use it when looking for the next frame.
> +	 */
> +	if (regs && regs->ip == 0 &&

Would it make sense to do 'regs->ip < PAGE_SIZE', a la show_fault_oops()?
E.g. to handle bugs where a function pointer gets loaded with NULL+offset.

> +	    (unsigned long *)kernel_stack_pointer(regs) >= first_frame) {
> +		state->next_bp = bp;
> +		bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1;
> +	}
> +
>  	/* Initialize stack info and make sure the frame data is accessible: */
>  	get_stack_info(bp, state->task, &state->stack_info,
>  		       &state->stack_mask);
> @@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
>  	 */
>  	while (!unwind_done(state) &&
>  	       (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
> -			state->bp < first_frame))
> +			(state->next_bp == NULL && state->bp < first_frame)))
>  		unwind_next_frame(state);
>  }
>  EXPORT_SYMBOL_GPL(__unwind_start);
> -- 
> 2.21.0.352.gf09ad66450-goog
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder
  2019-03-01 15:32 ` Sean Christopherson
@ 2019-03-01 15:59   ` Jann Horn
  0 siblings, 0 replies; 11+ messages in thread
From: Jann Horn @ 2019-03-01 15:59 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Andrew Morton,
	Josh Poimboeuf, syzbot, H. Peter Anvin, the arch/x86 maintainers,
	kernel list, Masahiro Yamada, Michal Marek, linux-kbuild

On Fri, Mar 1, 2019 at 4:33 PM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
> On Fri, Mar 01, 2019 at 04:12:00AM +0100, Jann Horn wrote:
> > When the frame unwinder is invoked for an oops caused by a call to NULL,
> > it currently skips the parent function because BP still points to the
> > parent's stack frame; the (nonexistent) current function only has the first
> > half of a stack frame, and BP doesn't point to it yet.
> >
> > Add a special case for IP==0 that calculates a fake BP from SP, then uses
> > the real BP for the next frame.
> >
> > Note that this handles first_frame specially: We return information about
> > the parent function as long as the saved IP is >=first_frame, even if the
> > fake BP points below it.
> >
> > With an artificially-added NULL call in prctl_set_seccomp(), before this
> > patch, the trace is:
> >
> > Call Trace:
> >  ? prctl_set_seccomp+0x3a/0x50
> >  __x64_sys_prctl+0x457/0x6f0
> >  ? __ia32_sys_prctl+0x750/0x750
> >  do_syscall_64+0x72/0x160
> >  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> >
> > After this patch, the trace is:
> >
> > Call Trace:
> >  prctl_set_seccomp+0x3a/0x50
> >  __x64_sys_prctl+0x457/0x6f0
> >  ? __ia32_sys_prctl+0x750/0x750
> >  do_syscall_64+0x72/0x160
> >  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> >
> > Signed-off-by: Jann Horn <jannh@google.com>
> > ---
> >  arch/x86/include/asm/unwind.h  |  6 ++++++
> >  arch/x86/kernel/unwind_frame.c | 25 ++++++++++++++++++++++---
> >  2 files changed, 28 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
> > index 1f86e1b0a5cd..499578f7e6d7 100644
> > --- a/arch/x86/include/asm/unwind.h
> > +++ b/arch/x86/include/asm/unwind.h
> > @@ -23,6 +23,12 @@ struct unwind_state {
> >  #elif defined(CONFIG_UNWINDER_FRAME_POINTER)
> >       bool got_irq;
> >       unsigned long *bp, *orig_sp, ip;
> > +     /*
> > +      * If non-NULL: The current frame is incomplete and doesn't contain a
> > +      * valid BP. When looking for the next frame, use this instead of the
> > +      * non-existent saved BP.
> > +      */
> > +     unsigned long *next_bp;
> >       struct pt_regs *regs;
> >  #else
> >       unsigned long *sp;
> > diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
> > index 3dc26f95d46e..9b9fd4826e7a 100644
> > --- a/arch/x86/kernel/unwind_frame.c
> > +++ b/arch/x86/kernel/unwind_frame.c
> > @@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state)
> >       }
> >
> >       /* Get the next frame pointer: */
> > -     if (state->regs)
> > +     if (state->next_bp) {
> > +             next_bp = state->next_bp;
> > +             state->next_bp = NULL;
> > +     } else if (state->regs) {
> >               next_bp = (unsigned long *)state->regs->bp;
> > -     else
> > +     } else {
> >               next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
> > +     }
> >
> >       /* Move to the next frame if it's safe: */
> >       if (!update_stack_state(state, next_bp))
> > @@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
> >
> >       bp = get_frame_pointer(task, regs);
> >
> > +     /*
> > +      * If we crash with IP==0, the last successfully executed instruction
> > +      * was probably an indirect function call with a NULL function pointer.
> > +      * That means that SP points into the middle of an incomplete frame:
> > +      * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
> > +      * would have written a frame pointer if we hadn't crashed.
> > +      * Pretend that the frame is complete and that BP points to it, but save
> > +      * the real BP so that we can use it when looking for the next frame.
> > +      */
> > +     if (regs && regs->ip == 0 &&
>
> Would it make sense to do 'regs->ip < PAGE_SIZE', a la show_fault_oops()?
> E.g. to handle bugs where a function pointer gets loaded with NULL+offset.

I don't think near-NULL function pointers make sense or are likely to
occur in practice. Near-NULL pointer dereferences happen when you add
a struct member offset to a NULL pointer, or something like that; but
functions are never inline in structs/arrays, so there isn't really a
reason to compute a function pointer by adding an offset to a (NULL)
pointer.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder
  2019-03-01  3:12 [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Jann Horn
                   ` (2 preceding siblings ...)
  2019-03-01 15:32 ` Sean Christopherson
@ 2019-03-01 16:19 ` Josh Poimboeuf
  2019-03-06 22:31 ` [tip:x86/urgent] x86/unwind: Handle " tip-bot for Jann Horn
  4 siblings, 0 replies; 11+ messages in thread
From: Josh Poimboeuf @ 2019-03-01 16:19 UTC (permalink / raw)
  To: Jann Horn
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Andrew Morton,
	syzbot, H. Peter Anvin, x86, linux-kernel, Masahiro Yamada,
	Michal Marek, linux-kbuild

On Fri, Mar 01, 2019 at 04:12:00AM +0100, Jann Horn wrote:
> When the frame unwinder is invoked for an oops caused by a call to NULL,
> it currently skips the parent function because BP still points to the
> parent's stack frame; the (nonexistent) current function only has the first
> half of a stack frame, and BP doesn't point to it yet.
> 
> Add a special case for IP==0 that calculates a fake BP from SP, then uses
> the real BP for the next frame.
> 
> Note that this handles first_frame specially: We return information about
> the parent function as long as the saved IP is >=first_frame, even if the
> fake BP points below it.
> 
> With an artificially-added NULL call in prctl_set_seccomp(), before this
> patch, the trace is:
> 
> Call Trace:
>  ? prctl_set_seccomp+0x3a/0x50
>  __x64_sys_prctl+0x457/0x6f0
>  ? __ia32_sys_prctl+0x750/0x750
>  do_syscall_64+0x72/0x160
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> After this patch, the trace is:
> 
> Call Trace:
>  prctl_set_seccomp+0x3a/0x50
>  __x64_sys_prctl+0x457/0x6f0
>  ? __ia32_sys_prctl+0x750/0x750
>  do_syscall_64+0x72/0x160
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Signed-off-by: Jann Horn <jannh@google.com>

Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>

-- 
Josh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL
  2019-03-01  3:12 ` [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL Jann Horn
@ 2019-03-01 16:24   ` Josh Poimboeuf
  2019-03-01 16:29     ` Josh Poimboeuf
  2019-03-06 22:31   ` [tip:x86/urgent] x86/unwind: Add " tip-bot for Jann Horn
  1 sibling, 1 reply; 11+ messages in thread
From: Josh Poimboeuf @ 2019-03-01 16:24 UTC (permalink / raw)
  To: Jann Horn
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Andrew Morton,
	syzbot, H. Peter Anvin, x86, linux-kernel, Masahiro Yamada,
	Michal Marek, linux-kbuild

On Fri, Mar 01, 2019 at 04:12:01AM +0100, Jann Horn wrote:
> When the ORC unwinder is invoked for an oops caused by IP==0,
> it currently has no idea what to do because there is no debug information
> for the stack frame of NULL.
> 
> But if RIP is NULL, it is very likely that the last successfully executed
> instruction was an indirect CALL/JMP, and it is possible to unwind out in
> the same way as for the first instruction of a normal function. Hardcode
> a corresponding ORC entry.
> 
> 
> With an artificially-added NULL call in prctl_set_seccomp(), before this
> patch, the trace is:
> 
> Call Trace:
>  ? __x64_sys_prctl+0x402/0x680
>  ? __ia32_sys_prctl+0x6e0/0x6e0
>  ? __do_page_fault+0x457/0x620
>  ? do_syscall_64+0x6d/0x160
>  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> After this patch, the trace looks like this:
> 
> Call Trace:
>  __x64_sys_prctl+0x402/0x680
>  ? __ia32_sys_prctl+0x6e0/0x6e0
>  ? __do_page_fault+0x457/0x620
>  do_syscall_64+0x6d/0x160
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> prctl_set_seccomp() still doesn't show up in the trace because for some
> reason, tail call optimization is only disabled in builds that use the
> frame pointer unwinder.
> 
> Signed-off-by: Jann Horn <jannh@google.com>

Thanks for the patches!

Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>

> Is there a reason why the top-level Makefile only sets
> -fno-optimize-sibling-calls if CONFIG_FRAME_POINTER is set?
> I suspect that this is just a historical thing, because reliable
> unwinding didn't work without frame pointers until ORC came along.
> I'm not quite sure how best to express "don't do tail optimization if
> either frame pointers are used or ORC is used" in a Makefile, and
> whether we want an indirection through Kconfig for that, so I'm not
> doing anything about it in this series.
> Can someone send a patch to deal with it properly?

Why would sibling calls be a problem for ORC?  Once a function does a
sibling call, it has effectively returned and shouldn't show up on the
stack trace anyway.

-- 
Josh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL
  2019-03-01 16:24   ` Josh Poimboeuf
@ 2019-03-01 16:29     ` Josh Poimboeuf
  2019-03-01 16:55       ` Jann Horn
  0 siblings, 1 reply; 11+ messages in thread
From: Josh Poimboeuf @ 2019-03-01 16:29 UTC (permalink / raw)
  To: Jann Horn
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Andrew Morton,
	syzbot, H. Peter Anvin, x86, linux-kernel, Masahiro Yamada,
	Michal Marek, linux-kbuild

On Fri, Mar 01, 2019 at 10:24:18AM -0600, Josh Poimboeuf wrote:
> > Is there a reason why the top-level Makefile only sets
> > -fno-optimize-sibling-calls if CONFIG_FRAME_POINTER is set?
> > I suspect that this is just a historical thing, because reliable
> > unwinding didn't work without frame pointers until ORC came along.
> > I'm not quite sure how best to express "don't do tail optimization if
> > either frame pointers are used or ORC is used" in a Makefile, and
> > whether we want an indirection through Kconfig for that, so I'm not
> > doing anything about it in this series.
> > Can someone send a patch to deal with it properly?
> 
> Why would sibling calls be a problem for ORC?  Once a function does a
> sibling call, it has effectively returned and shouldn't show up on the
> stack trace anyway.

Answering my own question, I guess some people might find it confusing
to have a caller skipped in the stack trace.  But nobody has ever
complained about it.

It's not a problem for livepatch since we only care about the return
path.

-- 
Josh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL
  2019-03-01 16:29     ` Josh Poimboeuf
@ 2019-03-01 16:55       ` Jann Horn
  0 siblings, 0 replies; 11+ messages in thread
From: Jann Horn @ 2019-03-01 16:55 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Andrew Morton,
	syzbot, H. Peter Anvin, the arch/x86 maintainers, kernel list,
	Masahiro Yamada, Michal Marek, linux-kbuild

On Fri, Mar 1, 2019 at 5:29 PM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
> On Fri, Mar 01, 2019 at 10:24:18AM -0600, Josh Poimboeuf wrote:
> > > Is there a reason why the top-level Makefile only sets
> > > -fno-optimize-sibling-calls if CONFIG_FRAME_POINTER is set?
> > > I suspect that this is just a historical thing, because reliable
> > > unwinding didn't work without frame pointers until ORC came along.
> > > I'm not quite sure how best to express "don't do tail optimization if
> > > either frame pointers are used or ORC is used" in a Makefile, and
> > > whether we want an indirection through Kconfig for that, so I'm not
> > > doing anything about it in this series.
> > > Can someone send a patch to deal with it properly?
> >
> > Why would sibling calls be a problem for ORC?  Once a function does a
> > sibling call, it has effectively returned and shouldn't show up on the
> > stack trace anyway.
>
> Answering my own question, I guess some people might find it confusing
> to have a caller skipped in the stack trace.  But nobody has ever
> complained about it.
>
> It's not a problem for livepatch since we only care about the return
> path.

Yeah, that's my concern. I understand that it's irrelevant for tooling
that wants to understand what context a function is running in, but it
might matter to a human trying to understand how a function was
reached. In a theoretical worst case, a stack trace might skip
directly from do_syscall_64() into some random helper function that
received a bad pointer, and that might make debugging harder.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [tip:x86/urgent] x86/unwind: Handle NULL pointer calls better in frame unwinder
  2019-03-01  3:12 [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Jann Horn
                   ` (3 preceding siblings ...)
  2019-03-01 16:19 ` Josh Poimboeuf
@ 2019-03-06 22:31 ` tip-bot for Jann Horn
  4 siblings, 0 replies; 11+ messages in thread
From: tip-bot for Jann Horn @ 2019-03-06 22:31 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: bp, yamada.masahiro, linux-kernel, jpoimboe, tglx, jannh, akpm,
	hpa, mingo, syzbot+ca95b2b7aef9e7cbd6ab, michal.lkml

Commit-ID:  f4f34e1b82eb4219d8eaa1c7e2e17ca219a6a2b5
Gitweb:     https://git.kernel.org/tip/f4f34e1b82eb4219d8eaa1c7e2e17ca219a6a2b5
Author:     Jann Horn <jannh@google.com>
AuthorDate: Fri, 1 Mar 2019 04:12:00 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Wed, 6 Mar 2019 23:03:26 +0100

x86/unwind: Handle NULL pointer calls better in frame unwinder

When the frame unwinder is invoked for an oops caused by a call to NULL, it
currently skips the parent function because BP still points to the parent's
stack frame; the (nonexistent) current function only has the first half of
a stack frame, and BP doesn't point to it yet.

Add a special case for IP==0 that calculates a fake BP from SP, then uses
the real BP for the next frame.

Note that this handles first_frame specially: Return information about the
parent function as long as the saved IP is >=first_frame, even if the fake
BP points below it.

With an artificially-added NULL call in prctl_set_seccomp(), before this
patch, the trace is:

Call Trace:
 ? prctl_set_seccomp+0x3a/0x50
 __x64_sys_prctl+0x457/0x6f0
 ? __ia32_sys_prctl+0x750/0x750
 do_syscall_64+0x72/0x160
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

After this patch, the trace is:

Call Trace:
 prctl_set_seccomp+0x3a/0x50
 __x64_sys_prctl+0x457/0x6f0
 ? __ia32_sys_prctl+0x750/0x750
 do_syscall_64+0x72/0x160
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: syzbot <syzbot+ca95b2b7aef9e7cbd6ab@syzkaller.appspotmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Marek <michal.lkml@markovi.net>
Cc: linux-kbuild@vger.kernel.org
Link: https://lkml.kernel.org/r/20190301031201.7416-1-jannh@google.com

---
 arch/x86/include/asm/unwind.h  |  6 ++++++
 arch/x86/kernel/unwind_frame.c | 25 ++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 1f86e1b0a5cd..499578f7e6d7 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -23,6 +23,12 @@ struct unwind_state {
 #elif defined(CONFIG_UNWINDER_FRAME_POINTER)
 	bool got_irq;
 	unsigned long *bp, *orig_sp, ip;
+	/*
+	 * If non-NULL: The current frame is incomplete and doesn't contain a
+	 * valid BP. When looking for the next frame, use this instead of the
+	 * non-existent saved BP.
+	 */
+	unsigned long *next_bp;
 	struct pt_regs *regs;
 #else
 	unsigned long *sp;
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 3dc26f95d46e..9b9fd4826e7a 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state)
 	}
 
 	/* Get the next frame pointer: */
-	if (state->regs)
+	if (state->next_bp) {
+		next_bp = state->next_bp;
+		state->next_bp = NULL;
+	} else if (state->regs) {
 		next_bp = (unsigned long *)state->regs->bp;
-	else
+	} else {
 		next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+	}
 
 	/* Move to the next frame if it's safe: */
 	if (!update_stack_state(state, next_bp))
@@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
 
 	bp = get_frame_pointer(task, regs);
 
+	/*
+	 * If we crash with IP==0, the last successfully executed instruction
+	 * was probably an indirect function call with a NULL function pointer.
+	 * That means that SP points into the middle of an incomplete frame:
+	 * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
+	 * would have written a frame pointer if we hadn't crashed.
+	 * Pretend that the frame is complete and that BP points to it, but save
+	 * the real BP so that we can use it when looking for the next frame.
+	 */
+	if (regs && regs->ip == 0 &&
+	    (unsigned long *)kernel_stack_pointer(regs) >= first_frame) {
+		state->next_bp = bp;
+		bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1;
+	}
+
 	/* Initialize stack info and make sure the frame data is accessible: */
 	get_stack_info(bp, state->task, &state->stack_info,
 		       &state->stack_mask);
@@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
 	 */
 	while (!unwind_done(state) &&
 	       (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
-			state->bp < first_frame))
+			(state->next_bp == NULL && state->bp < first_frame)))
 		unwind_next_frame(state);
 }
 EXPORT_SYMBOL_GPL(__unwind_start);

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [tip:x86/urgent] x86/unwind: Add hardcoded ORC entry for NULL
  2019-03-01  3:12 ` [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL Jann Horn
  2019-03-01 16:24   ` Josh Poimboeuf
@ 2019-03-06 22:31   ` tip-bot for Jann Horn
  1 sibling, 0 replies; 11+ messages in thread
From: tip-bot for Jann Horn @ 2019-03-06 22:31 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: bp, mingo, jpoimboe, michal.lkml, hpa, jannh,
	syzbot+ca95b2b7aef9e7cbd6ab, akpm, linux-kernel, tglx,
	yamada.masahiro

Commit-ID:  ac5ceccce5501e43d217c596e4ee859f2a3fef79
Gitweb:     https://git.kernel.org/tip/ac5ceccce5501e43d217c596e4ee859f2a3fef79
Author:     Jann Horn <jannh@google.com>
AuthorDate: Fri, 1 Mar 2019 04:12:01 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Wed, 6 Mar 2019 23:03:26 +0100

x86/unwind: Add hardcoded ORC entry for NULL

When the ORC unwinder is invoked for an oops caused by IP==0,
it currently has no idea what to do because there is no debug information
for the stack frame of NULL.

But if RIP is NULL, it is very likely that the last successfully executed
instruction was an indirect CALL/JMP, and it is possible to unwind out in
the same way as for the first instruction of a normal function. Hardcode
a corresponding ORC entry.

With an artificially-added NULL call in prctl_set_seccomp(), before this
patch, the trace is:

Call Trace:
 ? __x64_sys_prctl+0x402/0x680
 ? __ia32_sys_prctl+0x6e0/0x6e0
 ? __do_page_fault+0x457/0x620
 ? do_syscall_64+0x6d/0x160
 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9

After this patch, the trace looks like this:

Call Trace:
 __x64_sys_prctl+0x402/0x680
 ? __ia32_sys_prctl+0x6e0/0x6e0
 ? __do_page_fault+0x457/0x620
 do_syscall_64+0x6d/0x160
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

prctl_set_seccomp() still doesn't show up in the trace because for some
reason, tail call optimization is only disabled in builds that use the
frame pointer unwinder.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: syzbot <syzbot+ca95b2b7aef9e7cbd6ab@syzkaller.appspotmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Marek <michal.lkml@markovi.net>
Cc: linux-kbuild@vger.kernel.org
Link: https://lkml.kernel.org/r/20190301031201.7416-2-jannh@google.com

---
 arch/x86/kernel/unwind_orc.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 26038eacf74a..89be1be1790c 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -113,6 +113,20 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip)
 }
 #endif
 
+/*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer,
+ * and we don't have unwind information for NULL.
+ * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
+ * pointer into its parent and then continue normally from there.
+ */
+static struct orc_entry null_orc_entry = {
+	.sp_offset = sizeof(long),
+	.sp_reg = ORC_REG_SP,
+	.bp_reg = ORC_REG_UNDEFINED,
+	.type = ORC_TYPE_CALL
+};
+
 static struct orc_entry *orc_find(unsigned long ip)
 {
 	static struct orc_entry *orc;
@@ -120,6 +134,9 @@ static struct orc_entry *orc_find(unsigned long ip)
 	if (!orc_init)
 		return NULL;
 
+	if (ip == 0)
+		return &null_orc_entry;
+
 	/* For non-init vmlinux addresses, use the fast lookup table: */
 	if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
 		unsigned int idx, start, stop;

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-03-06 22:32 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-01  3:12 [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Jann Horn
2019-03-01  3:12 ` [PATCH 2/2] x86/unwind: add hardcoded ORC entry for NULL Jann Horn
2019-03-01 16:24   ` Josh Poimboeuf
2019-03-01 16:29     ` Josh Poimboeuf
2019-03-01 16:55       ` Jann Horn
2019-03-06 22:31   ` [tip:x86/urgent] x86/unwind: Add " tip-bot for Jann Horn
2019-03-01  4:22 ` [PATCH 1/2] x86/unwind: handle NULL pointer calls better in frame unwinder Josh Poimboeuf
2019-03-01 15:32 ` Sean Christopherson
2019-03-01 15:59   ` Jann Horn
2019-03-01 16:19 ` Josh Poimboeuf
2019-03-06 22:31 ` [tip:x86/urgent] x86/unwind: Handle " tip-bot for Jann Horn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).