[v1] powerpc: Include running function as first entry in save_stack_trace() and friends
diff mbox series

Message ID e2e8728c4c4553bbac75a64b148e402183699c0c.1614780567.git.christophe.leroy@csgroup.eu
State New, archived
Headers show
Series
  • [v1] powerpc: Include running function as first entry in save_stack_trace() and friends
Related show

Commit Message

Christophe Leroy March 3, 2021, 2:09 p.m. UTC
It seems like all other sane architectures, namely x86 and arm64
at least, include the running function as top entry when saving
stack trace.

Functionnalities like KFENCE expect it.

Do the same on powerpc, it allows KFENCE to properly identify the faulting
function as depicted below. Before the patch KFENCE was identifying
finish_task_switch.isra as the faulting function.

[   14.937370] ==================================================================
[   14.948692] BUG: KFENCE: invalid read in test_invalid_access+0x54/0x108
[   14.948692]
[   14.956814] Invalid read at 0xdf98800a:
[   14.960664]  test_invalid_access+0x54/0x108
[   14.964876]  finish_task_switch.isra.0+0x54/0x23c
[   14.969606]  kunit_try_run_case+0x5c/0xd0
[   14.973658]  kunit_generic_run_threadfn_adapter+0x24/0x30
[   14.979079]  kthread+0x15c/0x174
[   14.982342]  ret_from_kernel_thread+0x14/0x1c
[   14.986731]
[   14.988236] CPU: 0 PID: 111 Comm: kunit_try_catch Tainted: G    B             5.12.0-rc1-01537-g95f6e2088d7e-dirty #4682
[   14.999795] NIP:  c016ec2c LR: c02f517c CTR: c016ebd8
[   15.004851] REGS: e2449d90 TRAP: 0301   Tainted: G    B              (5.12.0-rc1-01537-g95f6e2088d7e-dirty)
[   15.015274] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 22000004  XER: 00000000
[   15.022043] DAR: df98800a DSISR: 20000000
[   15.022043] GPR00: c02f517c e2449e50 c1142080 e100dd24 c084b13c 00000008 c084b32b c016ebd8
[   15.022043] GPR08: c0850000 df988000 c0d10000 e2449eb0 22000288
[   15.040581] NIP [c016ec2c] test_invalid_access+0x54/0x108
[   15.046010] LR [c02f517c] kunit_try_run_case+0x5c/0xd0
[   15.051181] Call Trace:
[   15.053637] [e2449e50] [c005a68c] finish_task_switch.isra.0+0x54/0x23c (unreliable)
[   15.061338] [e2449eb0] [c02f517c] kunit_try_run_case+0x5c/0xd0
[   15.067215] [e2449ed0] [c02f648c] kunit_generic_run_threadfn_adapter+0x24/0x30
[   15.074472] [e2449ef0] [c004e7b0] kthread+0x15c/0x174
[   15.079571] [e2449f30] [c001317c] ret_from_kernel_thread+0x14/0x1c
[   15.085798] Instruction dump:
[   15.088784] 8129d608 38e7ebd8 81020280 911f004c 39000000 995f0024 907f0028 90ff001c
[   15.096613] 3949000a 915f0020 3d40c0d1 3d00c085 <8929000a> 3908adb0 812a4b98 3d40c02f
[   15.104612] ==================================================================

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/stacktrace.c | 42 +++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 14 deletions(-)

Comments

Marco Elver March 3, 2021, 2:38 p.m. UTC | #1
On Wed, 3 Mar 2021 at 15:09, Christophe Leroy
<christophe.leroy@csgroup.eu> wrote:
>
> It seems like all other sane architectures, namely x86 and arm64
> at least, include the running function as top entry when saving
> stack trace.
>
> Functionnalities like KFENCE expect it.
>
> Do the same on powerpc, it allows KFENCE to properly identify the faulting
> function as depicted below. Before the patch KFENCE was identifying
> finish_task_switch.isra as the faulting function.
>
> [   14.937370] ==================================================================
> [   14.948692] BUG: KFENCE: invalid read in test_invalid_access+0x54/0x108
> [   14.948692]
> [   14.956814] Invalid read at 0xdf98800a:
> [   14.960664]  test_invalid_access+0x54/0x108
> [   14.964876]  finish_task_switch.isra.0+0x54/0x23c
> [   14.969606]  kunit_try_run_case+0x5c/0xd0
> [   14.973658]  kunit_generic_run_threadfn_adapter+0x24/0x30
> [   14.979079]  kthread+0x15c/0x174
> [   14.982342]  ret_from_kernel_thread+0x14/0x1c
> [   14.986731]
> [   14.988236] CPU: 0 PID: 111 Comm: kunit_try_catch Tainted: G    B             5.12.0-rc1-01537-g95f6e2088d7e-dirty #4682
> [   14.999795] NIP:  c016ec2c LR: c02f517c CTR: c016ebd8
> [   15.004851] REGS: e2449d90 TRAP: 0301   Tainted: G    B              (5.12.0-rc1-01537-g95f6e2088d7e-dirty)
> [   15.015274] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 22000004  XER: 00000000
> [   15.022043] DAR: df98800a DSISR: 20000000
> [   15.022043] GPR00: c02f517c e2449e50 c1142080 e100dd24 c084b13c 00000008 c084b32b c016ebd8
> [   15.022043] GPR08: c0850000 df988000 c0d10000 e2449eb0 22000288
> [   15.040581] NIP [c016ec2c] test_invalid_access+0x54/0x108
> [   15.046010] LR [c02f517c] kunit_try_run_case+0x5c/0xd0
> [   15.051181] Call Trace:
> [   15.053637] [e2449e50] [c005a68c] finish_task_switch.isra.0+0x54/0x23c (unreliable)
> [   15.061338] [e2449eb0] [c02f517c] kunit_try_run_case+0x5c/0xd0
> [   15.067215] [e2449ed0] [c02f648c] kunit_generic_run_threadfn_adapter+0x24/0x30
> [   15.074472] [e2449ef0] [c004e7b0] kthread+0x15c/0x174
> [   15.079571] [e2449f30] [c001317c] ret_from_kernel_thread+0x14/0x1c
> [   15.085798] Instruction dump:
> [   15.088784] 8129d608 38e7ebd8 81020280 911f004c 39000000 995f0024 907f0028 90ff001c
> [   15.096613] 3949000a 915f0020 3d40c0d1 3d00c085 <8929000a> 3908adb0 812a4b98 3d40c02f
> [   15.104612] ==================================================================
>
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>

Acked-by: Marco Elver <elver@google.com>

Thank you, I think this looks like the right solution. Just a question below:

> ---
>  arch/powerpc/kernel/stacktrace.c | 42 +++++++++++++++++++++-----------
>  1 file changed, 28 insertions(+), 14 deletions(-)
>
> diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
> index b6440657ef92..67c2b8488035 100644
> --- a/arch/powerpc/kernel/stacktrace.c
> +++ b/arch/powerpc/kernel/stacktrace.c
> @@ -22,16 +22,32 @@
>  #include <asm/kprobes.h>
>
>  #include <asm/paca.h>
> +#include <asm/switch_to.h>
>
>  /*
>   * Save stack-backtrace addresses into a stack_trace buffer.
>   */
> +static void save_entry(struct stack_trace *trace, unsigned long ip, int savesched)
> +{
> +       if (savesched || !in_sched_functions(ip)) {
> +               if (!trace->skip)
> +                       trace->entries[trace->nr_entries++] = ip;
> +               else
> +                       trace->skip--;
> +       }
> +}
> +
>  static void save_context_stack(struct stack_trace *trace, unsigned long sp,
> -                       struct task_struct *tsk, int savesched)
> +                              unsigned long ip, struct task_struct *tsk, int savesched)
>  {
> +       save_entry(trace, ip, savesched);
> +
> +       if (trace->nr_entries >= trace->max_entries)
> +               return;
> +
>         for (;;) {
>                 unsigned long *stack = (unsigned long *) sp;
> -               unsigned long newsp, ip;
> +               unsigned long newsp;
>
>                 if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD))
>                         return;
> @@ -39,12 +55,7 @@ static void save_context_stack(struct stack_trace *trace, unsigned long sp,
>                 newsp = stack[0];
>                 ip = stack[STACK_FRAME_LR_SAVE];
>
> -               if (savesched || !in_sched_functions(ip)) {
> -                       if (!trace->skip)
> -                               trace->entries[trace->nr_entries++] = ip;
> -                       else
> -                               trace->skip--;
> -               }
> +               save_entry(trace, ip, savesched);
>
>                 if (trace->nr_entries >= trace->max_entries)
>                         return;
> @@ -59,23 +70,26 @@ void save_stack_trace(struct stack_trace *trace)
>
>         sp = current_stack_frame();
>
> -       save_context_stack(trace, sp, current, 1);
> +       save_context_stack(trace, sp, (unsigned long)save_stack_trace, current, 1);

This causes ip == save_stack_trace and also below for
save_stack_trace_tsk. Does this mean save_stack_trace() is included in
the trace? Looking at kernel/stacktrace.c, I think the library wants
to exclude itself from the trace, as it does '.skip = skipnr + 1' (and
'.skip   = skipnr + (current == tsk)' for the _tsk variant).

If the arch-helper here is included, should this use _RET_IP_ instead?

Thanks,
-- Marco

>  }
>  EXPORT_SYMBOL_GPL(save_stack_trace);
>
>  void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
>  {
> -       unsigned long sp;
> +       unsigned long sp, ip;
>
>         if (!try_get_task_stack(tsk))
>                 return;
>
> -       if (tsk == current)
> +       if (tsk == current) {
> +               ip = (unsigned long)save_stack_trace_tsk;
>                 sp = current_stack_frame();
> -       else
> +       } else {
> +               ip = (unsigned long)_switch;
>                 sp = tsk->thread.ksp;
> +       }
>
> -       save_context_stack(trace, sp, tsk, 0);
> +       save_context_stack(trace, sp, ip, tsk, 0);
>
>         put_task_stack(tsk);
>  }
> @@ -84,7 +98,7 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
>  void
>  save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
>  {
> -       save_context_stack(trace, regs->gpr[1], current, 0);
> +       save_context_stack(trace, regs->gpr[1], regs->nip, current, 0);
>  }
>  EXPORT_SYMBOL_GPL(save_stack_trace_regs);
>
> --
> 2.25.0
>
Christophe Leroy March 3, 2021, 2:52 p.m. UTC | #2
Le 03/03/2021 à 15:38, Marco Elver a écrit :
> On Wed, 3 Mar 2021 at 15:09, Christophe Leroy
> <christophe.leroy@csgroup.eu> wrote:
>>
>> It seems like all other sane architectures, namely x86 and arm64
>> at least, include the running function as top entry when saving
>> stack trace.
>>
>> Functionnalities like KFENCE expect it.
>>
>> Do the same on powerpc, it allows KFENCE to properly identify the faulting
>> function as depicted below. Before the patch KFENCE was identifying
>> finish_task_switch.isra as the faulting function.
>>
>> [   14.937370] ==================================================================
>> [   14.948692] BUG: KFENCE: invalid read in test_invalid_access+0x54/0x108
>> [   14.948692]
>> [   14.956814] Invalid read at 0xdf98800a:
>> [   14.960664]  test_invalid_access+0x54/0x108
>> [   14.964876]  finish_task_switch.isra.0+0x54/0x23c
>> [   14.969606]  kunit_try_run_case+0x5c/0xd0
>> [   14.973658]  kunit_generic_run_threadfn_adapter+0x24/0x30
>> [   14.979079]  kthread+0x15c/0x174
>> [   14.982342]  ret_from_kernel_thread+0x14/0x1c
>> [   14.986731]
>> [   14.988236] CPU: 0 PID: 111 Comm: kunit_try_catch Tainted: G    B             5.12.0-rc1-01537-g95f6e2088d7e-dirty #4682
>> [   14.999795] NIP:  c016ec2c LR: c02f517c CTR: c016ebd8
>> [   15.004851] REGS: e2449d90 TRAP: 0301   Tainted: G    B              (5.12.0-rc1-01537-g95f6e2088d7e-dirty)
>> [   15.015274] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 22000004  XER: 00000000
>> [   15.022043] DAR: df98800a DSISR: 20000000
>> [   15.022043] GPR00: c02f517c e2449e50 c1142080 e100dd24 c084b13c 00000008 c084b32b c016ebd8
>> [   15.022043] GPR08: c0850000 df988000 c0d10000 e2449eb0 22000288
>> [   15.040581] NIP [c016ec2c] test_invalid_access+0x54/0x108
>> [   15.046010] LR [c02f517c] kunit_try_run_case+0x5c/0xd0
>> [   15.051181] Call Trace:
>> [   15.053637] [e2449e50] [c005a68c] finish_task_switch.isra.0+0x54/0x23c (unreliable)
>> [   15.061338] [e2449eb0] [c02f517c] kunit_try_run_case+0x5c/0xd0
>> [   15.067215] [e2449ed0] [c02f648c] kunit_generic_run_threadfn_adapter+0x24/0x30
>> [   15.074472] [e2449ef0] [c004e7b0] kthread+0x15c/0x174
>> [   15.079571] [e2449f30] [c001317c] ret_from_kernel_thread+0x14/0x1c
>> [   15.085798] Instruction dump:
>> [   15.088784] 8129d608 38e7ebd8 81020280 911f004c 39000000 995f0024 907f0028 90ff001c
>> [   15.096613] 3949000a 915f0020 3d40c0d1 3d00c085 <8929000a> 3908adb0 812a4b98 3d40c02f
>> [   15.104612] ==================================================================
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> 
> Acked-by: Marco Elver <elver@google.com>
> 
> Thank you, I think this looks like the right solution. Just a question below:
> 
...

>> @@ -59,23 +70,26 @@ void save_stack_trace(struct stack_trace *trace)
>>
>>          sp = current_stack_frame();
>>
>> -       save_context_stack(trace, sp, current, 1);
>> +       save_context_stack(trace, sp, (unsigned long)save_stack_trace, current, 1);
> 
> This causes ip == save_stack_trace and also below for
> save_stack_trace_tsk. Does this mean save_stack_trace() is included in
> the trace? Looking at kernel/stacktrace.c, I think the library wants
> to exclude itself from the trace, as it does '.skip = skipnr + 1' (and
> '.skip   = skipnr + (current == tsk)' for the _tsk variant).
> 
> If the arch-helper here is included, should this use _RET_IP_ instead?
> 

Don't really know, I was inspired by arm64 which has:

void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
		     struct task_struct *task, struct pt_regs *regs)
{
	struct stackframe frame;

	if (regs)
		start_backtrace(&frame, regs->regs[29], regs->pc);
	else if (task == current)
		start_backtrace(&frame,
				(unsigned long)__builtin_frame_address(0),
				(unsigned long)arch_stack_walk);
	else
		start_backtrace(&frame, thread_saved_fp(task),
				thread_saved_pc(task));

	walk_stackframe(task, &frame, consume_entry, cookie);
}


But looking at x86 you may be right, so what should be done really ?

Thanks
Christophe
Marco Elver March 3, 2021, 3:20 p.m. UTC | #3
On Wed, Mar 03, 2021 at 03:52PM +0100, Christophe Leroy wrote:
> Le 03/03/2021 à 15:38, Marco Elver a écrit :
> > On Wed, 3 Mar 2021 at 15:09, Christophe Leroy
> > <christophe.leroy@csgroup.eu> wrote:
> > > 
> > > It seems like all other sane architectures, namely x86 and arm64
> > > at least, include the running function as top entry when saving
> > > stack trace.
> > > 
> > > Functionnalities like KFENCE expect it.
> > > 
> > > Do the same on powerpc, it allows KFENCE to properly identify the faulting
> > > function as depicted below. Before the patch KFENCE was identifying
> > > finish_task_switch.isra as the faulting function.
> > > 
> > > [   14.937370] ==================================================================
> > > [   14.948692] BUG: KFENCE: invalid read in test_invalid_access+0x54/0x108
> > > [   14.948692]
> > > [   14.956814] Invalid read at 0xdf98800a:
> > > [   14.960664]  test_invalid_access+0x54/0x108
> > > [   14.964876]  finish_task_switch.isra.0+0x54/0x23c
> > > [   14.969606]  kunit_try_run_case+0x5c/0xd0
> > > [   14.973658]  kunit_generic_run_threadfn_adapter+0x24/0x30
> > > [   14.979079]  kthread+0x15c/0x174
> > > [   14.982342]  ret_from_kernel_thread+0x14/0x1c
> > > [   14.986731]
> > > [   14.988236] CPU: 0 PID: 111 Comm: kunit_try_catch Tainted: G    B             5.12.0-rc1-01537-g95f6e2088d7e-dirty #4682
> > > [   14.999795] NIP:  c016ec2c LR: c02f517c CTR: c016ebd8
> > > [   15.004851] REGS: e2449d90 TRAP: 0301   Tainted: G    B              (5.12.0-rc1-01537-g95f6e2088d7e-dirty)
> > > [   15.015274] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 22000004  XER: 00000000
> > > [   15.022043] DAR: df98800a DSISR: 20000000
> > > [   15.022043] GPR00: c02f517c e2449e50 c1142080 e100dd24 c084b13c 00000008 c084b32b c016ebd8
> > > [   15.022043] GPR08: c0850000 df988000 c0d10000 e2449eb0 22000288
> > > [   15.040581] NIP [c016ec2c] test_invalid_access+0x54/0x108
> > > [   15.046010] LR [c02f517c] kunit_try_run_case+0x5c/0xd0
> > > [   15.051181] Call Trace:
> > > [   15.053637] [e2449e50] [c005a68c] finish_task_switch.isra.0+0x54/0x23c (unreliable)
> > > [   15.061338] [e2449eb0] [c02f517c] kunit_try_run_case+0x5c/0xd0
> > > [   15.067215] [e2449ed0] [c02f648c] kunit_generic_run_threadfn_adapter+0x24/0x30
> > > [   15.074472] [e2449ef0] [c004e7b0] kthread+0x15c/0x174
> > > [   15.079571] [e2449f30] [c001317c] ret_from_kernel_thread+0x14/0x1c
> > > [   15.085798] Instruction dump:
> > > [   15.088784] 8129d608 38e7ebd8 81020280 911f004c 39000000 995f0024 907f0028 90ff001c
> > > [   15.096613] 3949000a 915f0020 3d40c0d1 3d00c085 <8929000a> 3908adb0 812a4b98 3d40c02f
> > > [   15.104612] ==================================================================
> > > 
> > > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> > 
> > Acked-by: Marco Elver <elver@google.com>
> > 
> > Thank you, I think this looks like the right solution. Just a question below:
> > 
> ...
> 
> > > @@ -59,23 +70,26 @@ void save_stack_trace(struct stack_trace *trace)
> > > 
> > >          sp = current_stack_frame();
> > > 
> > > -       save_context_stack(trace, sp, current, 1);
> > > +       save_context_stack(trace, sp, (unsigned long)save_stack_trace, current, 1);
> > 
> > This causes ip == save_stack_trace and also below for
> > save_stack_trace_tsk. Does this mean save_stack_trace() is included in
> > the trace? Looking at kernel/stacktrace.c, I think the library wants
> > to exclude itself from the trace, as it does '.skip = skipnr + 1' (and
> > '.skip   = skipnr + (current == tsk)' for the _tsk variant).
> > 
> > If the arch-helper here is included, should this use _RET_IP_ instead?
> > 
> 
> Don't really know, I was inspired by arm64 which has:
> 
> void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
> 		     struct task_struct *task, struct pt_regs *regs)
> {
> 	struct stackframe frame;
> 
> 	if (regs)
> 		start_backtrace(&frame, regs->regs[29], regs->pc);
> 	else if (task == current)
> 		start_backtrace(&frame,
> 				(unsigned long)__builtin_frame_address(0),
> 				(unsigned long)arch_stack_walk);
> 	else
> 		start_backtrace(&frame, thread_saved_fp(task),
> 				thread_saved_pc(task));
> 
> 	walk_stackframe(task, &frame, consume_entry, cookie);
> }
> 
> But looking at x86 you may be right, so what should be done really ?

x86:

[    2.843292] calling stack_trace_save:
[    2.843705]  test_func+0x6c/0x118
[    2.844184]  do_one_initcall+0x58/0x270
[    2.844618]  kernel_init_freeable+0x1da/0x23a
[    2.845110]  kernel_init+0xc/0x166
[    2.845494]  ret_from_fork+0x22/0x30

[    2.867525] calling stack_trace_save_tsk:
[    2.868017]  test_func+0xa9/0x118
[    2.868530]  do_one_initcall+0x58/0x270
[    2.869003]  kernel_init_freeable+0x1da/0x23a
[    2.869535]  kernel_init+0xc/0x166
[    2.869957]  ret_from_fork+0x22/0x30

arm64:

[    3.786911] calling stack_trace_save:
[    3.787147]  stack_trace_save+0x50/0x78
[    3.787443]  test_func+0x84/0x13c
[    3.787738]  do_one_initcall+0x5c/0x310
[    3.788099]  kernel_init_freeable+0x214/0x294
[    3.788363]  kernel_init+0x18/0x164
[    3.788585]  ret_from_fork+0x10/0x30

[    3.803615] calling stack_trace_save_tsk:
[    3.804266]  stack_trace_save_tsk+0x9c/0x100
[    3.804541]  test_func+0xc4/0x13c
[    3.804803]  do_one_initcall+0x5c/0x310
[    3.805031]  kernel_init_freeable+0x214/0x294
[    3.805284]  kernel_init+0x18/0x164
[    3.805505]  ret_from_fork+0x10/0x30

+Cc arm64 folks.

So I think the arm64 version also has a bug, because I think a user of
<linux/stacktrace.h> really doesn't care about the library function
itself. And from reading kernel/stacktrace.c I think it wants to exclude
itself entirely.

It's a shame that <linux/stacktrace.h> isn't better documented, but I'm
pretty sure that including the library functions in the trace is not
useful.

For the ppc version, let's do what x86 does and start with the caller.

Thanks,
-- Marco
Mark Rutland March 4, 2021, 2:57 p.m. UTC | #4
[adding Mark Brown]

On Wed, Mar 03, 2021 at 04:20:43PM +0100, Marco Elver wrote:
> On Wed, Mar 03, 2021 at 03:52PM +0100, Christophe Leroy wrote:
> > Le 03/03/2021 � 15:38, Marco Elver a �crit�:
> > > On Wed, 3 Mar 2021 at 15:09, Christophe Leroy
> > > <christophe.leroy@csgroup.eu> wrote:
> > > > 
> > > > It seems like all other sane architectures, namely x86 and arm64
> > > > at least, include the running function as top entry when saving
> > > > stack trace.
> > > > 
> > > > Functionnalities like KFENCE expect it.
> > > > 
> > > > Do the same on powerpc, it allows KFENCE to properly identify the faulting
> > > > function as depicted below. Before the patch KFENCE was identifying
> > > > finish_task_switch.isra as the faulting function.
> > > > 
> > > > [   14.937370] ==================================================================
> > > > [   14.948692] BUG: KFENCE: invalid read in test_invalid_access+0x54/0x108
> > > > [   14.948692]
> > > > [   14.956814] Invalid read at 0xdf98800a:
> > > > [   14.960664]  test_invalid_access+0x54/0x108
> > > > [   14.964876]  finish_task_switch.isra.0+0x54/0x23c
> > > > [   14.969606]  kunit_try_run_case+0x5c/0xd0
> > > > [   14.973658]  kunit_generic_run_threadfn_adapter+0x24/0x30
> > > > [   14.979079]  kthread+0x15c/0x174
> > > > [   14.982342]  ret_from_kernel_thread+0x14/0x1c
> > > > [   14.986731]
> > > > [   14.988236] CPU: 0 PID: 111 Comm: kunit_try_catch Tainted: G    B             5.12.0-rc1-01537-g95f6e2088d7e-dirty #4682
> > > > [   14.999795] NIP:  c016ec2c LR: c02f517c CTR: c016ebd8
> > > > [   15.004851] REGS: e2449d90 TRAP: 0301   Tainted: G    B              (5.12.0-rc1-01537-g95f6e2088d7e-dirty)
> > > > [   15.015274] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 22000004  XER: 00000000
> > > > [   15.022043] DAR: df98800a DSISR: 20000000
> > > > [   15.022043] GPR00: c02f517c e2449e50 c1142080 e100dd24 c084b13c 00000008 c084b32b c016ebd8
> > > > [   15.022043] GPR08: c0850000 df988000 c0d10000 e2449eb0 22000288
> > > > [   15.040581] NIP [c016ec2c] test_invalid_access+0x54/0x108
> > > > [   15.046010] LR [c02f517c] kunit_try_run_case+0x5c/0xd0
> > > > [   15.051181] Call Trace:
> > > > [   15.053637] [e2449e50] [c005a68c] finish_task_switch.isra.0+0x54/0x23c (unreliable)
> > > > [   15.061338] [e2449eb0] [c02f517c] kunit_try_run_case+0x5c/0xd0
> > > > [   15.067215] [e2449ed0] [c02f648c] kunit_generic_run_threadfn_adapter+0x24/0x30
> > > > [   15.074472] [e2449ef0] [c004e7b0] kthread+0x15c/0x174
> > > > [   15.079571] [e2449f30] [c001317c] ret_from_kernel_thread+0x14/0x1c
> > > > [   15.085798] Instruction dump:
> > > > [   15.088784] 8129d608 38e7ebd8 81020280 911f004c 39000000 995f0024 907f0028 90ff001c
> > > > [   15.096613] 3949000a 915f0020 3d40c0d1 3d00c085 <8929000a> 3908adb0 812a4b98 3d40c02f
> > > > [   15.104612] ==================================================================
> > > > 
> > > > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> > > 
> > > Acked-by: Marco Elver <elver@google.com>
> > > 
> > > Thank you, I think this looks like the right solution. Just a question below:
> > > 
> > ...
> > 
> > > > @@ -59,23 +70,26 @@ void save_stack_trace(struct stack_trace *trace)
> > > > 
> > > >          sp = current_stack_frame();
> > > > 
> > > > -       save_context_stack(trace, sp, current, 1);
> > > > +       save_context_stack(trace, sp, (unsigned long)save_stack_trace, current, 1);
> > > 
> > > This causes ip == save_stack_trace and also below for
> > > save_stack_trace_tsk. Does this mean save_stack_trace() is included in
> > > the trace? Looking at kernel/stacktrace.c, I think the library wants
> > > to exclude itself from the trace, as it does '.skip = skipnr + 1' (and
> > > '.skip   = skipnr + (current == tsk)' for the _tsk variant).
> > > 
> > > If the arch-helper here is included, should this use _RET_IP_ instead?
> > > 
> > 
> > Don't really know, I was inspired by arm64 which has:
> > 
> > void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
> > 		     struct task_struct *task, struct pt_regs *regs)
> > {
> > 	struct stackframe frame;
> > 
> > 	if (regs)
> > 		start_backtrace(&frame, regs->regs[29], regs->pc);
> > 	else if (task == current)
> > 		start_backtrace(&frame,
> > 				(unsigned long)__builtin_frame_address(0),
> > 				(unsigned long)arch_stack_walk);
> > 	else
> > 		start_backtrace(&frame, thread_saved_fp(task),
> > 				thread_saved_pc(task));
> > 
> > 	walk_stackframe(task, &frame, consume_entry, cookie);
> > }
> > 
> > But looking at x86 you may be right, so what should be done really ?
> 
> x86:
> 
> [    2.843292] calling stack_trace_save:
> [    2.843705]  test_func+0x6c/0x118
> [    2.844184]  do_one_initcall+0x58/0x270
> [    2.844618]  kernel_init_freeable+0x1da/0x23a
> [    2.845110]  kernel_init+0xc/0x166
> [    2.845494]  ret_from_fork+0x22/0x30
> 
> [    2.867525] calling stack_trace_save_tsk:
> [    2.868017]  test_func+0xa9/0x118
> [    2.868530]  do_one_initcall+0x58/0x270
> [    2.869003]  kernel_init_freeable+0x1da/0x23a
> [    2.869535]  kernel_init+0xc/0x166
> [    2.869957]  ret_from_fork+0x22/0x30
> 
> arm64:
> 
> [    3.786911] calling stack_trace_save:
> [    3.787147]  stack_trace_save+0x50/0x78
> [    3.787443]  test_func+0x84/0x13c
> [    3.787738]  do_one_initcall+0x5c/0x310
> [    3.788099]  kernel_init_freeable+0x214/0x294
> [    3.788363]  kernel_init+0x18/0x164
> [    3.788585]  ret_from_fork+0x10/0x30
> 
> [    3.803615] calling stack_trace_save_tsk:
> [    3.804266]  stack_trace_save_tsk+0x9c/0x100
> [    3.804541]  test_func+0xc4/0x13c
> [    3.804803]  do_one_initcall+0x5c/0x310
> [    3.805031]  kernel_init_freeable+0x214/0x294
> [    3.805284]  kernel_init+0x18/0x164
> [    3.805505]  ret_from_fork+0x10/0x30
> 
> +Cc arm64 folks.
> 
> So I think the arm64 version also has a bug, because I think a user of
> <linux/stacktrace.h> really doesn't care about the library function
> itself. And from reading kernel/stacktrace.c I think it wants to exclude
> itself entirely.
>
> It's a shame that <linux/stacktrace.h> isn't better documented, but I'm
> pretty sure that including the library functions in the trace is not
> useful.

I agree this behaviour isn't desireable, and that the lack of
documentation is unfortunate.

It looks like GCC is happy to give us the function-entry-time FP if we use
__builtin_frame_address(1), and assuming clang is similarly happy we can do:

| diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
| index ad20981dfda4..5dfbf915eb7f 100644
| --- a/arch/arm64/kernel/stacktrace.c
| +++ b/arch/arm64/kernel/stacktrace.c
| @@ -203,8 +203,8 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
|                 start_backtrace(&frame, regs->regs[29], regs->pc);
|         else if (task == current)
|                 start_backtrace(&frame,
| -                               (unsigned long)__builtin_frame_address(0),
| -                               (unsigned long)arch_stack_walk);
| +                               (unsigned long)__builtin_frame_address(1),
| +                               (unsigned long)__builtin_return_address(0));
|         else
|                 start_backtrace(&frame, thread_saved_fp(task),
|                                 thread_saved_pc(task));

... such that arch_stack_walk() will try to avoid including itself in a
trace, and so the existing skipping should (w/ caveats below) skip
stack_trace_save() or stack_trace_save_tsk().

If that works for you, I can spin that as a patch, though we'll need to
check that doesn't introduce a new fencepost error elsewhere.

The bigger problem here is that skipping is dodgy to begin with, and
this is still liable to break in some cases. One big concern is that
(especially with LTO) we cannot guarantee the compiler will not inline
or outline functions, causing the skipp value to be too large or too
small. That's liable to happen to callers, and in theory (though
unlikely in practice), portions of arch_stack_walk() or
stack_trace_save() could get outlined too.

Unless we can get some strong guarantees from compiler folk such that we
can guarantee a specific function acts boundary for unwinding (and
doesn't itself get split, etc), the only reliable way I can think to
solve this requires an assembly trampoline. Whatever we do is liable to
need some invasive rework.

Thanks,
Mark.
Marco Elver March 4, 2021, 3:30 p.m. UTC | #5
On Thu, 4 Mar 2021 at 15:57, Mark Rutland <mark.rutland@arm.com> wrote:
> [adding Mark Brown]
>
> On Wed, Mar 03, 2021 at 04:20:43PM +0100, Marco Elver wrote:
> > On Wed, Mar 03, 2021 at 03:52PM +0100, Christophe Leroy wrote:
> > > Le 03/03/2021 � 15:38, Marco Elver a �crit�:
> > > > On Wed, 3 Mar 2021 at 15:09, Christophe Leroy
> > > > <christophe.leroy@csgroup.eu> wrote:
> > > > >
> > > > > It seems like all other sane architectures, namely x86 and arm64
> > > > > at least, include the running function as top entry when saving
> > > > > stack trace.
> > > > >
> > > > > Functionnalities like KFENCE expect it.
> > > > >
> > > > > Do the same on powerpc, it allows KFENCE to properly identify the faulting
> > > > > function as depicted below. Before the patch KFENCE was identifying
> > > > > finish_task_switch.isra as the faulting function.
> > > > >
> > > > > [   14.937370] ==================================================================
> > > > > [   14.948692] BUG: KFENCE: invalid read in test_invalid_access+0x54/0x108
> > > > > [   14.948692]
> > > > > [   14.956814] Invalid read at 0xdf98800a:
> > > > > [   14.960664]  test_invalid_access+0x54/0x108
> > > > > [   14.964876]  finish_task_switch.isra.0+0x54/0x23c
> > > > > [   14.969606]  kunit_try_run_case+0x5c/0xd0
> > > > > [   14.973658]  kunit_generic_run_threadfn_adapter+0x24/0x30
> > > > > [   14.979079]  kthread+0x15c/0x174
> > > > > [   14.982342]  ret_from_kernel_thread+0x14/0x1c
> > > > > [   14.986731]
> > > > > [   14.988236] CPU: 0 PID: 111 Comm: kunit_try_catch Tainted: G    B             5.12.0-rc1-01537-g95f6e2088d7e-dirty #4682
> > > > > [   14.999795] NIP:  c016ec2c LR: c02f517c CTR: c016ebd8
> > > > > [   15.004851] REGS: e2449d90 TRAP: 0301   Tainted: G    B              (5.12.0-rc1-01537-g95f6e2088d7e-dirty)
> > > > > [   15.015274] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 22000004  XER: 00000000
> > > > > [   15.022043] DAR: df98800a DSISR: 20000000
> > > > > [   15.022043] GPR00: c02f517c e2449e50 c1142080 e100dd24 c084b13c 00000008 c084b32b c016ebd8
> > > > > [   15.022043] GPR08: c0850000 df988000 c0d10000 e2449eb0 22000288
> > > > > [   15.040581] NIP [c016ec2c] test_invalid_access+0x54/0x108
> > > > > [   15.046010] LR [c02f517c] kunit_try_run_case+0x5c/0xd0
> > > > > [   15.051181] Call Trace:
> > > > > [   15.053637] [e2449e50] [c005a68c] finish_task_switch.isra.0+0x54/0x23c (unreliable)
> > > > > [   15.061338] [e2449eb0] [c02f517c] kunit_try_run_case+0x5c/0xd0
> > > > > [   15.067215] [e2449ed0] [c02f648c] kunit_generic_run_threadfn_adapter+0x24/0x30
> > > > > [   15.074472] [e2449ef0] [c004e7b0] kthread+0x15c/0x174
> > > > > [   15.079571] [e2449f30] [c001317c] ret_from_kernel_thread+0x14/0x1c
> > > > > [   15.085798] Instruction dump:
> > > > > [   15.088784] 8129d608 38e7ebd8 81020280 911f004c 39000000 995f0024 907f0028 90ff001c
> > > > > [   15.096613] 3949000a 915f0020 3d40c0d1 3d00c085 <8929000a> 3908adb0 812a4b98 3d40c02f
> > > > > [   15.104612] ==================================================================
> > > > >
> > > > > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> > > >
> > > > Acked-by: Marco Elver <elver@google.com>
> > > >
> > > > Thank you, I think this looks like the right solution. Just a question below:
> > > >
> > > ...
> > >
> > > > > @@ -59,23 +70,26 @@ void save_stack_trace(struct stack_trace *trace)
> > > > >
> > > > >          sp = current_stack_frame();
> > > > >
> > > > > -       save_context_stack(trace, sp, current, 1);
> > > > > +       save_context_stack(trace, sp, (unsigned long)save_stack_trace, current, 1);
> > > >
> > > > This causes ip == save_stack_trace and also below for
> > > > save_stack_trace_tsk. Does this mean save_stack_trace() is included in
> > > > the trace? Looking at kernel/stacktrace.c, I think the library wants
> > > > to exclude itself from the trace, as it does '.skip = skipnr + 1' (and
> > > > '.skip   = skipnr + (current == tsk)' for the _tsk variant).
> > > >
> > > > If the arch-helper here is included, should this use _RET_IP_ instead?
> > > >
> > >
> > > Don't really know, I was inspired by arm64 which has:
> > >
> > > void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
> > >                  struct task_struct *task, struct pt_regs *regs)
> > > {
> > >     struct stackframe frame;
> > >
> > >     if (regs)
> > >             start_backtrace(&frame, regs->regs[29], regs->pc);
> > >     else if (task == current)
> > >             start_backtrace(&frame,
> > >                             (unsigned long)__builtin_frame_address(0),
> > >                             (unsigned long)arch_stack_walk);
> > >     else
> > >             start_backtrace(&frame, thread_saved_fp(task),
> > >                             thread_saved_pc(task));
> > >
> > >     walk_stackframe(task, &frame, consume_entry, cookie);
> > > }
> > >
> > > But looking at x86 you may be right, so what should be done really ?
> >
> > x86:
> >
> > [    2.843292] calling stack_trace_save:
> > [    2.843705]  test_func+0x6c/0x118
> > [    2.844184]  do_one_initcall+0x58/0x270
> > [    2.844618]  kernel_init_freeable+0x1da/0x23a
> > [    2.845110]  kernel_init+0xc/0x166
> > [    2.845494]  ret_from_fork+0x22/0x30
> >
> > [    2.867525] calling stack_trace_save_tsk:
> > [    2.868017]  test_func+0xa9/0x118
> > [    2.868530]  do_one_initcall+0x58/0x270
> > [    2.869003]  kernel_init_freeable+0x1da/0x23a
> > [    2.869535]  kernel_init+0xc/0x166
> > [    2.869957]  ret_from_fork+0x22/0x30
> >
> > arm64:
> >
> > [    3.786911] calling stack_trace_save:
> > [    3.787147]  stack_trace_save+0x50/0x78
> > [    3.787443]  test_func+0x84/0x13c
> > [    3.787738]  do_one_initcall+0x5c/0x310
> > [    3.788099]  kernel_init_freeable+0x214/0x294
> > [    3.788363]  kernel_init+0x18/0x164
> > [    3.788585]  ret_from_fork+0x10/0x30
> >
> > [    3.803615] calling stack_trace_save_tsk:
> > [    3.804266]  stack_trace_save_tsk+0x9c/0x100
> > [    3.804541]  test_func+0xc4/0x13c
> > [    3.804803]  do_one_initcall+0x5c/0x310
> > [    3.805031]  kernel_init_freeable+0x214/0x294
> > [    3.805284]  kernel_init+0x18/0x164
> > [    3.805505]  ret_from_fork+0x10/0x30
> >
> > +Cc arm64 folks.
> >
> > So I think the arm64 version also has a bug, because I think a user of
> > <linux/stacktrace.h> really doesn't care about the library function
> > itself. And from reading kernel/stacktrace.c I think it wants to exclude
> > itself entirely.
> >
> > It's a shame that <linux/stacktrace.h> isn't better documented, but I'm
> > pretty sure that including the library functions in the trace is not
> > useful.
>
> I agree this behaviour isn't desireable, and that the lack of
> documentation is unfortunate.
>
> It looks like GCC is happy to give us the function-entry-time FP if we use
> __builtin_frame_address(1), and assuming clang is similarly happy we can do:
>
> | diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> | index ad20981dfda4..5dfbf915eb7f 100644
> | --- a/arch/arm64/kernel/stacktrace.c
> | +++ b/arch/arm64/kernel/stacktrace.c
> | @@ -203,8 +203,8 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
> |                 start_backtrace(&frame, regs->regs[29], regs->pc);
> |         else if (task == current)
> |                 start_backtrace(&frame,
> | -                               (unsigned long)__builtin_frame_address(0),
> | -                               (unsigned long)arch_stack_walk);
> | +                               (unsigned long)__builtin_frame_address(1),
> | +                               (unsigned long)__builtin_return_address(0));
> |         else
> |                 start_backtrace(&frame, thread_saved_fp(task),
> |                                 thread_saved_pc(task));
>
> ... such that arch_stack_walk() will try to avoid including itself in a
> trace, and so the existing skipping should (w/ caveats below) skip
> stack_trace_save() or stack_trace_save_tsk().

Thank you! Yes, that works.

> If that works for you, I can spin that as a patch, though we'll need to
> check that doesn't introduce a new fencepost error elsewhere.
>
> The bigger problem here is that skipping is dodgy to begin with, and
> this is still liable to break in some cases. One big concern is that
> (especially with LTO) we cannot guarantee the compiler will not inline
> or outline functions, causing the skipp value to be too large or too
> small. That's liable to happen to callers, and in theory (though
> unlikely in practice), portions of arch_stack_walk() or
> stack_trace_save() could get outlined too.
>
> Unless we can get some strong guarantees from compiler folk such that we
> can guarantee a specific function acts boundary for unwinding (and
> doesn't itself get split, etc), the only reliable way I can think to
> solve this requires an assembly trampoline. Whatever we do is liable to
> need some invasive rework.

Will LTO and friends respect 'noinline'? One thing I also noticed is
that tail calls would also cause the stack trace to appear somewhat
incomplete (for some of my tests I've disabled tail call
optimizations). Is there a way to also mark a function
non-tail-callable? But I'm also not sure if with all that we'd be
guaranteed the code we want, even though in practice it might.

Thanks,
-- Marco
Mark Rutland March 4, 2021, 4:59 p.m. UTC | #6
On Thu, Mar 04, 2021 at 04:30:34PM +0100, Marco Elver wrote:
> On Thu, 4 Mar 2021 at 15:57, Mark Rutland <mark.rutland@arm.com> wrote:
> > [adding Mark Brown]
> >
> > The bigger problem here is that skipping is dodgy to begin with, and
> > this is still liable to break in some cases. One big concern is that
> > (especially with LTO) we cannot guarantee the compiler will not inline
> > or outline functions, causing the skipp value to be too large or too
> > small. That's liable to happen to callers, and in theory (though
> > unlikely in practice), portions of arch_stack_walk() or
> > stack_trace_save() could get outlined too.
> >
> > Unless we can get some strong guarantees from compiler folk such that we
> > can guarantee a specific function acts boundary for unwinding (and
> > doesn't itself get split, etc), the only reliable way I can think to
> > solve this requires an assembly trampoline. Whatever we do is liable to
> > need some invasive rework.
> 
> Will LTO and friends respect 'noinline'?

I hope so (and suspect we'd have more problems otherwise), but I don't
know whether they actually so.

I suspect even with 'noinline' the compiler is permitted to outline
portions of a function if it wanted to (and IIUC it could still make
specialized copies in the absence of 'noclone').

> One thing I also noticed is that tail calls would also cause the stack
> trace to appear somewhat incomplete (for some of my tests I've
> disabled tail call optimizations).

I assume you mean for a chain A->B->C where B tail-calls C, you get a
trace A->C? ... or is A going missing too?

> Is there a way to also mark a function non-tail-callable?

I think this can be bodged using __attribute__((optimize("$OPTIONS")))
on a caller to inhibit TCO (though IIRC GCC doesn't reliably support
function-local optimization options), but I don't expect there's any way
to mark a callee as not being tail-callable.

Accoding to the GCC documentation, GCC won't TCO noreturn functions, but
obviously that's not something we can use generally.

https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes

> But I'm also not sure if with all that we'd be guaranteed the code we
> want, even though in practice it might.

True! I'd just like to be on the least dodgy ground we can be.

Thanks,
Mark.
Marco Elver March 4, 2021, 5:25 p.m. UTC | #7
On Thu, Mar 04, 2021 at 04:59PM +0000, Mark Rutland wrote:
> On Thu, Mar 04, 2021 at 04:30:34PM +0100, Marco Elver wrote:
> > On Thu, 4 Mar 2021 at 15:57, Mark Rutland <mark.rutland@arm.com> wrote:
> > > [adding Mark Brown]
> > >
> > > The bigger problem here is that skipping is dodgy to begin with, and
> > > this is still liable to break in some cases. One big concern is that
> > > (especially with LTO) we cannot guarantee the compiler will not inline
> > > or outline functions, causing the skipp value to be too large or too
> > > small. That's liable to happen to callers, and in theory (though
> > > unlikely in practice), portions of arch_stack_walk() or
> > > stack_trace_save() could get outlined too.
> > >
> > > Unless we can get some strong guarantees from compiler folk such that we
> > > can guarantee a specific function acts boundary for unwinding (and
> > > doesn't itself get split, etc), the only reliable way I can think to
> > > solve this requires an assembly trampoline. Whatever we do is liable to
> > > need some invasive rework.
> > 
> > Will LTO and friends respect 'noinline'?
> 
> I hope so (and suspect we'd have more problems otherwise), but I don't
> know whether they actually so.
> 
> I suspect even with 'noinline' the compiler is permitted to outline
> portions of a function if it wanted to (and IIUC it could still make
> specialized copies in the absence of 'noclone').
> 
> > One thing I also noticed is that tail calls would also cause the stack
> > trace to appear somewhat incomplete (for some of my tests I've
> > disabled tail call optimizations).
> 
> I assume you mean for a chain A->B->C where B tail-calls C, you get a
> trace A->C? ... or is A going missing too?

Correct, it's just the A->C outcome.

> > Is there a way to also mark a function non-tail-callable?
> 
> I think this can be bodged using __attribute__((optimize("$OPTIONS")))
> on a caller to inhibit TCO (though IIRC GCC doesn't reliably support
> function-local optimization options), but I don't expect there's any way
> to mark a callee as not being tail-callable.

I don't think this is reliable. It'd be
__attribute__((optimize("-fno-optimize-sibling-calls"))), but doesn't
work if applied to the function we do not want to tail-call-optimize,
but would have to be applied to the function that does the tail-calling.
So it's a bit backwards, even if it worked.

> Accoding to the GCC documentation, GCC won't TCO noreturn functions, but
> obviously that's not something we can use generally.
> 
> https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes

Perhaps we can ask the toolchain folks to help add such an attribute. Or
maybe the feature already exists somewhere, but hidden.

+Cc linux-toolchains@vger.kernel.org

> > But I'm also not sure if with all that we'd be guaranteed the code we
> > want, even though in practice it might.
> 
> True! I'd just like to be on the least dodgy ground we can be.

It's been dodgy for a while, and I'd welcome any low-cost fixes to make
it less dodgy in the short-term at least. :-)

Thanks,
-- Marco
Nick Desaulniers March 4, 2021, 5:54 p.m. UTC | #8
On Thu, Mar 4, 2021 at 9:42 AM Marco Elver <elver@google.com> wrote:
>
> On Thu, Mar 04, 2021 at 04:59PM +0000, Mark Rutland wrote:
> > On Thu, Mar 04, 2021 at 04:30:34PM +0100, Marco Elver wrote:
> > > On Thu, 4 Mar 2021 at 15:57, Mark Rutland <mark.rutland@arm.com> wrote:
> > > > [adding Mark Brown]
> > > >
> > > > The bigger problem here is that skipping is dodgy to begin with, and
> > > > this is still liable to break in some cases. One big concern is that
> > > > (especially with LTO) we cannot guarantee the compiler will not inline
> > > > or outline functions, causing the skipp value to be too large or too
> > > > small. That's liable to happen to callers, and in theory (though
> > > > unlikely in practice), portions of arch_stack_walk() or
> > > > stack_trace_save() could get outlined too.
> > > >
> > > > Unless we can get some strong guarantees from compiler folk such that we
> > > > can guarantee a specific function acts boundary for unwinding (and
> > > > doesn't itself get split, etc), the only reliable way I can think to
> > > > solve this requires an assembly trampoline. Whatever we do is liable to
> > > > need some invasive rework.
> > >
> > > Will LTO and friends respect 'noinline'?
> >
> > I hope so (and suspect we'd have more problems otherwise), but I don't
> > know whether they actually so.
> >
> > I suspect even with 'noinline' the compiler is permitted to outline
> > portions of a function if it wanted to (and IIUC it could still make
> > specialized copies in the absence of 'noclone').
> >
> > > One thing I also noticed is that tail calls would also cause the stack
> > > trace to appear somewhat incomplete (for some of my tests I've
> > > disabled tail call optimizations).
> >
> > I assume you mean for a chain A->B->C where B tail-calls C, you get a
> > trace A->C? ... or is A going missing too?
>
> Correct, it's just the A->C outcome.
>
> > > Is there a way to also mark a function non-tail-callable?
> >
> > I think this can be bodged using __attribute__((optimize("$OPTIONS")))
> > on a caller to inhibit TCO (though IIRC GCC doesn't reliably support
> > function-local optimization options), but I don't expect there's any way
> > to mark a callee as not being tail-callable.
>
> I don't think this is reliable. It'd be
> __attribute__((optimize("-fno-optimize-sibling-calls"))), but doesn't
> work if applied to the function we do not want to tail-call-optimize,
> but would have to be applied to the function that does the tail-calling.
> So it's a bit backwards, even if it worked.
>
> > Accoding to the GCC documentation, GCC won't TCO noreturn functions, but
> > obviously that's not something we can use generally.
> >
> > https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes

include/linux/compiler.h:246:
prevent_tail_call_optimization

commit a9a3ed1eff36 ("x86: Fix early boot crash on gcc-10, third try")

>
> Perhaps we can ask the toolchain folks to help add such an attribute. Or
> maybe the feature already exists somewhere, but hidden.
>
> +Cc linux-toolchains@vger.kernel.org
>
> > > But I'm also not sure if with all that we'd be guaranteed the code we
> > > want, even though in practice it might.
> >
> > True! I'd just like to be on the least dodgy ground we can be.
>
> It's been dodgy for a while, and I'd welcome any low-cost fixes to make
> it less dodgy in the short-term at least. :-)
>
> Thanks,
> -- Marco
Mark Rutland March 4, 2021, 6:01 p.m. UTC | #9
On Thu, Mar 04, 2021 at 06:25:33PM +0100, Marco Elver wrote:
> On Thu, Mar 04, 2021 at 04:59PM +0000, Mark Rutland wrote:
> > On Thu, Mar 04, 2021 at 04:30:34PM +0100, Marco Elver wrote:
> > > On Thu, 4 Mar 2021 at 15:57, Mark Rutland <mark.rutland@arm.com> wrote:
> > > > [adding Mark Brown]
> > > >
> > > > The bigger problem here is that skipping is dodgy to begin with, and
> > > > this is still liable to break in some cases. One big concern is that
> > > > (especially with LTO) we cannot guarantee the compiler will not inline
> > > > or outline functions, causing the skipp value to be too large or too
> > > > small. That's liable to happen to callers, and in theory (though
> > > > unlikely in practice), portions of arch_stack_walk() or
> > > > stack_trace_save() could get outlined too.
> > > >
> > > > Unless we can get some strong guarantees from compiler folk such that we
> > > > can guarantee a specific function acts boundary for unwinding (and
> > > > doesn't itself get split, etc), the only reliable way I can think to
> > > > solve this requires an assembly trampoline. Whatever we do is liable to
> > > > need some invasive rework.
> > > 
> > > Will LTO and friends respect 'noinline'?
> > 
> > I hope so (and suspect we'd have more problems otherwise), but I don't
> > know whether they actually so.
> > 
> > I suspect even with 'noinline' the compiler is permitted to outline
> > portions of a function if it wanted to (and IIUC it could still make
> > specialized copies in the absence of 'noclone').
> > 
> > > One thing I also noticed is that tail calls would also cause the stack
> > > trace to appear somewhat incomplete (for some of my tests I've
> > > disabled tail call optimizations).
> > 
> > I assume you mean for a chain A->B->C where B tail-calls C, you get a
> > trace A->C? ... or is A going missing too?
> 
> Correct, it's just the A->C outcome.

I'd assumed that those cases were benign, e.g. for livepatching what
matters is what can be returned to, so B disappearing from the trace
isn't a problem there.

Is the concern debugability, or is there a functional issue you have in
mind?

> > > Is there a way to also mark a function non-tail-callable?
> > 
> > I think this can be bodged using __attribute__((optimize("$OPTIONS")))
> > on a caller to inhibit TCO (though IIRC GCC doesn't reliably support
> > function-local optimization options), but I don't expect there's any way
> > to mark a callee as not being tail-callable.
> 
> I don't think this is reliable. It'd be
> __attribute__((optimize("-fno-optimize-sibling-calls"))), but doesn't
> work if applied to the function we do not want to tail-call-optimize,
> but would have to be applied to the function that does the tail-calling.

Yup; that's what I meant then I said you could do that on the caller but
not the callee.

I don't follow why you'd want to put this on the callee, though, so I
think I'm missing something. Considering a set of functions in different
compilation units:

  A->B->C->D->E->F->G->H->I->J->K

... if K were marked in this way, and J was compiled with visibility of
this, J would stick around, but J's callers might not, and so the a
trace might see:

  A->J->K

... do you just care about the final caller, i.e. you just need
certainty that J will be in the trace?

If so, we can somewhat bodge that by having K have an __always_inline
wrapper which has a barrier() or similar after the real call to K, so
the call couldn't be TCO'd.

Otherwise I'd expect we'd probably need to disable TCO generally.

> So it's a bit backwards, even if it worked.
> 
> > Accoding to the GCC documentation, GCC won't TCO noreturn functions, but
> > obviously that's not something we can use generally.
> > 
> > https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes
> 
> Perhaps we can ask the toolchain folks to help add such an attribute. Or
> maybe the feature already exists somewhere, but hidden.
> 
> +Cc linux-toolchains@vger.kernel.org
> 
> > > But I'm also not sure if with all that we'd be guaranteed the code we
> > > want, even though in practice it might.
> > 
> > True! I'd just like to be on the least dodgy ground we can be.
> 
> It's been dodgy for a while, and I'd welcome any low-cost fixes to make
> it less dodgy in the short-term at least. :-)

:)

Thanks,
Mark.
Marco Elver March 4, 2021, 6:22 p.m. UTC | #10
On Thu, 4 Mar 2021 at 19:02, Mark Rutland <mark.rutland@arm.com> wrote:
> On Thu, Mar 04, 2021 at 06:25:33PM +0100, Marco Elver wrote:
> > On Thu, Mar 04, 2021 at 04:59PM +0000, Mark Rutland wrote:
> > > On Thu, Mar 04, 2021 at 04:30:34PM +0100, Marco Elver wrote:
> > > > On Thu, 4 Mar 2021 at 15:57, Mark Rutland <mark.rutland@arm.com> wrote:
> > > > > [adding Mark Brown]
> > > > >
> > > > > The bigger problem here is that skipping is dodgy to begin with, and
> > > > > this is still liable to break in some cases. One big concern is that
> > > > > (especially with LTO) we cannot guarantee the compiler will not inline
> > > > > or outline functions, causing the skipp value to be too large or too
> > > > > small. That's liable to happen to callers, and in theory (though
> > > > > unlikely in practice), portions of arch_stack_walk() or
> > > > > stack_trace_save() could get outlined too.
> > > > >
> > > > > Unless we can get some strong guarantees from compiler folk such that we
> > > > > can guarantee a specific function acts boundary for unwinding (and
> > > > > doesn't itself get split, etc), the only reliable way I can think to
> > > > > solve this requires an assembly trampoline. Whatever we do is liable to
> > > > > need some invasive rework.
> > > >
> > > > Will LTO and friends respect 'noinline'?
> > >
> > > I hope so (and suspect we'd have more problems otherwise), but I don't
> > > know whether they actually so.
> > >
> > > I suspect even with 'noinline' the compiler is permitted to outline
> > > portions of a function if it wanted to (and IIUC it could still make
> > > specialized copies in the absence of 'noclone').
> > >
> > > > One thing I also noticed is that tail calls would also cause the stack
> > > > trace to appear somewhat incomplete (for some of my tests I've
> > > > disabled tail call optimizations).
> > >
> > > I assume you mean for a chain A->B->C where B tail-calls C, you get a
> > > trace A->C? ... or is A going missing too?
> >
> > Correct, it's just the A->C outcome.
>
> I'd assumed that those cases were benign, e.g. for livepatching what
> matters is what can be returned to, so B disappearing from the trace
> isn't a problem there.
>
> Is the concern debugability, or is there a functional issue you have in
> mind?

For me, it's just been debuggability, and reliable test cases.

> > > > Is there a way to also mark a function non-tail-callable?
> > >
> > > I think this can be bodged using __attribute__((optimize("$OPTIONS")))
> > > on a caller to inhibit TCO (though IIRC GCC doesn't reliably support
> > > function-local optimization options), but I don't expect there's any way
> > > to mark a callee as not being tail-callable.
> >
> > I don't think this is reliable. It'd be
> > __attribute__((optimize("-fno-optimize-sibling-calls"))), but doesn't
> > work if applied to the function we do not want to tail-call-optimize,
> > but would have to be applied to the function that does the tail-calling.
>
> Yup; that's what I meant then I said you could do that on the caller but
> not the callee.
>
> I don't follow why you'd want to put this on the callee, though, so I
> think I'm missing something. Considering a set of functions in different
> compilation units:
>
>   A->B->C->D->E->F->G->H->I->J->K

I was having this problem with KCSAN, where the compiler would
tail-call-optimize __tsan_X instrumentation. This would mean that
KCSAN runtime functions ended up in the trace, but the function where
the access happened would not. However, I don't care about the runtime
functions, and instead want to see the function where the access
happened. In that case, I'd like to just mark __tsan_X and any other
kcsan instrumentation functions as do-not-tail-call-optimize, which
would solve the problem.

The solution today is that when you compile a kernel with KCSAN, every
instrumented TU is compiled with -fno-optimize-sibling-calls. The
better solution would be to just mark KCSAN runtime functions somehow,
but permit tail calling other things. Although, I probably still want
to see the full trace, and would decide that having
-fno-optimize-sibling-calls is a small price to pay in a
debug-only-kernel to get complete traces.

> ... if K were marked in this way, and J was compiled with visibility of
> this, J would stick around, but J's callers might not, and so the a
> trace might see:
>
>   A->J->K
>
> ... do you just care about the final caller, i.e. you just need
> certainty that J will be in the trace?

Yes. But maybe it's a special problem that only sanitizers have.

> If so, we can somewhat bodge that by having K have an __always_inline
> wrapper which has a barrier() or similar after the real call to K, so
> the call couldn't be TCO'd.
>
> Otherwise I'd expect we'd probably need to disable TCO generally.

Thanks,
-- Marco
Mark Rutland March 4, 2021, 6:51 p.m. UTC | #11
On Thu, Mar 04, 2021 at 07:22:53PM +0100, Marco Elver wrote:
> On Thu, 4 Mar 2021 at 19:02, Mark Rutland <mark.rutland@arm.com> wrote:
> > On Thu, Mar 04, 2021 at 06:25:33PM +0100, Marco Elver wrote:
> > > On Thu, Mar 04, 2021 at 04:59PM +0000, Mark Rutland wrote:
> > > > On Thu, Mar 04, 2021 at 04:30:34PM +0100, Marco Elver wrote:
> > > > > On Thu, 4 Mar 2021 at 15:57, Mark Rutland <mark.rutland@arm.com> wrote:
> > > > > > [adding Mark Brown]
> > > > > >
> > > > > > The bigger problem here is that skipping is dodgy to begin with, and
> > > > > > this is still liable to break in some cases. One big concern is that
> > > > > > (especially with LTO) we cannot guarantee the compiler will not inline
> > > > > > or outline functions, causing the skipp value to be too large or too
> > > > > > small. That's liable to happen to callers, and in theory (though
> > > > > > unlikely in practice), portions of arch_stack_walk() or
> > > > > > stack_trace_save() could get outlined too.
> > > > > >
> > > > > > Unless we can get some strong guarantees from compiler folk such that we
> > > > > > can guarantee a specific function acts boundary for unwinding (and
> > > > > > doesn't itself get split, etc), the only reliable way I can think to
> > > > > > solve this requires an assembly trampoline. Whatever we do is liable to
> > > > > > need some invasive rework.
> > > > >
> > > > > Will LTO and friends respect 'noinline'?
> > > >
> > > > I hope so (and suspect we'd have more problems otherwise), but I don't
> > > > know whether they actually so.
> > > >
> > > > I suspect even with 'noinline' the compiler is permitted to outline
> > > > portions of a function if it wanted to (and IIUC it could still make
> > > > specialized copies in the absence of 'noclone').
> > > >
> > > > > One thing I also noticed is that tail calls would also cause the stack
> > > > > trace to appear somewhat incomplete (for some of my tests I've
> > > > > disabled tail call optimizations).
> > > >
> > > > I assume you mean for a chain A->B->C where B tail-calls C, you get a
> > > > trace A->C? ... or is A going missing too?
> > >
> > > Correct, it's just the A->C outcome.
> >
> > I'd assumed that those cases were benign, e.g. for livepatching what
> > matters is what can be returned to, so B disappearing from the trace
> > isn't a problem there.
> >
> > Is the concern debugability, or is there a functional issue you have in
> > mind?
> 
> For me, it's just been debuggability, and reliable test cases.
> 
> > > > > Is there a way to also mark a function non-tail-callable?
> > > >
> > > > I think this can be bodged using __attribute__((optimize("$OPTIONS")))
> > > > on a caller to inhibit TCO (though IIRC GCC doesn't reliably support
> > > > function-local optimization options), but I don't expect there's any way
> > > > to mark a callee as not being tail-callable.
> > >
> > > I don't think this is reliable. It'd be
> > > __attribute__((optimize("-fno-optimize-sibling-calls"))), but doesn't
> > > work if applied to the function we do not want to tail-call-optimize,
> > > but would have to be applied to the function that does the tail-calling.
> >
> > Yup; that's what I meant then I said you could do that on the caller but
> > not the callee.
> >
> > I don't follow why you'd want to put this on the callee, though, so I
> > think I'm missing something. Considering a set of functions in different
> > compilation units:
> >
> >   A->B->C->D->E->F->G->H->I->J->K
> 
> I was having this problem with KCSAN, where the compiler would
> tail-call-optimize __tsan_X instrumentation.

Those are compiler-generated calls, right? When those are generated the
compilation unit (and whatever it has included) might not have provided
a prototype anyway, and the compiler has special knowledge of the
functions, so it feels like the compiler would need to inhibit TCO here
for this to be robust. For their intended usage subjecting them to TCO
doesn't seem to make sense AFAICT.

I suspect that compilers have some way of handling that; otherwise I'd
expect to have heard stories of mcount/fentry calls getting TCO'd and
causing problems. So maybe there's an easy fix there?

> This would mean that KCSAN runtime functions ended up in the trace,
> but the function where the access happened would not. However, I don't
> care about the runtime functions, and instead want to see the function
> where the access happened. In that case, I'd like to just mark
> __tsan_X and any other kcsan instrumentation functions as
> do-not-tail-call-optimize, which would solve the problem.

I understand why we don't want to TCO these calls, but given the calls
are implicitly generated, I strongly suspect it's better to fix the
implicit call generation to not be TCO'd to begin with.

> The solution today is that when you compile a kernel with KCSAN, every
> instrumented TU is compiled with -fno-optimize-sibling-calls. The
> better solution would be to just mark KCSAN runtime functions somehow,
> but permit tail calling other things. Although, I probably still want
> to see the full trace, and would decide that having
> -fno-optimize-sibling-calls is a small price to pay in a
> debug-only-kernel to get complete traces.
> 
> > ... if K were marked in this way, and J was compiled with visibility of
> > this, J would stick around, but J's callers might not, and so the a
> > trace might see:
> >
> >   A->J->K
> >
> > ... do you just care about the final caller, i.e. you just need
> > certainty that J will be in the trace?
> 
> Yes. But maybe it's a special problem that only sanitizers have.

I reckon for basically any instrumentation we don't want calls to be
TCO'd, though I'm not immediately sure of cases beyond sanitizers and
mcount/fentry.

Thanks,
Mark.
Marco Elver March 4, 2021, 7:01 p.m. UTC | #12
On Thu, 4 Mar 2021 at 19:51, Mark Rutland <mark.rutland@arm.com> wrote:
> On Thu, Mar 04, 2021 at 07:22:53PM +0100, Marco Elver wrote:
> > On Thu, 4 Mar 2021 at 19:02, Mark Rutland <mark.rutland@arm.com> wrote:
> > > On Thu, Mar 04, 2021 at 06:25:33PM +0100, Marco Elver wrote:
> > > > On Thu, Mar 04, 2021 at 04:59PM +0000, Mark Rutland wrote:
> > > > > On Thu, Mar 04, 2021 at 04:30:34PM +0100, Marco Elver wrote:
> > > > > > On Thu, 4 Mar 2021 at 15:57, Mark Rutland <mark.rutland@arm.com> wrote:
> > > > > > > [adding Mark Brown]
> > > > > > >
> > > > > > > The bigger problem here is that skipping is dodgy to begin with, and
> > > > > > > this is still liable to break in some cases. One big concern is that
> > > > > > > (especially with LTO) we cannot guarantee the compiler will not inline
> > > > > > > or outline functions, causing the skipp value to be too large or too
> > > > > > > small. That's liable to happen to callers, and in theory (though
> > > > > > > unlikely in practice), portions of arch_stack_walk() or
> > > > > > > stack_trace_save() could get outlined too.
> > > > > > >
> > > > > > > Unless we can get some strong guarantees from compiler folk such that we
> > > > > > > can guarantee a specific function acts boundary for unwinding (and
> > > > > > > doesn't itself get split, etc), the only reliable way I can think to
> > > > > > > solve this requires an assembly trampoline. Whatever we do is liable to
> > > > > > > need some invasive rework.
> > > > > >
> > > > > > Will LTO and friends respect 'noinline'?
> > > > >
> > > > > I hope so (and suspect we'd have more problems otherwise), but I don't
> > > > > know whether they actually so.
> > > > >
> > > > > I suspect even with 'noinline' the compiler is permitted to outline
> > > > > portions of a function if it wanted to (and IIUC it could still make
> > > > > specialized copies in the absence of 'noclone').
> > > > >
> > > > > > One thing I also noticed is that tail calls would also cause the stack
> > > > > > trace to appear somewhat incomplete (for some of my tests I've
> > > > > > disabled tail call optimizations).
> > > > >
> > > > > I assume you mean for a chain A->B->C where B tail-calls C, you get a
> > > > > trace A->C? ... or is A going missing too?
> > > >
> > > > Correct, it's just the A->C outcome.
> > >
> > > I'd assumed that those cases were benign, e.g. for livepatching what
> > > matters is what can be returned to, so B disappearing from the trace
> > > isn't a problem there.
> > >
> > > Is the concern debugability, or is there a functional issue you have in
> > > mind?
> >
> > For me, it's just been debuggability, and reliable test cases.
> >
> > > > > > Is there a way to also mark a function non-tail-callable?
> > > > >
> > > > > I think this can be bodged using __attribute__((optimize("$OPTIONS")))
> > > > > on a caller to inhibit TCO (though IIRC GCC doesn't reliably support
> > > > > function-local optimization options), but I don't expect there's any way
> > > > > to mark a callee as not being tail-callable.
> > > >
> > > > I don't think this is reliable. It'd be
> > > > __attribute__((optimize("-fno-optimize-sibling-calls"))), but doesn't
> > > > work if applied to the function we do not want to tail-call-optimize,
> > > > but would have to be applied to the function that does the tail-calling.
> > >
> > > Yup; that's what I meant then I said you could do that on the caller but
> > > not the callee.
> > >
> > > I don't follow why you'd want to put this on the callee, though, so I
> > > think I'm missing something. Considering a set of functions in different
> > > compilation units:
> > >
> > >   A->B->C->D->E->F->G->H->I->J->K
> >
> > I was having this problem with KCSAN, where the compiler would
> > tail-call-optimize __tsan_X instrumentation.
>
> Those are compiler-generated calls, right? When those are generated the
> compilation unit (and whatever it has included) might not have provided
> a prototype anyway, and the compiler has special knowledge of the
> functions, so it feels like the compiler would need to inhibit TCO here
> for this to be robust. For their intended usage subjecting them to TCO
> doesn't seem to make sense AFAICT.
>
> I suspect that compilers have some way of handling that; otherwise I'd
> expect to have heard stories of mcount/fentry calls getting TCO'd and
> causing problems. So maybe there's an easy fix there?

I agree, the compiler builtins should be handled by the compiler
directly, perhaps that was a bad example. But we also have "explicit
instrumentation", e.g. everything that's in <linux/instrumented.h>.

> > This would mean that KCSAN runtime functions ended up in the trace,
> > but the function where the access happened would not. However, I don't
> > care about the runtime functions, and instead want to see the function
> > where the access happened. In that case, I'd like to just mark
> > __tsan_X and any other kcsan instrumentation functions as
> > do-not-tail-call-optimize, which would solve the problem.
>
> I understand why we don't want to TCO these calls, but given the calls
> are implicitly generated, I strongly suspect it's better to fix the
> implicit call generation to not be TCO'd to begin with.
>
> > The solution today is that when you compile a kernel with KCSAN, every
> > instrumented TU is compiled with -fno-optimize-sibling-calls. The
> > better solution would be to just mark KCSAN runtime functions somehow,
> > but permit tail calling other things. Although, I probably still want
> > to see the full trace, and would decide that having
> > -fno-optimize-sibling-calls is a small price to pay in a
> > debug-only-kernel to get complete traces.
> >
> > > ... if K were marked in this way, and J was compiled with visibility of
> > > this, J would stick around, but J's callers might not, and so the a
> > > trace might see:
> > >
> > >   A->J->K
> > >
> > > ... do you just care about the final caller, i.e. you just need
> > > certainty that J will be in the trace?
> >
> > Yes. But maybe it's a special problem that only sanitizers have.
>
> I reckon for basically any instrumentation we don't want calls to be
> TCO'd, though I'm not immediately sure of cases beyond sanitizers and
> mcount/fentry.

Thinking about this more, I think it's all debugging tools. E.g.
lockdep, if you lock/unlock at the end of a function, you might tail
call into lockdep. If the compiler applies TCO, and lockdep determines
there's a bug and then shows a trace, you'll have no idea where the
actual bug is. The kernel has lots of debugging facilities that add
instrumentation in this way. So perhaps it's a general debugging-tool
problem (rather than just sanitizers).

Thanks,
-- Marco
Segher Boessenkool March 4, 2021, 7:24 p.m. UTC | #13
On Thu, Mar 04, 2021 at 09:54:44AM -0800, Nick Desaulniers wrote:
> On Thu, Mar 4, 2021 at 9:42 AM Marco Elver <elver@google.com> wrote:
> include/linux/compiler.h:246:
> prevent_tail_call_optimization
> 
> commit a9a3ed1eff36 ("x86: Fix early boot crash on gcc-10, third try")

That is much heavier than needed (an mb()).  You can just put an empty
inline asm after a call before a return, and that call cannot be
optimised to a sibling call: (the end of a function is an implicit
return:)

Instead of:

void g(void);
void f(int x)
	if (x)
		g();
}

Do:

void g(void);
void f(int x)
	if (x)
		g();
	asm("");
}

This costs no extra instructions, and certainly not something as heavy
as an mb()!  It works without the "if" as well, of course, but with it
it is a more interesting example of a tail call.


Segher
Segher Boessenkool March 4, 2021, 9:54 p.m. UTC | #14
Hi!

On Thu, Mar 04, 2021 at 02:57:30PM +0000, Mark Rutland wrote:
> It looks like GCC is happy to give us the function-entry-time FP if we use
> __builtin_frame_address(1),

From the GCC manual:
     Calling this function with a nonzero argument can have
     unpredictable effects, including crashing the calling program.  As
     a result, calls that are considered unsafe are diagnosed when the
     '-Wframe-address' option is in effect.  Such calls should only be
     made in debugging situations.

It *does* warn (the warning is in -Wall btw), on both powerpc and
aarch64.  Furthermore, using this builtin causes lousy code (it forces
the use of a frame pointer, which we normally try very hard to optimise
away, for good reason).

And, that warning is not an idle warning.  Non-zero arguments to
__builtin_frame_address can crash the program.  It won't on simpler
functions, but there is no real definition of what a simpler function
*is*.  It is meant for debugging, not for production use (this is also
why no one has bothered to make it faster).

On Power it should work, but on pretty much any other arch it won't.

> Unless we can get some strong guarantees from compiler folk such that we
> can guarantee a specific function acts boundary for unwinding (and
> doesn't itself get split, etc), the only reliable way I can think to
> solve this requires an assembly trampoline. Whatever we do is liable to
> need some invasive rework.

You cannot get such a guarantee, other than not letting the compiler
see into the routine at all, like with assembler code (not inline asm,
real assembler code).

The real way forward is to bite the bullet and to no longer pretend you
can do a full backtrace from just the stack contents.  You cannot.


Segher
Christophe Leroy March 5, 2021, 6:38 a.m. UTC | #15
Le 04/03/2021 à 20:24, Segher Boessenkool a écrit :
> On Thu, Mar 04, 2021 at 09:54:44AM -0800, Nick Desaulniers wrote:
>> On Thu, Mar 4, 2021 at 9:42 AM Marco Elver <elver@google.com> wrote:
>> include/linux/compiler.h:246:
>> prevent_tail_call_optimization
>>
>> commit a9a3ed1eff36 ("x86: Fix early boot crash on gcc-10, third try")

https://github.com/linuxppc/linux/commit/a9a3ed1eff36

> 
> That is much heavier than needed (an mb()).  You can just put an empty
> inline asm after a call before a return, and that call cannot be
> optimised to a sibling call: (the end of a function is an implicit
> return:)
> 
> Instead of:
> 
> void g(void);
> void f(int x)
> 	if (x)
> 		g();
> }
> 
> Do:
> 
> void g(void);
> void f(int x)
> 	if (x)
> 		g();
> 	asm("");
> }
> 
> This costs no extra instructions, and certainly not something as heavy
> as an mb()!  It works without the "if" as well, of course, but with it
> it is a more interesting example of a tail call.

In the commit mentionned at the top, it is said:

The next attempt to prevent compilers from tail-call optimizing
the last function call cpu_startup_entry(), ... , was to add an empty asm("").

This current solution was short and sweet, and reportedly, is supported
by both compilers but we didn't get very far this time: future (LTO?)
optimization passes could potentially eliminate this, which leads us
to the third attempt: having an actual memory barrier there which the
compiler cannot ignore or move around etc.

Christophe
Mark Rutland March 5, 2021, 12:04 p.m. UTC | #16
On Thu, Mar 04, 2021 at 08:01:29PM +0100, Marco Elver wrote:
> On Thu, 4 Mar 2021 at 19:51, Mark Rutland <mark.rutland@arm.com> wrote:
> > On Thu, Mar 04, 2021 at 07:22:53PM +0100, Marco Elver wrote:

> > > I was having this problem with KCSAN, where the compiler would
> > > tail-call-optimize __tsan_X instrumentation.
> >
> > Those are compiler-generated calls, right? When those are generated the
> > compilation unit (and whatever it has included) might not have provided
> > a prototype anyway, and the compiler has special knowledge of the
> > functions, so it feels like the compiler would need to inhibit TCO here
> > for this to be robust. For their intended usage subjecting them to TCO
> > doesn't seem to make sense AFAICT.
> >
> > I suspect that compilers have some way of handling that; otherwise I'd
> > expect to have heard stories of mcount/fentry calls getting TCO'd and
> > causing problems. So maybe there's an easy fix there?
> 
> I agree, the compiler builtins should be handled by the compiler
> directly, perhaps that was a bad example. But we also have "explicit
> instrumentation", e.g. everything that's in <linux/instrumented.h>.

True -- I agree for those we want similar, and can see a case for a
no-tco-calls-to-me attribute on functions as with noreturn.

Maybe for now it's worth adding prevent_tail_call_optimization() to the
instrument_*() call wrappers in <linux/instrumented.h>? As those are
__always_inline, that should keep the function they get inlined in
around. Though we probably want to see if we can replace the mb() in
prevent_tail_call_optimization() with something that doesn't require a
real CPU barrier.

[...]

> > I reckon for basically any instrumentation we don't want calls to be
> > TCO'd, though I'm not immediately sure of cases beyond sanitizers and
> > mcount/fentry.
> 
> Thinking about this more, I think it's all debugging tools. E.g.
> lockdep, if you lock/unlock at the end of a function, you might tail
> call into lockdep. If the compiler applies TCO, and lockdep determines
> there's a bug and then shows a trace, you'll have no idea where the
> actual bug is. The kernel has lots of debugging facilities that add
> instrumentation in this way. So perhaps it's a general debugging-tool
> problem (rather than just sanitizers).

This makes sense to me.

Thanks,
Mark.
Segher Boessenkool March 5, 2021, 6:16 p.m. UTC | #17
On Fri, Mar 05, 2021 at 07:38:25AM +0100, Christophe Leroy wrote:
> Le 04/03/2021 à 20:24, Segher Boessenkool a écrit :
> https://github.com/linuxppc/linux/commit/a9a3ed1eff36
> 
> >
> >That is much heavier than needed (an mb()).  You can just put an empty
> >inline asm after a call before a return, and that call cannot be
> >optimised to a sibling call: (the end of a function is an implicit
> >return:)

> In the commit mentionned at the top, it is said:
> 
> The next attempt to prevent compilers from tail-call optimizing
> the last function call cpu_startup_entry(), ... , was to add an empty 
> asm("").
> 
> This current solution was short and sweet, and reportedly, is supported
> by both compilers but we didn't get very far this time: future (LTO?)
> optimization passes could potentially eliminate this,

This is simply not true.  A volatile inline asm (like this is, all
asm without outputs are) is always run on the reel machine exactly like
on the abstract machine.  LTO can not eliminate it, not more than any
other optimisation can.  The compiler makes no assumption about the
constents of the template of an asm, empty or not.

If you are really scared the compiler violates the rules of GCC inline
asm and thinks it knows what "" means, you can write
  asm(";#");
(that is a comment on all supported archs).

> which leads us
> to the third attempt: having an actual memory barrier there which the
> compiler cannot ignore or move around etc.

Why would it not be allowed to delete this, and delete some other asm?

And the compiler *can* move around asm like this.  But the point is,
it has to stay in order with other side effects, so there cannot be a
sibling call here, the call has to remain: any call contains a sequence
point, so side effects cannot be reordered over it, so the call (being
before the asm) cannot be transformed to a tail call.


Segher
Mark Rutland March 9, 2021, 4:05 p.m. UTC | #18
On Thu, Mar 04, 2021 at 03:54:48PM -0600, Segher Boessenkool wrote:
> Hi!

Hi Segher,

> On Thu, Mar 04, 2021 at 02:57:30PM +0000, Mark Rutland wrote:
> > It looks like GCC is happy to give us the function-entry-time FP if we use
> > __builtin_frame_address(1),
> 
> From the GCC manual:
>      Calling this function with a nonzero argument can have
>      unpredictable effects, including crashing the calling program.  As
>      a result, calls that are considered unsafe are diagnosed when the
>      '-Wframe-address' option is in effect.  Such calls should only be
>      made in debugging situations.
> 
> It *does* warn (the warning is in -Wall btw), on both powerpc and
> aarch64.  Furthermore, using this builtin causes lousy code (it forces
> the use of a frame pointer, which we normally try very hard to optimise
> away, for good reason).
> 
> And, that warning is not an idle warning.  Non-zero arguments to
> __builtin_frame_address can crash the program.  It won't on simpler
> functions, but there is no real definition of what a simpler function
> *is*.  It is meant for debugging, not for production use (this is also
> why no one has bothered to make it faster).
>
> On Power it should work, but on pretty much any other arch it won't.

I understand this is true generally, and cannot be relied upon in
portable code. However as you hint here for Power, I believe that on
arm64 __builtin_frame_address(1) shouldn't crash the program due to the
way frame records work on arm64, but I'll go check with some local
compiler folk. I agree that __builtin_frame_address(2) and beyond
certainly can, e.g.  by NULL dereference and similar.

For context, why do you think this would work on power specifically? I
wonder if our rationale is similar.

Are you aware of anything in particular that breaks using
__builtin_frame_address(1) in non-portable code, or is this just a
general sentiment of this not being a supported use-case?

> > Unless we can get some strong guarantees from compiler folk such that we
> > can guarantee a specific function acts boundary for unwinding (and
> > doesn't itself get split, etc), the only reliable way I can think to
> > solve this requires an assembly trampoline. Whatever we do is liable to
> > need some invasive rework.
> 
> You cannot get such a guarantee, other than not letting the compiler
> see into the routine at all, like with assembler code (not inline asm,
> real assembler code).

If we cannot reliably ensure this then I'm happy to go write an assembly
trampoline to snapshot the state at a function call boundary (where our
procedure call standard mandates the state of the LR, FP, and frame
records pointed to by the FP). This'll require reworking a reasonable
amount of code cross-architecture, so I'll need to get some more
concrete justification (e.g. examples of things that can go wrong in
practice).

> The real way forward is to bite the bullet and to no longer pretend you
> can do a full backtrace from just the stack contents.  You cannot.

I think what you mean here is that there's no reliable way to handle the
current/leaf function, right? If so I do agree.

Beyond that I believe that arm64's frame records should be sufficient.

Thanks,
Mark.
Segher Boessenkool March 9, 2021, 10:05 p.m. UTC | #19
Hi!

On Tue, Mar 09, 2021 at 04:05:23PM +0000, Mark Rutland wrote:
> On Thu, Mar 04, 2021 at 03:54:48PM -0600, Segher Boessenkool wrote:
> > On Thu, Mar 04, 2021 at 02:57:30PM +0000, Mark Rutland wrote:
> > > It looks like GCC is happy to give us the function-entry-time FP if we use
> > > __builtin_frame_address(1),
> > 
> > From the GCC manual:
> >      Calling this function with a nonzero argument can have
> >      unpredictable effects, including crashing the calling program.  As
> >      a result, calls that are considered unsafe are diagnosed when the
> >      '-Wframe-address' option is in effect.  Such calls should only be
> >      made in debugging situations.
> > 
> > It *does* warn (the warning is in -Wall btw), on both powerpc and
> > aarch64.  Furthermore, using this builtin causes lousy code (it forces
> > the use of a frame pointer, which we normally try very hard to optimise
> > away, for good reason).
> > 
> > And, that warning is not an idle warning.  Non-zero arguments to
> > __builtin_frame_address can crash the program.  It won't on simpler
> > functions, but there is no real definition of what a simpler function
> > *is*.  It is meant for debugging, not for production use (this is also
> > why no one has bothered to make it faster).
> >
> > On Power it should work, but on pretty much any other arch it won't.
> 
> I understand this is true generally, and cannot be relied upon in
> portable code. However as you hint here for Power, I believe that on
> arm64 __builtin_frame_address(1) shouldn't crash the program due to the
> way frame records work on arm64, but I'll go check with some local
> compiler folk. I agree that __builtin_frame_address(2) and beyond
> certainly can, e.g.  by NULL dereference and similar.

I still do not know the aarch64 ABI well enough.  If only I had time!

> For context, why do you think this would work on power specifically? I
> wonder if our rationale is similar.

On most 64-bit Power ABIs all stack frames are connected together as a
linked list (which is updated atomically, importantly).  This makes it
possible to always find all previous stack frames.

> Are you aware of anything in particular that breaks using
> __builtin_frame_address(1) in non-portable code, or is this just a
> general sentiment of this not being a supported use-case?

It is not supported, and trying to do it anyway can crash: it can use
random stack contents as pointer!  Not really "random" of course, but
where it thinks to find a pointer into the previous frame, which is not
something it can rely on (unless the ABI guarantees it somehow).

See gcc.gnu.org/PR60109 for example.

> > > Unless we can get some strong guarantees from compiler folk such that we
> > > can guarantee a specific function acts boundary for unwinding (and
> > > doesn't itself get split, etc), the only reliable way I can think to
> > > solve this requires an assembly trampoline. Whatever we do is liable to
> > > need some invasive rework.
> > 
> > You cannot get such a guarantee, other than not letting the compiler
> > see into the routine at all, like with assembler code (not inline asm,
> > real assembler code).
> 
> If we cannot reliably ensure this then I'm happy to go write an assembly
> trampoline to snapshot the state at a function call boundary (where our
> procedure call standard mandates the state of the LR, FP, and frame
> records pointed to by the FP).

Is the frame pointer required?!

> This'll require reworking a reasonable
> amount of code cross-architecture, so I'll need to get some more
> concrete justification (e.g. examples of things that can go wrong in
> practice).

Say you have a function that does dynamic stack allocation, then there
is usually no way to find the previous stack frame (without function-
specific knowledge).  So __builtin_frame_address cannot work (it knows
nothing about frames further up).

Dynamic stack allocation (alloca, or variable length automatic arrays)
is just the most common and most convenient example; it is not the only
case you have problems here.

> > The real way forward is to bite the bullet and to no longer pretend you
> > can do a full backtrace from just the stack contents.  You cannot.
> 
> I think what you mean here is that there's no reliable way to handle the
> current/leaf function, right? If so I do agree.

No, I meant what I said.

There is the separate issue that you do not know where the return
address (etc.) is stored in a function that has not yet done a call
itself, sure.  You cannot assume anything the ABI does not tell you you
can depend on.

> Beyond that I believe that arm64's frame records should be sufficient.

Do you have a simple linked list connecting all frames?  The aarch64 GCC
port does not define anything special here (DYNAMIC_CHAIN_ADDRESS), so
the default will be used: every frame pointer has to point to the
previous one, no exceptions whatsoever.


Segher
Mark Rutland March 10, 2021, 11:32 a.m. UTC | #20
On Tue, Mar 09, 2021 at 04:05:32PM -0600, Segher Boessenkool wrote:
> Hi!
> 
> On Tue, Mar 09, 2021 at 04:05:23PM +0000, Mark Rutland wrote:
> > On Thu, Mar 04, 2021 at 03:54:48PM -0600, Segher Boessenkool wrote:
> > > On Thu, Mar 04, 2021 at 02:57:30PM +0000, Mark Rutland wrote:
> > > > It looks like GCC is happy to give us the function-entry-time FP if we use
> > > > __builtin_frame_address(1),
> > > 
> > > From the GCC manual:
> > >      Calling this function with a nonzero argument can have
> > >      unpredictable effects, including crashing the calling program.  As
> > >      a result, calls that are considered unsafe are diagnosed when the
> > >      '-Wframe-address' option is in effect.  Such calls should only be
> > >      made in debugging situations.
> > > 
> > > It *does* warn (the warning is in -Wall btw), on both powerpc and
> > > aarch64.  Furthermore, using this builtin causes lousy code (it forces
> > > the use of a frame pointer, which we normally try very hard to optimise
> > > away, for good reason).
> > > 
> > > And, that warning is not an idle warning.  Non-zero arguments to
> > > __builtin_frame_address can crash the program.  It won't on simpler
> > > functions, but there is no real definition of what a simpler function
> > > *is*.  It is meant for debugging, not for production use (this is also
> > > why no one has bothered to make it faster).
> > >
> > > On Power it should work, but on pretty much any other arch it won't.
> > 
> > I understand this is true generally, and cannot be relied upon in
> > portable code. However as you hint here for Power, I believe that on
> > arm64 __builtin_frame_address(1) shouldn't crash the program due to the
> > way frame records work on arm64, but I'll go check with some local
> > compiler folk. I agree that __builtin_frame_address(2) and beyond
> > certainly can, e.g.  by NULL dereference and similar.
> 
> I still do not know the aarch64 ABI well enough.  If only I had time!
> 
> > For context, why do you think this would work on power specifically? I
> > wonder if our rationale is similar.
> 
> On most 64-bit Power ABIs all stack frames are connected together as a
> linked list (which is updated atomically, importantly).  This makes it
> possible to always find all previous stack frames.

We have something similar on arm64, where the kernel depends on being
built with a frame pointer following the AAPCS frame pointer rules.

Every stack frame contains a "frame record" *somewhere* within that
stack frame, and the frame records are chained together as a linked
list. The frame pointer points at the most recent frame record (and this
is what __builtin_frame_address(0) returns).

The records themselves are basically:

struct record {
	struct record *next;
	unsigned long ret_addr;
};

At function call boundaries, we know that the FP is the caller's record
(or NULL for the first function), and the LR is the address the current
function should return to. Within a function with a stack frame, we can
access that function's record and the `next` field (equivalent to the FP
at the time of entry to the function) is what __builtin_frame_address(1)
should return.

> > Are you aware of anything in particular that breaks using
> > __builtin_frame_address(1) in non-portable code, or is this just a
> > general sentiment of this not being a supported use-case?
> 
> It is not supported, and trying to do it anyway can crash: it can use
> random stack contents as pointer!  Not really "random" of course, but
> where it thinks to find a pointer into the previous frame, which is not
> something it can rely on (unless the ABI guarantees it somehow).
> 
> See gcc.gnu.org/PR60109 for example.

Sure; I see that being true generally (and Ramana noted that on 32-bit
arm a frame pointer wasn't mandated), but I think in this case we have a
stronger target (and configuration) specific guarantee.

> > > > Unless we can get some strong guarantees from compiler folk such that we
> > > > can guarantee a specific function acts boundary for unwinding (and
> > > > doesn't itself get split, etc), the only reliable way I can think to
> > > > solve this requires an assembly trampoline. Whatever we do is liable to
> > > > need some invasive rework.
> > > 
> > > You cannot get such a guarantee, other than not letting the compiler
> > > see into the routine at all, like with assembler code (not inline asm,
> > > real assembler code).
> > 
> > If we cannot reliably ensure this then I'm happy to go write an assembly
> > trampoline to snapshot the state at a function call boundary (where our
> > procedure call standard mandates the state of the LR, FP, and frame
> > records pointed to by the FP).
> 
> Is the frame pointer required?!

The arm64 Linux port mandates frame pointers for kernel code. It is
generally possible to build code without frame pointers (e.g. userspace),
but doing that for kernel code would be a bug.

> > This'll require reworking a reasonable
> > amount of code cross-architecture, so I'll need to get some more
> > concrete justification (e.g. examples of things that can go wrong in
> > practice).
> 
> Say you have a function that does dynamic stack allocation, then there
> is usually no way to find the previous stack frame (without function-
> specific knowledge).  So __builtin_frame_address cannot work (it knows
> nothing about frames further up).
> 
> Dynamic stack allocation (alloca, or variable length automatic arrays)
> is just the most common and most convenient example; it is not the only
> case you have problems here.

I agree with those as general concerns, but I don't think that affects
arm64's frame records, since their location within a stack frame is
immaterial given the chaining.

> > > The real way forward is to bite the bullet and to no longer pretend you
> > > can do a full backtrace from just the stack contents.  You cannot.
> > 
> > I think what you mean here is that there's no reliable way to handle the
> > current/leaf function, right? If so I do agree.
> 
> No, I meant what I said.
> 
> There is the separate issue that you do not know where the return
> address (etc.) is stored in a function that has not yet done a call
> itself, sure.  You cannot assume anything the ABI does not tell you you
> can depend on.

This is in the frame record per the AAPCS.

> > Beyond that I believe that arm64's frame records should be sufficient.
> 
> Do you have a simple linked list connecting all frames?

Yes.

Thanks,
Mark.
Segher Boessenkool March 10, 2021, 5:37 p.m. UTC | #21
Hi!

On Wed, Mar 10, 2021 at 11:32:20AM +0000, Mark Rutland wrote:
> On Tue, Mar 09, 2021 at 04:05:32PM -0600, Segher Boessenkool wrote:
> > On Tue, Mar 09, 2021 at 04:05:23PM +0000, Mark Rutland wrote:
> > > On Thu, Mar 04, 2021 at 03:54:48PM -0600, Segher Boessenkool wrote:
> > > > On Thu, Mar 04, 2021 at 02:57:30PM +0000, Mark Rutland wrote:
> > > > > It looks like GCC is happy to give us the function-entry-time FP if we use
> > > > > __builtin_frame_address(1),
> > > > 
> > > > From the GCC manual:
> > > >      Calling this function with a nonzero argument can have
> > > >      unpredictable effects, including crashing the calling program.  As
> > > >      a result, calls that are considered unsafe are diagnosed when the
> > > >      '-Wframe-address' option is in effect.  Such calls should only be
> > > >      made in debugging situations.
> > > > 
> > > > It *does* warn (the warning is in -Wall btw), on both powerpc and
> > > > aarch64.  Furthermore, using this builtin causes lousy code (it forces
> > > > the use of a frame pointer, which we normally try very hard to optimise
> > > > away, for good reason).
> > > > 
> > > > And, that warning is not an idle warning.  Non-zero arguments to
> > > > __builtin_frame_address can crash the program.  It won't on simpler
> > > > functions, but there is no real definition of what a simpler function
> > > > *is*.  It is meant for debugging, not for production use (this is also
> > > > why no one has bothered to make it faster).
> > > >
> > > > On Power it should work, but on pretty much any other arch it won't.
> > > 
> > > I understand this is true generally, and cannot be relied upon in
> > > portable code. However as you hint here for Power, I believe that on
> > > arm64 __builtin_frame_address(1) shouldn't crash the program due to the
> > > way frame records work on arm64, but I'll go check with some local
> > > compiler folk. I agree that __builtin_frame_address(2) and beyond
> > > certainly can, e.g.  by NULL dereference and similar.
> > 
> > I still do not know the aarch64 ABI well enough.  If only I had time!
> > 
> > > For context, why do you think this would work on power specifically? I
> > > wonder if our rationale is similar.
> > 
> > On most 64-bit Power ABIs all stack frames are connected together as a
> > linked list (which is updated atomically, importantly).  This makes it
> > possible to always find all previous stack frames.
> 
> We have something similar on arm64, where the kernel depends on being
> built with a frame pointer following the AAPCS frame pointer rules.

The huge difference is on Power this is about the stack itself: you do
not need a frame pointer at all for it (there is no specific register
named as frame pointer, even).

> Every stack frame contains a "frame record" *somewhere* within that
> stack frame, and the frame records are chained together as a linked
> list. The frame pointer points at the most recent frame record (and this
> is what __builtin_frame_address(0) returns).

> > See gcc.gnu.org/PR60109 for example.
> 
> Sure; I see that being true generally (and Ramana noted that on 32-bit
> arm a frame pointer wasn't mandated), but I think in this case we have a
> stronger target (and configuration) specific guarantee.

It sounds like it, yes.  You need to have a frame pointer in the ABI,
with pretty strong rules, and have everything follow those rules.

> > Is the frame pointer required?!
> 
> The arm64 Linux port mandates frame pointers for kernel code. It is
> generally possible to build code without frame pointers (e.g. userspace),
> but doing that for kernel code would be a bug.

I see.  And it even is less expensive to do this than on most machines,
because of register pair load/store instructions :-)

> > > > The real way forward is to bite the bullet and to no longer pretend you
> > > > can do a full backtrace from just the stack contents.  You cannot.
> > > 
> > > I think what you mean here is that there's no reliable way to handle the
> > > current/leaf function, right? If so I do agree.
> > 
> > No, I meant what I said.
> > 
> > There is the separate issue that you do not know where the return
> > address (etc.) is stored in a function that has not yet done a call
> > itself, sure.  You cannot assume anything the ABI does not tell you you
> > can depend on.
> 
> This is in the frame record per the AAPCS.

But you do not know where in the function it will store that.  It often
can be optimised by the compiler to only store the LR and FP on paths
where a call will happen later, and there is no way (without DWARF info
or similar) to know whether that has happened yet or not.

This is a well-known problem of course.  For the current function you
cannot know in general if there is an activation frame yet or not.


Segher

Patch
diff mbox series

diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index b6440657ef92..67c2b8488035 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -22,16 +22,32 @@ 
 #include <asm/kprobes.h>
 
 #include <asm/paca.h>
+#include <asm/switch_to.h>
 
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
  */
+static void save_entry(struct stack_trace *trace, unsigned long ip, int savesched)
+{
+	if (savesched || !in_sched_functions(ip)) {
+		if (!trace->skip)
+			trace->entries[trace->nr_entries++] = ip;
+		else
+			trace->skip--;
+	}
+}
+
 static void save_context_stack(struct stack_trace *trace, unsigned long sp,
-			struct task_struct *tsk, int savesched)
+			       unsigned long ip, struct task_struct *tsk, int savesched)
 {
+	save_entry(trace, ip, savesched);
+
+	if (trace->nr_entries >= trace->max_entries)
+		return;
+
 	for (;;) {
 		unsigned long *stack = (unsigned long *) sp;
-		unsigned long newsp, ip;
+		unsigned long newsp;
 
 		if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD))
 			return;
@@ -39,12 +55,7 @@  static void save_context_stack(struct stack_trace *trace, unsigned long sp,
 		newsp = stack[0];
 		ip = stack[STACK_FRAME_LR_SAVE];
 
-		if (savesched || !in_sched_functions(ip)) {
-			if (!trace->skip)
-				trace->entries[trace->nr_entries++] = ip;
-			else
-				trace->skip--;
-		}
+		save_entry(trace, ip, savesched);
 
 		if (trace->nr_entries >= trace->max_entries)
 			return;
@@ -59,23 +70,26 @@  void save_stack_trace(struct stack_trace *trace)
 
 	sp = current_stack_frame();
 
-	save_context_stack(trace, sp, current, 1);
+	save_context_stack(trace, sp, (unsigned long)save_stack_trace, current, 1);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
-	unsigned long sp;
+	unsigned long sp, ip;
 
 	if (!try_get_task_stack(tsk))
 		return;
 
-	if (tsk == current)
+	if (tsk == current) {
+		ip = (unsigned long)save_stack_trace_tsk;
 		sp = current_stack_frame();
-	else
+	} else {
+		ip = (unsigned long)_switch;
 		sp = tsk->thread.ksp;
+	}
 
-	save_context_stack(trace, sp, tsk, 0);
+	save_context_stack(trace, sp, ip, tsk, 0);
 
 	put_task_stack(tsk);
 }
@@ -84,7 +98,7 @@  EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 void
 save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 {
-	save_context_stack(trace, regs->gpr[1], current, 0);
+	save_context_stack(trace, regs->gpr[1], regs->nip, current, 0);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_regs);