[PATCH] x86/stacktrace: Do not access user space memory unnecessarily

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
@ 2019-07-02  5:31 Eiichi Tsukata
  2019-07-02  7:28 ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Eiichi Tsukata @ 2019-07-02  5:31 UTC (permalink / raw)
  To: rostedt, edwintorok, tglx, mingo, bp, hpa, x86, linux-kernel
  Cc: Eiichi Tsukata

Put the boundary check before it accesses user space to prevent unnecessary
access which might crash the machine.

Especially, ftrace preemptirq/irq_disable event with user stack trace
option can trigger SEGV in pid 1 which leads to panic.

Reproducer:

  CONFIG_PREEMPTIRQ_TRACEPOINTS=y
  # echo 1 > events/preemptirq/enable
  # echo userstacktrace > trace_options

Output:

  Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
  CPU: 1 PID: 1 Comm: systemd Not tainted 5.2.0-rc7+ #10
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
  Call Trace:
   dump_stack+0x67/0x90
   panic+0x100/0x2c6
   do_exit.cold+0x4e/0x101
   do_group_exit+0x3a/0xa0
   get_signal+0x14a/0x8e0
   do_signal+0x36/0x650
   exit_to_usermode_loop+0x92/0xb0
   prepare_exit_to_usermode+0x6f/0xb0
   retint_user+0x8/0x18
  RIP: 0033:0x55be7ad1c89f
  Code: Bad RIP value.
  RSP: 002b:00007ffe329a4b00 EFLAGS: 00010202
  RAX: 0000000000000768 RBX: 00007ffe329a4ba0 RCX: 00007ff0063aa469
  RDX: 00007ff0066761de RSI: 00007ffe329a4b20 RDI: 0000000000000768
  RBP: 000000000000000b R08: 0000000000000000 R09: 00007ffe329a4e2f
  R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000768
  R13: 0000000000000000 R14: 0000000000000004 R15: 000055be7b3d3560
  Kernel Offset: 0x2a000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)

Fixes: 02b67518e2b1 ("tracing: add support for userspace stacktraces in tracing/iter_ctrl")
Signed-off-by: Eiichi Tsukata <devel@etsukata.com>
---
 arch/x86/kernel/stacktrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 2abf27d7df6b..6d0c608ffe34 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -123,12 +123,12 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
 	while (1) {
 		struct stack_frame_user frame;
 
+		if ((unsigned long)fp < regs->sp)
+			break;
 		frame.next_fp = NULL;
 		frame.ret_addr = 0;
 		if (!copy_stack_frame(fp, &frame))
 			break;
-		if ((unsigned long)fp < regs->sp)
-			break;
 		if (frame.ret_addr) {
 			if (!consume_entry(cookie, frame.ret_addr, false))
 				return;
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02  5:31 [PATCH] x86/stacktrace: Do not access user space memory unnecessarily Eiichi Tsukata
@ 2019-07-02  7:28 ` Peter Zijlstra
  2019-07-02 14:14   ` Thomas Gleixner
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2019-07-02  7:28 UTC (permalink / raw)
  To: Eiichi Tsukata
  Cc: rostedt, edwintorok, tglx, mingo, bp, hpa, x86, linux-kernel,
	Josh Poimboeuf

On Tue, Jul 02, 2019 at 02:31:51PM +0900, Eiichi Tsukata wrote:
> Put the boundary check before it accesses user space to prevent unnecessary
> access which might crash the machine.
> 
> Especially, ftrace preemptirq/irq_disable event with user stack trace
> option can trigger SEGV in pid 1 which leads to panic.
> 
> Reproducer:
> 
>   CONFIG_PREEMPTIRQ_TRACEPOINTS=y
>   # echo 1 > events/preemptirq/enable
>   # echo userstacktrace > trace_options
> 
> Output:
> 
>   Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
>   CPU: 1 PID: 1 Comm: systemd Not tainted 5.2.0-rc7+ #10

Killing systemd is a feature :-)

>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
>   Call Trace:
>    dump_stack+0x67/0x90
>    panic+0x100/0x2c6
>    do_exit.cold+0x4e/0x101
>    do_group_exit+0x3a/0xa0
>    get_signal+0x14a/0x8e0
>    do_signal+0x36/0x650
>    exit_to_usermode_loop+0x92/0xb0
>    prepare_exit_to_usermode+0x6f/0xb0
>    retint_user+0x8/0x18
>   RIP: 0033:0x55be7ad1c89f
>   Code: Bad RIP value.

^^^ that's weird, no amount of unwinding should affect regs->ip.

>   RSP: 002b:00007ffe329a4b00 EFLAGS: 00010202
>   RAX: 0000000000000768 RBX: 00007ffe329a4ba0 RCX: 00007ff0063aa469
>   RDX: 00007ff0066761de RSI: 00007ffe329a4b20 RDI: 0000000000000768
>   RBP: 000000000000000b R08: 0000000000000000 R09: 00007ffe329a4e2f
>   R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000768
>   R13: 0000000000000000 R14: 0000000000000004 R15: 000055be7b3d3560
>   Kernel Offset: 0x2a000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> 
> Fixes: 02b67518e2b1 ("tracing: add support for userspace stacktraces in tracing/iter_ctrl")
> Signed-off-by: Eiichi Tsukata <devel@etsukata.com>
> ---
>  arch/x86/kernel/stacktrace.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
> index 2abf27d7df6b..6d0c608ffe34 100644
> --- a/arch/x86/kernel/stacktrace.c
> +++ b/arch/x86/kernel/stacktrace.c
> @@ -123,12 +123,12 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
>  	while (1) {
>  		struct stack_frame_user frame;
>  
> +		if ((unsigned long)fp < regs->sp)
> +			break;
>  		frame.next_fp = NULL;
>  		frame.ret_addr = 0;
>  		if (!copy_stack_frame(fp, &frame))
>  			break;
> -		if ((unsigned long)fp < regs->sp)
> -			break;

Aside of which, that doesn't make sense, even if copy_stack_frame() was
fed utter garbage it should never result in the user process being
affected.

It does: "pagefault_disable(); __copy_from_user_inatomic()", which
should take the fault and catch it in an extable and have it return
-EFAULT.

Something is really fishy here, maybe Josh has an idea?

>  		if (frame.ret_addr) {
>  			if (!consume_entry(cookie, frame.ret_addr, false))
>  				return;
> -- 
> 2.21.0
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02  7:28 ` Peter Zijlstra
@ 2019-07-02 14:14   ` Thomas Gleixner
  2019-07-02 15:33     ` Steven Rostedt
  0 siblings, 1 reply; 16+ messages in thread
From: Thomas Gleixner @ 2019-07-02 14:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eiichi Tsukata, rostedt, edwintorok, mingo, bp, hpa, x86,
	linux-kernel, Josh Poimboeuf

On Tue, 2 Jul 2019, Peter Zijlstra wrote:

> On Tue, Jul 02, 2019 at 02:31:51PM +0900, Eiichi Tsukata wrote:
> > Put the boundary check before it accesses user space to prevent unnecessary
> > access which might crash the machine.
> > 
> > Especially, ftrace preemptirq/irq_disable event with user stack trace
> > option can trigger SEGV in pid 1 which leads to panic.

It triggers segfaults in random user processes which is bad enough.

And even with that 'fix' applied I can see random segfaults just less
frequent.

> >   RIP: 0033:0x55be7ad1c89f
> >   Code: Bad RIP value.
> 
> ^^^ that's weird, no amount of unwinding should affect regs->ip.

True.

I've gathered a trace from a crash. It's available here:

     https://tglx.de/~tglx/log.1.xz

The interesting part is:

[  352.756926]  systemd-1       1d..2 346277977us : <user stack trace>0000000004
[  352.756926]  =>  <00007f785ae26289>
[  352.758084]  systemd-1       1...1 346277978us : sys_clone -> 0x495
[  352.758846]  systemd-1       1...1 346277978us : <stack trace>    5
[  352.758846]  => do_syscall_64
[  352.758846]  => entry_SYSCALL_64_after_hwframe
[  352.760399]  systemd-1       1...1 346277979us : <user stack trace>
[  352.760399]  =>  <00007f785ae26289>TS]
[  352.761556]  systemd-1       1d... 346277979us : irq_disable: caller=do_syscall_64+0x87/0x110 parent=entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  352.763048]  systemd-1       1d... 346277979us : <stack trace>
[  352.763048]  => trace_hardirqs_off
[  352.763048]  => do_syscall_64
[  352.763048]  => entry_SYSCALL_64_after_hwframe
[  352.765015]  systemd-1       1d... 346277979us : <user 
[  352.765015]  =>  <00007f785ae26289>
[  352.766173]  systemd-1       1d... 346277980us : irq_enable: caller=trace_hardirqs_on_thunk+0x1a/0x1c parent=entry_SYSCALL_64_after_hwframe+0x59/0xbe
[  352.767745]  systemd-1       1d... 346277980us : <stack trace>
[  352.767745]  => trace_hardirqs_on_caller
[  352.767745]  => trace_hardirqs_on_thunk
[  352.767745]  => entry_SYSCALL_64_after_hwframe
[  352.769831]  systemd-1       1d... 346277981us : <user stack trace>
[  352.769831]  =>  <00007f785ae26289>
[  352.770989]  systemd-1       1d... 346277982us : irq_disable: caller=trace_hardirqs_off_thunk+0x1a/0x1c parent=error_entry+0x80/0x100
[  352.772408]  systemd-1       1d... 346277983us : <stack trace>
[  352.772408]  => trace_hardirqs_off_caller
[  352.772408]  => trace_hardirqs_off_thunk
[  352.772408]  => error_entry
[  352.774334]  systemd-1       1d... 346277983us : <user stack trace>
[  352.774334]  =>  <00005614ef9dde48>ter_hwframe
[  352.775505]  systemd-1       1d... 346277984us : page_fault_user: address=0x7ffd52fd0038 ip=0x5614ef9dde48 error_code=0x7
[  352.776811]  systemd-1       1d... 346277984us : <stack trace>
-UU-52.776811]  => do_page_fault
    52.776811]  => do_async_page_fault

....

[  353.078313]  =>  <00005614ef9dde48>
[  353.079486]  systemd-1       1d... 346278040us : irq_disable: caller=trace_hardirqs_off_thunk+0x1a/0x1c parent=error_entry+0x80/0x100
[  353.080952]  systemd-1       1d... 346278041us : <stack trace>
[  353.080952]  => trace_hardirqs_off_caller
[  353.080952]  => trace_hardirqs_off_thunk6278021us : <user stack trace>
[  353.080952]  => error_entry
[  353.082890]  systemd-1       1d... 346278041us : <user stack trace>rcu_irq_exit_irqson+0x2b/0x30 parent=trace_preempt_off+0xa1/0xd0
[  353.082890]  =>  <00007f785ab9ba85>
[  353.084059]  systemd-1       1d... 346278042us : page_fault_user: address=0x495 ip=0x7f785ab9ba85 error_code=0x7p_page_copy+0x344/0x790
[  353.085277]  systemd-1       1d... 346278042us : <stack trace>
[  353.085277]  => do1       1...1 346278034us : <user 
[  353.085277]  => do_async_page_fault
[  353.085277]  => async_page_fault
[  353.087114]  systemd-1       1d... 346278043us : <user stack trace>
[  353.087114]  =>  <00007f785ab9ba85>
[  353.088391]  systemd-1       1d... 346278043us : irq_enable: caller=__do_page_fault+0x2a7/0x4b0 parent=do_page_fault+0x28/0xf0
[  353.089761]  systemd-1       1d... 346278044us : <stack trace>

That last #PF kills it. What's weird is the PF address 0x495 which is the
return value of sys_clone() above. Might be coincidence, but I don't think
so.

Haven't had time to dig deeper.

Thanks,

	tglx



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02 14:14   ` Thomas Gleixner
@ 2019-07-02 15:33     ` Steven Rostedt
  2019-07-02 17:39       ` Steven Rostedt
  0 siblings, 1 reply; 16+ messages in thread
From: Steven Rostedt @ 2019-07-02 15:33 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Eiichi Tsukata, edwintorok, mingo, bp, hpa, x86,
	linux-kernel, Josh Poimboeuf

On Tue, 2 Jul 2019 16:14:05 +0200 (CEST)
Thomas Gleixner <tglx@linutronix.de> wrote:

> On Tue, 2 Jul 2019, Peter Zijlstra wrote:
> 
> > On Tue, Jul 02, 2019 at 02:31:51PM +0900, Eiichi Tsukata wrote:  
> > > Put the boundary check before it accesses user space to prevent unnecessary
> > > access which might crash the machine.
> > > 
> > > Especially, ftrace preemptirq/irq_disable event with user stack trace
> > > option can trigger SEGV in pid 1 which leads to panic.  

Note, I'm only able to trigger this crash with the irq_disable event.
The irq_enable and preempt_disable/enable events work just fine. This
leads me to believe that the TRACE_IRQS_OFF macro (which uses a thunk
trampoline) may have some issues and is probably the place to look at.

-- Steve

> 
> It triggers segfaults in random user processes which is bad enough.
> 
> And even with that 'fix' applied I can see random segfaults just less
> frequent.
> 
> > >   RIP: 0033:0x55be7ad1c89f
> > >   Code: Bad RIP value.  
> > 
> > ^^^ that's weird, no amount of unwinding should affect regs->ip.  
> 
>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02 15:33     ` Steven Rostedt
@ 2019-07-02 17:39       ` Steven Rostedt
  2019-07-02 17:47         ` Steven Rostedt
                           ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Steven Rostedt @ 2019-07-02 17:39 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Eiichi Tsukata, edwintorok, mingo, bp, hpa, x86,
	linux-kernel, Josh Poimboeuf, Joel Fernandes

On Tue, 2 Jul 2019 11:33:55 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 2 Jul 2019 16:14:05 +0200 (CEST)
> Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > On Tue, 2 Jul 2019, Peter Zijlstra wrote:
> >   
> > > On Tue, Jul 02, 2019 at 02:31:51PM +0900, Eiichi Tsukata wrote:    
> > > > Put the boundary check before it accesses user space to prevent unnecessary
> > > > access which might crash the machine.
> > > > 
> > > > Especially, ftrace preemptirq/irq_disable event with user stack trace
> > > > option can trigger SEGV in pid 1 which leads to panic.    
> 
> Note, I'm only able to trigger this crash with the irq_disable event.
> The irq_enable and preempt_disable/enable events work just fine. This
> leads me to believe that the TRACE_IRQS_OFF macro (which uses a thunk
> trampoline) may have some issues and is probably the place to look at.

I figured it out.

It's another "corruption of the cr2" register issue. The following
patch makes the issue go away. I'm not suggesting that we use this
patch, but it shows where the bug lies.

IIRC, there was patches posted before that fixed this issue. I'll go
look to see if I can dig them up. Was it Joel that sent them?

-- Steve

diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index be36bf4e0957..dd79256badb2 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -40,7 +40,7 @@
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
-	THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
+	THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller_cr2,1
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 46df4c6aae46..b42ca3fc569d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1555,3 +1555,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_page_fault);
+
+void trace_hardirqs_off_caller(unsigned long addr);
+
+void notrace trace_hardirqs_off_caller_cr2(unsigned long addr)
+{
+	unsigned long address = read_cr2(); /* Get the faulting address */
+
+	trace_hardirqs_off_caller(addr);
+	write_cr2(address);
+}


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02 17:39       ` Steven Rostedt
@ 2019-07-02 17:47         ` Steven Rostedt
  2019-07-02 20:18         ` Peter Zijlstra
  2019-07-19 20:28         ` Sean Christopherson
  2 siblings, 0 replies; 16+ messages in thread
From: Steven Rostedt @ 2019-07-02 17:47 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Eiichi Tsukata, edwintorok, mingo, bp, hpa, x86,
	linux-kernel, Josh Poimboeuf, Joel Fernandes

On Tue, 2 Jul 2019 13:39:05 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 2 Jul 2019 11:33:55 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Tue, 2 Jul 2019 16:14:05 +0200 (CEST)
> > Thomas Gleixner <tglx@linutronix.de> wrote:
> >   
> > > On Tue, 2 Jul 2019, Peter Zijlstra wrote:
> > >     
> > > > On Tue, Jul 02, 2019 at 02:31:51PM +0900, Eiichi Tsukata wrote:      
> > > > > Put the boundary check before it accesses user space to prevent unnecessary
> > > > > access which might crash the machine.
> > > > > 
> > > > > Especially, ftrace preemptirq/irq_disable event with user stack trace
> > > > > option can trigger SEGV in pid 1 which leads to panic.      
> > 
> > Note, I'm only able to trigger this crash with the irq_disable event.
> > The irq_enable and preempt_disable/enable events work just fine. This
> > leads me to believe that the TRACE_IRQS_OFF macro (which uses a thunk
> > trampoline) may have some issues and is probably the place to look at.  
> 
> I figured it out.
> 
> It's another "corruption of the cr2" register issue. The following
> patch makes the issue go away. I'm not suggesting that we use this
> patch, but it shows where the bug lies.
> 
> IIRC, there was patches posted before that fixed this issue. I'll go
> look to see if I can dig them up. Was it Joel that sent them?

Although with this patch, I just triggered this:

[ 1331.706273] WARNING: can't dereference registers at 00000000cb0cab6d for ip interrupt_entry+0xc1/0xf0
[ 1439.850235] ------------[ cut here ]------------
[ 1439.854848] General protection fault in user access. Non-canonical address?
[ 1439.854861] WARNING: CPU: 6 PID: 1620 at arch/x86/mm/extable.c:125 ex_handler_uaccess+0x62/0x70
[ 1439.870455] Modules linked in: ip6t_REJECT nf_reject_ipv6 xt_state nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip6table_filter ip6_tables snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel i915 snd_hda_codec hp_wmi snd_hwdep wmi_bmof sparse_keymap snd_hda_core rfkill i2c_algo_bit snd_seq iosf_mbi snd_seq_device snd_pcm drm_kms_helper x86_pkg_temp_thermal syscopyarea sysfillrect coretemp sysimgblt snd_timer fb_sys_fops snd drm crc32c_intel tpm_infineon soundcore e1000e ptp tpm_tis pps_core lpc_ich i2c_i801 mfd_core i2c_core tpm_tis_core tpm pcspkr serio_raw wmi video
[ 1439.922501] CPU: 6 PID: 1620 Comm: dhclient Not tainted 5.2.0-rc1-test+ #8
[ 1439.929350] Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
[ 1439.938277] RIP: 0010:ex_handler_uaccess+0x62/0x70
[ 1439.943055] Code: 80 00 00 00 b8 01 00 00 00 5b 5d 41 5c c3 80 3d 09 f3 ed 01 00 75 c5 48 c7 c7 80 bc 24 82 c6 05 f9 f2 ed 01 01 e8 ea ce 06 00 <0f> 0b eb ae 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 80 3d d9
[ 1439.961724] RSP: 0018:ffff8880c7fff4f0 EFLAGS: 00010082
[ 1439.966934] RAX: 0000000000000000 RBX: ffffffff820024b8 RCX: 0000000000000000
[ 1439.974045] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffffffff83e12fc0
[ 1439.981156] RBP: ffff8880c7fff578 R08: fffffbfff0561631 R09: fffffbfff0561630
[ 1439.988265] R10: fffffbfff0561630 R11: ffffffff82b0b183 R12: ffffffff820024b8
[ 1439.995368] R13: 000000000000000d R14: 0000000000000000 R15: 0000000000000000
[ 1440.002479] FS:  00007fdc180a2e80(0000) GS:ffff8880d4780000(0000) knlGS:0000000000000000
[ 1440.010537] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1440.016268] CR2: 0000000000001010 CR3: 00000000adad0001 CR4: 00000000001606e0
[ 1440.023379] Call Trace:
[ 1440.025827]  ? ex_handler_refcount+0xb0/0xb0
[ 1440.030089]  fixup_exception+0x60/0x7b
[ 1440.033837]  do_general_protection+0x68/0x1f0
[ 1440.038189]  general_protection+0x1e/0x30
[ 1440.042193] RIP: 0010:arch_stack_walk_user+0x6c/0x110
[ 1440.047231] Code: d8 22 00 00 48 83 e8 10 49 39 c7 77 44 4d 89 fd 31 db 65 48 8b 04 25 80 ee 01 00 83 80 e8 21 00 00 01 0f 1f 00 0f ae e8 89 d8 <4d> 8b 75 00 85 c0 0f 85 82 00 00 00 49 8b 75 08 0f 1f 00 85 c0 74
[ 1440.065903] RSP: 0018:ffff8880c7fff620 EFLAGS: 00010002
[ 1440.071117] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff811c5258
[ 1440.078226] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8880ce9c99c4
[ 1440.085341] RBP: ffffffff811c5200 R08: 1ffff11019d39338 R09: ffffed1019bf2924
[ 1440.092449] R10: ffffed1019bf2924 R11: ffff8880cdf94927 R12: ffff8880c7ffff58
[ 1440.099562] R13: 62696c2f7273752f R14: 62696c2f7273752f R15: 62696c2f7273752f
[ 1440.106679]  ? profile_setup.cold.11+0xa1/0xa1
[ 1440.111115]  ? stack_trace_consume_entry+0x58/0x80
[ 1440.115914]  stack_trace_save_user+0xb6/0xe8
[ 1440.120180]  ? stack_trace_save_tsk_reliable+0x1c0/0x1c0
[ 1440.125489]  trace_buffer_unlock_commit_regs+0x286/0x3f0
[ 1440.130791]  trace_event_buffer_commit+0xd0/0x300
[ 1440.135482]  ? trace_event_buffer_reserve+0xc6/0xf0
[ 1440.140353]  ? 0xffffffff81000000
[ 1440.143662]  trace_event_raw_event_preemptirq_template+0xe1/0x150
[ 1440.149743]  ? perf_trace_preemptirq_template+0x230/0x230
[ 1440.155135]  ? rcu_dynticks_curr_cpu_in_eqs+0x46/0x60
[ 1440.160174]  ? perf_trace_preemptirq_template+0x230/0x230
[ 1440.165558]  ? ktime_get_coarse_real_ts64+0x7f/0xf0
[ 1440.170428]  trace_hardirqs_off+0xbb/0x100
[ 1440.174519]  ktime_get_coarse_real_ts64+0x7f/0xf0
[ 1440.179217]  current_time+0x68/0xf0
[ 1440.182706]  ? timespec64_trunc+0x110/0x110
[ 1440.186902]  ? iov_iter_zero+0x7e0/0x7e0
[ 1440.190822]  ? preempt_count_sub+0xb0/0x100
[ 1440.194995]  ? match_held_lock+0x1b/0x230
[ 1440.199004]  atime_needs_update+0xdf/0x1b0
[ 1440.203093]  touch_atime+0x91/0x170
[ 1440.206576]  ? atime_needs_update+0x1b0/0x1b0
[ 1440.210924]  ? copy_page_to_iter+0x31d/0x560
[ 1440.215179]  ? pagecache_get_page+0x2f/0x3b0
[ 1440.219447]  generic_file_read_iter+0xe00/0x1110
[ 1440.224073]  ? arch_stack_walk+0x92/0xe0
[ 1440.227996]  ? filemap_write_and_wait_range+0x80/0x80
[ 1440.233034]  ? xattr_resolve_name+0x107/0x180
[ 1440.237385]  ? __vfs_getxattr+0xb2/0x100
[ 1440.241304]  ? __vfs_setxattr+0x110/0x110
[ 1440.245319]  new_sync_read+0x24f/0x370
[ 1440.249067]  ? __ia32_sys_llseek+0x1d0/0x1d0
[ 1440.253332]  ? fsnotify+0x690/0x6d0
[ 1440.256835]  ? __fsnotify_update_child_dentry_flags.part.4+0x170/0x170
[ 1440.263389]  vfs_read+0xa6/0x1a0
[ 1440.266631]  kernel_read+0x69/0xa0
[ 1440.270048]  prepare_binprm+0x258/0x2c0
[ 1440.273894]  ? install_exec_creds+0xb0/0xb0
[ 1440.278081]  __do_execve_file.isra.42+0x990/0xfc0
[ 1440.282813]  ? copy_strings_kernel+0x90/0x90
[ 1440.287089]  ? strncpy_from_user+0xd6/0x1f0
[ 1440.291281]  __x64_sys_execve+0x54/0x60
[ 1440.295120]  do_syscall_64+0x68/0x190
[ 1440.298780]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1440.303815] RIP: 0033:0x7fdc18501b0b
[ 1440.307392] Code: 41 89 01 eb da 66 2e 0f 1f 84 00 00 00 00 00 f7 d8 64 41 89 01 eb d6 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 3b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4d 63 0f 00 f7 d8 64 89 01 48
[ 1440.326068] RSP: 002b:00007fff5c5819f8 EFLAGS: 00000202 ORIG_RAX: 000000000000003b
[ 1440.333610] RAX: ffffffffffffffda RBX: 0000563a5d884f60 RCX: 00007fdc18501b0b
[ 1440.340724] RDX: 0000563a5d88d4a0 RSI: 00007fff5c581a10 RDI: 00007fff5c584e89
[ 1440.347825] RBP: 00007fff5c584e89 R08: 0000563a5d832290 R09: 0000000000000001
[ 1440.354931] R10: 00007fdc180a2e80 R11: 0000000000000202 R12: 0000563a5d88d4a0
[ 1440.362042] R13: 0000000000000000 R14: 0000563a5bb4fbe0 R15: 0000000000000136
[ 1440.369166] ---[ end trace e37d5da069ab7362 ]---

-- Steve

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02 17:39       ` Steven Rostedt
  2019-07-02 17:47         ` Steven Rostedt
@ 2019-07-02 20:18         ` Peter Zijlstra
  2019-07-02 20:33           ` Steven Rostedt
  2019-07-02 22:02           ` Peter Zijlstra
  2019-07-19 20:28         ` Sean Christopherson
  2 siblings, 2 replies; 16+ messages in thread
From: Peter Zijlstra @ 2019-07-02 20:18 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Thomas Gleixner, Eiichi Tsukata, edwintorok, mingo, bp, hpa, x86,
	linux-kernel, Josh Poimboeuf, Joel Fernandes

On Tue, Jul 02, 2019 at 01:39:05PM -0400, Steven Rostedt wrote:
> On Tue, 2 Jul 2019 11:33:55 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Tue, 2 Jul 2019 16:14:05 +0200 (CEST)
> > Thomas Gleixner <tglx@linutronix.de> wrote:
> > 
> > > On Tue, 2 Jul 2019, Peter Zijlstra wrote:
> > >   
> > > > On Tue, Jul 02, 2019 at 02:31:51PM +0900, Eiichi Tsukata wrote:    
> > > > > Put the boundary check before it accesses user space to prevent unnecessary
> > > > > access which might crash the machine.
> > > > > 
> > > > > Especially, ftrace preemptirq/irq_disable event with user stack trace
> > > > > option can trigger SEGV in pid 1 which leads to panic.    
> > 
> > Note, I'm only able to trigger this crash with the irq_disable event.
> > The irq_enable and preempt_disable/enable events work just fine. This
> > leads me to believe that the TRACE_IRQS_OFF macro (which uses a thunk
> > trampoline) may have some issues and is probably the place to look at.
> 
> I figured it out.
> 
> It's another "corruption of the cr2" register issue. The following

Arrggghhh..

> patch makes the issue go away. I'm not suggesting that we use this
> patch, but it shows where the bug lies.
> 
> IIRC, there was patches posted before that fixed this issue. I'll go
> look to see if I can dig them up. Was it Joel that sent them?

https://lkml.kernel.org/r/20190320221534.165ab87b@oasis.local.home

I think; lemme re-read that thread.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02 20:18         ` Peter Zijlstra
@ 2019-07-02 20:33           ` Steven Rostedt
  2019-07-02 22:02           ` Peter Zijlstra
  1 sibling, 0 replies; 16+ messages in thread
From: Steven Rostedt @ 2019-07-02 20:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Eiichi Tsukata, edwintorok, mingo, bp, hpa, x86,
	linux-kernel, Josh Poimboeuf, Joel Fernandes

On Tue, 2 Jul 2019 22:18:27 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Jul 02, 2019 at 01:39:05PM -0400, Steven Rostedt wrote:
> > On Tue, 2 Jul 2019 11:33:55 -0400
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> >   
> > > On Tue, 2 Jul 2019 16:14:05 +0200 (CEST)
> > > Thomas Gleixner <tglx@linutronix.de> wrote:
> > >   
> > > > On Tue, 2 Jul 2019, Peter Zijlstra wrote:
> > > >     
> > > > > On Tue, Jul 02, 2019 at 02:31:51PM +0900, Eiichi Tsukata wrote:      
> > > > > > Put the boundary check before it accesses user space to prevent unnecessary
> > > > > > access which might crash the machine.
> > > > > > 
> > > > > > Especially, ftrace preemptirq/irq_disable event with user stack trace
> > > > > > option can trigger SEGV in pid 1 which leads to panic.      
> > > 
> > > Note, I'm only able to trigger this crash with the irq_disable event.
> > > The irq_enable and preempt_disable/enable events work just fine. This
> > > leads me to believe that the TRACE_IRQS_OFF macro (which uses a thunk
> > > trampoline) may have some issues and is probably the place to look at.  
> > 
> > I figured it out.
> > 
> > It's another "corruption of the cr2" register issue. The following  
> 
> Arrggghhh..
> 
> > patch makes the issue go away. I'm not suggesting that we use this
> > patch, but it shows where the bug lies.
> > 
> > IIRC, there was patches posted before that fixed this issue. I'll go
> > look to see if I can dig them up. Was it Joel that sent them?  
> 
> https://lkml.kernel.org/r/20190320221534.165ab87b@oasis.local.home

Oh, I wrote the patches. No wonder I couldn't find them in my local
"patchwork". It doesn't include patches I write. But still, I
completely forgot. Better send me to the nursing home :-p

> 
> I think; lemme re-read that thread.

I'll need to do that too.

-- Steve

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02 20:18         ` Peter Zijlstra
  2019-07-02 20:33           ` Steven Rostedt
@ 2019-07-02 22:02           ` Peter Zijlstra
  1 sibling, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2019-07-02 22:02 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Thomas Gleixner, Eiichi Tsukata, edwintorok, mingo, bp, hpa, x86,
	linux-kernel, Josh Poimboeuf, Joel Fernandes

On Tue, Jul 02, 2019 at 10:18:27PM +0200, Peter Zijlstra wrote:

> I think; lemme re-read that thread.

*completely* untested, not even been near a compiler yet.

but it now includes 32bit support and should be more or less complete.

I removed the most horrendous (rbx control flow) hacks from idtentry,
although none of this is winning any prices. Cleaning up the idtentry is
for later, otherwise we'll get side-tracked into that again and leave
this bug lingering for a long long while still.

XXX write proper changelog, after testing.

---
 arch/x86/entry/entry_32.S             |   60 +++++++++++++++++++---------------
 arch/x86/entry/entry_64.S             |   28 +++++++--------
 arch/x86/include/asm/kvm_para.h       |    2 -
 arch/x86/include/asm/paravirt.h       |   22 +++++++-----
 arch/x86/include/asm/paravirt_types.h |    2 -
 arch/x86/include/asm/traps.h          |    2 -
 arch/x86/kernel/asm-offsets.c         |    1 
 arch/x86/kernel/head_64.S             |    4 --
 arch/x86/kernel/kvm.c                 |    8 ++--
 arch/x86/kernel/paravirt.c            |    2 -
 arch/x86/mm/fault.c                   |    3 -
 arch/x86/xen/enlighten_pv.c           |    3 +
 arch/x86/xen/mmu_pv.c                 |   12 ------
 arch/x86/xen/xen-asm.S                |   25 ++++++++++++++
 arch/x86/xen/xen-ops.h                |    3 +
 15 files changed, 103 insertions(+), 74 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -294,9 +294,11 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
 	cld
+.if \skip_gs == 0
 	PUSH_GS
+.endif
 	FIXUP_FRAME
 	pushl	%fs
 	pushl	%es
@@ -313,13 +315,13 @@
 	movl	%edx, %es
 	movl	$(__KERNEL_PERCPU), %edx
 	movl	%edx, %fs
+.if \skip_gs == 0
 	SET_KERNEL_GS %edx
-
+.endif
 	/* Switch to kernel stack if necessary */
 .if \switch_stacks > 0
 	SWITCH_TO_KERNEL_STACK
 .endif
-
 .endm
 
 .macro SAVE_ALL_NMI cr3_reg:req
@@ -1441,39 +1443,45 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vec
 
 ENTRY(page_fault)
 	ASM_CLAC
-	pushl	$do_page_fault
-	ALIGN
-	jmp common_exception
+	pushl	$0; /* %gs's slot on the stack */
+
+	SAVE_ALL switch_stacks=1 skip_gs=1
+
+	ENCODE_FRAME_POINTER
+	UNWIND_ESPFIX_STACK
+
+	/* fixup %gs */
+	GS_TO_REG %ecx
+	REG_TO_PTGS %ecx
+	SET_KERNEL_GS %ecx
+
+	GET_CR2_INTO(%ecx)			# might clobber %eax
+
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+
+	TRACE_IRQS_OFF
+	movl	%esp, %eax			# pt_regs pointer
+	call	$do_page_fault
+	jmp	ret_from_exception
 END(page_fault)
 
 common_exception:
 	/* the function address is in %gs's slot on the stack */
-	FIXUP_FRAME
-	pushl	%fs
-	pushl	%es
-	pushl	%ds
-	pushl	%eax
-	movl	$(__USER_DS), %eax
-	movl	%eax, %ds
-	movl	%eax, %es
-	movl	$(__KERNEL_PERCPU), %eax
-	movl	%eax, %fs
-	pushl	%ebp
-	pushl	%edi
-	pushl	%esi
-	pushl	%edx
-	pushl	%ecx
-	pushl	%ebx
-	SWITCH_TO_KERNEL_STACK
+	SAVE_ALL switch_stacks=1 skip_gs=1
 	ENCODE_FRAME_POINTER
-	cld
 	UNWIND_ESPFIX_STACK
+
+	/* fixup %gs */
 	GS_TO_REG %ecx
 	movl	PT_GS(%esp), %edi		# get the function address
-	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
-	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
 	REG_TO_PTGS %ecx
 	SET_KERNEL_GS %ecx
+
+	/* fixup orig %eax */
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
 	CALL_NOSPEC %edi
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -901,7 +901,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -937,18 +937,27 @@ ENTRY(\sym)
 
 	.if \paranoid
 	call	paranoid_entry
+	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 	.else
 	call	error_entry
 	.endif
 	UNWIND_HINT_REGS
-	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
-	.if \paranoid
+	.if \read_cr2
+	GET_CR2_INTO(%rdx);			/* can clobber %rax */
+	.endif
+
 	.if \shift_ist != -1
 	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
 	.else
 	TRACE_IRQS_OFF
 	.endif
+
+	.if \paranoid == 0
+	testb	$3, CS(%rsp)
+	jz	.Lfrom_kernel_no_context_tracking_\@
+	CALL_enter_from_user_mode
+.Lfrom_kernel_no_context_tracking_\@:
 	.endif
 
 	movq	%rsp, %rdi			/* pt_regs pointer */
@@ -1180,10 +1189,10 @@ idtentry xenint3		do_int3			has_error_co
 #endif
 
 idtentry general_protection	do_general_protection	has_error_code=1
-idtentry page_fault		do_page_fault		has_error_code=1
+idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
 
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault	do_async_page_fault	has_error_code=1
+idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
 #endif
 
 #ifdef CONFIG_X86_MCE
@@ -1338,18 +1347,9 @@ ENTRY(error_entry)
 	movq	%rax, %rsp			/* switch stack */
 	ENCODE_FRAME_POINTER
 	pushq	%r12
-
-	/*
-	 * We need to tell lockdep that IRQs are off.  We can't do this until
-	 * we fix gsbase, and we should do it before enter_from_user_mode
-	 * (which can take locks).
-	 */
-	TRACE_IRQS_OFF
-	CALL_enter_from_user_mode
 	ret
 
 .Lerror_entry_done:
-	TRACE_IRQS_OFF
 	ret
 
 	/*
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, i
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code);
+void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -116,7 +116,7 @@ static inline void write_cr0(unsigned lo
 
 static inline unsigned long read_cr2(void)
 {
-	return PVOP_CALL0(unsigned long, mmu.read_cr2);
+	return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
 }
 
 static inline void write_cr2(unsigned long x)
@@ -909,13 +909,7 @@ extern void default_banner(void);
 		  ANNOTATE_RETPOLINE_SAFE;				\
 		  call PARA_INDIRECT(pv_ops+PV_CPU_swapgs);		\
 		 )
-#endif
-
-#define GET_CR2_INTO_RAX				\
-	ANNOTATE_RETPOLINE_SAFE;				\
-	call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);
 
-#ifdef CONFIG_PARAVIRT_XXL
 #define USERGS_SYSRET64							\
 	PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64),			\
 		  ANNOTATE_RETPOLINE_SAFE;				\
@@ -929,9 +923,19 @@ extern void default_banner(void);
 		  call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);	    \
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 #endif
-#endif
+#endif /* CONFIG_PARAVIRT_XXL */
+#endif	/* CONFIG_X86_64 */
+
+#ifdef CONFIG_PARAVIRT_XXL
+
+#define GET_CR2_INTO_AX							\
+	PARA_SITE(PARA_PATCH(PV_MMU_read_cr2),				\
+		  ANNOTATE_RETPOLINE_SAFE;				\
+		  call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);		\
+		 )
+
+#endif /* CONFIG_PARAVIRT_XXL */
 
-#endif	/* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
 #else  /* CONFIG_PARAVIRT */
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -220,7 +220,7 @@ struct pv_mmu_ops {
 	void (*exit_mmap)(struct mm_struct *mm);
 
 #ifdef CONFIG_PARAVIRT_XXL
-	unsigned long (*read_cr2)(void);
+	struct paravirt_callee_save read_cr2;
 	void (*write_cr2)(unsigned long);
 
 	unsigned long (*read_cr3)(void);
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -81,7 +81,7 @@ struct bad_iret_stack *fixup_bad_iret(st
 void __init trap_init(void);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
-dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code);
+dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
 dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -77,6 +77,7 @@ static void __used common(void)
 	BLANK();
 	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
 	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+	OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2);
 #endif
 
 	BLANK();
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -29,9 +29,7 @@
 #ifdef CONFIG_PARAVIRT_XXL
 #include <asm/asm-offsets.h>
 #include <asm/paravirt.h>
-#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
 #else
-#define GET_CR2_INTO(reg) movq %cr2, reg
 #define INTERRUPT_RETURN iretq
 #endif
 
@@ -323,7 +321,7 @@ END(early_idt_handler_array)
 
 	cmpq $14,%rsi		/* Page fault? */
 	jnz 10f
-	GET_CR2_INTO(%rdi)	/* Can clobber any volatile register if pv */
+	GET_CR2_INTO(%rdi)	/* can clobber %rax if pv */
 	call early_make_pgtable
 	andl %eax,%eax
 	jz 20f			/* All good */
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_
 NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
 dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 {
 	enum ctx_state prev_state;
 
 	switch (kvm_read_and_reset_pf_reason()) {
 	default:
-		do_page_fault(regs, error_code);
+		do_page_fault(regs, error_code, address);
 		break;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
 		prev_state = exception_enter();
-		kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
+		kvm_async_pf_task_wait((u32)address, !user_mode(regs));
 		exception_exit(prev_state);
 		break;
 	case KVM_PV_REASON_PAGE_READY:
 		rcu_irq_enter();
-		kvm_async_pf_task_wake((u32)read_cr2());
+		kvm_async_pf_task_wake((u32)address);
 		rcu_irq_exit();
 		break;
 	}
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -370,7 +370,7 @@ struct paravirt_patch_template pv_ops =
 	.mmu.exit_mmap		= paravirt_nop,
 
 #ifdef CONFIG_PARAVIRT_XXL
-	.mmu.read_cr2		= native_read_cr2,
+	.mmu.read_cr2		= __PV_IS_CALLEE_SAVE(native_read_cr2),
 	.mmu.write_cr2		= native_write_cr2,
 	.mmu.read_cr3		= __native_read_cr3,
 	.mmu.write_cr3		= native_write_cr3,
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1548,9 +1548,8 @@ trace_page_fault_entries(unsigned long a
  * exception_{enter,exit}() contains all sorts of tracepoints.
  */
 dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 {
-	unsigned long address = read_cr2(); /* Get the faulting address */
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -998,7 +998,8 @@ void __init xen_setup_vcpu_info_placemen
 			__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
 		pv_ops.irq.irq_enable =
 			__PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
-		pv_ops.mmu.read_cr2 = xen_read_cr2_direct;
+		pv_ops.mmu.read_cr2 =
+			__PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
 	}
 }
 
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1307,16 +1307,6 @@ static void xen_write_cr2(unsigned long
 	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
 }
 
-static unsigned long xen_read_cr2(void)
-{
-	return this_cpu_read(xen_vcpu)->arch.cr2;
-}
-
-unsigned long xen_read_cr2_direct(void)
-{
-	return this_cpu_read(xen_vcpu_info.arch.cr2);
-}
-
 static noinline void xen_flush_tlb(void)
 {
 	struct mmuext_op *op;
@@ -2397,7 +2387,7 @@ static void xen_leave_lazy_mmu(void)
 }
 
 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
-	.read_cr2 = xen_read_cr2,
+	.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
 	.write_cr2 = xen_write_cr2,
 
 	.read_cr3 = xen_read_cr3,
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -135,3 +135,28 @@ ENTRY(check_events)
 	FRAME_END
 	ret
 ENDPROC(check_events)
+
+ENTRY(xen_read_cr2)
+	FRAME_BEGIN
+#ifdef CONFIG_X86_64
+	movq	PER_CPU_VAR(xen_vcpu), %rax
+	movq	XEN_vcpu_info_arch_cr2(%rax), %rax
+#else
+	movl	PER_CPU_VAR(xen_vcpu), %eax
+	movl	XEN_vcpu_info_arch_cr2(%eax), %eax
+#endif
+	FRAME_END
+	ret
+	ENDPROC(xen_read_cr2);
+
+ENTRY(xen_read_cr2_direct)
+	FRAME_BEGIN
+#ifdef CONFIG_X86_64
+	movq	PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %rax
+#else
+	movl	PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %eax
+#endif
+	FRAME_END
+	ret
+	ENDPROC(xen_read_cr2_direct);
+
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -134,6 +134,9 @@ __visible void xen_irq_disable_direct(vo
 __visible unsigned long xen_save_fl_direct(void);
 __visible void xen_restore_fl_direct(unsigned long);
 
+__visible unsigned long xen_read_cr2(void);
+__visible unsigned long xen_read_cr2_direct(void);
+
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 __visible void xen_sysret32(void);


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-02 17:39       ` Steven Rostedt
  2019-07-02 17:47         ` Steven Rostedt
  2019-07-02 20:18         ` Peter Zijlstra
@ 2019-07-19 20:28         ` Sean Christopherson
  2019-07-19 22:23           ` Thomas Gleixner
  2 siblings, 1 reply; 16+ messages in thread
From: Sean Christopherson @ 2019-07-19 20:28 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Thomas Gleixner, Peter Zijlstra, Eiichi Tsukata, edwintorok,
	mingo, bp, hpa, x86, linux-kernel, Josh Poimboeuf,
	Joel Fernandes

On Tue, Jul 02, 2019 at 01:39:05PM -0400, Steven Rostedt wrote:
> On Tue, 2 Jul 2019 11:33:55 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Tue, 2 Jul 2019 16:14:05 +0200 (CEST)
> > Thomas Gleixner <tglx@linutronix.de> wrote:
> > 
> > > On Tue, 2 Jul 2019, Peter Zijlstra wrote:
> > >   
> > > > On Tue, Jul 02, 2019 at 02:31:51PM +0900, Eiichi Tsukata wrote:    
> > > > > Put the boundary check before it accesses user space to prevent unnecessary
> > > > > access which might crash the machine.
> > > > > 
> > > > > Especially, ftrace preemptirq/irq_disable event with user stack trace
> > > > > option can trigger SEGV in pid 1 which leads to panic.    
> > 
> > Note, I'm only able to trigger this crash with the irq_disable event.
> > The irq_enable and preempt_disable/enable events work just fine. This
> > leads me to believe that the TRACE_IRQS_OFF macro (which uses a thunk
> > trampoline) may have some issues and is probably the place to look at.
> 
> I figured it out.
> 
> It's another "corruption of the cr2" register issue. The following
> patch makes the issue go away. I'm not suggesting that we use this
> patch, but it shows where the bug lies.
> 
> IIRC, there was patches posted before that fixed this issue. I'll go
> look to see if I can dig them up. Was it Joel that sent them?
> 
> -- Steve
> 
> diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
> index be36bf4e0957..dd79256badb2 100644
> --- a/arch/x86/entry/thunk_64.S
> +++ b/arch/x86/entry/thunk_64.S
> @@ -40,7 +40,7 @@
>  
>  #ifdef CONFIG_TRACE_IRQFLAGS
>  	THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
> -	THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
> +	THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller_cr2,1
>  #endif
>  
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 46df4c6aae46..b42ca3fc569d 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1555,3 +1555,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
>  	exception_exit(prev_state);
>  }
>  NOKPROBE_SYMBOL(do_page_fault);
> +
> +void trace_hardirqs_off_caller(unsigned long addr);
> +
> +void notrace trace_hardirqs_off_caller_cr2(unsigned long addr)
> +{
> +	unsigned long address = read_cr2(); /* Get the faulting address */
> +
> +	trace_hardirqs_off_caller(addr);
> +	write_cr2(address);
> +}

I'm hitting a similar panic that bisects to commit

  a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption")

except I'm experiencing death immediately after starting init.

Through sheer dumb luck, I tracked (pun intended) this down to forcing
context tracking:

  CONFIG_CONTEXT_TRACKING=y
  CONFIG_CONTEXT_TRACKING_FORCE=y
  CONFIG_VIRT_CPU_ACCOUNTING_GEN=y

I haven't attempted to debug further and I'll be offline for most of the
next few days.  Hopefully this is enough to root cause the badness.

[    0.680477] Run /sbin/init as init process
[    0.682116] init[1]: segfault at 2926a7ef ip 00007f98a49d9c30 sp 00007fffd83e6af0 error 14 in ld-2.23.so[7f98a49d9000+26000]
[    0.683427] Code: Bad RIP value.
[    0.683844] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
[    0.684710] CPU: 0 PID: 1 Comm: init Not tainted 5.2.0+ #18
[    0.685352] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[    0.686239] Call Trace:
[    0.686542]  dump_stack+0x46/0x5b
[    0.686915]  panic+0xf8/0x2d2
[    0.687252]  do_exit+0xb68/0xb70
[    0.687616]  do_group_exit+0x3a/0xa0
[    0.688088]  get_signal+0x184/0x880
[    0.688483]  do_signal+0x30/0x690
[    0.688857]  ? signal_wake_up_state+0x15/0x30
[    0.689433]  ? __send_signal+0x139/0x380
[    0.689908]  exit_to_usermode_loop+0x6a/0xc0
[    0.690397]  ? async_page_fault+0x8/0x40
[    0.690850]  prepare_exit_to_usermode+0x78/0xb0
[    0.691355]  retint_user+0x8/0x8
[    0.691722] RIP: 0033:0x7f98a49d9c30
[    0.692122] Code: Bad RIP value.
[    0.692484] RSP: 002b:00007fffd83e6af0 EFLAGS: 00010202
[    0.693060] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[    0.693907] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[    0.694739] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[    0.695538] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[    0.696322] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[    0.697195] Kernel Offset: disabled
[    0.697600] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-19 20:28         ` Sean Christopherson
@ 2019-07-19 22:23           ` Thomas Gleixner
  2019-07-19 23:01             ` Thomas Gleixner
  2019-07-20  8:56             ` [PATCH] x86/entry/64: Prevent clobbering of saved CR2 value Thomas Gleixner
  0 siblings, 2 replies; 16+ messages in thread
From: Thomas Gleixner @ 2019-07-19 22:23 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Steven Rostedt, Peter Zijlstra, Eiichi Tsukata, edwintorok,
	mingo, bp, hpa, x86, linux-kernel, Josh Poimboeuf,
	Joel Fernandes

On Fri, 19 Jul 2019, Sean Christopherson wrote:
> On Tue, Jul 02, 2019 at 01:39:05PM -0400, Steven Rostedt wrote:
> 
> I'm hitting a similar panic that bisects to commit
> 
>   a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption")
> 
> except I'm experiencing death immediately after starting init.
> 
> Through sheer dumb luck, I tracked (pun intended) this down to forcing
> context tracking:
> 
>   CONFIG_CONTEXT_TRACKING=y
>   CONFIG_CONTEXT_TRACKING_FORCE=y
>   CONFIG_VIRT_CPU_ACCOUNTING_GEN=y
> 
> I haven't attempted to debug further and I'll be offline for most of the
> next few days.  Hopefully this is enough to root cause the badness.
> 
> [    0.680477] Run /sbin/init as init process
> [    0.682116] init[1]: segfault at 2926a7ef ip 00007f98a49d9c30 sp 00007fffd83e6af0 error 14 in ld-2.23.so[7f98a49d9000+26000]

That's because the call into the context tracking muck clobbers RDX which
contains the CR2 value on pagefault. So the pagefault resolves to crap and
kills init.

Brute force fix below. That needs to be conditional on read_cr2 but for now
it does the job.

Thanks,

	tglx

8<------------
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -887,7 +887,9 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	.if \paranoid == 0
 	testb	$3, CS(%rsp)
 	jz	.Lfrom_kernel_no_context_tracking_\@
+	pushq	%rdx
 	CALL_enter_from_user_mode
+	popq	%rdx
 .Lfrom_kernel_no_context_tracking_\@:
 	.endif
 




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-19 22:23           ` Thomas Gleixner
@ 2019-07-19 23:01             ` Thomas Gleixner
  2019-07-20  8:44               ` Thomas Gleixner
  2019-07-20  8:56             ` [PATCH] x86/entry/64: Prevent clobbering of saved CR2 value Thomas Gleixner
  1 sibling, 1 reply; 16+ messages in thread
From: Thomas Gleixner @ 2019-07-19 23:01 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Steven Rostedt, Peter Zijlstra, Eiichi Tsukata, edwintorok,
	mingo, bp, hpa, x86, linux-kernel, Josh Poimboeuf,
	Joel Fernandes

On Sat, 20 Jul 2019, Thomas Gleixner wrote:

> On Fri, 19 Jul 2019, Sean Christopherson wrote:
> > On Tue, Jul 02, 2019 at 01:39:05PM -0400, Steven Rostedt wrote:
> > 
> > I'm hitting a similar panic that bisects to commit
> > 
> >   a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption")
> > 
> > except I'm experiencing death immediately after starting init.
> > 
> > Through sheer dumb luck, I tracked (pun intended) this down to forcing
> > context tracking:
> > 
> >   CONFIG_CONTEXT_TRACKING=y
> >   CONFIG_CONTEXT_TRACKING_FORCE=y
> >   CONFIG_VIRT_CPU_ACCOUNTING_GEN=y
> > 
> > I haven't attempted to debug further and I'll be offline for most of the
> > next few days.  Hopefully this is enough to root cause the badness.
> > 
> > [    0.680477] Run /sbin/init as init process
> > [    0.682116] init[1]: segfault at 2926a7ef ip 00007f98a49d9c30 sp 00007fffd83e6af0 error 14 in ld-2.23.so[7f98a49d9000+26000]
> 
> That's because the call into the context tracking muck clobbers RDX which
> contains the CR2 value on pagefault. So the pagefault resolves to crap and
> kills init.
> 
> Brute force fix below. That needs to be conditional on read_cr2 but for now
> it does the job.

But it does it just for the context tracking case. TRACE_IRQS_OFF* will do
the same damage.

Fix is not pretty, but ...

Thanks,

	tglx

8<-----------
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -876,6 +876,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 
 	.if \read_cr2
 	GET_CR2_INTO(%rdx);			/* can clobber %rax */
+	pushq	%rdx
 	.endif
 
 	.if \shift_ist != -1
@@ -885,12 +886,20 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	.endif
 
 	.if \paranoid == 0
+	.if \read_cr2
+	testb	$3, CS + 8(%rsp)
+	.else
 	testb	$3, CS(%rsp)
+	.endif
 	jz	.Lfrom_kernel_no_context_tracking_\@
 	CALL_enter_from_user_mode
 .Lfrom_kernel_no_context_tracking_\@:
 	.endif
 
+	.if \read_cr2
+	popq	%rdx
+	.endif
+
 	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/stacktrace: Do not access user space memory unnecessarily
  2019-07-19 23:01             ` Thomas Gleixner
@ 2019-07-20  8:44               ` Thomas Gleixner
  0 siblings, 0 replies; 16+ messages in thread
From: Thomas Gleixner @ 2019-07-20  8:44 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Steven Rostedt, Peter Zijlstra, Eiichi Tsukata, edwintorok,
	mingo, bp, hpa, x86, linux-kernel, Josh Poimboeuf,
	Joel Fernandes

On Sat, 20 Jul 2019, Thomas Gleixner wrote:
> On Sat, 20 Jul 2019, Thomas Gleixner wrote:
> > On Fri, 19 Jul 2019, Sean Christopherson wrote:
> > > [    0.680477] Run /sbin/init as init process
> > > [    0.682116] init[1]: segfault at 2926a7ef ip 00007f98a49d9c30 sp 00007fffd83e6af0 error 14 in ld-2.23.so[7f98a49d9000+26000]
> > 
> > That's because the call into the context tracking muck clobbers RDX which
> > contains the CR2 value on pagefault. So the pagefault resolves to crap and
> > kills init.
> > 
> > Brute force fix below. That needs to be conditional on read_cr2 but for now
> > it does the job.
> 
> But it does it just for the context tracking case. TRACE_IRQS_OFF* will do
> the same damage.

Hmm, should not becasue that calls through the thunk which preserves RDX.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] x86/entry/64: Prevent clobbering of saved CR2 value
  2019-07-19 22:23           ` Thomas Gleixner
  2019-07-19 23:01             ` Thomas Gleixner
@ 2019-07-20  8:56             ` Thomas Gleixner
  2019-07-20 11:20               ` Peter Zijlstra
  2019-07-20 12:34               ` [tip:x86/urgent] " tip-bot for Thomas Gleixner
  1 sibling, 2 replies; 16+ messages in thread
From: Thomas Gleixner @ 2019-07-20  8:56 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Steven Rostedt, Peter Zijlstra, Eiichi Tsukata, edwintorok,
	Ingo Molnar, Borislav Petkov, H. Peter Anvin, x86, LKML,
	Josh Poimboeuf, Joel Fernandes, Andy Lutomirski, Dave Hansen

The recent fix for CR2 corruption introduced a new way to reliably corrupt
the saved CR2 value.

CR2 is saved early in the entry code in RDX, which is the third argument to
the fault handling functions. But it missed that between saving and
invoking the fault handler enter_from_user_mode() can be called. RDX is a
caller saved register so the invoked function can freely clobber it with
the obvious consequences.

The TRACE_IRQS_OFF call is safe as it calls through the thunk which
preserves RDX.

Store CR2 in R12 instead which is a callee saved register and move R12 to
RDX just before calling the fault handler.

Fixes: a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption")
Reported-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/entry_64.S |   11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -875,7 +875,12 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	UNWIND_HINT_REGS
 
 	.if \read_cr2
-	GET_CR2_INTO(%rdx);			/* can clobber %rax */
+	/*
+	 * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
+	 * intermediate storage as RDX can be clobbered in enter_from_user_mode().
+	 * GET_CR2_INTO can clobber RAX.
+	 */
+	GET_CR2_INTO(%r12);
 	.endif
 
 	.if \shift_ist != -1
@@ -904,6 +909,10 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	subq	$\ist_offset, CPU_TSS_IST(\shift_ist)
 	.endif
 
+	.if \read_cr2
+	movq	%r12, %rdx			/* Move CR2 into 3rd argument */
+	.endif
+
 	call	\do_sym
 
 	.if \shift_ist != -1

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] x86/entry/64: Prevent clobbering of saved CR2 value
  2019-07-20  8:56             ` [PATCH] x86/entry/64: Prevent clobbering of saved CR2 value Thomas Gleixner
@ 2019-07-20 11:20               ` Peter Zijlstra
  2019-07-20 12:34               ` [tip:x86/urgent] " tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2019-07-20 11:20 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Sean Christopherson, Steven Rostedt, Eiichi Tsukata, edwintorok,
	Ingo Molnar, Borislav Petkov, H. Peter Anvin, x86, LKML,
	Josh Poimboeuf, Joel Fernandes, Andy Lutomirski, Dave Hansen

On Sat, Jul 20, 2019 at 10:56:41AM +0200, Thomas Gleixner wrote:
> The recent fix for CR2 corruption introduced a new way to reliably corrupt
> the saved CR2 value.
> 
> CR2 is saved early in the entry code in RDX, which is the third argument to
> the fault handling functions. But it missed that between saving and
> invoking the fault handler enter_from_user_mode() can be called. RDX is a
> caller saved register so the invoked function can freely clobber it with
> the obvious consequences.
> 
> The TRACE_IRQS_OFF call is safe as it calls through the thunk which
> preserves RDX.
> 
> Store CR2 in R12 instead which is a callee saved register and move R12 to
> RDX just before calling the fault handler.
> 
> Fixes: a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption")
> Reported-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

D'0h :-( Sorry about that.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

> ---
>  arch/x86/entry/entry_64.S |   11 ++++++++++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
> 
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -875,7 +875,12 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
>  	UNWIND_HINT_REGS
>  
>  	.if \read_cr2
> -	GET_CR2_INTO(%rdx);			/* can clobber %rax */
> +	/*
> +	 * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
> +	 * intermediate storage as RDX can be clobbered in enter_from_user_mode().
> +	 * GET_CR2_INTO can clobber RAX.
> +	 */
> +	GET_CR2_INTO(%r12);
>  	.endif
>  
>  	.if \shift_ist != -1
> @@ -904,6 +909,10 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
>  	subq	$\ist_offset, CPU_TSS_IST(\shift_ist)
>  	.endif
>  
> +	.if \read_cr2
> +	movq	%r12, %rdx			/* Move CR2 into 3rd argument */
> +	.endif
> +
>  	call	\do_sym
>  
>  	.if \shift_ist != -1

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [tip:x86/urgent] x86/entry/64: Prevent clobbering of saved CR2 value
  2019-07-20  8:56             ` [PATCH] x86/entry/64: Prevent clobbering of saved CR2 value Thomas Gleixner
  2019-07-20 11:20               ` Peter Zijlstra
@ 2019-07-20 12:34               ` tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 16+ messages in thread
From: tip-bot for Thomas Gleixner @ 2019-07-20 12:34 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: peterz, tglx, sean.j.christopherson, linux-kernel, hpa, mingo

Commit-ID:  6879298bd0673840cadd1fb36d7225485504ceb4
Gitweb:     https://git.kernel.org/tip/6879298bd0673840cadd1fb36d7225485504ceb4
Author:     Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Sat, 20 Jul 2019 10:56:41 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Sat, 20 Jul 2019 14:28:41 +0200

x86/entry/64: Prevent clobbering of saved CR2 value

The recent fix for CR2 corruption introduced a new way to reliably corrupt
the saved CR2 value.

CR2 is saved early in the entry code in RDX, which is the third argument to
the fault handling functions. But it missed that between saving and
invoking the fault handler enter_from_user_mode() can be called. RDX is a
caller saved register so the invoked function can freely clobber it with
the obvious consequences.

The TRACE_IRQS_OFF call is safe as it calls through the thunk which
preserves RDX, but TRACE_IRQS_OFF_DEBUG is not because it also calls into
C-code outside of the thunk.

Store CR2 in R12 instead which is a callee saved register and move R12 to
RDX just before calling the fault handler.

Fixes: a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption")
Reported-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907201020540.1782@nanos.tec.linutronix.de

---
 arch/x86/entry/entry_64.S | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7cb2e1f1ec09..f7c70c1bee8b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -875,7 +875,12 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 	UNWIND_HINT_REGS
 
 	.if \read_cr2
-	GET_CR2_INTO(%rdx);			/* can clobber %rax */
+	/*
+	 * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
+	 * intermediate storage as RDX can be clobbered in enter_from_user_mode().
+	 * GET_CR2_INTO can clobber RAX.
+	 */
+	GET_CR2_INTO(%r12);
 	.endif
 
 	.if \shift_ist != -1
@@ -904,6 +909,10 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 	subq	$\ist_offset, CPU_TSS_IST(\shift_ist)
 	.endif
 
+	.if \read_cr2
+	movq	%r12, %rdx			/* Move CR2 into 3rd argument */
+	.endif
+
 	call	\do_sym
 
 	.if \shift_ist != -1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2019-07-20 12:34 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-02  5:31 [PATCH] x86/stacktrace: Do not access user space memory unnecessarily Eiichi Tsukata
2019-07-02  7:28 ` Peter Zijlstra
2019-07-02 14:14   ` Thomas Gleixner
2019-07-02 15:33     ` Steven Rostedt
2019-07-02 17:39       ` Steven Rostedt
2019-07-02 17:47         ` Steven Rostedt
2019-07-02 20:18         ` Peter Zijlstra
2019-07-02 20:33           ` Steven Rostedt
2019-07-02 22:02           ` Peter Zijlstra
2019-07-19 20:28         ` Sean Christopherson
2019-07-19 22:23           ` Thomas Gleixner
2019-07-19 23:01             ` Thomas Gleixner
2019-07-20  8:44               ` Thomas Gleixner
2019-07-20  8:56             ` [PATCH] x86/entry/64: Prevent clobbering of saved CR2 value Thomas Gleixner
2019-07-20 11:20               ` Peter Zijlstra
2019-07-20 12:34               ` [tip:x86/urgent] " tip-bot for Thomas Gleixner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).