* [PATCH 0/2] x86_64: LRET to userspace @ 2014-07-23 15:34 Andy Lutomirski 2014-07-23 15:34 ` [PATCH 1/2] x86_64,entry,xen: Do not invoke espfix64 on Xen Andy Lutomirski 2014-07-23 15:34 ` [PATCH 2/2] x86_64,entry: Use lret to return to userspace when possible Andy Lutomirski 0 siblings, 2 replies; 6+ messages in thread From: Andy Lutomirski @ 2014-07-23 15:34 UTC (permalink / raw) To: H. Peter Anvin, Linus Torvalds, linux-kernel, x86, Borislav Petkov Cc: Andy Lutomirski This series is based on tip/perf/urgent. It should apply to any recent kernel, but my test case [1] will OOPS without the fix in tip/perf/urgent. The first patch or something like it is needed for 3.16, regardless of any lret hackery -- I think it's making its way toward a pull request this week, but I haven't seen it land in -tip yet. I'm repeating it here because the LRET stuff won't work on paravirt, and patch 1 puts the machinery for that in place. tl;dr Read patch 2. To test, apply patch 1 and consider basing on tip/perf/urgent. [1] sigreturn_32 from https://gitorious.org/linux-test-utils/linux-clock-tests/ Andy Lutomirski (2): x86_64,entry,xen: Do not invoke espfix64 on Xen x86_64,entry: Use lret to return to userspace when possible arch/x86/include/asm/irqflags.h | 3 +- arch/x86/include/asm/paravirt.h | 4 ++ arch/x86/include/asm/traps.h | 6 ++ arch/x86/kernel/cpu/mcheck/mce.c | 2 + arch/x86/kernel/entry_64.S | 121 ++++++++++++++++++++++++++++++------ arch/x86/kernel/nmi.c | 21 +++++++ arch/x86/kernel/paravirt_patch_64.c | 2 - 7 files changed, 136 insertions(+), 23 deletions(-) -- 1.9.3 ^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH 1/2] x86_64,entry,xen: Do not invoke espfix64 on Xen 2014-07-23 15:34 [PATCH 0/2] x86_64: LRET to userspace Andy Lutomirski @ 2014-07-23 15:34 ` Andy Lutomirski 2014-07-28 22:33 ` [tip:x86/urgent] x86_64/entry/xen: " tip-bot for Andy Lutomirski 2014-07-23 15:34 ` [PATCH 2/2] x86_64,entry: Use lret to return to userspace when possible Andy Lutomirski 1 sibling, 1 reply; 6+ messages in thread From: Andy Lutomirski @ 2014-07-23 15:34 UTC (permalink / raw) To: H. Peter Anvin, Linus Torvalds, linux-kernel, x86, Borislav Petkov Cc: Andy Lutomirski This moves the espfix64 logic into native_iret. To make this work, it gets rid of the native patch for INTERRUPT_RETURN: INTERRUPT_RETURN on native kernels is now 'jmp native_iret'. This changes the 16-bit SS behavior on Xen from OOPSing to leaking some bits of the Xen hypervisor's RSP (I think). Signed-off-by: Andy Lutomirski <luto@amacapital.net> --- arch/x86/include/asm/irqflags.h | 2 +- arch/x86/kernel/entry_64.S | 28 ++++++++++------------------ arch/x86/kernel/paravirt_patch_64.c | 2 -- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index bba3cf8..0a8b519 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void) #define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ -#define INTERRUPT_RETURN iretq +#define INTERRUPT_RETURN jmp native_iret #define USERGS_SYSRET64 \ swapgs; \ sysretq; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b25ca96..c844f08 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -830,27 +830,24 @@ restore_args: RESTORE_ARGS 1,8,1 irq_return: + INTERRUPT_RETURN + +ENTRY(native_iret) /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. */ #ifdef CONFIG_X86_ESPFIX64 testb $4,(SS-RIP)(%rsp) - jnz irq_return_ldt + jnz native_irq_return_ldt #endif -irq_return_iret: - INTERRUPT_RETURN - _ASM_EXTABLE(irq_return_iret, bad_iret) - -#ifdef CONFIG_PARAVIRT -ENTRY(native_iret) +native_irq_return_iret: iretq - _ASM_EXTABLE(native_iret, bad_iret) -#endif + _ASM_EXTABLE(native_irq_return_iret, bad_iret) #ifdef CONFIG_X86_ESPFIX64 -irq_return_ldt: +native_irq_return_ldt: pushq_cfi %rax pushq_cfi %rdi SWAPGS @@ -872,7 +869,7 @@ irq_return_ldt: SWAPGS movq %rax,%rsp popq_cfi %rax - jmp irq_return_iret + jmp native_irq_return_iret #endif .section .fixup,"ax" @@ -956,13 +953,8 @@ __do_double_fault: cmpl $__KERNEL_CS,CS(%rdi) jne do_double_fault movq RIP(%rdi),%rax - cmpq $irq_return_iret,%rax -#ifdef CONFIG_PARAVIRT - je 1f - cmpq $native_iret,%rax -#endif + cmpq $native_irq_return_iret,%rax jne do_double_fault /* This shouldn't happen... */ -1: movq PER_CPU_VAR(kernel_stack),%rax subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ movq %rax,RSP(%rdi) @@ -1428,7 +1420,7 @@ error_sti: */ error_kernelspace: incl %ebx - leaq irq_return_iret(%rip),%rcx + leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs movl %ecx,%eax /* zero extend */ diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 3f08f34..a1da673 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); -DEF_NATIVE(pv_cpu_ops, iret, "iretq"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); @@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, iret); PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); -- 1.9.3 ^ permalink raw reply related [flat|nested] 6+ messages in thread
* [tip:x86/urgent] x86_64/entry/xen: Do not invoke espfix64 on Xen 2014-07-23 15:34 ` [PATCH 1/2] x86_64,entry,xen: Do not invoke espfix64 on Xen Andy Lutomirski @ 2014-07-28 22:33 ` tip-bot for Andy Lutomirski 2014-07-29 3:39 ` Andy Lutomirski 0 siblings, 1 reply; 6+ messages in thread From: tip-bot for Andy Lutomirski @ 2014-07-28 22:33 UTC (permalink / raw) To: linux-tip-commits; +Cc: linux-kernel, luto, hpa, mingo, stable, tglx, hpa Commit-ID: 7209a75d2009dbf7745e2fd354abf25c3deb3ca3 Gitweb: http://git.kernel.org/tip/7209a75d2009dbf7745e2fd354abf25c3deb3ca3 Author: Andy Lutomirski <luto@amacapital.net> AuthorDate: Wed, 23 Jul 2014 08:34:11 -0700 Committer: H. Peter Anvin <hpa@linux.intel.com> CommitDate: Mon, 28 Jul 2014 15:25:40 -0700 x86_64/entry/xen: Do not invoke espfix64 on Xen This moves the espfix64 logic into native_iret. To make this work, it gets rid of the native patch for INTERRUPT_RETURN: INTERRUPT_RETURN on native kernels is now 'jmp native_iret'. This changes the 16-bit SS behavior on Xen from OOPSing to leaking some bits of the Xen hypervisor's RSP (I think). [ hpa: this is a nonzero cost on native, but probably not enough to measure. Xen needs to fix this in their own code, probably doing something equivalent to espfix64. ] Signed-off-by: Andy Lutomirski <luto@amacapital.net> Link: http://lkml.kernel.org/r/7b8f1d8ef6597cb16ae004a43c56980a7de3cf94.1406129132.git.luto@amacapital.net Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> Cc: <stable@vger.kernel.org> --- arch/x86/include/asm/irqflags.h | 2 +- arch/x86/kernel/entry_64.S | 28 ++++++++++------------------ arch/x86/kernel/paravirt_patch_64.c | 2 -- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index bba3cf8..0a8b519 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void) #define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ -#define INTERRUPT_RETURN iretq +#define INTERRUPT_RETURN jmp native_iret #define USERGS_SYSRET64 \ swapgs; \ sysretq; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b25ca96..c844f08 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -830,27 +830,24 @@ restore_args: RESTORE_ARGS 1,8,1 irq_return: + INTERRUPT_RETURN + +ENTRY(native_iret) /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. */ #ifdef CONFIG_X86_ESPFIX64 testb $4,(SS-RIP)(%rsp) - jnz irq_return_ldt + jnz native_irq_return_ldt #endif -irq_return_iret: - INTERRUPT_RETURN - _ASM_EXTABLE(irq_return_iret, bad_iret) - -#ifdef CONFIG_PARAVIRT -ENTRY(native_iret) +native_irq_return_iret: iretq - _ASM_EXTABLE(native_iret, bad_iret) -#endif + _ASM_EXTABLE(native_irq_return_iret, bad_iret) #ifdef CONFIG_X86_ESPFIX64 -irq_return_ldt: +native_irq_return_ldt: pushq_cfi %rax pushq_cfi %rdi SWAPGS @@ -872,7 +869,7 @@ irq_return_ldt: SWAPGS movq %rax,%rsp popq_cfi %rax - jmp irq_return_iret + jmp native_irq_return_iret #endif .section .fixup,"ax" @@ -956,13 +953,8 @@ __do_double_fault: cmpl $__KERNEL_CS,CS(%rdi) jne do_double_fault movq RIP(%rdi),%rax - cmpq $irq_return_iret,%rax -#ifdef CONFIG_PARAVIRT - je 1f - cmpq $native_iret,%rax -#endif + cmpq $native_irq_return_iret,%rax jne do_double_fault /* This shouldn't happen... */ -1: movq PER_CPU_VAR(kernel_stack),%rax subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ movq %rax,RSP(%rdi) @@ -1428,7 +1420,7 @@ error_sti: */ error_kernelspace: incl %ebx - leaq irq_return_iret(%rip),%rcx + leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs movl %ecx,%eax /* zero extend */ diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 3f08f34..a1da673 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); -DEF_NATIVE(pv_cpu_ops, iret, "iretq"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); @@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, iret); PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); ^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [tip:x86/urgent] x86_64/entry/xen: Do not invoke espfix64 on Xen 2014-07-28 22:33 ` [tip:x86/urgent] x86_64/entry/xen: " tip-bot for Andy Lutomirski @ 2014-07-29 3:39 ` Andy Lutomirski 2014-07-29 3:47 ` H. Peter Anvin 0 siblings, 1 reply; 6+ messages in thread From: Andy Lutomirski @ 2014-07-29 3:39 UTC (permalink / raw) To: Ingo Molnar, H. Peter Anvin, Andy Lutomirski, linux-kernel, Thomas Gleixner, stable, H. Peter Anvin Cc: linux-tip-commits On Mon, Jul 28, 2014 at 3:33 PM, tip-bot for Andy Lutomirski <tipbot@zytor.com> wrote: > Commit-ID: 7209a75d2009dbf7745e2fd354abf25c3deb3ca3 > Gitweb: http://git.kernel.org/tip/7209a75d2009dbf7745e2fd354abf25c3deb3ca3 > Author: Andy Lutomirski <luto@amacapital.net> > AuthorDate: Wed, 23 Jul 2014 08:34:11 -0700 > Committer: H. Peter Anvin <hpa@linux.intel.com> > CommitDate: Mon, 28 Jul 2014 15:25:40 -0700 > > x86_64/entry/xen: Do not invoke espfix64 on Xen > > This moves the espfix64 logic into native_iret. To make this work, > it gets rid of the native patch for INTERRUPT_RETURN: > INTERRUPT_RETURN on native kernels is now 'jmp native_iret'. > > This changes the 16-bit SS behavior on Xen from OOPSing to leaking > some bits of the Xen hypervisor's RSP (I think). > > [ hpa: this is a nonzero cost on native, but probably not enough to > measure. Xen needs to fix this in their own code, probably doing > something equivalent to espfix64. ] Any plan to either fix or remove the buggy bsp-only initialization of espfix64 on paravirt? It's harmless now, but IMO it's still ugly. Or should this wait for 3.17? --Andy > > Signed-off-by: Andy Lutomirski <luto@amacapital.net> > Link: http://lkml.kernel.org/r/7b8f1d8ef6597cb16ae004a43c56980a7de3cf94.1406129132.git.luto@amacapital.net > Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> > Cc: <stable@vger.kernel.org> > --- > arch/x86/include/asm/irqflags.h | 2 +- > arch/x86/kernel/entry_64.S | 28 ++++++++++------------------ > arch/x86/kernel/paravirt_patch_64.c | 2 -- > 3 files changed, 11 insertions(+), 21 deletions(-) > > diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h > index bba3cf8..0a8b519 100644 > --- a/arch/x86/include/asm/irqflags.h > +++ b/arch/x86/include/asm/irqflags.h > @@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void) > > #define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ > > -#define INTERRUPT_RETURN iretq > +#define INTERRUPT_RETURN jmp native_iret > #define USERGS_SYSRET64 \ > swapgs; \ > sysretq; > diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S > index b25ca96..c844f08 100644 > --- a/arch/x86/kernel/entry_64.S > +++ b/arch/x86/kernel/entry_64.S > @@ -830,27 +830,24 @@ restore_args: > RESTORE_ARGS 1,8,1 > > irq_return: > + INTERRUPT_RETURN > + > +ENTRY(native_iret) > /* > * Are we returning to a stack segment from the LDT? Note: in > * 64-bit mode SS:RSP on the exception stack is always valid. > */ > #ifdef CONFIG_X86_ESPFIX64 > testb $4,(SS-RIP)(%rsp) > - jnz irq_return_ldt > + jnz native_irq_return_ldt > #endif > > -irq_return_iret: > - INTERRUPT_RETURN > - _ASM_EXTABLE(irq_return_iret, bad_iret) > - > -#ifdef CONFIG_PARAVIRT > -ENTRY(native_iret) > +native_irq_return_iret: > iretq > - _ASM_EXTABLE(native_iret, bad_iret) > -#endif > + _ASM_EXTABLE(native_irq_return_iret, bad_iret) > > #ifdef CONFIG_X86_ESPFIX64 > -irq_return_ldt: > +native_irq_return_ldt: > pushq_cfi %rax > pushq_cfi %rdi > SWAPGS > @@ -872,7 +869,7 @@ irq_return_ldt: > SWAPGS > movq %rax,%rsp > popq_cfi %rax > - jmp irq_return_iret > + jmp native_irq_return_iret > #endif > > .section .fixup,"ax" > @@ -956,13 +953,8 @@ __do_double_fault: > cmpl $__KERNEL_CS,CS(%rdi) > jne do_double_fault > movq RIP(%rdi),%rax > - cmpq $irq_return_iret,%rax > -#ifdef CONFIG_PARAVIRT > - je 1f > - cmpq $native_iret,%rax > -#endif > + cmpq $native_irq_return_iret,%rax > jne do_double_fault /* This shouldn't happen... */ > -1: > movq PER_CPU_VAR(kernel_stack),%rax > subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ > movq %rax,RSP(%rdi) > @@ -1428,7 +1420,7 @@ error_sti: > */ > error_kernelspace: > incl %ebx > - leaq irq_return_iret(%rip),%rcx > + leaq native_irq_return_iret(%rip),%rcx > cmpq %rcx,RIP+8(%rsp) > je error_swapgs > movl %ecx,%eax /* zero extend */ > diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c > index 3f08f34..a1da673 100644 > --- a/arch/x86/kernel/paravirt_patch_64.c > +++ b/arch/x86/kernel/paravirt_patch_64.c > @@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); > DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); > DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); > DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); > -DEF_NATIVE(pv_cpu_ops, iret, "iretq"); > DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); > DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); > DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); > @@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, > PATCH_SITE(pv_irq_ops, save_fl); > PATCH_SITE(pv_irq_ops, irq_enable); > PATCH_SITE(pv_irq_ops, irq_disable); > - PATCH_SITE(pv_cpu_ops, iret); > PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); > PATCH_SITE(pv_cpu_ops, usergs_sysret32); > PATCH_SITE(pv_cpu_ops, usergs_sysret64); -- Andy Lutomirski AMA Capital Management, LLC ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [tip:x86/urgent] x86_64/entry/xen: Do not invoke espfix64 on Xen 2014-07-29 3:39 ` Andy Lutomirski @ 2014-07-29 3:47 ` H. Peter Anvin 0 siblings, 0 replies; 6+ messages in thread From: H. Peter Anvin @ 2014-07-29 3:47 UTC (permalink / raw) To: Andy Lutomirski, Ingo Molnar, linux-kernel, Thomas Gleixner, stable, H. Peter Anvin Cc: linux-tip-commits Would like to absolutely minimize changes at this point. On July 28, 2014 8:39:43 PM PDT, Andy Lutomirski <luto@amacapital.net> wrote: >On Mon, Jul 28, 2014 at 3:33 PM, tip-bot for Andy Lutomirski ><tipbot@zytor.com> wrote: >> Commit-ID: 7209a75d2009dbf7745e2fd354abf25c3deb3ca3 >> Gitweb: >http://git.kernel.org/tip/7209a75d2009dbf7745e2fd354abf25c3deb3ca3 >> Author: Andy Lutomirski <luto@amacapital.net> >> AuthorDate: Wed, 23 Jul 2014 08:34:11 -0700 >> Committer: H. Peter Anvin <hpa@linux.intel.com> >> CommitDate: Mon, 28 Jul 2014 15:25:40 -0700 >> >> x86_64/entry/xen: Do not invoke espfix64 on Xen >> >> This moves the espfix64 logic into native_iret. To make this work, >> it gets rid of the native patch for INTERRUPT_RETURN: >> INTERRUPT_RETURN on native kernels is now 'jmp native_iret'. >> >> This changes the 16-bit SS behavior on Xen from OOPSing to leaking >> some bits of the Xen hypervisor's RSP (I think). >> >> [ hpa: this is a nonzero cost on native, but probably not enough to >> measure. Xen needs to fix this in their own code, probably doing >> something equivalent to espfix64. ] > >Any plan to either fix or remove the buggy bsp-only initialization of >espfix64 on paravirt? It's harmless now, but IMO it's still ugly. Or >should this wait for 3.17? > >--Andy > >> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> >> Link: >http://lkml.kernel.org/r/7b8f1d8ef6597cb16ae004a43c56980a7de3cf94.1406129132.git.luto@amacapital.net >> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> >> Cc: <stable@vger.kernel.org> >> --- >> arch/x86/include/asm/irqflags.h | 2 +- >> arch/x86/kernel/entry_64.S | 28 >++++++++++------------------ >> arch/x86/kernel/paravirt_patch_64.c | 2 -- >> 3 files changed, 11 insertions(+), 21 deletions(-) >> >> diff --git a/arch/x86/include/asm/irqflags.h >b/arch/x86/include/asm/irqflags.h >> index bba3cf8..0a8b519 100644 >> --- a/arch/x86/include/asm/irqflags.h >> +++ b/arch/x86/include/asm/irqflags.h >> @@ -129,7 +129,7 @@ static inline notrace unsigned long >arch_local_irq_save(void) >> >> #define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ >> >> -#define INTERRUPT_RETURN iretq >> +#define INTERRUPT_RETURN jmp native_iret >> #define USERGS_SYSRET64 \ >> swapgs; \ >> sysretq; >> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S >> index b25ca96..c844f08 100644 >> --- a/arch/x86/kernel/entry_64.S >> +++ b/arch/x86/kernel/entry_64.S >> @@ -830,27 +830,24 @@ restore_args: >> RESTORE_ARGS 1,8,1 >> >> irq_return: >> + INTERRUPT_RETURN >> + >> +ENTRY(native_iret) >> /* >> * Are we returning to a stack segment from the LDT? Note: >in >> * 64-bit mode SS:RSP on the exception stack is always valid. >> */ >> #ifdef CONFIG_X86_ESPFIX64 >> testb $4,(SS-RIP)(%rsp) >> - jnz irq_return_ldt >> + jnz native_irq_return_ldt >> #endif >> >> -irq_return_iret: >> - INTERRUPT_RETURN >> - _ASM_EXTABLE(irq_return_iret, bad_iret) >> - >> -#ifdef CONFIG_PARAVIRT >> -ENTRY(native_iret) >> +native_irq_return_iret: >> iretq >> - _ASM_EXTABLE(native_iret, bad_iret) >> -#endif >> + _ASM_EXTABLE(native_irq_return_iret, bad_iret) >> >> #ifdef CONFIG_X86_ESPFIX64 >> -irq_return_ldt: >> +native_irq_return_ldt: >> pushq_cfi %rax >> pushq_cfi %rdi >> SWAPGS >> @@ -872,7 +869,7 @@ irq_return_ldt: >> SWAPGS >> movq %rax,%rsp >> popq_cfi %rax >> - jmp irq_return_iret >> + jmp native_irq_return_iret >> #endif >> >> .section .fixup,"ax" >> @@ -956,13 +953,8 @@ __do_double_fault: >> cmpl $__KERNEL_CS,CS(%rdi) >> jne do_double_fault >> movq RIP(%rdi),%rax >> - cmpq $irq_return_iret,%rax >> -#ifdef CONFIG_PARAVIRT >> - je 1f >> - cmpq $native_iret,%rax >> -#endif >> + cmpq $native_irq_return_iret,%rax >> jne do_double_fault /* This shouldn't happen... >*/ >> -1: >> movq PER_CPU_VAR(kernel_stack),%rax >> subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original >stack */ >> movq %rax,RSP(%rdi) >> @@ -1428,7 +1420,7 @@ error_sti: >> */ >> error_kernelspace: >> incl %ebx >> - leaq irq_return_iret(%rip),%rcx >> + leaq native_irq_return_iret(%rip),%rcx >> cmpq %rcx,RIP+8(%rsp) >> je error_swapgs >> movl %ecx,%eax /* zero extend */ >> diff --git a/arch/x86/kernel/paravirt_patch_64.c >b/arch/x86/kernel/paravirt_patch_64.c >> index 3f08f34..a1da673 100644 >> --- a/arch/x86/kernel/paravirt_patch_64.c >> +++ b/arch/x86/kernel/paravirt_patch_64.c >> @@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); >> DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); >> DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); >> DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); >> -DEF_NATIVE(pv_cpu_ops, iret, "iretq"); >> DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); >> DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); >> DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); >> @@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void >*ibuf, >> PATCH_SITE(pv_irq_ops, save_fl); >> PATCH_SITE(pv_irq_ops, irq_enable); >> PATCH_SITE(pv_irq_ops, irq_disable); >> - PATCH_SITE(pv_cpu_ops, iret); >> PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); >> PATCH_SITE(pv_cpu_ops, usergs_sysret32); >> PATCH_SITE(pv_cpu_ops, usergs_sysret64); -- Sent from my mobile phone. Please pardon brevity and lack of formatting. ^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH 2/2] x86_64,entry: Use lret to return to userspace when possible 2014-07-23 15:34 [PATCH 0/2] x86_64: LRET to userspace Andy Lutomirski 2014-07-23 15:34 ` [PATCH 1/2] x86_64,entry,xen: Do not invoke espfix64 on Xen Andy Lutomirski @ 2014-07-23 15:34 ` Andy Lutomirski 1 sibling, 0 replies; 6+ messages in thread From: Andy Lutomirski @ 2014-07-23 15:34 UTC (permalink / raw) To: H. Peter Anvin, Linus Torvalds, linux-kernel, x86, Borislav Petkov Cc: Andy Lutomirski IRET serializes, but LRET does not. Try to use LRET to return to userspace when possible. It's possible if the saved RF and TF are clear, IF is set, and espfix isn't needed. This cuts about 23ns off of the IRET-to-userspace path on my machine. (YMMV -- this was in a tight loop, and I can imagine the performance hit from serialization to be somewhat higher in real code.) Signed-off-by: Andy Lutomirski <luto@amacapital.net> --- I've tested normal code, iret faults, lret faults, and returns to funny SS values. I haven't explicitly exercised this code under heavy NMI load, nor have I tested it on Xen (although, unless I screwed up, it shouldn't do anything on Xen). Benchmark away. I know that Linus had a test for this stuff. I doubt that this will be as spectacular as my old sysret trampoline hack, but it could still be a nice speedup. arch/x86/include/asm/irqflags.h | 3 +- arch/x86/include/asm/paravirt.h | 4 ++ arch/x86/include/asm/traps.h | 6 +++ arch/x86/kernel/cpu/mcheck/mce.c | 2 + arch/x86/kernel/entry_64.S | 93 +++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/nmi.c | 21 +++++++++ 6 files changed, 126 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0a8b519..04c45bb 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -129,7 +129,8 @@ static inline notrace unsigned long arch_local_irq_save(void) #define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ -#define INTERRUPT_RETURN jmp native_iret +#define INTERRUPT_RETURN jmp native_iret +#define INTERRUPT_RETURN_UNBLOCK_NMI jmp native_irq_return_need_iret #define USERGS_SYSRET64 \ swapgs; \ sysretq; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cd6e161..3716b3d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -913,6 +913,10 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret)) +#define INTERRUPT_RETURN_UNBLOCK_NMI \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret)) + #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bc8352e..2e3dfe8 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -134,4 +134,10 @@ enum { X86_TRAP_IRET = 32, /* 32, IRET Exception */ }; +#ifdef CONFIG_X86_64 +extern void fixup_lret_nmi(struct pt_regs *regs); +#else +static inline void fixup_lret_nmi(struct pt_regs *regs) {} +#endif + #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index bb92f38..0bb9b9b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -43,6 +43,7 @@ #include <linux/export.h> #include <asm/processor.h> +#include <asm/traps.h> #include <asm/mce.h> #include <asm/msr.h> @@ -1168,6 +1169,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: sync_core(); + fixup_lret_nmi(regs); } EXPORT_SYMBOL_GPL(do_machine_check); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c844f08..a23b302 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -834,6 +834,81 @@ irq_return: ENTRY(native_iret) /* + * This implements "iret" the Platonic ideal, not "iret" the + * instruction. Specifically, it will pop RIP, CS, FLAGS, + * RSP, and SS and load them, correctly, into the CPU state. + * (IRET screws up RSP depending on SS). It tries to avoid + * serializing (IRET always serializes). + * + * This code does *not* promise to unblock NMIs. Use + * INTERRUPT_RETURN_UNBLOCK_NMI if you need NMIs to be unblocked. + */ + + /* + * Only IRET can set RF correctly, and our sti trick is + * is incompatible with TF being set. + */ + testl $(X86_EFLAGS_RF|X86_EFLAGS_TF), (EFLAGS-RIP)(%rsp) + jnz native_irq_return_need_iret + + /* + * While it's technically possible to be in userspace with IF + * clear (using iopl(2)), it's so unlikely that there's no point + * in optimizing it. + */ + testl $X86_EFLAGS_IF, (EFLAGS-RIP)(%rsp) + jz native_irq_return_need_iret + + /* + * Returning without IRET to kernel space is possible, but + * the considerations are different and we're not ready for that + * yet. + */ + testl $3, (CS-RIP)(%rsp) + jz native_irq_return_need_iret + +#ifdef CONFIG_X86_ESPFIX64 + /* lret has the same bug^Wfeature as iret wrt 16-bit SS. */ + testb $4,(SS-RIP)(%rsp) + jnz native_irq_return_ldt +#endif + + /* + * Rearrange the stack to pretend we got here via a call gate + * (yes, really), and do a long return. + */ + pushq (SS - RIP + 0*8)(%rsp) + pushq (RSP - RIP + 1*8)(%rsp) + pushq (CS - RIP + 2*8)(%rsp) + pushq (RIP - RIP + 3*8)(%rsp) + pushq (EFLAGS - RIP + 4*8)(%rsp) + andl $~X86_EFLAGS_IF, (%rsp) /* Clear saved IF. */ + popfq /* Restore all regs except IF. */ + +.global native_sti_before_lret_to_userspace +native_sti_before_lret_to_userspace: + sti /* Restore IF. */ + + /* + * This relies on the one-instruction interrupt grace period here + * between sti and lret. A non-paranoid interrupt here will + * explode because GS is wrong. More subtly, we may be on an IST + * stack, and if we enable interrupts before leaving the IST stack, + * we could cause a recursive IST interrupt, which would blow away + * our stack frame. + * + * NMI and MCE are safe here -- see fixup_lret_nmi. + */ + +.global native_lret_to_userspace +native_lret_to_userspace: + lretq + + /* This fixup is special -- see error_lret. */ + _ASM_EXTABLE(native_lret_to_userspace, bad_iret) + +native_irq_return_need_iret: + /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. */ @@ -883,6 +958,8 @@ bad_iret: * We are now running with the kernel GS after exception recovery. * But error_entry expects us to have user GS to match the user %cs, * so swap back. + * + * lret faults land here, too. */ pushq $0 @@ -1412,7 +1489,7 @@ error_sti: ret /* - * There are two places in the kernel that can potentially fault with + * There are three places in the kernel that can potentially fault with * usergs. Handle them here. The exception handlers after iret run with * kernel gs again, so don't set the user space flag. B stepping K8s * sometimes report an truncated RIP for IRET exceptions returning to @@ -1428,6 +1505,8 @@ error_kernelspace: je bstep_iret cmpq $gs_change,RIP+8(%rsp) je error_swapgs + cmpq $native_lret_to_userspace,RIP+8(%rsp) + je error_lret jmp error_sti bstep_iret: @@ -1435,6 +1514,16 @@ bstep_iret: movq %rcx,RIP+8(%rsp) jmp error_swapgs CFI_ENDPROC + +error_lret: + /* + * We can't return from this fault with IF set because we'll lose + * the sti grace period. Fix up the fault so that it looks just + * like an iret fault instead. + */ + addq $4*8,RSP+8(%rsp) /* pop the lret frame */ + andl $~X86_EFLAGS_IF,EFLAGS+8(%rsp) /* clear IF */ + jmp error_swapgs /* return w/ kernel GS */ END(error_entry) @@ -1706,7 +1795,7 @@ nmi_restore: /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) - jmp irq_return + INTERRUPT_RETURN_UNBLOCK_NMI CFI_ENDPROC END(nmi) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index c3e985d..ca8be8e 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -306,12 +306,33 @@ NOKPROBE_SYMBOL(unknown_nmi_error); static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); +#ifdef CONFIG_X86_64 +void fixup_lret_nmi(struct pt_regs *regs) +{ + /* + * There is no architectural guarantee that an NMI or MCE can't + * happen between sti and lret. To avoid returning to the lret + * instruction with interrupts on, we back up one instruction. + */ + extern const char native_lret_to_userspace[]; + extern const char native_sti_before_lret_to_userspace[]; + + if (!user_mode_vm(regs) && + regs->ip == (unsigned long)native_lret_to_userspace) { + regs->ip = (unsigned long)native_sti_before_lret_to_userspace; + regs->flags &= ~X86_EFLAGS_IF; + } +} +#endif + static void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; bool b2b = false; + fixup_lret_nmi(regs); + /* * CPU-specific NMI must be processed before non-CPU-specific * NMI, otherwise we may lose it, because the CPU-specific -- 1.9.3 ^ permalink raw reply related [flat|nested] 6+ messages in thread
end of thread, other threads:[~2014-07-29 3:47 UTC | newest] Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2014-07-23 15:34 [PATCH 0/2] x86_64: LRET to userspace Andy Lutomirski 2014-07-23 15:34 ` [PATCH 1/2] x86_64,entry,xen: Do not invoke espfix64 on Xen Andy Lutomirski 2014-07-28 22:33 ` [tip:x86/urgent] x86_64/entry/xen: " tip-bot for Andy Lutomirski 2014-07-29 3:39 ` Andy Lutomirski 2014-07-29 3:47 ` H. Peter Anvin 2014-07-23 15:34 ` [PATCH 2/2] x86_64,entry: Use lret to return to userspace when possible Andy Lutomirski
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).