From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753694AbdKXEe4 (ORCPT ); Thu, 23 Nov 2017 23:34:56 -0500 Received: from mail.kernel.org ([198.145.29.99]:44078 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753492AbdKXEd1 (ORCPT ); Thu, 23 Nov 2017 23:33:27 -0500 DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org ECEB221996 Authentication-Results: mail.kernel.org; dmarc=none (p=none dis=none) header.from=kernel.org Authentication-Results: mail.kernel.org; spf=none smtp.mailfrom=luto@kernel.org From: Andy Lutomirski To: X86 ML Cc: Borislav Petkov , "linux-kernel@vger.kernel.org" , Brian Gerst , Dave Hansen , Linus Torvalds , Josh Poimboeuf , Andy Lutomirski Subject: [PATCH v3 14/19] x86/entry/64: Create a percpu SYSCALL entry trampoline Date: Thu, 23 Nov 2017 20:33:00 -0800 Message-Id: X-Mailer: git-send-email 2.13.6 In-Reply-To: References: In-Reply-To: References: Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Handling SYSCALL is tricky: the SYSCALL handler is entered with every single register (except FLAGS), including RSP, live. It somehow needs to set RSP to point to a valid stack, which means it needs to save the user RSP somewhere and find its own stack pointer. The canonical way to do this is with SWAPGS, which lets us access percpu data using the %gs prefix. With KAISER-like pagetable switching, this is problematic. Without a scratch register, switching CR3 is impossible, so %gs-based percpu memory would need to be mapped in the user pagetables. Doing that without information leaks is difficult or impossible. Instead, use a different sneaky trick. Map a copy of the first part of the SYSCALL asm at a different address for each CPU. Now RIP varies depending on the CPU, so we can use RIP-relative memory access to access percpu memory. By putting the relevant information (one scratch slot and the stack address) at a constant offset relative to RIP, we can make SYSCALL work without relying on %gs. A nice thing about this approach is that we can easily switch it on and off if we want pagetable switching to be configurable. The compat variant of SYSCALL doesn't have this problem in the first place -- there are plenty of scratch registers, since we don't care about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 at all. XXX: Whenever we settle how KAISER gets turned on and off, we should do the same to this. Signed-off-by: Andy Lutomirski --- arch/x86/entry/entry_64.S | 48 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/fixmap.h | 2 ++ arch/x86/kernel/asm-offsets.c | 1 + arch/x86/kernel/cpu/common.c | 12 ++++++++++- arch/x86/kernel/vmlinux.lds.S | 10 +++++++++ 5 files changed, 72 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 426b8c669d6a..0cde243b7542 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -140,6 +140,54 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ + .pushsection .entry_trampoline, "ax" + +/* + * The code in here gets remapped into cpu_entry_area's trampoline. This means + * that the assembler and linker have the wrong idea as to where this code + * lives (and, in fact, it's mapped more than once, so it's not even at a + * fixed address). So we can't reference any symbols outside the entry + * trampoline and expect it to work. + * + * Instead, we carefully abuse %rip-relative addressing. + * .Lentry_trampoline(%rip) refers to the start of the remapped) entry + * trampoline. We can thus find cpu_entry_area with this macro: + */ + +#define CPU_ENTRY_AREA \ + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA + +ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY + swapgs + + /* Stash the user RSP. */ + movq %rsp, RSP_SCRATCH + + /* Load the top of the task stack into RSP */ + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp + + /* Start building the simulated IRET frame. */ + pushq $__USER_DS /* pt_regs->ss */ + pushq RSP_SCRATCH /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + + /* + * x86 lacks a near absolute jump, and we can't jump to the real + * entry text with a relative jump, so we fake it using retq. + */ + pushq $entry_SYSCALL_64_after_hwframe + retq +END(entry_SYSCALL_64_trampoline) + + .popsection + ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 3a42da14c2cb..7eb1b5490395 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -58,6 +58,8 @@ struct cpu_entry_area { * of the TSS region. */ struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; }; #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 55858b277cf6..61b1af88ac07 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -101,4 +101,5 @@ void common(void) { /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7c82a8a8bfda..5a05db084659 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -507,6 +507,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); static inline void setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + /* On 64-bit systems, we use a read-only fixmap GDT. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; #else @@ -553,6 +555,11 @@ static inline void setup_cpu_entry_area(int cpu) #ifdef CONFIG_X86_32 this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); #endif + +#ifdef CONFIG_X86_64 + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif } /* Load the original GDT from the per-cpu structure */ @@ -1417,10 +1424,13 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { + extern char _entry_trampoline[]; + extern char entry_SYSCALL_64_trampoline[]; + int cpu = smp_processor_id(); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + (entry_SYSCALL_64_trampoline - _entry_trampoline)); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a4009fb9be87..2738cfb6c8c8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -107,6 +107,16 @@ SECTIONS SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) + +#ifdef CONFIG_X86_64 + /* Entry trampoline */ + . = ALIGN(PAGE_SIZE); + _entry_trampoline = .; + *(.entry_trampoline) + . = ALIGN(PAGE_SIZE); + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); +#endif + /* End of text section */ _etext = .; } :text = 0x9090 -- 2.13.6