diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9264f04a4c55..cea5b9b517f2 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -57,6 +57,8 @@ extern void x86_ce4100_early_setup(void); static inline void x86_ce4100_early_setup(void) { } #endif +extern void init_espfix_this_cpu(void); + #ifndef _SETUP /* diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f4d96000d33a..1cc3789d99d9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf2..7f71c97f59c0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -58,6 +58,7 @@ #include #include #include +#include #include /* Avoid __ASSEMBLER__'ifying just for this. */ @@ -1040,8 +1041,16 @@ restore_args: RESTORE_ARGS 1,8,1 irq_return: + /* + * Are we returning to the LDT? Note: in 64-bit mode + * SS:RSP on the exception stack is always valid. + */ + testb $4,(SS-RIP)(%rsp) + jnz irq_return_ldt + +irq_return_iret: INTERRUPT_RETURN - _ASM_EXTABLE(irq_return, bad_iret) + _ASM_EXTABLE(irq_return_iret, bad_iret) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) @@ -1049,6 +1058,34 @@ ENTRY(native_iret) _ASM_EXTABLE(native_iret, bad_iret) #endif +irq_return_ldt: + pushq_cfi %rcx + larl (CS-RIP+8)(%rsp), %ecx + jnz 1f /* Invalid segment - will #GP at IRET time */ + testl $0x00200000, %ecx + jnz 1f /* Returning to 64-bit mode */ + larl (SS-RIP+8)(%rsp), %ecx + jnz 1f /* Invalid segment - will #SS at IRET time */ + testl $0x00400000, %ecx + jnz 1f /* Not a 16-bit stack segment */ + pushq_cfi %rsi + pushq_cfi %rdi + SWAPGS + movq PER_CPU_VAR(espfix_stack),%rdi + movl (RSP-RIP+3*8)(%rsp),%esi + xorw %si,%si + orq %rsi,%rdi + movq %rsp,%rsi + movl $8,%ecx + rep;movsq + leaq -(8*8)(%rdi),%rsp + SWAPGS + popq_cfi %rdi + popq_cfi %rsi +1: + popq_cfi %rcx + jmp irq_return_iret + .section .fixup,"ax" bad_iret: /* @@ -1058,6 +1095,7 @@ bad_iret: * So pretend we completed the iret and took the #GPF in user mode. * * We are now running with the kernel GS after exception recovery. + * Exception entry will have removed us from the espfix stack. * But error_entry expects us to have user GS to match the user %cs, * so swap back. */ @@ -1278,6 +1316,62 @@ ENTRY(\sym) END(\sym) .endm +/* + * Same as errorentry, except use for #GP in case we take the exception + * while on the espfix stack. All other exceptions that are possible while + * on the espfix stack use IST, but that is not really practical for #GP + * for nesting reasons. + */ +.macro errorentry_espfix sym do_sym +ENTRY(\sym) + XCPT_FRAME + ASM_CLAC + PARAVIRT_ADJUST_EXCEPTION_FRAME + /* Check if we are on the espfix stack */ + pushq_cfi %rdi + pushq_cfi %rsi + movq %rsp,%rdi + sarq $PGDIR_SHIFT,%rdi + cmpl $-2,%edi /* Are we on the espfix stack? */ + CFI_REMEMBER_STATE + je 1f +2: + subq $RSI-R15, %rsp + CFI_ADJUST_CFA_OFFSET RSI-R15 + call error_entry_rdi_rsi_saved + DEFAULT_FRAME 0 + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ +1: + CFI_RESTORE_STATE + SWAPGS + movq PER_CPU_VAR(kernel_stack),%rdi + SWAPGS + /* Copy data from the espfix stack to the real stack */ + movq %rsi,-64(%rdi) /* Saved value of %rsi already */ + movq 8(%rsp),%rsi + movq %rsi,-56(%rdi) + movq 16(%rsp),%rsi + movq %rsi,-48(%rdi) + movq 24(%rsp),%rsi + movq %rsi,-40(%rdi) + movq 32(%rsp),%rsi + movq %rsi,-32(%rdi) + movq 40(%rsp),%rsi + movq %rsi,-24(%rdi) + movq 48(%rsp),%rsi + movq %rsi,-16(%rdi) + movq 56(%rsp),%rsi + movq %rsi,-8(%rdi) + leaq -64(%rdi),%rsp + jmp 2b + CFI_ENDPROC +END(\sym) +.endm + #ifdef CONFIG_TRACING .macro trace_errorentry sym do_sym errorentry trace(\sym) trace(\do_sym) @@ -1323,7 +1417,6 @@ zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error - /* Reload gs selector with exception handling */ /* edi: new selector */ ENTRY(native_load_gs_index) @@ -1490,7 +1583,7 @@ zeroentry xen_debug do_debug zeroentry xen_int3 do_int3 errorentry xen_stack_segment do_stack_segment #endif -errorentry general_protection do_general_protection +errorentry_espfix general_protection do_general_protection trace_errorentry page_fault do_page_fault #ifdef CONFIG_KVM_GUEST errorentry async_page_fault do_async_page_fault @@ -1567,9 +1660,10 @@ ENTRY(error_entry) XCPT_FRAME CFI_ADJUST_CFA_OFFSET 15*8 /* oldrax contains error code */ - cld movq_cfi rdi, RDI+8 movq_cfi rsi, RSI+8 +error_entry_rdi_rsi_saved: + cld movq_cfi rdx, RDX+8 movq_cfi rcx, RCX+8 movq_cfi rax, RAX+8 diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 000000000000..05567d706f92 --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,136 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include + +#define ESPFIX_STACK_SIZE 64UL +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE << (PGDIR_SHIFT-PAGE_SHIFT-16)) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more than one PGD for the ESPFIX hack" +#endif + +#define ESPFIX_BASE_ADDR (-2UL << PGDIR_SHIFT) + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +#define ESPFIX_MAP_SIZE DIV_ROUND_UP(ESPFIX_MAX_PAGES, BITS_PER_LONG) +static unsigned long espfix_page_alloc_map[ESPFIX_MAP_SIZE]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) << PAGE_SHIFT; + addr = page + (cpu % ESPFIX_STACKS_PER_PAGE) * ESPFIX_STACK_SIZE; + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +void init_espfix_this_cpu(void) +{ + unsigned int cpu, page; + unsigned long addr; + pgd_t pgd, *pgd_p; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(this_cpu_read(espfix_stack))) + return; /* Already initialized */ + + cpu = smp_processor_id(); + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + if (likely(test_bit(page, espfix_page_alloc_map))) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + if (unlikely(test_bit(page, espfix_page_alloc_map))) + goto unlock_done; + + ptemask = __supported_pte_mask; + + pgd_p = &init_level4_pgt[pgd_index(addr)]; + pgd = *pgd_p; + if (!pgd_present(pgd)) { + /* This can only happen on the BSP */ + pgd = __pgd(__pa_symbol(espfix_pud_page) | + (_KERNPG_TABLE & ptemask)); + set_pgd(pgd_p, pgd); + } + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + pud = __pud(__pa(pmd_p) | (_KERNPG_TABLE & ptemask)); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + pmd = __pmd(__pa(pte_p) | (_KERNPG_TABLE & ptemask)); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = (void *)__get_free_page(GFP_KERNEL); + pte = __pte(__pa(stack_page) | (__PAGE_KERNEL & ptemask)); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + set_bit(page, espfix_page_alloc_map); + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + this_cpu_write(espfix_stack, addr); +} diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index af1d14a9ebda..ebc987398923 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,17 +229,6 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } - /* - * On x86-64 we do not support 16-bit segments due to - * IRET leaking the high bits of the kernel stack address. - */ -#ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit) { - error = -EINVAL; - goto out_unlock; - } -#endif - fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 34826934d4a7..7956aad1a710 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -244,6 +244,11 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* + * Enable the espfix hack for this CPU + */ + init_espfix_this_cpu(); + + /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 20621d753d5f..96bf767a05fc 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -327,6 +327,8 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) int i; struct pg_state st = {}; + st.to_dmesg = true; + if (pgd) { start = pgd; st.to_dmesg = true; diff --git a/init/main.c b/init/main.c index 9c7fd4c9249f..6230d4b7ce1b 100644 --- a/init/main.c +++ b/init/main.c @@ -617,6 +617,10 @@ asmlinkage void __init start_kernel(void) if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); #endif +#ifdef CONFIG_X86_64 + /* Should be run before the first non-init thread is created */ + init_espfix_this_cpu(); +#endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages);