* [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
@ 2017-05-04 10:02 Daniel Gruss
2017-05-04 12:26 ` Daniel Gruss
` (3 more replies)
0 siblings, 4 replies; 28+ messages in thread
From: Daniel Gruss @ 2017-05-04 10:02 UTC (permalink / raw)
To: kernel list, kernel-hardening
Cc: clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, kirill.shutemov, Ingo Molnar, anders.fogh
[-- Attachment #1: Type: text/plain, Size: 1060 bytes --]
After several recent works [1,2,3] KASLR on x86_64 was basically
considered dead by many researchers. We have been working on an
efficient but effective fix for this problem and found that not mapping
the kernel space when running in user mode is the solution to this
problem [4] (the corresponding paper [5] will be presented at ESSoS17).
With this RFC patch we allow anybody to configure their kernel with the
flag CONFIG_KAISER to add our defense mechanism.
If there are any questions we would love to answer them.
We also appreciate any comments!
Cheers,
Daniel (+ the KAISER team from Graz University of Technology)
[1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf
[2]
https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf
[3]
https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf
[4] https://github.com/IAIK/KAISER
[5] https://gruss.cc/files/kaiser.pdf
[-- Attachment #2: 0001-KAISER-Kernel-Address-Isolation.patch --]
[-- Type: text/x-patch, Size: 22362 bytes --]
>From 03c413bc52f1ac253cf0f067605f367f3390d3f4 Mon Sep 17 00:00:00 2001
From: Richard Fellner <richard.fellner@student.tugraz.at>
Date: Thu, 4 May 2017 10:44:38 +0200
Subject: [PATCH] KAISER: Kernel Address Isolation
This patch introduces our implementation of KAISER (Kernel Address Isolation to
have Side-channels Efficiently Removed), a kernel isolation technique to close
hardware side channels on kernel address information.
More information about the patch can be found on:
https://github.com/IAIK/KAISER
---
arch/x86/entry/entry_64.S | 17 +++++++++++++++++
arch/x86/entry/entry_64_compat.S | 7 ++++++-
arch/x86/include/asm/hw_irq.h | 2 +-
arch/x86/include/asm/pgtable.h | 4 ++++
arch/x86/include/asm/pgtable_64.h | 21 +++++++++++++++++++++
arch/x86/include/asm/pgtable_types.h | 12 ++++++++++--
arch/x86/include/asm/processor.h | 7 ++++++-
arch/x86/kernel/cpu/common.c | 4 ++--
arch/x86/kernel/espfix_64.c | 6 ++++++
arch/x86/kernel/head_64.S | 16 ++++++++++++----
arch/x86/kernel/irqinit.c | 2 +-
arch/x86/kernel/process.c | 2 +-
arch/x86/mm/Makefile | 2 +-
arch/x86/mm/pageattr.c | 2 +-
arch/x86/mm/pgtable.c | 28 +++++++++++++++++++++++++++-
include/asm-generic/vmlinux.lds.h | 11 ++++++++++-
include/linux/percpu-defs.h | 30 ++++++++++++++++++++++++++++++
init/main.c | 5 +++++
kernel/fork.c | 8 ++++++++
security/Kconfig | 7 +++++++
20 files changed, 176 insertions(+), 17 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 044d18e..631c7bf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -37,6 +37,7 @@
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/frame.h>
+#include <asm/kaiser.h>
#include <linux/err.h>
.code64
@@ -141,6 +142,7 @@ ENTRY(entry_SYSCALL_64)
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@@ -223,6 +225,7 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
@@ -318,10 +321,12 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
+ SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
@@ -420,6 +425,7 @@ ENTRY(ret_from_fork)
leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ SWITCH_USER_CR3
SWAPGS
FRAME_END
jmp restore_regs_and_iret
@@ -476,6 +482,7 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -533,6 +540,7 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -610,6 +618,7 @@ native_irq_return_ldt:
pushq %rdi /* Stash user RDI */
SWAPGS
+ SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
@@ -636,6 +645,7 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
+ SWITCH_USER_CR3
SWAPGS
movq %rax, %rsp
@@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry)
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
+ SWITCH_KERNEL_CR3
xorl %ebx, %ebx
1: ret
END(paranoid_entry)
@@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
@@ -1085,6 +1097,7 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
.Lerror_entry_from_usermode_after_swapgs:
/*
@@ -1136,6 +1149,7 @@ ENTRY(error_entry)
* Switch to kernel gsbase:
*/
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1234,6 +1248,7 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1274,6 +1289,7 @@ ENTRY(nmi)
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts.
*/
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -1485,6 +1501,7 @@ end_repeat_nmi:
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
+ SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721da..f0e384e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>
@@ -48,6 +49,7 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
@@ -259,6 +262,7 @@ sysret32_from_system_call:
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
+ SWITCH_USER_CR3
movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
@@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat)
PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
-
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* User tracing code (ptrace or signal handlers) might assume that
* the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
+ SWITCH_USER_CR3_NO_STACK
SWAPGS
jmp restore_regs_and_iret
END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b90e105..0817d63 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -178,7 +178,7 @@ extern char irq_entries_start[];
#define VECTOR_RETRIGGERED ((void *)~0UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 437feb4..aeeabb9 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ // clone the shadow pgd part as well
+ memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 62b77592..550b473 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+ return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+ return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
+#ifdef CONFIG_KAISER
+ // We know that a pgd is page aligned.
+ // Therefore the lower indices have to be mapped to user space.
+ // These pages are mapped to the shadow mapping.
+ if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ }
+
+ pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
*pgdp = pgd;
+#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8b4de22..00fecbb 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -45,7 +45,11 @@
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL (_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -119,7 +123,11 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE (_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1be64da..26d3505 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -339,7 +339,7 @@ struct tss_struct {
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -366,6 +366,11 @@ union irq_stack_union {
char gs_base[40];
unsigned long stack_canary;
};
+
+ struct {
+ char irq_stack_pointer[64];
+ char unused[IRQ_STACK_SIZE - 64];
+ };
};
DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9bab7a8..cfce6a6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
static const struct cpu_dev *this_cpu = &default_cpu;
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -1270,7 +1270,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89ca..9ff875a 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
+#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+ // add the esp stack pud to the shadow mapping here.
+ // This can be done directly, because the fixup stack has its own pud
+ set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
/* Randomize the locations */
init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b467b14..ea43ac3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -410,6 +410,14 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -419,7 +427,7 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -429,10 +437,10 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
- .fill 512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+ .fill 2*512,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1..f480b38 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b615a11..3ebc9f6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -40,7 +40,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
.x86_tss = {
.sp0 = TOP_OF_INIT_STACK,
#ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b84..682c162 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
-
+obj-$(CONFIG_KAISER) += kaiser.o
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 5a287e5..420df2c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -823,7 +823,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
pud_clear(pud);
}
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3feec5a..833ab5f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -346,12 +346,38 @@ static inline void _pgd_free(pgd_t *pgd)
#else
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#ifdef CONFIG_KAISER
+ // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+ // block. Therefore, we have to allocate at least 3 pages. However, the
+ // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+ // the beginning of the page of our 8kb-aligned memory block in order to
+ // correctly free it afterwars.
+
+ unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+ if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+ {
+ *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+ return (pgd_t *) pages;
+ }
+ else
+ {
+ *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+ return (pgd_t *) (pages + PAGE_SIZE);
+ }
+#else
+ return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
}
static inline void _pgd_free(pgd_t *pgd)
{
+#ifdef CONFIG_KAISER
+ unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+ free_pages(pages, get_order(4*PAGE_SIZE));
+#else
free_page((unsigned long)pgd);
+#endif
}
#endif /* CONFIG_X86_PAE */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0968d13..eea0fc1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -765,7 +765,16 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
- *(.data..percpu..first) \
+ \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
+ *(.data..percpu..first) \
+ . = ALIGN(cacheline); \
+ *(.data..percpu..user_mapped) \
+ *(.data..percpu..user_mapped..shared_aligned) \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..user_mapped..page_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
+ \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299..8ea945f 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
#endif
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@@ -144,6 +156,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@@ -162,6 +182,16 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
diff --git a/init/main.c b/init/main.c
index b0c9d6f..0b48d95 100644
--- a/init/main.c
+++ b/init/main.c
@@ -83,11 +83,13 @@
#include <linux/io.h>
#include <linux/cache.h>
+#include <asm/cmdline.h>
#include <asm/io.h>
#include <asm/bugs.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
+#include <asm/kaiser.h>
static int kernel_init(void *);
@@ -477,6 +479,9 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+#ifdef CONFIG_KAISER
+ kaiser_init();
+#endif
}
asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8a..491eb8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,8 +211,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
#endif
}
+extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
static inline void free_thread_stack(struct task_struct *tsk)
{
+#ifdef CONFIG_KAISER
+ kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
+#endif
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
unsigned long flags;
@@ -470,6 +474,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
@@ -497,6 +502,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
* functions again.
*/
tsk->stack = stack;
+#ifdef CONFIG_KAISER
+ kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+#endif
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
diff --git a/security/Kconfig b/security/Kconfig
index 118f454..f515ac3 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -30,6 +30,13 @@ config SECURITY
model will be used.
If you are unsure how to answer this question, answer N.
+config KAISER
+ bool "Remove the kernel mapping in user mode"
+ depends on X86_64
+ depends on !PARAVIRT
+ help
+ This enforces a strict kernel and user space isolation in order to close
+ hardware side channels on kernel address information.
config SECURITYFS
bool "Enable the securityfs filesystem"
--
2.9.3
^ permalink raw reply related [flat|nested] 28+ messages in thread
* Re: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-04 10:02 [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Daniel Gruss
@ 2017-05-04 12:26 ` Daniel Gruss
2017-05-04 15:28 ` [kernel-hardening] " Thomas Garnier
` (2 subsequent siblings)
3 siblings, 0 replies; 28+ messages in thread
From: Daniel Gruss @ 2017-05-04 12:26 UTC (permalink / raw)
To: kernel list, kernel-hardening
Cc: clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, kirill.shutemov, Ingo Molnar, anders.fogh
[-- Attachment #1: Type: text/plain, Size: 115 bytes --]
Sorry, missed a file in the first mail (from some code cleanup), the
full patch is now attached.
Cheers,
Daniel
[-- Attachment #2: 0001-KAISER-Kernel-Address-Isolation.patch --]
[-- Type: text/x-patch, Size: 30703 bytes --]
>From c4b1831d44c6144d3762ccc72f0c4e71a0c713e5 Mon Sep 17 00:00:00 2001
From: Richard Fellner <richard.fellner@student.tugraz.at>
Date: Thu, 4 May 2017 14:16:44 +0200
Subject: [PATCH] KAISER: Kernel Address Isolation
This patch introduces our implementation of KAISER (Kernel Address Isolation to
have Side-channels Efficiently Removed), a kernel isolation technique to close
hardware side channels on kernel address information.
More information about the patch can be found on:
https://github.com/IAIK/KAISER
---
arch/x86/entry/entry_64.S | 17 ++++
arch/x86/entry/entry_64_compat.S | 7 +-
arch/x86/include/asm/hw_irq.h | 2 +-
arch/x86/include/asm/kaiser.h | 113 +++++++++++++++++++++++
arch/x86/include/asm/pgtable.h | 4 +
arch/x86/include/asm/pgtable_64.h | 21 +++++
arch/x86/include/asm/pgtable_types.h | 12 ++-
arch/x86/include/asm/processor.h | 7 +-
arch/x86/kernel/cpu/common.c | 4 +-
arch/x86/kernel/espfix_64.c | 6 ++
arch/x86/kernel/head_64.S | 16 +++-
arch/x86/kernel/irqinit.c | 2 +-
arch/x86/kernel/process.c | 2 +-
arch/x86/mm/Makefile | 2 +-
arch/x86/mm/kaiser.c | 172 +++++++++++++++++++++++++++++++++++
arch/x86/mm/pageattr.c | 2 +-
arch/x86/mm/pgtable.c | 28 +++++-
include/asm-generic/vmlinux.lds.h | 11 ++-
include/linux/percpu-defs.h | 30 ++++++
init/main.c | 5 +
kernel/fork.c | 8 ++
security/Kconfig | 7 ++
22 files changed, 461 insertions(+), 17 deletions(-)
create mode 100644 arch/x86/include/asm/kaiser.h
create mode 100644 arch/x86/mm/kaiser.c
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 044d18e..631c7bf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -37,6 +37,7 @@
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/frame.h>
+#include <asm/kaiser.h>
#include <linux/err.h>
.code64
@@ -141,6 +142,7 @@ ENTRY(entry_SYSCALL_64)
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@@ -223,6 +225,7 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
@@ -318,10 +321,12 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
+ SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
@@ -420,6 +425,7 @@ ENTRY(ret_from_fork)
leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ SWITCH_USER_CR3
SWAPGS
FRAME_END
jmp restore_regs_and_iret
@@ -476,6 +482,7 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -533,6 +540,7 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -610,6 +618,7 @@ native_irq_return_ldt:
pushq %rdi /* Stash user RDI */
SWAPGS
+ SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
@@ -636,6 +645,7 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
+ SWITCH_USER_CR3
SWAPGS
movq %rax, %rsp
@@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry)
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
+ SWITCH_KERNEL_CR3
xorl %ebx, %ebx
1: ret
END(paranoid_entry)
@@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
@@ -1085,6 +1097,7 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
.Lerror_entry_from_usermode_after_swapgs:
/*
@@ -1136,6 +1149,7 @@ ENTRY(error_entry)
* Switch to kernel gsbase:
*/
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1234,6 +1248,7 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1274,6 +1289,7 @@ ENTRY(nmi)
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts.
*/
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -1485,6 +1501,7 @@ end_repeat_nmi:
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
+ SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721da..f0e384e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>
@@ -48,6 +49,7 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
@@ -259,6 +262,7 @@ sysret32_from_system_call:
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
+ SWITCH_USER_CR3
movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
@@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat)
PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
-
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* User tracing code (ptrace or signal handlers) might assume that
* the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
+ SWITCH_USER_CR3_NO_STACK
SWAPGS
jmp restore_regs_and_iret
END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b90e105..0817d63 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -178,7 +178,7 @@ extern char irq_entries_start[];
#define VECTOR_RETRIGGERED ((void *)~0UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 0000000..63ee830
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+/* This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
+ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
+ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
+ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
+ * of the user space, or the stacks.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg
+movq %cr3, \reg
+orq $(0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+
+.macro SWITCH_USER_CR3_NO_STACK
+
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_USER_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg
+.endm
+.macro SWITCH_USER_CR3_NO_STACK
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+#else /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_KAISER
+// Upon kernel/user mode switch, it may happen that
+// the address space has to be switched before the registers have been stored.
+// To change the address space, another register is needed.
+// A register therefore has to be stored/restored.
+//
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+#endif /* CONFIG_KAISER */
+
+/**
+ * shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+ * the mapping is done on a global scope, so no bigger synchronization has to be done.
+ * the pages have to be manually unmapped again when they are not needed any longer.
+ */
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+/**
+ * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ * shadowmem_initialize_mapping - Initalize the shadow mapping
+ *
+ * most parts of the shadow mapping can be mapped upon boot time.
+ * only the thread stacks have to be mapped on runtime.
+ * the mapped regions are not unmapped at all.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 437feb4..aeeabb9 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ // clone the shadow pgd part as well
+ memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 62b77592..550b473 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+ return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+ return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
+#ifdef CONFIG_KAISER
+ // We know that a pgd is page aligned.
+ // Therefore the lower indices have to be mapped to user space.
+ // These pages are mapped to the shadow mapping.
+ if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ }
+
+ pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
*pgdp = pgd;
+#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8b4de22..00fecbb 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -45,7 +45,11 @@
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL (_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -119,7 +123,11 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE (_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1be64da..26d3505 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -339,7 +339,7 @@ struct tss_struct {
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -366,6 +366,11 @@ union irq_stack_union {
char gs_base[40];
unsigned long stack_canary;
};
+
+ struct {
+ char irq_stack_pointer[64];
+ char unused[IRQ_STACK_SIZE - 64];
+ };
};
DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9bab7a8..cfce6a6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
static const struct cpu_dev *this_cpu = &default_cpu;
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -1270,7 +1270,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89ca..9ff875a 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
+#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+ // add the esp stack pud to the shadow mapping here.
+ // This can be done directly, because the fixup stack has its own pud
+ set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
/* Randomize the locations */
init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b467b14..ea43ac3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -410,6 +410,14 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -419,7 +427,7 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -429,10 +437,10 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
- .fill 512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+ .fill 2*512,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1..f480b38 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b615a11..3ebc9f6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -40,7 +40,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
.x86_tss = {
.sp0 = TOP_OF_INIT_STACK,
#ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b84..682c162 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
-
+obj-$(CONFIG_KAISER) += kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 0000000..db58838
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,172 @@
+
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#ifdef CONFIG_KAISER
+
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/**
+ * Get the real ppn from a address in kernel mapping.
+ * @param address The virtual adrress
+ * @return the physical address
+ */
+static inline unsigned long get_pa_from_mapping (unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(address);
+ BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+ pud = pud_offset(pgd, address);
+ BUG_ON(pud_none(*pud));
+
+ if(pud_large(*pud))
+ {
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+ }
+
+ pmd = pmd_offset(pud, address);
+ BUG_ON(pmd_none(*pmd));
+
+ if(pmd_large(*pmd))
+ {
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+ }
+
+ pte = pte_offset_kernel(pmd, address);
+ BUG_ON(pte_none(*pte));
+
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+}
+
+void _kaiser_copy (unsigned long start_addr, unsigned long size,
+ unsigned long flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long address;
+ unsigned long end_addr = start_addr + size;
+ unsigned long target_address;
+
+ for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+ address < PAGE_ALIGN(end_addr); address += PAGE_SIZE)
+ {
+ target_address = get_pa_from_mapping(address);
+
+ pgd = native_get_shadow_pgd(pgd_offset_k(address));
+
+ BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+ BUG_ON(pgd_large(*pgd));
+
+ pud = pud_offset(pgd, address);
+ if(pud_none(*pud))
+ {
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+ }
+ BUG_ON(pud_large(*pud));
+
+ pmd = pmd_offset(pud, address);
+ if(pmd_none(*pmd))
+ {
+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+ }
+ BUG_ON(pmd_large(*pmd));
+
+ pte = pte_offset_kernel(pmd, address);
+ if(pte_none(*pte))
+ {
+ set_pte(pte, __pte(flags | target_address));
+ }
+ else
+ {
+ BUG_ON(__pa(pte_page(*pte)) != target_address);
+ }
+ }
+}
+
+// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+static inline void __init _kaiser_init(void)
+{
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++)
+ {
+ set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+ }
+}
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+spinlock_t shadow_table_lock;
+void __init kaiser_init(void)
+{
+ int cpu;
+ spin_lock_init(&shadow_table_lock);
+
+ spin_lock(&shadow_table_lock);
+
+ _kaiser_init();
+
+ for_each_possible_cpu(cpu)
+ {
+ // map the per cpu user variables
+ _kaiser_copy(
+ (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+ (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+ __PAGE_KERNEL);
+ }
+
+ // map the entry/exit text section, which is responsible to switch between user- and kernel mode
+ _kaiser_copy(
+ (unsigned long) __entry_text_start,
+ (unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+ __PAGE_KERNEL_RX);
+
+ // the fixed map address of the idt_table
+ _kaiser_copy(
+ (unsigned long) idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+
+ spin_unlock(&shadow_table_lock);
+}
+
+// add a mapping to the shadow-mapping, and synchronize the mappings
+void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+ spin_lock(&shadow_table_lock);
+ _kaiser_copy(addr, size, flags);
+ spin_unlock(&shadow_table_lock);
+}
+
+extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+ spin_lock(&shadow_table_lock);
+ do
+ {
+ unmap_pud_range(pgd, start, start + size);
+ }
+ while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+ spin_unlock(&shadow_table_lock);
+}
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 5a287e5..420df2c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -823,7 +823,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
pud_clear(pud);
}
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3feec5a..833ab5f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -346,12 +346,38 @@ static inline void _pgd_free(pgd_t *pgd)
#else
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#ifdef CONFIG_KAISER
+ // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+ // block. Therefore, we have to allocate at least 3 pages. However, the
+ // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+ // the beginning of the page of our 8kb-aligned memory block in order to
+ // correctly free it afterwars.
+
+ unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+ if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+ {
+ *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+ return (pgd_t *) pages;
+ }
+ else
+ {
+ *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+ return (pgd_t *) (pages + PAGE_SIZE);
+ }
+#else
+ return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
}
static inline void _pgd_free(pgd_t *pgd)
{
+#ifdef CONFIG_KAISER
+ unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+ free_pages(pages, get_order(4*PAGE_SIZE));
+#else
free_page((unsigned long)pgd);
+#endif
}
#endif /* CONFIG_X86_PAE */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0968d13..eea0fc1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -765,7 +765,16 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
- *(.data..percpu..first) \
+ \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
+ *(.data..percpu..first) \
+ . = ALIGN(cacheline); \
+ *(.data..percpu..user_mapped) \
+ *(.data..percpu..user_mapped..shared_aligned) \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..user_mapped..page_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
+ \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299..8ea945f 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
#endif
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@@ -144,6 +156,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@@ -162,6 +182,16 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
diff --git a/init/main.c b/init/main.c
index b0c9d6f..0b48d95 100644
--- a/init/main.c
+++ b/init/main.c
@@ -83,11 +83,13 @@
#include <linux/io.h>
#include <linux/cache.h>
+#include <asm/cmdline.h>
#include <asm/io.h>
#include <asm/bugs.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
+#include <asm/kaiser.h>
static int kernel_init(void *);
@@ -477,6 +479,9 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+#ifdef CONFIG_KAISER
+ kaiser_init();
+#endif
}
asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8a..491eb8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,8 +211,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
#endif
}
+extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
static inline void free_thread_stack(struct task_struct *tsk)
{
+#ifdef CONFIG_KAISER
+ kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
+#endif
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
unsigned long flags;
@@ -470,6 +474,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
@@ -497,6 +502,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
* functions again.
*/
tsk->stack = stack;
+#ifdef CONFIG_KAISER
+ kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+#endif
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
diff --git a/security/Kconfig b/security/Kconfig
index 118f454..f515ac3 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -30,6 +30,13 @@ config SECURITY
model will be used.
If you are unsure how to answer this question, answer N.
+config KAISER
+ bool "Remove the kernel mapping in user mode"
+ depends on X86_64
+ depends on !PARAVIRT
+ help
+ This enforces a strict kernel and user space isolation in order to close
+ hardware side channels on kernel address information.
config SECURITYFS
bool "Enable the securityfs filesystem"
--
2.9.3
^ permalink raw reply related [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-04 10:02 [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Daniel Gruss
2017-05-04 12:26 ` Daniel Gruss
@ 2017-05-04 15:28 ` Thomas Garnier
2017-05-05 8:23 ` Daniel Gruss
2017-05-04 15:47 ` Christoph Hellwig
2017-05-05 15:49 ` [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel " Jann Horn
3 siblings, 1 reply; 28+ messages in thread
From: Thomas Garnier @ 2017-05-04 15:28 UTC (permalink / raw)
To: Daniel Gruss
Cc: kernel list, Kernel Hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, Kirill A. Shutemov,
Ingo Molnar, anders.fogh
On Thu, May 4, 2017 at 3:02 AM, Daniel Gruss
<daniel.gruss@iaik.tugraz.at> wrote:
> After several recent works [1,2,3] KASLR on x86_64 was basically considered
> dead by many researchers. We have been working on an efficient but effective
> fix for this problem and found that not mapping the kernel space when
> running in user mode is the solution to this problem [4] (the corresponding
> paper [5] will be presented at ESSoS17).
>
> With this RFC patch we allow anybody to configure their kernel with the flag
> CONFIG_KAISER to add our defense mechanism.
>
> If there are any questions we would love to answer them.
> We also appreciate any comments!
>
> Cheers,
> Daniel (+ the KAISER team from Graz University of Technology)
>
> [1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf
> [2]
> https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf
> [3]
> https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf
> [4] https://github.com/IAIK/KAISER
> [5] https://gruss.cc/files/kaiser.pdf
>
>
Please read the documentation on submitting patches [1] and coding style [2].
I have two questions:
- How this approach prevent the hardware attacks you mentioned? You
still have to keep a part of _text in the pagetable and an attacker
could discover it no? (and deduce the kernel base address). You also
need to make it clear that btb attacks are still possible.
- What is the perf impact?
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/coding-style.rst
Thanks,
--
Thomas
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-04 10:02 [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Daniel Gruss
2017-05-04 12:26 ` Daniel Gruss
2017-05-04 15:28 ` [kernel-hardening] " Thomas Garnier
@ 2017-05-04 15:47 ` Christoph Hellwig
2017-05-05 7:40 ` Daniel Gruss
2017-05-05 15:49 ` [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel " Jann Horn
3 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2017-05-04 15:47 UTC (permalink / raw)
To: Daniel Gruss
Cc: kernel list, kernel-hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, kirill.shutemov, Ingo Molnar,
anders.fogh
On Thu, May 04, 2017 at 12:02:47PM +0200, Daniel Gruss wrote:
> After several recent works [1,2,3] KASLR on x86_64 was basically considered
> dead by many researchers. We have been working on an efficient but effective
> fix for this problem and found that not mapping the kernel space when
> running in user mode is the solution to this problem [4] (the corresponding
> paper [5] will be presented at ESSoS17).
I'll try to read the paper. In the meantime: how different is your
approach from then one here?
https://lwn.net/Articles/39283/
and how different is the performance impact?
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-04 15:47 ` Christoph Hellwig
@ 2017-05-05 7:40 ` Daniel Gruss
2017-05-07 20:20 ` [kernel-hardening] " Richard Weinberger
0 siblings, 1 reply; 28+ messages in thread
From: Daniel Gruss @ 2017-05-05 7:40 UTC (permalink / raw)
To: Christoph Hellwig
Cc: kernel list, kernel-hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, kirill.shutemov, Ingo Molnar,
anders.fogh
On 04.05.2017 17:47, Christoph Hellwig wrote:
> I'll try to read the paper. In the meantime: how different is your
> approach from then one here?
>
> https://lwn.net/Articles/39283/
>
> and how different is the performance impact?
The approach sounds very similar, but we have fewer changes because we
don't want to change memory allocation but only split the virtual memory
- everything can stay where it is.
We found that the CR3 switch seems to be significantly improved in
modern microarchitectures (we performed our performance tests on a
Skylake i7-6700K). We think the TLB maybe uses the full CR3 base address
as a tag, relaxing the necessity of flushing the entire TLB upon CR3
updates a bit.
Direct runtime overhead is switching the CR3, but that's it.
Indirectly, we're potentially increasing the number of TLB entries that
are required on one or the other level of the TLB. For TLB-intense tasks
this might lead to more significant performance penalties.
I'm sure the overhead on older systems is larger than on recent systems.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-04 15:28 ` [kernel-hardening] " Thomas Garnier
@ 2017-05-05 8:23 ` Daniel Gruss
2017-05-05 15:47 ` Thomas Garnier
2017-05-08 13:23 ` Daniel Gruss
0 siblings, 2 replies; 28+ messages in thread
From: Daniel Gruss @ 2017-05-05 8:23 UTC (permalink / raw)
To: Thomas Garnier
Cc: kernel list, Kernel Hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, Kirill A. Shutemov,
Ingo Molnar, anders.fogh
On 04.05.2017 17:28, Thomas Garnier wrote:
> Please read the documentation on submitting patches [1] and coding style [2].
I will have a closer look at that.
> - How this approach prevent the hardware attacks you mentioned? You
> still have to keep a part of _text in the pagetable and an attacker
> could discover it no? (and deduce the kernel base address).
These parts are moved to a different section (.user_mapped) which is at
a possibly predictable location - the location of the randomized parts
of the kernel is independent of the location of .user_mapped.
The code/data footprint for .user_mapped is quite small, helping to
reduce or eliminate the attack surface...
> You also need to make it clear that btb attacks are still possible.
By just increasing the KASLR randomization range, btb attacks can be
mitigated (for free).
> - What is the perf impact?
It will vary for different machines. We have promising results (<1%) for
an i7-6700K with representative benchmarks. However, for older systems
or for workloads with a lot of pressure on some TLB levels, the
performance may be much worse.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-05 8:23 ` Daniel Gruss
@ 2017-05-05 15:47 ` Thomas Garnier
2017-05-06 4:02 ` David Gens
2017-05-08 13:23 ` Daniel Gruss
1 sibling, 1 reply; 28+ messages in thread
From: Thomas Garnier @ 2017-05-05 15:47 UTC (permalink / raw)
To: Daniel Gruss
Cc: kernel list, Kernel Hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, Kirill A. Shutemov,
Ingo Molnar, anders.fogh
On Fri, May 5, 2017 at 1:23 AM, Daniel Gruss
<daniel.gruss@iaik.tugraz.at> wrote:
>
> On 04.05.2017 17:28, Thomas Garnier wrote:
>>
>> Please read the documentation on submitting patches [1] and coding style [2].
>
>
> I will have a closer look at that.
>
>> - How this approach prevent the hardware attacks you mentioned? You
>> still have to keep a part of _text in the pagetable and an attacker
>> could discover it no? (and deduce the kernel base address).
>
>
> These parts are moved to a different section (.user_mapped) which is at a possibly predictable location - the location of the randomized parts of the kernel is independent of the location of .user_mapped.
> The code/data footprint for .user_mapped is quite small, helping to reduce or eliminate the attack surface...
>
If I get it right, it means you can leak the per-cpu address instead
of the kernel. Correct? That would be a problem because you can
elevate privilege by overwriting per-cpu variables. Leaking this
address means also defeating KASLR memory randomization [3] (cf paper
in the commit).
In theory you could put the code in the fixmap but you still have the
per-cpu variables and changing that is hard.
[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=021182e52fe01c1f7b126f97fd6ba048dc4234fd
>> You also need to make it clear that btb attacks are still possible.
>
>
> By just increasing the KASLR randomization range, btb attacks can be mitigated (for free).
Correct, I hope we can do that.
>
>> - What is the perf impact?
>
>
> It will vary for different machines. We have promising results (<1%) for an i7-6700K with representative benchmarks. However, for older systems or for workloads with a lot of pressure on some TLB levels, the performance may be much worse.
I think including performance data in both cases would be useful.
--
Thomas
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-04 10:02 [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Daniel Gruss
` (2 preceding siblings ...)
2017-05-04 15:47 ` Christoph Hellwig
@ 2017-05-05 15:49 ` Jann Horn
2017-05-05 15:53 ` Jann Horn
3 siblings, 1 reply; 28+ messages in thread
From: Jann Horn @ 2017-05-05 15:49 UTC (permalink / raw)
To: Daniel Gruss
Cc: kernel list, kernel-hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, kirill.shutemov, Ingo Molnar,
anders.fogh
On Thu, May 4, 2017 at 12:02 PM, Daniel Gruss
<daniel.gruss@iaik.tugraz.at> wrote:
> After several recent works [1,2,3] KASLR on x86_64 was basically considered
> dead by many researchers. We have been working on an efficient but effective
> fix for this problem and found that not mapping the kernel space when
> running in user mode is the solution to this problem [4] (the corresponding
> paper [5] will be presented at ESSoS17).
>
> With this RFC patch we allow anybody to configure their kernel with the flag
> CONFIG_KAISER to add our defense mechanism.
>
> If there are any questions we would love to answer them.
> We also appreciate any comments!
Why do you need this SWITCH_KERNEL_CR3_NO_STACK logic? It would
make sense if the kernel stacks weren't mapped, but if they weren't mapped,
I don't see how the entry_INT80_compat entry point could work at all - the
software interrupt itself already pushes values on the kernel stack. You could
maybe work around that using some sort of trampoline stack, but I don't see
anything like that. Am I missing something?
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-05 15:49 ` [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel " Jann Horn
@ 2017-05-05 15:53 ` Jann Horn
2017-05-06 8:28 ` Daniel Gruss
0 siblings, 1 reply; 28+ messages in thread
From: Jann Horn @ 2017-05-05 15:53 UTC (permalink / raw)
To: Daniel Gruss
Cc: kernel list, kernel-hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, kirill.shutemov, Ingo Molnar,
anders.fogh
On Fri, May 5, 2017 at 5:49 PM, Jann Horn <jannh@google.com> wrote:
> On Thu, May 4, 2017 at 12:02 PM, Daniel Gruss
> <daniel.gruss@iaik.tugraz.at> wrote:
>> After several recent works [1,2,3] KASLR on x86_64 was basically considered
>> dead by many researchers. We have been working on an efficient but effective
>> fix for this problem and found that not mapping the kernel space when
>> running in user mode is the solution to this problem [4] (the corresponding
>> paper [5] will be presented at ESSoS17).
>>
>> With this RFC patch we allow anybody to configure their kernel with the flag
>> CONFIG_KAISER to add our defense mechanism.
>>
>> If there are any questions we would love to answer them.
>> We also appreciate any comments!
>
> Why do you need this SWITCH_KERNEL_CR3_NO_STACK logic? It would
> make sense if the kernel stacks weren't mapped, but if they weren't mapped,
> I don't see how the entry_INT80_compat entry point could work at all - the
> software interrupt itself already pushes values on the kernel stack. You could
> maybe work around that using some sort of trampoline stack, but I don't see
> anything like that. Am I missing something?
Ah, I think I understand. The kernel stacks are mapped, but
cpu_current_top_of_stack isn't, so you can't find the stack until after the CR3
switch in the syscall handler?
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-05 15:47 ` Thomas Garnier
@ 2017-05-06 4:02 ` David Gens
2017-05-06 8:38 ` Daniel Gruss
0 siblings, 1 reply; 28+ messages in thread
From: David Gens @ 2017-05-06 4:02 UTC (permalink / raw)
To: Thomas Garnier
Cc: Daniel Gruss, kernel list, Kernel Hardening, clementine.maurice,
moritz.lipp, Michael Schwarz, Richard Fellner,
Kirill A. Shutemov, Ingo Molnar, anders.fogh
On 2017-05-05 17:47, Thomas Garnier wrote:
> On Fri, May 5, 2017 at 1:23 AM, Daniel Gruss
> <daniel.gruss@iaik.tugraz.at> wrote:
>>
>> On 04.05.2017 17:28, Thomas Garnier wrote:
>>>
>>> Please read the documentation on submitting patches [1] and coding
>>> style [2].
>>
>>
>> I will have a closer look at that.
>>
>>> - How this approach prevent the hardware attacks you mentioned? You
>>> still have to keep a part of _text in the pagetable and an attacker
>>> could discover it no? (and deduce the kernel base address).
>>
>>
>> These parts are moved to a different section (.user_mapped) which is
>> at a possibly predictable location - the location of the randomized
>> parts of the kernel is independent of the location of .user_mapped.
>> The code/data footprint for .user_mapped is quite small, helping to
>> reduce or eliminate the attack surface...
>>
>
> If I get it right, it means you can leak the per-cpu address instead
> of the kernel. Correct? That would be a problem because you can
> elevate privilege by overwriting per-cpu variables. Leaking this
> address means also defeating KASLR memory randomization [3] (cf paper
> in the commit).
>
> In theory you could put the code in the fixmap but you still have the
> per-cpu variables and changing that is hard.
>
> [3]
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=021182e52fe01c1f7b126f97fd6ba048dc4234fd
(Chiming in here, since we worked on something similar)
Assuming that their patch indeed leaks per-cpu addresses.. it might not
necessarily
be required to change it. Since an adversary has to leak the per-cpu
addresses
based on timing information you can work around that by inserting dummy
entries
into the user mappings, with the goal of creating multiple candidate
addresses
that show an identical measurement. For instance, you can create one
entry for
every possible KASLR slot.
>>> You also need to make it clear that btb attacks are still possible.
>>
>>
>> By just increasing the KASLR randomization range, btb attacks can be
>> mitigated (for free).
>
> Correct, I hope we can do that.
>
>>
>>> - What is the perf impact?
>>
>>
>> It will vary for different machines. We have promising results (<1%)
>> for an i7-6700K with representative benchmarks. However, for older
>> systems or for workloads with a lot of pressure on some TLB levels,
>> the performance may be much worse.
>
> I think including performance data in both cases would be useful.
Best,
David
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-05 15:53 ` Jann Horn
@ 2017-05-06 8:28 ` Daniel Gruss
0 siblings, 0 replies; 28+ messages in thread
From: Daniel Gruss @ 2017-05-06 8:28 UTC (permalink / raw)
To: Jann Horn
Cc: kernel list, kernel-hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, kirill.shutemov, Ingo Molnar,
anders.fogh
On 2017-05-05 17:53, Jann Horn wrote:
> Ah, I think I understand. The kernel stacks are mapped, but
> cpu_current_top_of_stack isn't, so you can't find the stack until after the CR3
> switch in the syscall handler?
That's the idea. Only the absolute minimum that is required for a
context switch remains mapped (+ it is mapped at an offset which does
not depend on KASLR -> we do not leak the KASLR offsets).
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-06 4:02 ` David Gens
@ 2017-05-06 8:38 ` Daniel Gruss
2017-05-08 10:21 ` Mark Rutland
2017-05-08 13:53 ` Daniel Gruss
0 siblings, 2 replies; 28+ messages in thread
From: Daniel Gruss @ 2017-05-06 8:38 UTC (permalink / raw)
To: David Gens, Thomas Garnier
Cc: kernel list, Kernel Hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, Kirill A. Shutemov,
Ingo Molnar, anders.fogh
On 2017-05-06 06:02, David Gens wrote:
> Assuming that their patch indeed leaks per-cpu addresses.. it might not
> necessarily
> be required to change it.
I think we're not leaking them (unless we still have some bug in our
code). The basic idea is that any part that is required for the context
switch is at a fixed location (unrelated to the location of code / data
/ per-cpu data / ...) and thus does not reveal any randomized offsets.
Then the attacker cannot gain any knowledge through the side channel
anymore.
For any attack the attacker could then only use the few KBs of memory
that cannot be unmapped because of the way x86 works. Hardening these
few KBs seems like an easier task than doing the same for the entire kernel.
(The best solution would of course be Intel introducing CR3A and CR3B
just like ARM has TTBR0 and TTBR1 - on ARM this entirely prevents any
prefetch / double-fault side-channel attacks.)
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-05 7:40 ` Daniel Gruss
@ 2017-05-07 20:20 ` Richard Weinberger
2017-05-07 21:45 ` Daniel Gruss
0 siblings, 1 reply; 28+ messages in thread
From: Richard Weinberger @ 2017-05-07 20:20 UTC (permalink / raw)
To: Daniel Gruss
Cc: Christoph Hellwig, kernel list, kernel-hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar, anders.fogh
Daniel,
On Fri, May 5, 2017 at 9:40 AM, Daniel Gruss
<daniel.gruss@iaik.tugraz.at> wrote:
> I'm sure the overhead on older systems is larger than on recent systems.
Just did a quick test on my main KVM host, a 8 core Intel(R) Xeon(R)
CPU E3-1240 V2.
KVM guests are 4.10 w/o CONFIG_KAISER and kvmconfig without CONFIG_PARAVIRT.
Building a defconfig kernel within that guests is about 10% slower
when CONFIG_KAISER
is enabled.
Is this expected?
If it helps I can redo the same test also on bare metal.
--
Thanks,
//richard
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-07 20:20 ` [kernel-hardening] " Richard Weinberger
@ 2017-05-07 21:45 ` Daniel Gruss
2017-05-07 22:02 ` Richard Weinberger
0 siblings, 1 reply; 28+ messages in thread
From: Daniel Gruss @ 2017-05-07 21:45 UTC (permalink / raw)
To: Richard Weinberger
Cc: Christoph Hellwig, kernel list, kernel-hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar, anders.fogh
> Just did a quick test on my main KVM host, a 8 core Intel(R) Xeon(R)
> CPU E3-1240 V2.
> KVM guests are 4.10 w/o CONFIG_KAISER and kvmconfig without CONFIG_PARAVIRT.
> Building a defconfig kernel within that guests is about 10% slower
> when CONFIG_KAISER
> is enabled.
Thank you for testing it! :)
> Is this expected?
It sounds plausible. First, I would expect any form of virtualization to
increase the overhead. Second, for the processor (Ivy Bridge), I would
have expected even higher performance overheads. KAISER utilizes very
recent performance improvements in Intel processors...
> If it helps I can redo the same test also on bare metal.
I'm not sure how we proceed here and if this would help, because I don't
know what everyone expects.
KAISER definitely introduces an overhead, no doubt about that. How much
overhead it is depends on the specific hardware and may be very little
on recent architectures and more on older machines.
We are not proposing to enable KAISER by default, but to provide the
config option to allow easy integration into hardened kernels where
performance overheads may be acceptable (which depends on the specific
use case and the specific hardware).
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-07 21:45 ` Daniel Gruss
@ 2017-05-07 22:02 ` Richard Weinberger
2017-05-07 22:18 ` Daniel Gruss
0 siblings, 1 reply; 28+ messages in thread
From: Richard Weinberger @ 2017-05-07 22:02 UTC (permalink / raw)
To: Daniel Gruss
Cc: Christoph Hellwig, kernel list, kernel-hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar, anders.fogh
Daniel,
Am 07.05.2017 um 23:45 schrieb Daniel Gruss:
>> Just did a quick test on my main KVM host, a 8 core Intel(R) Xeon(R)
>> CPU E3-1240 V2.
>> KVM guests are 4.10 w/o CONFIG_KAISER and kvmconfig without CONFIG_PARAVIRT.
>> Building a defconfig kernel within that guests is about 10% slower
>> when CONFIG_KAISER
>> is enabled.
>
> Thank you for testing it! :)
>
>> Is this expected?
>
> It sounds plausible. First, I would expect any form of virtualization to increase the overhead. Second, for the processor (Ivy Bridge), I would have expected even higher
> performance overheads. KAISER utilizes very recent performance improvements in Intel processors...
Ahh, *very* recent is the keyword then. ;)
I was a bit confused since in your paper the overhead is less than 1%.
What platforms did you test?
i.e. how does it perform on recent AMD systems?
Thanks,
//richard
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-07 22:02 ` Richard Weinberger
@ 2017-05-07 22:18 ` Daniel Gruss
2017-05-09 14:44 ` [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not mapkernel " Fogh, Anders
0 siblings, 1 reply; 28+ messages in thread
From: Daniel Gruss @ 2017-05-07 22:18 UTC (permalink / raw)
To: Richard Weinberger
Cc: Christoph Hellwig, kernel list, kernel-hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar, anders.fogh
On 2017-05-08 00:02, Richard Weinberger wrote:
> Ahh, *very* recent is the keyword then. ;)
> I was a bit confused since in your paper the overhead is less than 1%.
Yes, only for very recent platforms (Skylake). While working on the
paper we were surprised that we found overheads that small.
> What platforms did you test?
We tested it on multiple platforms for stability, but we only ran longer
performance tests on different Skylake i7-6700K systems we mentioned in
the paper.
> i.e. how does it perform on recent AMD systems?
Unfortunately, we don't have any AMD systems at hand. I'm also not sure
how AMD is affected by the issue in the first place. Although unlikely,
there is the possibility that the problem of KASLR information leakage
through microarchitectural side channels might be Intel-specific.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-06 8:38 ` Daniel Gruss
@ 2017-05-08 10:21 ` Mark Rutland
2017-05-08 10:51 ` Daniel Gruss
2017-05-08 13:53 ` Daniel Gruss
1 sibling, 1 reply; 28+ messages in thread
From: Mark Rutland @ 2017-05-08 10:21 UTC (permalink / raw)
To: Daniel Gruss
Cc: David Gens, Thomas Garnier, kernel list, Kernel Hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar, anders.fogh
Hi,
On Sat, May 06, 2017 at 10:38:23AM +0200, Daniel Gruss wrote:
> On 2017-05-06 06:02, David Gens wrote:
> >Assuming that their patch indeed leaks per-cpu addresses.. it might not
> >necessarily
> >be required to change it.
>
> I think we're not leaking them (unless we still have some bug in our code).
> The basic idea is that any part that is required for the context switch is
> at a fixed location (unrelated to the location of code / data / per-cpu data
> / ...) and thus does not reveal any randomized offsets. Then the attacker
> cannot gain any knowledge through the side channel anymore.
> For any attack the attacker could then only use the few KBs of memory that
> cannot be unmapped because of the way x86 works. Hardening these few KBs
> seems like an easier task than doing the same for the entire kernel.
>
> (The best solution would of course be Intel introducing CR3A and CR3B just
> like ARM has TTBR0 and TTBR1 - on ARM this entirely prevents any prefetch /
> double-fault side-channel attacks.)
While it may be the case that in practice ARM systems do not have such a
side channel, I think that it is erroneous to believe that the
architectural TTBR{0,1} split ensures this.
The use of TTBR0 for user and TTBR1 for kernel is entirely a SW policy,
and not an architectural requirement. It is possible to map data in
TTBR1 which is accessible to userspace, and data in TTBR0 which is only
accessible by the kernel. In either case, this is determined by the page
tables themselves.
Given this, I think that the statements in the KAISER paper regarding
the TTBRs (in section 2.1) are not quite right. Architecturally,
permission checks and lookups cannot be elided based on the TTBR used.
Having two TTBRs does make it simpler to change the user/kernel address
spaces independently, however.
Thanks,
Mark.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-08 10:21 ` Mark Rutland
@ 2017-05-08 10:51 ` Daniel Gruss
2017-05-08 13:22 ` Mark Rutland
0 siblings, 1 reply; 28+ messages in thread
From: Daniel Gruss @ 2017-05-08 10:51 UTC (permalink / raw)
To: Mark Rutland
Cc: David Gens, Thomas Garnier, kernel list, Kernel Hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar, anders.fogh
> While it may be the case that in practice ARM systems do not have such a
> side channel, I think that it is erroneous to believe that the
> architectural TTBR{0,1} split ensures this.
>
> The use of TTBR0 for user and TTBR1 for kernel is entirely a SW policy,
> and not an architectural requirement. It is possible to map data in
> TTBR1 which is accessible to userspace, and data in TTBR0 which is only
> accessible by the kernel. In either case, this is determined by the page
> tables themselves.
Absolutely right, but TTBR0 and TTBR1 are usually used in this way.
> Given this, I think that the statements in the KAISER paper regarding
> the TTBRs (in section 2.1) are not quite right. Architecturally,
> permission checks and lookups cannot be elided based on the TTBR used.
As we say in section 2.1, they are "typically" used in this way, and this prevents the attacks. Not just the presence of
a second register, but the way how the two registers are used to split the translation tables for user and kernel.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-08 10:51 ` Daniel Gruss
@ 2017-05-08 13:22 ` Mark Rutland
2017-05-08 13:43 ` Daniel Gruss
0 siblings, 1 reply; 28+ messages in thread
From: Mark Rutland @ 2017-05-08 13:22 UTC (permalink / raw)
To: Daniel Gruss
Cc: David Gens, Thomas Garnier, kernel list, Kernel Hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar, anders.fogh
On Mon, May 08, 2017 at 12:51:27PM +0200, Daniel Gruss wrote:
> >While it may be the case that in practice ARM systems do not have such a
> >side channel, I think that it is erroneous to believe that the
> >architectural TTBR{0,1} split ensures this.
> >
> >The use of TTBR0 for user and TTBR1 for kernel is entirely a SW policy,
> >and not an architectural requirement. It is possible to map data in
> >TTBR1 which is accessible to userspace, and data in TTBR0 which is only
> >accessible by the kernel. In either case, this is determined by the page
> >tables themselves.
>
> Absolutely right, but TTBR0 and TTBR1 are usually used in this way.
Sure; if we consider Linux, while userspace is executing, TTBR1 will
(only) contain kernel page tables and TTBR0 will (only) contain user
page tables.
However, as this is not an architectural requirement, the CPU cannot
know that a user access that gets translated via TTBR1 will fault, and
at some point must determine the permissions from the page tables as
required by the architecture.
> >Given this, I think that the statements in the KAISER paper regarding
> >the TTBRs (in section 2.1) are not quite right. Architecturally,
> >permission checks and lookups cannot be elided based on the TTBR used.
>
> As we say in section 2.1, they are "typically" used in this way, and
> this prevents the attacks. Not just the presence of a second
> register, but the way how the two registers are used to split the
> translation tables for user and kernel.
In practice, while userspace is executing, TTBR1 still points to kernel
page tables. If a user program attempts to access an address mapped via
TTBR1, the CPU has to attempt this translation via the TTBR1 page tables
and/or associated TLB entries.
Specifically, I think this does not align with the statement in 2.1
regarding the two TTBRs:
This simplifies privilege checks and does not require any address
translation for invalid memory accesses and thus no cache lookups.
... since the use of the TTBRs is orthogonal to privilege checks and/or
the design of the TLBs.
Thanks,
Mark.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-05 8:23 ` Daniel Gruss
2017-05-05 15:47 ` Thomas Garnier
@ 2017-05-08 13:23 ` Daniel Gruss
1 sibling, 0 replies; 28+ messages in thread
From: Daniel Gruss @ 2017-05-08 13:23 UTC (permalink / raw)
To: Thomas Garnier
Cc: kernel list, Kernel Hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, Kirill A. Shutemov,
Ingo Molnar, anders.fogh
On 05.05.2017 10:23, Daniel Gruss wrote:
>> - How this approach prevent the hardware attacks you mentioned? You
>> still have to keep a part of _text in the pagetable and an attacker
>> could discover it no? (and deduce the kernel base address).
>
> These parts are moved to a different section (.user_mapped) which is at a possibly predictable location - the location
> of the randomized parts of the kernel is independent of the location of .user_mapped.
> The code/data footprint for .user_mapped is quite small, helping to reduce or eliminate the attack surface...
We just discussed that in our group again: although we experimented with this part, it's not yet included in the patch.
The solution we sketched is, as I wrote, we map the required (per-thread) variables in the user CR3 to a fixed location
in memory. During the context switch, only this fixed part remains mapped but not the randomized pages. This is not a
lot of work, because it's just mapping a few more pages and fixing a 1 or 2 lines in the context switch.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-08 13:22 ` Mark Rutland
@ 2017-05-08 13:43 ` Daniel Gruss
0 siblings, 0 replies; 28+ messages in thread
From: Daniel Gruss @ 2017-05-08 13:43 UTC (permalink / raw)
To: Mark Rutland
Cc: David Gens, Thomas Garnier, kernel list, Kernel Hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar, anders.fogh
On 08.05.2017 15:22, Mark Rutland wrote:
> Specifically, I think this does not align with the statement in 2.1
> regarding the two TTBRs:
>
> This simplifies privilege checks and does not require any address
> translation for invalid memory accesses and thus no cache lookups.
>
> ... since the use of the TTBRs is orthogonal to privilege checks and/or
> the design of the TLBs.
Ok, this is a good point, we will try to clarify this in the paper.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-06 8:38 ` Daniel Gruss
2017-05-08 10:21 ` Mark Rutland
@ 2017-05-08 13:53 ` Daniel Gruss
2017-05-08 14:09 ` Thomas Garnier
1 sibling, 1 reply; 28+ messages in thread
From: Daniel Gruss @ 2017-05-08 13:53 UTC (permalink / raw)
To: David Gens, Thomas Garnier
Cc: kernel list, Kernel Hardening, clementine.maurice, moritz.lipp,
Michael Schwarz, Richard Fellner, Kirill A. Shutemov,
Ingo Molnar, anders.fogh
On 06.05.2017 10:38, Daniel Gruss wrote:
> On 2017-05-06 06:02, David Gens wrote:
>> Assuming that their patch indeed leaks per-cpu addresses.. it might not
>> necessarily
>> be required to change it.
>
> I think we're not leaking them (unless we still have some bug in our code).
Just to correct my answer here as well: Although we experimented with fixed mappings for per-cpu addresses, the current
patch does not incorporate this yet, so it indeed still leaks. However, it is not a severe problem. The mapping of the
required (per-cpu) variables would be at a fixed location in the user CR3, instead of the ones that are used in the kernel.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-08 13:53 ` Daniel Gruss
@ 2017-05-08 14:09 ` Thomas Garnier
2017-05-08 14:19 ` Daniel Gruss
0 siblings, 1 reply; 28+ messages in thread
From: Thomas Garnier @ 2017-05-08 14:09 UTC (permalink / raw)
To: Daniel Gruss
Cc: David Gens, kernel list, Kernel Hardening, clementine.maurice,
moritz.lipp, Michael Schwarz, Richard Fellner,
Kirill A. Shutemov, Ingo Molnar, anders.fogh
On Mon, May 8, 2017 at 6:53 AM, Daniel Gruss
<daniel.gruss@iaik.tugraz.at> wrote:
> On 06.05.2017 10:38, Daniel Gruss wrote:
>>
>> On 2017-05-06 06:02, David Gens wrote:
>>>
>>> Assuming that their patch indeed leaks per-cpu addresses.. it might not
>>> necessarily
>>> be required to change it.
>>
>>
>> I think we're not leaking them (unless we still have some bug in our
>> code).
>
>
> Just to correct my answer here as well: Although we experimented with fixed
> mappings for per-cpu addresses, the current patch does not incorporate this
> yet, so it indeed still leaks. However, it is not a severe problem. The
> mapping of the required (per-cpu) variables would be at a fixed location in
> the user CR3, instead of the ones that are used in the kernel.
Why do you think it should be at a fixed location in the user CR3? I
see that you just mirror the entries. You also mirror
__entry_text_start / __entry_text_end which is part of the binary so
will leak the base address of the kernel. Maybe I am missing
something.
--
Thomas
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
2017-05-08 14:09 ` Thomas Garnier
@ 2017-05-08 14:19 ` Daniel Gruss
0 siblings, 0 replies; 28+ messages in thread
From: Daniel Gruss @ 2017-05-08 14:19 UTC (permalink / raw)
To: Thomas Garnier
Cc: David Gens, kernel list, Kernel Hardening, clementine.maurice,
moritz.lipp, Michael Schwarz, Richard Fellner,
Kirill A. Shutemov, Ingo Molnar, anders.fogh
On 08.05.2017 16:09, Thomas Garnier wrote:
>> Just to correct my answer here as well: Although we experimented with fixed
>> mappings for per-cpu addresses, the current patch does not incorporate this
>> yet, so it indeed still leaks. However, it is not a severe problem. The
>> mapping of the required (per-cpu) variables would be at a fixed location in
>> the user CR3, instead of the ones that are used in the kernel.
>
> Why do you think it should be at a fixed location in the user CR3? I
> see that you just mirror the entries. You also mirror
> __entry_text_start / __entry_text_end which is part of the binary so
> will leak the base address of the kernel. Maybe I am missing
> something.
As I said, the current patch does not incorporate this yet, so yes, this part currently still leaks because we did not
implement it yet.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not mapkernel in user mode
2017-05-07 22:18 ` Daniel Gruss
@ 2017-05-09 14:44 ` Fogh, Anders
2017-05-09 14:57 ` Richard Weinberger
0 siblings, 1 reply; 28+ messages in thread
From: Fogh, Anders @ 2017-05-09 14:44 UTC (permalink / raw)
To: Daniel Gruss, Richard Weinberger
Cc: Christoph Hellwig, kernel list, kernel-hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar
>> i.e. how does it perform on recent AMD systems?
Sorry for the latency. Recent AMD is reported by Enrique Nissem to not
be vulnerable to the prefetch attack. TSX attack doesn't apply to AMD.
Hund, Willems & Holz wrote in 2013 that AMD was vulnerable to that
attack. The BTB is almost surely working in a different manner of
fashion if at all. So AMD may or may not be vulnerable to the DPF
attack, but none of the modern attacks should work - at least out of the
box.
Best regards,
Anders
____________
Virus checked by G Data MailSecurity
Version: AVA 25.12310 dated 09.05.2017
Virus news: www.antiviruslab.com
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not mapkernel in user mode
2017-05-09 14:44 ` [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not mapkernel " Fogh, Anders
@ 2017-05-09 14:57 ` Richard Weinberger
2017-05-09 15:30 ` Rik van Riel
0 siblings, 1 reply; 28+ messages in thread
From: Richard Weinberger @ 2017-05-09 14:57 UTC (permalink / raw)
To: Fogh, Anders, Daniel Gruss
Cc: Christoph Hellwig, kernel list, kernel-hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar
Am 09.05.2017 um 16:44 schrieb Fogh, Anders:
>>> i.e. how does it perform on recent AMD systems?
>
> Sorry for the latency. Recent AMD is reported by Enrique Nissem to not
> be vulnerable to the prefetch attack. TSX attack doesn't apply to AMD.
> Hund, Willems & Holz wrote in 2013 that AMD was vulnerable to that
> attack. The BTB is almost surely working in a different manner of
> fashion if at all. So AMD may or may not be vulnerable to the DPF
> attack, but none of the modern attacks should work - at least out of the
> box.
But the promoted patch will also run on AMD systems, that's why I asked
for the overhead.
Thanks,
//richard
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not mapkernel in user mode
2017-05-09 14:57 ` Richard Weinberger
@ 2017-05-09 15:30 ` Rik van Riel
2017-10-31 23:28 ` Dave Hansen
0 siblings, 1 reply; 28+ messages in thread
From: Rik van Riel @ 2017-05-09 15:30 UTC (permalink / raw)
To: Richard Weinberger, Fogh, Anders, Daniel Gruss
Cc: Christoph Hellwig, kernel list, kernel-hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar
[-- Attachment #1: Type: text/plain, Size: 964 bytes --]
On Tue, 2017-05-09 at 16:57 +0200, Richard Weinberger wrote:
> Am 09.05.2017 um 16:44 schrieb Fogh, Anders:
> > > > i.e. how does it perform on recent AMD systems?
> >
> > Sorry for the latency. Recent AMD is reported by Enrique Nissem to
> > not
> > be vulnerable to the prefetch attack. TSX attack doesn't apply to
> > AMD.
> > Hund, Willems & Holz wrote in 2013 that AMD was vulnerable to that
> > attack. The BTB is almost surely working in a different manner of
> > fashion if at all. So AMD may or may not be vulnerable to the DPF
> > attack, but none of the modern attacks should work - at least out
> > of the
> > box.
>
> But the promoted patch will also run on AMD systems, that's why I
> asked
> for the overhead.
Well, if it is a compile time switch, and the
overhead is unacceptable on everything but the
very latest Intel chips, chances are the code
will not be enabled in any distribution kernel.
--
All rights reversed
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 473 bytes --]
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not mapkernel in user mode
2017-05-09 15:30 ` Rik van Riel
@ 2017-10-31 23:28 ` Dave Hansen
0 siblings, 0 replies; 28+ messages in thread
From: Dave Hansen @ 2017-10-31 23:28 UTC (permalink / raw)
To: Rik van Riel
Cc: Richard Weinberger, Fogh, Anders, Daniel Gruss,
Christoph Hellwig, kernel list, kernel-hardening,
clementine.maurice, moritz.lipp, Michael Schwarz,
Richard Fellner, Kirill A. Shutemov, Ingo Molnar
Hi Folks,
I've fixed some bugs and updated the KAISER patch set on top of the
work that was done here. My new version is posted here:
https://marc.info/?l=linux-kernel&m=150948911429162&w=2
^ permalink raw reply [flat|nested] 28+ messages in thread
end of thread, other threads:[~2017-10-31 23:28 UTC | newest]
Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-04 10:02 [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Daniel Gruss
2017-05-04 12:26 ` Daniel Gruss
2017-05-04 15:28 ` [kernel-hardening] " Thomas Garnier
2017-05-05 8:23 ` Daniel Gruss
2017-05-05 15:47 ` Thomas Garnier
2017-05-06 4:02 ` David Gens
2017-05-06 8:38 ` Daniel Gruss
2017-05-08 10:21 ` Mark Rutland
2017-05-08 10:51 ` Daniel Gruss
2017-05-08 13:22 ` Mark Rutland
2017-05-08 13:43 ` Daniel Gruss
2017-05-08 13:53 ` Daniel Gruss
2017-05-08 14:09 ` Thomas Garnier
2017-05-08 14:19 ` Daniel Gruss
2017-05-08 13:23 ` Daniel Gruss
2017-05-04 15:47 ` Christoph Hellwig
2017-05-05 7:40 ` Daniel Gruss
2017-05-07 20:20 ` [kernel-hardening] " Richard Weinberger
2017-05-07 21:45 ` Daniel Gruss
2017-05-07 22:02 ` Richard Weinberger
2017-05-07 22:18 ` Daniel Gruss
2017-05-09 14:44 ` [kernel-hardening] Re: [RFC, PATCH] x86_64: KAISER - do not mapkernel " Fogh, Anders
2017-05-09 14:57 ` Richard Weinberger
2017-05-09 15:30 ` Rik van Riel
2017-10-31 23:28 ` Dave Hansen
2017-05-05 15:49 ` [kernel-hardening] [RFC, PATCH] x86_64: KAISER - do not map kernel " Jann Horn
2017-05-05 15:53 ` Jann Horn
2017-05-06 8:28 ` Daniel Gruss
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).