[PATCH] LDT improvements

* [PATCH] LDT improvements
@ 2017-12-07  7:22 Andy Lutomirski
  2017-12-07 12:43 ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-07  7:22 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, Borislav Petkov, Brian Gerst, David Laight,
	Kees Cook, Peter Zijlstra, Andy Lutomirski

I think I like this approach.  I also think it might be nice to move the
whole cpu_entry_area into this new pgd range so that we can stop mucking
around with the fixmap.

TODO:
 - It crashes in ldt_gdt_64.  Not sure why.
 - 5-level docs aren't updated and the code is untested.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 Documentation/x86/x86_64/mm.txt         |  11 ++--
 arch/x86/include/asm/mmu_context.h      |  33 +++++++++-
 arch/x86/include/asm/pgtable_64_types.h |   2 +
 arch/x86/include/asm/processor.h        |  23 +++++--
 arch/x86/kernel/ldt.c                   | 109 +++++++++++++++++++++++++++++++-
 5 files changed, 161 insertions(+), 17 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 2d7d6590ade8..bfa44e1cb293 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT range
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space (variable)
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Virtual memory map with 5 level page tables:
@@ -39,8 +41,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Architecture defines a 64-bit virtual address. Implementations can support
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e1a1ecb65c6..eb87bbeddacc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,13 +52,29 @@ struct ldt_struct {
 	 */
 	struct desc_struct *entries;
 	unsigned int nr_entries;
+
+	/*
+	 * If PTI is in use, then the entries array is not mapped while we're
+	 * in user mode.  The whole array will be aliased at the addressed
+	 * given by ldt_slot_va(slot).
+	 */
+	int slot;
 };
 
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static void *ldt_slot_va(int slot)
+{
+	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
 /*
  * Used for LDT copy/destruction.
  */
 int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
 static inline int init_new_context_ldt(struct task_struct *tsk,
 				       struct mm_struct *mm)
@@ -90,10 +106,20 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 	 * that we can see.
 	 */
 
-	if (unlikely(ldt))
-		set_ldt(ldt->entries, ldt->nr_entries);
-	else
+	if (unlikely(ldt)) {
+		if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) {
+			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+				clear_LDT();
+				return;
+			}
+
+			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+		} else {
+			set_ldt(ldt->entries, ldt->nr_entries);
+		}
+	} else {
 		clear_LDT();
+	}
 #else
 	clear_LDT();
 #endif
@@ -185,6 +211,7 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 	paravirt_arch_exit_mmap(mm);
+	ldt_arch_exit_mmap(mm);
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6d5f45dcd4a1..130f575f8d1e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -100,6 +100,8 @@ typedef struct { pteval_t pte; } pte_t;
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 #define ESPFIX_PGD_ENTRY _AC(-2, UL)
 #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#define LDT_PGD_ENTRY _AC(-3, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
 #define EFI_VA_START	 ( -4 * (_AC(1, UL) << 30))
 #define EFI_VA_END	 (-68 * (_AC(1, UL) << 30))
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9e482d8b0b97..9c18da64daa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
 
 #else
 /*
- * User space process size. 47bits minus one guard page.  The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously.  We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
  */
 #define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ae5615b03def..a0008fb26ba2 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -19,6 +19,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/ldt.h>
+#include <asm/tlb.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -46,13 +47,12 @@ static void refresh_ldt_segments(void)
 static void flush_ldt(void *__mm)
 {
 	struct mm_struct *mm = __mm;
-	mm_context_t *pc;
 
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
 		return;
 
-	pc = &mm->context;
-	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+	__flush_tlb_all();
+	load_mm_ldt(mm);
 
 	refresh_ldt_segments();
 }
@@ -90,9 +90,93 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 	}
 
 	new_ldt->nr_entries = num_entries;
+	new_ldt->slot = -1;
 	return new_ldt;
 }
 
+static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_X86_64
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	bool is_vmalloc;
+	int i;
+	bool awful_hack;
+
+	if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+		return 0;
+
+	WARN_ON(ldt->slot != -1);
+
+	/* Both LDT slots are contained in a single PMD. */
+	pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+	awful_hack = pgd_none(*pgd);
+
+	p4d = p4d_alloc(mm, pgd, LDT_BASE_ADDR);
+	if (!p4d)
+		return -ENOMEM;
+
+	pud = pud_alloc(mm, p4d, LDT_BASE_ADDR);
+	if (!pud)
+		return -ENOMEM;
+	pmd = pmd_alloc(mm, pud, LDT_BASE_ADDR);
+	if (!pmd)
+		return -ENOMEM;
+	if (pte_alloc(mm, pmd, LDT_BASE_ADDR))
+		return -ENOMEM;
+
+	if (true) {
+		/* awful hack -- pti_set_user_pgd is a mess */
+		/* we can't even use the bool awful_hack because of pti_init_all_pgds(), which poops on our pgd. */
+		kernel_to_user_pgdp(pgd)->pgd = pgd->pgd;
+	}
+
+	is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+	for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+		unsigned long offset = i << PAGE_SHIFT;
+		unsigned long va = (unsigned long)ldt_slot_va(slot) + offset;
+		const void *src = (char *)ldt->entries + offset;
+		unsigned long pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+			page_to_pfn(virt_to_page(src));
+
+		pte = pte_offset_kernel(pmd, va);
+		set_pte(pte, pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+	}
+
+	flush_tlb_mm_range(mm,
+			   (unsigned long)ldt_slot_va(slot),
+			   (unsigned long)ldt_slot_va(slot) + LDT_SLOT_STRIDE,
+			   0);
+
+	ldt->slot = slot;
+
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_64
+	struct mmu_gather tlb;
+	unsigned long start = LDT_BASE_ADDR;
+	unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+	if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+		return;
+
+	tlb_gather_mmu(&tlb, mm, start, end);
+	free_pgd_range(&tlb, start, end, start, end);
+	tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
 /* After calling this, the LDT is immutable. */
 static void finalize_ldt_struct(struct ldt_struct *ldt)
 {
@@ -155,8 +239,17 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
 	memcpy(new_ldt->entries, old_mm->context.ldt->entries,
 	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
 	finalize_ldt_struct(new_ldt);
+	retval = map_ldt_struct(mm, new_ldt, 0);
+	if (retval)
+		goto out_free;
 
 	mm->context.ldt = new_ldt;
+	goto out_unlock;
+
+out_free:
+	free_ldt_pgtables(mm);
+	free_ldt_struct(new_ldt);
+	return retval;
 
 out_unlock:
 	mutex_unlock(&old_mm->context.lock);
@@ -174,6 +267,11 @@ void destroy_context_ldt(struct mm_struct *mm)
 	mm->context.ldt = NULL;
 }
 
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+	free_ldt_pgtables(mm);
+}
+
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
 	struct mm_struct *mm = current->mm;
@@ -285,6 +383,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 
 	new_ldt->entries[ldt_info.entry_number] = ldt;
 	finalize_ldt_struct(new_ldt);
+	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+	if (error) {
+		free_ldt_struct(old_ldt);
+		goto out_unlock;
+	}
 
 	install_ldt(mm, new_ldt);
 	free_ldt_struct(old_ldt);
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 22+ messages in thread