All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] LDT improvements
@ 2017-12-07  7:22 Andy Lutomirski
  2017-12-07 12:43 ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-07  7:22 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, Borislav Petkov, Brian Gerst, David Laight,
	Kees Cook, Peter Zijlstra, Andy Lutomirski

I think I like this approach.  I also think it might be nice to move the
whole cpu_entry_area into this new pgd range so that we can stop mucking
around with the fixmap.

TODO:
 - It crashes in ldt_gdt_64.  Not sure why.
 - 5-level docs aren't updated and the code is untested.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 Documentation/x86/x86_64/mm.txt         |  11 ++--
 arch/x86/include/asm/mmu_context.h      |  33 +++++++++-
 arch/x86/include/asm/pgtable_64_types.h |   2 +
 arch/x86/include/asm/processor.h        |  23 +++++--
 arch/x86/kernel/ldt.c                   | 109 +++++++++++++++++++++++++++++++-
 5 files changed, 161 insertions(+), 17 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 2d7d6590ade8..bfa44e1cb293 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT range
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space (variable)
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Virtual memory map with 5 level page tables:
@@ -39,8 +41,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Architecture defines a 64-bit virtual address. Implementations can support
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e1a1ecb65c6..eb87bbeddacc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,13 +52,29 @@ struct ldt_struct {
 	 */
 	struct desc_struct *entries;
 	unsigned int nr_entries;
+
+	/*
+	 * If PTI is in use, then the entries array is not mapped while we're
+	 * in user mode.  The whole array will be aliased at the addressed
+	 * given by ldt_slot_va(slot).
+	 */
+	int slot;
 };
 
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static void *ldt_slot_va(int slot)
+{
+	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
 /*
  * Used for LDT copy/destruction.
  */
 int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
 static inline int init_new_context_ldt(struct task_struct *tsk,
 				       struct mm_struct *mm)
@@ -90,10 +106,20 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 	 * that we can see.
 	 */
 
-	if (unlikely(ldt))
-		set_ldt(ldt->entries, ldt->nr_entries);
-	else
+	if (unlikely(ldt)) {
+		if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) {
+			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+				clear_LDT();
+				return;
+			}
+
+			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+		} else {
+			set_ldt(ldt->entries, ldt->nr_entries);
+		}
+	} else {
 		clear_LDT();
+	}
 #else
 	clear_LDT();
 #endif
@@ -185,6 +211,7 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 	paravirt_arch_exit_mmap(mm);
+	ldt_arch_exit_mmap(mm);
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6d5f45dcd4a1..130f575f8d1e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -100,6 +100,8 @@ typedef struct { pteval_t pte; } pte_t;
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 #define ESPFIX_PGD_ENTRY _AC(-2, UL)
 #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#define LDT_PGD_ENTRY _AC(-3, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
 #define EFI_VA_START	 ( -4 * (_AC(1, UL) << 30))
 #define EFI_VA_END	 (-68 * (_AC(1, UL) << 30))
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9e482d8b0b97..9c18da64daa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
 
 #else
 /*
- * User space process size. 47bits minus one guard page.  The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously.  We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
  */
 #define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ae5615b03def..a0008fb26ba2 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -19,6 +19,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/ldt.h>
+#include <asm/tlb.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -46,13 +47,12 @@ static void refresh_ldt_segments(void)
 static void flush_ldt(void *__mm)
 {
 	struct mm_struct *mm = __mm;
-	mm_context_t *pc;
 
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
 		return;
 
-	pc = &mm->context;
-	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+	__flush_tlb_all();
+	load_mm_ldt(mm);
 
 	refresh_ldt_segments();
 }
@@ -90,9 +90,93 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 	}
 
 	new_ldt->nr_entries = num_entries;
+	new_ldt->slot = -1;
 	return new_ldt;
 }
 
+static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_X86_64
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	bool is_vmalloc;
+	int i;
+	bool awful_hack;
+
+	if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+		return 0;
+
+	WARN_ON(ldt->slot != -1);
+
+	/* Both LDT slots are contained in a single PMD. */
+	pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+	awful_hack = pgd_none(*pgd);
+
+	p4d = p4d_alloc(mm, pgd, LDT_BASE_ADDR);
+	if (!p4d)
+		return -ENOMEM;
+
+	pud = pud_alloc(mm, p4d, LDT_BASE_ADDR);
+	if (!pud)
+		return -ENOMEM;
+	pmd = pmd_alloc(mm, pud, LDT_BASE_ADDR);
+	if (!pmd)
+		return -ENOMEM;
+	if (pte_alloc(mm, pmd, LDT_BASE_ADDR))
+		return -ENOMEM;
+
+	if (true) {
+		/* awful hack -- pti_set_user_pgd is a mess */
+		/* we can't even use the bool awful_hack because of pti_init_all_pgds(), which poops on our pgd. */
+		kernel_to_user_pgdp(pgd)->pgd = pgd->pgd;
+	}
+
+	is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+	for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+		unsigned long offset = i << PAGE_SHIFT;
+		unsigned long va = (unsigned long)ldt_slot_va(slot) + offset;
+		const void *src = (char *)ldt->entries + offset;
+		unsigned long pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+			page_to_pfn(virt_to_page(src));
+
+		pte = pte_offset_kernel(pmd, va);
+		set_pte(pte, pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+	}
+
+	flush_tlb_mm_range(mm,
+			   (unsigned long)ldt_slot_va(slot),
+			   (unsigned long)ldt_slot_va(slot) + LDT_SLOT_STRIDE,
+			   0);
+
+	ldt->slot = slot;
+
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_64
+	struct mmu_gather tlb;
+	unsigned long start = LDT_BASE_ADDR;
+	unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+	if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+		return;
+
+	tlb_gather_mmu(&tlb, mm, start, end);
+	free_pgd_range(&tlb, start, end, start, end);
+	tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
 /* After calling this, the LDT is immutable. */
 static void finalize_ldt_struct(struct ldt_struct *ldt)
 {
@@ -155,8 +239,17 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
 	memcpy(new_ldt->entries, old_mm->context.ldt->entries,
 	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
 	finalize_ldt_struct(new_ldt);
+	retval = map_ldt_struct(mm, new_ldt, 0);
+	if (retval)
+		goto out_free;
 
 	mm->context.ldt = new_ldt;
+	goto out_unlock;
+
+out_free:
+	free_ldt_pgtables(mm);
+	free_ldt_struct(new_ldt);
+	return retval;
 
 out_unlock:
 	mutex_unlock(&old_mm->context.lock);
@@ -174,6 +267,11 @@ void destroy_context_ldt(struct mm_struct *mm)
 	mm->context.ldt = NULL;
 }
 
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+	free_ldt_pgtables(mm);
+}
+
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
 	struct mm_struct *mm = current->mm;
@@ -285,6 +383,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 
 	new_ldt->entries[ldt_info.entry_number] = ldt;
 	finalize_ldt_struct(new_ldt);
+	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+	if (error) {
+		free_ldt_struct(old_ldt);
+		goto out_unlock;
+	}
 
 	install_ldt(mm, new_ldt);
 	free_ldt_struct(old_ldt);
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-07  7:22 [PATCH] LDT improvements Andy Lutomirski
@ 2017-12-07 12:43 ` Borislav Petkov
  2017-12-07 17:08   ` Andy Lutomirski
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-12-07 12:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: x86, linux-kernel, Brian Gerst, David Laight, Kees Cook, Peter Zijlstra

On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
> I think I like this approach.  I also think it might be nice to move the
> whole cpu_entry_area into this new pgd range so that we can stop mucking
> around with the fixmap.

Yeah, and also, I don't like the idea of sacrificing a whole PGD
only for the LDT crap which is optional, even. Frankly - and this
is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-07 12:43 ` Borislav Petkov
@ 2017-12-07 17:08   ` Andy Lutomirski
  2017-12-07 17:23     ` Thomas Gleixner
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-07 17:08 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, X86 ML, linux-kernel, Brian Gerst, David Laight,
	Kees Cook, Peter Zijlstra

On Thu, Dec 7, 2017 at 4:43 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
>> I think I like this approach.  I also think it might be nice to move the
>> whole cpu_entry_area into this new pgd range so that we can stop mucking
>> around with the fixmap.
>
> Yeah, and also, I don't like the idea of sacrificing a whole PGD
> only for the LDT crap which is optional, even. Frankly - and this
> is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
> CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.

The PGD sacrifice doesn't bother me.  Putting a writable LDT map at a
constant address does bother me.  We could probably get away with RO
if we trapped and handled the nasty faults, but that could be very
problematic.

The version here:

https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/pti&id=a74d1009ac72a1f04ec5bcd338a4bdbe170ab776

actually seems to work.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-07 17:08   ` Andy Lutomirski
@ 2017-12-07 17:23     ` Thomas Gleixner
  2017-12-07 18:21       ` Andy Lutomirski
  0 siblings, 1 reply; 22+ messages in thread
From: Thomas Gleixner @ 2017-12-07 17:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, X86 ML, linux-kernel, Brian Gerst, David Laight,
	Kees Cook, Peter Zijlstra

On Thu, 7 Dec 2017, Andy Lutomirski wrote:

> On Thu, Dec 7, 2017 at 4:43 AM, Borislav Petkov <bp@alien8.de> wrote:
> > On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
> >> I think I like this approach.  I also think it might be nice to move the
> >> whole cpu_entry_area into this new pgd range so that we can stop mucking
> >> around with the fixmap.
> >
> > Yeah, and also, I don't like the idea of sacrificing a whole PGD
> > only for the LDT crap which is optional, even. Frankly - and this
> > is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
> > CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.
> 
> The PGD sacrifice doesn't bother me.  Putting a writable LDT map at a
> constant address does bother me.  We could probably get away with RO
> if we trapped and handled the nasty faults, but that could be very
> problematic.

Where is the problem? You can map it RO into user space with the USER bit
cleared. The kernel knows how to access the real stuff.

> The version here:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/pti&id=a74d1009ac72a1f04ec5bcd338a4bdbe170ab776
> 
> actually seems to work.

The approach I've taken is to create a VMA and map it into user space with
the USER bit cleared. A little bit more effort code wise, but that avoids
all the page table muck and keeps it straight attached to the process.

Will post once in a bit.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-07 17:23     ` Thomas Gleixner
@ 2017-12-07 18:21       ` Andy Lutomirski
  2017-12-08  7:34         ` Ingo Molnar
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-07 18:21 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, Borislav Petkov, X86 ML, linux-kernel,
	Brian Gerst, David Laight, Kees Cook, Peter Zijlstra



> On Dec 7, 2017, at 9:23 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
>> On Thu, 7 Dec 2017, Andy Lutomirski wrote:
>> 
>>> On Thu, Dec 7, 2017 at 4:43 AM, Borislav Petkov <bp@alien8.de> wrote:
>>>> On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
>>>> I think I like this approach.  I also think it might be nice to move the
>>>> whole cpu_entry_area into this new pgd range so that we can stop mucking
>>>> around with the fixmap.
>>> 
>>> Yeah, and also, I don't like the idea of sacrificing a whole PGD
>>> only for the LDT crap which is optional, even. Frankly - and this
>>> is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
>>> CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.
>> 
>> The PGD sacrifice doesn't bother me.  Putting a writable LDT map at a
>> constant address does bother me.  We could probably get away with RO
>> if we trapped and handled the nasty faults, but that could be very
>> problematic.
> 
> Where is the problem? You can map it RO into user space with the USER bit
> cleared. The kernel knows how to access the real stuff.

Blows up when the CPU tries to set the accessed bit.

> 
>> The version here:
>> 
>> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/pti&id=a74d1009ac72a1f04ec5bcd338a4bdbe170ab776
>> 
>> actually seems to work.
> 
> The approach I've taken is to create a VMA and map it into user space with
> the USER bit cleared. A little bit more effort code wise, but that avoids
> all the page table muck and keeps it straight attached to the process.
> 
> Will post once in a bit.

I don't love mucking with user address space.  I'm also quite nervous about putting it in our near anything that could pass an access_ok check, since we're totally screwed if the bad guys can figure out how to write to it.

> 
> Thanks,
> 
>    tglx
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-07 18:21       ` Andy Lutomirski
@ 2017-12-08  7:34         ` Ingo Molnar
  2017-12-08  9:34           ` Thomas Gleixner
  0 siblings, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2017-12-08  7:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Andy Lutomirski, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook,
	Peter Zijlstra


* Andy Lutomirski <luto@amacapital.net> wrote:

> 
> 
> > On Dec 7, 2017, at 9:23 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > 
> >> On Thu, 7 Dec 2017, Andy Lutomirski wrote:
> >> 
> >>> On Thu, Dec 7, 2017 at 4:43 AM, Borislav Petkov <bp@alien8.de> wrote:
> >>>> On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
> >>>> I think I like this approach.  I also think it might be nice to move the
> >>>> whole cpu_entry_area into this new pgd range so that we can stop mucking
> >>>> around with the fixmap.
> >>> 
> >>> Yeah, and also, I don't like the idea of sacrificing a whole PGD
> >>> only for the LDT crap which is optional, even. Frankly - and this
> >>> is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
> >>> CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.
> >> 
> >> The PGD sacrifice doesn't bother me.  Putting a writable LDT map at a
> >> constant address does bother me.  We could probably get away with RO
> >> if we trapped and handled the nasty faults, but that could be very
> >> problematic.
> > 
> > Where is the problem? You can map it RO into user space with the USER bit
> > cleared. The kernel knows how to access the real stuff.
> 
> Blows up when the CPU tries to set the accessed bit.

BTW., could we force the accessed bit to be always set, without breaking the ABI?

> > The approach I've taken is to create a VMA and map it into user space with
> > the USER bit cleared. A little bit more effort code wise, but that avoids
> > all the page table muck and keeps it straight attached to the process.
> > 
> > Will post once in a bit.
> 
> I don't love mucking with user address space.  I'm also quite nervous about 
> putting it in our near anything that could pass an access_ok check, since we're 
> totally screwed if the bad guys can figure out how to write to it.

Hm, robustness of the LDT address wrt. access_ok() is a valid concern.

Can we have vmas with high addresses, in the vmalloc space for example?
IIRC the GPU code has precedents in that area.

Since this is x86-64, limitation of the vmalloc() space is not an issue.

I like Thomas's solution:

 - have the LDT in a regular mmap space vma (hence per process ASLR randomized), 
   but with the system bit set.

 - That would be an advantage even for non-PTI kernels, because mmap() is probably 
   more randomized than kmalloc().

 - It would also be a cleaner approach all around, and would avoid the fixmap
   complications and the scheduler muckery.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08  7:34         ` Ingo Molnar
@ 2017-12-08  9:34           ` Thomas Gleixner
  2017-12-08  9:44             ` Ingo Molnar
  2017-12-08 13:20             ` Andy Lutomirski
  0 siblings, 2 replies; 22+ messages in thread
From: Thomas Gleixner @ 2017-12-08  9:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andy Lutomirski, Andy Lutomirski, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook,
	Peter Zijlstra

On Fri, 8 Dec 2017, Ingo Molnar wrote:
> * Andy Lutomirski <luto@amacapital.net> wrote:
> > I don't love mucking with user address space.  I'm also quite nervous about 
> > putting it in our near anything that could pass an access_ok check, since we're 
> > totally screwed if the bad guys can figure out how to write to it.
> 
> Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
> 
> Can we have vmas with high addresses, in the vmalloc space for example?
> IIRC the GPU code has precedents in that area.
> 
> Since this is x86-64, limitation of the vmalloc() space is not an issue.
> 
> I like Thomas's solution:
> 
>  - have the LDT in a regular mmap space vma (hence per process ASLR randomized), 
>    but with the system bit set.
> 
>  - That would be an advantage even for non-PTI kernels, because mmap() is probably 
>    more randomized than kmalloc().

Randomization is pointless as long as you can get the LDT address in user
space, i.e. w/o UMIP.

>  - It would also be a cleaner approach all around, and would avoid the fixmap
>    complications and the scheduler muckery.

The error code of such an access is always 0x03. So I added a special
handler, which checks whether the address is in the LDT map range and
verifies that the access bit in the descriptor is 0. If that's the case it
sets it and returns. If not, the thing dies. That works.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08  9:34           ` Thomas Gleixner
@ 2017-12-08  9:44             ` Ingo Molnar
  2017-12-08  9:55               ` Thomas Gleixner
  2017-12-08 13:20             ` Andy Lutomirski
  1 sibling, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2017-12-08  9:44 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, Andy Lutomirski, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook,
	Peter Zijlstra


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Fri, 8 Dec 2017, Ingo Molnar wrote:
> > * Andy Lutomirski <luto@amacapital.net> wrote:
> > > I don't love mucking with user address space.  I'm also quite nervous about 
> > > putting it in our near anything that could pass an access_ok check, since we're 
> > > totally screwed if the bad guys can figure out how to write to it.
> > 
> > Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
> > 
> > Can we have vmas with high addresses, in the vmalloc space for example?
> > IIRC the GPU code has precedents in that area.
> > 
> > Since this is x86-64, limitation of the vmalloc() space is not an issue.
> > 
> > I like Thomas's solution:
> > 
> >  - have the LDT in a regular mmap space vma (hence per process ASLR randomized), 
> >    but with the system bit set.
> > 
> >  - That would be an advantage even for non-PTI kernels, because mmap() is probably 
> >    more randomized than kmalloc().
> 
> Randomization is pointless as long as you can get the LDT address in user
> space, i.e. w/o UMIP.

But with UMIP unprivileged user-space won't be able to get the linear address of 
the LDT. Now it's written out in /proc/self/maps.

> >  - It would also be a cleaner approach all around, and would avoid the fixmap
> >    complications and the scheduler muckery.
> 
> The error code of such an access is always 0x03. So I added a special
> handler, which checks whether the address is in the LDT map range and
> verifies that the access bit in the descriptor is 0. If that's the case it
> sets it and returns. If not, the thing dies. That works.

Are SMP races possible? For example two threads both triggering the accessed bit 
fault, but only one of them succeeding in setting it. The other thread should not 
die in this case, right?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08  9:44             ` Ingo Molnar
@ 2017-12-08  9:55               ` Thomas Gleixner
  2017-12-08 11:31                 ` Ingo Molnar
  0 siblings, 1 reply; 22+ messages in thread
From: Thomas Gleixner @ 2017-12-08  9:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andy Lutomirski, Andy Lutomirski, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook,
	Peter Zijlstra

On Fri, 8 Dec 2017, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > On Fri, 8 Dec 2017, Ingo Molnar wrote:
> > > * Andy Lutomirski <luto@amacapital.net> wrote:
> > > > I don't love mucking with user address space.  I'm also quite nervous about 
> > > > putting it in our near anything that could pass an access_ok check, since we're 
> > > > totally screwed if the bad guys can figure out how to write to it.
> > > 
> > > Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
> > > 
> > > Can we have vmas with high addresses, in the vmalloc space for example?
> > > IIRC the GPU code has precedents in that area.
> > > 
> > > Since this is x86-64, limitation of the vmalloc() space is not an issue.
> > > 
> > > I like Thomas's solution:
> > > 
> > >  - have the LDT in a regular mmap space vma (hence per process ASLR randomized), 
> > >    but with the system bit set.
> > > 
> > >  - That would be an advantage even for non-PTI kernels, because mmap() is probably 
> > >    more randomized than kmalloc().
> > 
> > Randomization is pointless as long as you can get the LDT address in user
> > space, i.e. w/o UMIP.
> 
> But with UMIP unprivileged user-space won't be able to get the linear address of 
> the LDT. Now it's written out in /proc/self/maps.

We can expose it nameless like other VMAs, but then it's 128k sized so it
can be figured out. But when it's RO then it's not really a problem, even
the kernel can't write to it.

> > >  - It would also be a cleaner approach all around, and would avoid the fixmap
> > >    complications and the scheduler muckery.
> > 
> > The error code of such an access is always 0x03. So I added a special
> > handler, which checks whether the address is in the LDT map range and
> > verifies that the access bit in the descriptor is 0. If that's the case it
> > sets it and returns. If not, the thing dies. That works.
> 
> Are SMP races possible? For example two threads both triggering the accessed bit 
> fault, but only one of them succeeding in setting it. The other thread should not 
> die in this case, right?

Right. I'm trying to figure out whether there is a way to reliably detect
that write access bit mode.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08  9:55               ` Thomas Gleixner
@ 2017-12-08 11:31                 ` Ingo Molnar
  2017-12-08 16:38                   ` Andy Lutomirski
  0 siblings, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2017-12-08 11:31 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, Andy Lutomirski, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook,
	Peter Zijlstra


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Fri, 8 Dec 2017, Ingo Molnar wrote:
> > * Thomas Gleixner <tglx@linutronix.de> wrote:
> > 
> > > On Fri, 8 Dec 2017, Ingo Molnar wrote:
> > > > * Andy Lutomirski <luto@amacapital.net> wrote:
> > > > > I don't love mucking with user address space.  I'm also quite nervous about 
> > > > > putting it in our near anything that could pass an access_ok check, since we're 
> > > > > totally screwed if the bad guys can figure out how to write to it.
> > > > 
> > > > Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
> > > > 
> > > > Can we have vmas with high addresses, in the vmalloc space for example?
> > > > IIRC the GPU code has precedents in that area.
> > > > 
> > > > Since this is x86-64, limitation of the vmalloc() space is not an issue.
> > > > 
> > > > I like Thomas's solution:
> > > > 
> > > >  - have the LDT in a regular mmap space vma (hence per process ASLR randomized), 
> > > >    but with the system bit set.
> > > > 
> > > >  - That would be an advantage even for non-PTI kernels, because mmap() is probably 
> > > >    more randomized than kmalloc().
> > > 
> > > Randomization is pointless as long as you can get the LDT address in user
> > > space, i.e. w/o UMIP.
> > 
> > But with UMIP unprivileged user-space won't be able to get the linear address of 
> > the LDT. Now it's written out in /proc/self/maps.
> 
> We can expose it nameless like other VMAs, but then it's 128k sized so it
> can be figured out. But when it's RO then it's not really a problem, even
> the kernel can't write to it.

Yeah, ok. I don't think we should hide it - if it's in the vma space it should be 
listed in the 'maps' file, and with a descriptive name.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08  9:34           ` Thomas Gleixner
  2017-12-08  9:44             ` Ingo Molnar
@ 2017-12-08 13:20             ` Andy Lutomirski
  2017-12-08 13:55               ` David Laight
  2017-12-08 14:06               ` Peter Zijlstra
  1 sibling, 2 replies; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-08 13:20 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Ingo Molnar, Andy Lutomirski, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook,
	Peter Zijlstra



> On Dec 8, 2017, at 1:34 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
>> On Fri, 8 Dec 2017, Ingo Molnar wrote:
>> * Andy Lutomirski <luto@amacapital.net> wrote:
>>> I don't love mucking with user address space.  I'm also quite nervous about 
>>> putting it in our near anything that could pass an access_ok check, since we're 
>>> totally screwed if the bad guys can figure out how to write to it.
>> 
>> Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
>> 
>> Can we have vmas with high addresses, in the vmalloc space for example?
>> IIRC the GPU code has precedents in that area.
>> 
>> Since this is x86-64, limitation of the vmalloc() space is not an issue.
>> 
>> I like Thomas's solution:
>> 
>> - have the LDT in a regular mmap space vma (hence per process ASLR randomized), 
>>   but with the system bit set.
>> 
>> - That would be an advantage even for non-PTI kernels, because mmap() is probably 
>>   more randomized than kmalloc().
> 
> Randomization is pointless as long as you can get the LDT address in user
> space, i.e. w/o UMIP.

You only get the LDT selector, not the address.

> 
>> - It would also be a cleaner approach all around, and would avoid the fixmap
>>   complications and the scheduler muckery.
> 
> The error code of such an access is always 0x03. So I added a special
> handler, which checks whether the address is in the LDT map range and
> verifies that the access bit in the descriptor is 0. If that's the case it
> sets it and returns. If not, the thing dies. That works.

What if you are in kernel mode and try to return to a context with SS or CS pointing to a non-accessed segment?  Or what if you try to schedule to a context with fs or, worse, gs pointing to such a segment?

> 
> Thanks,
> 
>    tglx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH] LDT improvements
  2017-12-08 13:20             ` Andy Lutomirski
@ 2017-12-08 13:55               ` David Laight
  2017-12-08 14:06               ` Peter Zijlstra
  1 sibling, 0 replies; 22+ messages in thread
From: David Laight @ 2017-12-08 13:55 UTC (permalink / raw)
  To: 'Andy Lutomirski', Thomas Gleixner
  Cc: Ingo Molnar, Andy Lutomirski, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, Kees Cook, Peter Zijlstra

From: Andy Lutomirski
> Sent: 08 December 2017 13:20
...
> >> - It would also be a cleaner approach all around, and would avoid the fixmap
> >>   complications and the scheduler muckery.
> >
> > The error code of such an access is always 0x03. So I added a special
> > handler, which checks whether the address is in the LDT map range and
> > verifies that the access bit in the descriptor is 0. If that's the case it
> > sets it and returns. If not, the thing dies. That works.
> 
> What if you are in kernel mode and try to return to a context with SS or CS pointing to a non-accessed
> segment?
> Or what if you try to schedule to a context with fs or, worse, gs pointing to such a segment?

Well, the cpu will fault in kernel on the 'pop %xs' or 'iret' instruction.
These all (probably) happen on the kernel stack with the usergs loaded.
So the fault handler has to look at the opcodes and/or %pc value, sort
out the stack (etc) and then generate SIGSEGV.

I'm not sure the kernel needs to know why the segment selector is invalid.

	David

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08 13:20             ` Andy Lutomirski
  2017-12-08 13:55               ` David Laight
@ 2017-12-08 14:06               ` Peter Zijlstra
  2017-12-08 16:20                 ` Peter Zijlstra
  2017-12-08 16:33                 ` Andy Lutomirski
  1 sibling, 2 replies; 22+ messages in thread
From: Peter Zijlstra @ 2017-12-08 14:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Ingo Molnar, Andy Lutomirski, Borislav Petkov,
	X86 ML, linux-kernel, Brian Gerst, David Laight, Kees Cook

On Fri, Dec 08, 2017 at 05:20:00AM -0800, Andy Lutomirski wrote:
> > 
> > The error code of such an access is always 0x03. So I added a special
> > handler, which checks whether the address is in the LDT map range and
> > verifies that the access bit in the descriptor is 0. If that's the case it
> > sets it and returns. If not, the thing dies. That works.
> 
> What if you are in kernel mode and try to return to a context with SS
> or CS pointing to a non-accessed segment?  Or what if you try to
> schedule to a context with fs or, worse, gs pointing to such a
> segment?

How would that be different from setting a 'crap' GS in modify_ldt() and
then returning from the syscall? That is something we should be able to
deal with already, no?

Is this something ldt_gdt.c already tests? The current "Test GS" is in
test_gdt_invalidation() which seems to suggest not.

Could we get a testcase for the exact situation you worry about? I'm not
sure I'd trust myself to get it right, all this LDT magic is new to me.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08 14:06               ` Peter Zijlstra
@ 2017-12-08 16:20                 ` Peter Zijlstra
  2017-12-08 16:33                 ` Andy Lutomirski
  1 sibling, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2017-12-08 16:20 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Ingo Molnar, Andy Lutomirski, Borislav Petkov,
	X86 ML, linux-kernel, Brian Gerst, David Laight, Kees Cook

On Fri, Dec 08, 2017 at 03:06:54PM +0100, Peter Zijlstra wrote:
> On Fri, Dec 08, 2017 at 05:20:00AM -0800, Andy Lutomirski wrote:
> > > 
> > > The error code of such an access is always 0x03. So I added a special
> > > handler, which checks whether the address is in the LDT map range and
> > > verifies that the access bit in the descriptor is 0. If that's the case it
> > > sets it and returns. If not, the thing dies. That works.
> > 
> > What if you are in kernel mode and try to return to a context with SS
> > or CS pointing to a non-accessed segment?  Or what if you try to
> > schedule to a context with fs or, worse, gs pointing to such a
> > segment?
> 
> How would that be different from setting a 'crap' GS in modify_ldt() and
> then returning from the syscall? That is something we should be able to
> deal with already, no?
> 
> Is this something ldt_gdt.c already tests? The current "Test GS" is in
> test_gdt_invalidation() which seems to suggest not.
> 
> Could we get a testcase for the exact situation you worry about? I'm not
> sure I'd trust myself to get it right, all this LDT magic is new to me.

I ended up with the below; that loads something in LDT-2, sets GS to
LDT-2 and then again loads that same thing in LDT-2.

AFAIU calling modify_ldt will clear that ACCESSED bit, so this would
then trigger that on the return to user from modify_ldt, no?

Seems to work with tglx's patches.


diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index 66e5ce5b91f0..d46a620c3734 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -242,6 +242,36 @@ static void fail_install(struct user_desc *desc)
 	}
 }
 
+static void do_ldt_gs_test(void)
+{
+	unsigned short prev_sel, sel = (2 << 3) | (1 << 2) | 3;
+
+	low_user_desc->entry_number = 2;
+
+	safe_modify_ldt(1, low_user_desc, sizeof(*low_user_desc));
+
+	/*
+	 * syscall (eax) 123 - modify_ldt
+	 *         (ebx)     - func
+	 *         (ecx)     - ptr
+	 *         (edx)     - bytecount
+	 */
+
+	int eax = 123;
+	int ebx = 1;
+	int ecx = (unsigned int)low_user_desc;
+	int edx = sizeof(struct user_desc);
+
+	asm volatile ("movw %%gs, %[prev_sel]\n\t"
+		      "movw %[sel], %%gs\n\t"
+		      "int $0x80\n\t"
+		      "mov %[prev_sel], %%gs"
+		      : [prev_sel] "=&R" (prev_sel), [sel] "+R" (sel), "+a" (eax)
+		      : "b" (ebx), "c" (ecx), "d" (edx)
+		      : INT80_CLOBBERS);
+
+}
+
 static void do_simple_tests(void)
 {
 	struct user_desc desc = {
@@ -919,6 +946,8 @@ int main(int argc, char **argv)
 	setup_counter_page();
 	setup_low_user_desc();
 
+	do_ldt_gs_test();
+
 	do_simple_tests();
 
 	do_multicpu_tests();

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08 14:06               ` Peter Zijlstra
  2017-12-08 16:20                 ` Peter Zijlstra
@ 2017-12-08 16:33                 ` Andy Lutomirski
  2017-12-08 16:46                   ` David Laight
  1 sibling, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-08 16:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Ingo Molnar, Andy Lutomirski, Borislav Petkov,
	X86 ML, linux-kernel, Brian Gerst, David Laight, Kees Cook

On Fri, Dec 8, 2017 at 6:06 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Dec 08, 2017 at 05:20:00AM -0800, Andy Lutomirski wrote:
>> >
>> > The error code of such an access is always 0x03. So I added a special
>> > handler, which checks whether the address is in the LDT map range and
>> > verifies that the access bit in the descriptor is 0. If that's the case it
>> > sets it and returns. If not, the thing dies. That works.
>>
>> What if you are in kernel mode and try to return to a context with SS
>> or CS pointing to a non-accessed segment?  Or what if you try to
>> schedule to a context with fs or, worse, gs pointing to such a
>> segment?
>
> How would that be different from setting a 'crap' GS in modify_ldt() and
> then returning from the syscall? That is something we should be able to
> deal with already, no?
>
> Is this something ldt_gdt.c already tests? The current "Test GS" is in
> test_gdt_invalidation() which seems to suggest not.
>
> Could we get a testcase for the exact situation you worry about? I'm not
> sure I'd trust myself to get it right, all this LDT magic is new to me.

#GP on IRET is a failure, and we have disgusting code to handle it.
#PF on IRET would not be a failure -- it's a case where IRET should be
retried.  Our crap that fixes up #GP would get that wrong and leave us
with the wrong GSBASE.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08 11:31                 ` Ingo Molnar
@ 2017-12-08 16:38                   ` Andy Lutomirski
  2017-12-08 17:37                     ` Thomas Gleixner
  2017-12-08 17:48                     ` Peter Zijlstra
  0 siblings, 2 replies; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-08 16:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, Andy Lutomirski, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook,
	Peter Zijlstra

On Fri, Dec 8, 2017 at 3:31 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Thomas Gleixner <tglx@linutronix.de> wrote:
>
>> On Fri, 8 Dec 2017, Ingo Molnar wrote:
>> > * Thomas Gleixner <tglx@linutronix.de> wrote:
>> >
>> > > On Fri, 8 Dec 2017, Ingo Molnar wrote:
>> > > > * Andy Lutomirski <luto@amacapital.net> wrote:
>> > > > > I don't love mucking with user address space.  I'm also quite nervous about
>> > > > > putting it in our near anything that could pass an access_ok check, since we're
>> > > > > totally screwed if the bad guys can figure out how to write to it.
>> > > >
>> > > > Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
>> > > >
>> > > > Can we have vmas with high addresses, in the vmalloc space for example?
>> > > > IIRC the GPU code has precedents in that area.
>> > > >
>> > > > Since this is x86-64, limitation of the vmalloc() space is not an issue.
>> > > >
>> > > > I like Thomas's solution:
>> > > >
>> > > >  - have the LDT in a regular mmap space vma (hence per process ASLR randomized),
>> > > >    but with the system bit set.
>> > > >
>> > > >  - That would be an advantage even for non-PTI kernels, because mmap() is probably
>> > > >    more randomized than kmalloc().
>> > >
>> > > Randomization is pointless as long as you can get the LDT address in user
>> > > space, i.e. w/o UMIP.
>> >
>> > But with UMIP unprivileged user-space won't be able to get the linear address of
>> > the LDT. Now it's written out in /proc/self/maps.
>>
>> We can expose it nameless like other VMAs, but then it's 128k sized so it
>> can be figured out. But when it's RO then it's not really a problem, even
>> the kernel can't write to it.
>
> Yeah, ok. I don't think we should hide it - if it's in the vma space it should be
> listed in the 'maps' file, and with a descriptive name.
>
> Thanks,
>
>         Ingo

Can we take a step back here?  I think there are four vaguely sane
ways to make the LDT work:

1. The way it is right now -- in vmalloc space.  The only real
downside is that it requires exposing that part of vmalloc space in
the user tables, which is a bit gross.

2. In some fixmap-like space, which is what my patch does, albeit
buggily.  This requires a PGD that we treat as per-mm, not per-cpu,
but that's not so bad.

3. In one of the user PGDs but above TASK_SIZE_MAX.  This is
functionally almost identical to #2, except that there's more concern
about exploits that write past TASK_SIZE_MAX.

4. In an actual vma.  I don't see the benefit of doing this at all --
it's just like #2 except way more error prone.  Hell, you have to make
sure that you can't munmap or mremap it, which isn't a consideration
at all with the other choices.

Why all the effort to make #4 work?  #1 is working fine right now, and
#2 is half-implemented.  #3 code-wise looks just like #2 except for
the choice of address and the interation with PTI's shitty PGD
handling.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH] LDT improvements
  2017-12-08 16:33                 ` Andy Lutomirski
@ 2017-12-08 16:46                   ` David Laight
  2017-12-08 16:47                     ` Andy Lutomirski
  0 siblings, 1 reply; 22+ messages in thread
From: David Laight @ 2017-12-08 16:46 UTC (permalink / raw)
  To: 'Andy Lutomirski', Peter Zijlstra
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, Kees Cook

From: Andy Lutomirski
> Sent: 08 December 2017 16:34

> #GP on IRET is a failure, and we have disgusting code to handle it.

Is that the trap in kernel space when the on-stack segment registers
are invalid?
Definitely needs horrid code...

> #PF on IRET would not be a failure -- it's a case where IRET should be
> retried.  Our crap that fixes up #GP would get that wrong and leave us
> with the wrong GSBASE.

If the user code page isn't present then the fault happens after the
return to user mode, not on the IRET instruction in kernel mode.
So it is not really any different to returning to a NOP at the end
of a resident page when the page following is absent.
(Or any other invalid %ip value.)

SWAPGS is a PITA, should have been SAVEGS, LOAD_KERNEL_GS, and READ_SAVED_GS.

	David

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08 16:46                   ` David Laight
@ 2017-12-08 16:47                     ` Andy Lutomirski
  2017-12-08 17:29                       ` David Laight
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-08 16:47 UTC (permalink / raw)
  To: David Laight
  Cc: Andy Lutomirski, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, X86 ML, linux-kernel, Brian Gerst, Kees Cook

On Fri, Dec 8, 2017 at 8:46 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Andy Lutomirski
>> Sent: 08 December 2017 16:34
>
>> #GP on IRET is a failure, and we have disgusting code to handle it.
>
> Is that the trap in kernel space when the on-stack segment registers
> are invalid?
> Definitely needs horrid code...
>
>> #PF on IRET would not be a failure -- it's a case where IRET should be
>> retried.  Our crap that fixes up #GP would get that wrong and leave us
>> with the wrong GSBASE.
>
> If the user code page isn't present then the fault happens after the
> return to user mode, not on the IRET instruction in kernel mode.
> So it is not really any different to returning to a NOP at the end
> of a resident page when the page following is absent.
> (Or any other invalid %ip value.)

I mean: if the user CS or SS is not accessed and the LDT is RO, then
we get #PF on the IRET instruction, I think.  Dealing with that is
truly awful.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH] LDT improvements
  2017-12-08 16:47                     ` Andy Lutomirski
@ 2017-12-08 17:29                       ` David Laight
  0 siblings, 0 replies; 22+ messages in thread
From: David Laight @ 2017-12-08 17:29 UTC (permalink / raw)
  To: 'Andy Lutomirski'
  Cc: Peter Zijlstra, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	X86 ML, linux-kernel, Brian Gerst, Kees Cook

From: Andy Lutomirski
> Sent: 08 December 2017 16:48
...
> I mean: if the user CS or SS is not accessed and the LDT is RO, then
> we get #PF on the IRET instruction, I think.  Dealing with that is
> truly awful.

Any fault in-kernel on the IRET is horrid.
Doesn't really matter which one.
Same goes for the 'pop %ds' (etc) that tend to precede it.

	David

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08 16:38                   ` Andy Lutomirski
@ 2017-12-08 17:37                     ` Thomas Gleixner
  2017-12-08 17:42                       ` Andy Lutomirski
  2017-12-08 17:48                     ` Peter Zijlstra
  1 sibling, 1 reply; 22+ messages in thread
From: Thomas Gleixner @ 2017-12-08 17:37 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Borislav Petkov, X86 ML, linux-kernel, Brian Gerst,
	David Laight, Kees Cook, Peter Zijlstra

On Fri, 8 Dec 2017, Andy Lutomirski wrote:
> Can we take a step back here?  I think there are four vaguely sane
> ways to make the LDT work:
> 
> 1. The way it is right now -- in vmalloc space.  The only real
> downside is that it requires exposing that part of vmalloc space in
> the user tables, which is a bit gross.
> 
> 2. In some fixmap-like space, which is what my patch does, albeit
> buggily.  This requires a PGD that we treat as per-mm, not per-cpu,
> but that's not so bad.
> 
> 3. In one of the user PGDs but above TASK_SIZE_MAX.  This is
> functionally almost identical to #2, except that there's more concern
> about exploits that write past TASK_SIZE_MAX.
> 
> 4. In an actual vma.  I don't see the benefit of doing this at all --
> it's just like #2 except way more error prone.  Hell, you have to make
> sure that you can't munmap or mremap it, which isn't a consideration
> at all with the other choices.

Why? You can unmap vdso or uprobe or whatever VMAs and you will simply
die. You get what you asked for.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08 17:37                     ` Thomas Gleixner
@ 2017-12-08 17:42                       ` Andy Lutomirski
  0 siblings, 0 replies; 22+ messages in thread
From: Andy Lutomirski @ 2017-12-08 17:42 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, Ingo Molnar, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook,
	Peter Zijlstra

On Fri, Dec 8, 2017 at 9:37 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Fri, 8 Dec 2017, Andy Lutomirski wrote:
>> Can we take a step back here?  I think there are four vaguely sane
>> ways to make the LDT work:
>>
>> 1. The way it is right now -- in vmalloc space.  The only real
>> downside is that it requires exposing that part of vmalloc space in
>> the user tables, which is a bit gross.
>>
>> 2. In some fixmap-like space, which is what my patch does, albeit
>> buggily.  This requires a PGD that we treat as per-mm, not per-cpu,
>> but that's not so bad.
>>
>> 3. In one of the user PGDs but above TASK_SIZE_MAX.  This is
>> functionally almost identical to #2, except that there's more concern
>> about exploits that write past TASK_SIZE_MAX.
>>
>> 4. In an actual vma.  I don't see the benefit of doing this at all --
>> it's just like #2 except way more error prone.  Hell, you have to make
>> sure that you can't munmap or mremap it, which isn't a consideration
>> at all with the other choices.
>
> Why? You can unmap vdso or uprobe or whatever VMAs and you will simply
> die. You get what you asked for.

But if you unmap ldt and then map something else at the same VA, you're root.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] LDT improvements
  2017-12-08 16:38                   ` Andy Lutomirski
  2017-12-08 17:37                     ` Thomas Gleixner
@ 2017-12-08 17:48                     ` Peter Zijlstra
  1 sibling, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2017-12-08 17:48 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Thomas Gleixner, Borislav Petkov, X86 ML,
	linux-kernel, Brian Gerst, David Laight, Kees Cook

On Fri, Dec 08, 2017 at 08:38:26AM -0800, Andy Lutomirski wrote:

> 4. In an actual vma.  I don't see the benefit of doing this at all --
> it's just like #2 except way more error prone.  Hell, you have to make
> sure that you can't munmap or mremap it, which isn't a consideration
> at all with the other choices.

mremap is trivially disabled. I've not tried munmap() yet, as long as it
just kills the process doing it we're good of course. Otherwise we need
an extra callback in do_munmap() which isn't too hard.

> Why all the effort to make #4 work?

Seemed like a sensible approach; I really dislike wasting an entire pmd
or whatever on a feature 'nobody' ever uses anyway.

> #1 is working fine right now

doesn't work for pti in its current form.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2017-12-08 17:49 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-07  7:22 [PATCH] LDT improvements Andy Lutomirski
2017-12-07 12:43 ` Borislav Petkov
2017-12-07 17:08   ` Andy Lutomirski
2017-12-07 17:23     ` Thomas Gleixner
2017-12-07 18:21       ` Andy Lutomirski
2017-12-08  7:34         ` Ingo Molnar
2017-12-08  9:34           ` Thomas Gleixner
2017-12-08  9:44             ` Ingo Molnar
2017-12-08  9:55               ` Thomas Gleixner
2017-12-08 11:31                 ` Ingo Molnar
2017-12-08 16:38                   ` Andy Lutomirski
2017-12-08 17:37                     ` Thomas Gleixner
2017-12-08 17:42                       ` Andy Lutomirski
2017-12-08 17:48                     ` Peter Zijlstra
2017-12-08 13:20             ` Andy Lutomirski
2017-12-08 13:55               ` David Laight
2017-12-08 14:06               ` Peter Zijlstra
2017-12-08 16:20                 ` Peter Zijlstra
2017-12-08 16:33                 ` Andy Lutomirski
2017-12-08 16:46                   ` David Laight
2017-12-08 16:47                     ` Andy Lutomirski
2017-12-08 17:29                       ` David Laight

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.