[MODERATED] [PATCH 1/6] Patch 1

* [MODERATED] [PATCH 1/6] Patch 1
@ 2018-04-25  3:29 Andi Kleen
  2018-04-25 15:51 ` [MODERATED] " Linus Torvalds
  0 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2018-04-25  3:29 UTC (permalink / raw)
  To: speck; +Cc: Andi Kleen

Intel CPUs can speculatively reference the address in a page table
entry with the present bit clear. This can allow to access data
in the L1 using a gadget similar to Spectre v2.

Linux has three cases where PTEs can be unmapped:
- Empty page table entries (referencing the 4K page at phys address 0)
This page doesn't contain interesting data, so is not mitigated. This
also applies to some page table entries that are temporarily
cleared to prevent races.
- Page is currently swapped out or being migrated/poisoned.
- Virtual address range is set to PROT_NONE using mprotect

This patch addresses the second case. The page is swapped out and
the PTE has been replaced with a swap entry.  It could also
contain a migration or poison entry, which have the same format.

The swap file offset could point to real memory, which might be by chance
in L1 and would be open to this side channel.

Fill the bits from MAX_PA-1 to the maximum with ones. This forces
the CPU to reference an unmapped but supported memory area, which stops
any speculation. In principle we only need the MAX_PA-1 bit, but
filling all bits makes the trick swapon() uses to determine
the maximum swap file size from the __swp_entry work.
It's also slightly safer if a VM reports an incorrect MAX_PA to
a guest.

This limits the maximum size of swap files to 3.5TB.

The workaround is only possible on 64bit and 32bit with PAE. On non
PAE it would require limiting memory to less than 2GB, which
is likely not practical. So systems without PAE are still vulnerable.

There are no user options to enable/disable because the workaround
has no noticeable performance impact. However we automatically
disable it if the system has more PA bits than 46 or reports
RDCL_NO.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/include/asm/pgtable-3level.h |  5 ++--
 arch/x86/include/asm/pgtable.h        |  3 +++
 arch/x86/include/asm/pgtable_64.h     | 21 ++++++++++++----
 arch/x86/kernel/cpu/bugs.c            | 47 +++++++++++++++++++++++++++++++++++
 arch/x86/mm/init.c                    |  3 +++
 5 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index f24df59c40b2..d247cdca105d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -243,8 +243,9 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
-#define __swp_offset(x)			((x).val >> 5)
-#define __swp_entry(type, offset)	((swp_entry_t){(type) | (offset) << 5})
+#define __swp_offset(x)			(((x).val & ~__swp_stop_mask) >> 5)
+#define __swp_entry(type, offset)	((swp_entry_t){(type) | (offset) << 5 |\
+					 __swp_stop_mask})
 #define __pte_to_swp_entry(pte)		((swp_entry_t){ (pte).pte_high })
 #define __swp_entry_to_pte(x)		((pte_t){ { .pte_high = (x).val } })
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5f49b4ff0c24..9f1280bb7e20 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -185,6 +185,8 @@ static inline int pte_special(pte_t pte)
 	return pte_flags(pte) & _PAGE_SPECIAL;
 }
 
+extern u64 __swp_stop_mask;
+
 static inline unsigned long pte_pfn(pte_t pte)
 {
 	return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
@@ -635,6 +637,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
 
 pmd_t *populate_extra_pmd(unsigned long vaddr);
 pte_t *populate_extra_pte(unsigned long vaddr);
+
 #endif	/* __ASSEMBLY__ */
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 877bc27718ae..76ed3ef49f53 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -271,9 +271,17 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /*
  * Encode and de-code a swap entry
  *
- * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
- * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
- * | OFFSET (14->63) | TYPE (9-13)  |0|0|X|X| X| X|X|SD|0| <- swp entry
+ * |     ...					   | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
+ * |     ...					   |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
+ * |MAXPA+2->63|SAVED12|1| OFFSET (14->MAXPA-2) | TYPE (9-13)  |0|0|X|X| X| X|X|SD|0| <- swp entry
+ *
+ * MAXPA is the highest PA bit reported by CPUID
+ * We set a 1 stop bit at the highest MAXPA bit to prevent speculation.
+ * Also PS(bit 7) must be always 0.
+ *
+ * SAVED12 is a copy of the original value of the MAXPA-1 stop bit
+ * and a marker bit that the saved copy contains valid data.
+ * The bits above are filled with ones.
  *
  * G (8) is aliased and used as a PROT_NONE indicator for
  * !present ptes.  We need to start storing swap entries above
@@ -296,10 +304,13 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 #define __swp_type(x)			(((x).val >> (SWP_TYPE_FIRST_BIT)) \
 					 & ((1U << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x)			((x).val >> SWP_OFFSET_FIRST_BIT)
+
+#define __swp_offset(x)			(((x).val & ~__swp_stop_mask) \
+					  >> SWP_OFFSET_FIRST_BIT)
 #define __swp_entry(type, offset)	((swp_entry_t) { \
 					 ((type) << (SWP_TYPE_FIRST_BIT)) \
-					 | ((offset) << SWP_OFFSET_FIRST_BIT) })
+					 | ((offset) << SWP_OFFSET_FIRST_BIT) \
+					 | __swp_stop_mask})
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
 #define __pmd_to_swp_entry(pmd)		((swp_entry_t) { pmd_val((pmd)) })
 #define __swp_entry_to_pte(x)		((pte_t) { .pte = (x).val })
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0a1f319c69fe..6aaee4ce8842 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -29,6 +29,7 @@
 static void __init spectre_v2_select_mitigation(void);
 static void __init ssb_select_mitigation(void);
 static void __init spec_ctrl_save_msr(void);
+static void __init l1tf_mitigation(struct cpuinfo_x86 *c);
 
 void __init check_bugs(void)
 {
@@ -85,6 +86,8 @@ void __init check_bugs(void)
 	if (!direct_gbpages)
 		set_memory_4k((unsigned long)__va(0), 1);
 #endif
+
+	l1tf_mitigation(&boot_cpu_data);
 }
 
 /* The kernel command line selection */
@@ -569,3 +572,47 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *
 	return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
 }
 #endif
+
+/*
+ * Note there is not a lot of motivation to disable the L1TF
+ * workaround, as it is very cheap. But there are a few
+ * corner cases where it can be disabled, so disable
+ * it also when not needed.
+ */
+static bool cpu_needs_l1tf(struct cpuinfo_x86 *c)
+{
+	u64 ia32_cap = 0;
+
+	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
+		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+	if (ia32_cap & ARCH_CAP_RDCL_NO)
+		return false;
+
+	if (c->x86_phys_bits > 46)
+		return false;
+
+	/* Add a check for MKTME here */
+
+	return true;
+}
+
+/*
+ * Workaround for L1 terminal fault speculation
+ * (CVE-2018-3620)
+ *
+ * For unmapped PTEs set all bits from MAX_PA-1 to top to stop
+ * speculation
+ *
+ * We only really need the MAX_PA-1 bit to address the L1
+ * terminal fault, but if we set all above too the swap file
+ * size check in swapon() limits the swap size correctly.
+ *
+ * Note this overwrites NX, which may need to be restored
+ * later.
+ */
+static __init void l1tf_mitigation(struct cpuinfo_x86 *c)
+{
+	if (cpu_needs_l1tf(c))
+		__swp_stop_mask = (-1ULL) << (c->x86_phys_bits - 1);
+}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fec82b577c18..e4a10bbdc53a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -71,6 +71,9 @@ static unsigned long __initdata pgt_buf_start;
 static unsigned long __initdata pgt_buf_end;
 static unsigned long __initdata pgt_buf_top;
 
+u64 __swp_stop_mask __read_mostly;
+EXPORT_SYMBOL(__swp_stop_mask);
+
 static unsigned long min_pfn_mapped;
 
 static bool __initdata can_use_brk_pgt = true;
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread