From 2e4a4617d484fd3b44abdc77013d559a02c1ed10 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Wed, 25 Jul 2018 12:10:19 +0000 Subject: [PATCH] x86/spec-ctrl: Calculate safe PTE addresses for L1TF mitigations Safe PTE addresses for L1TF mitigations are ones which are within the L1D address width (may be wider than reported in CPUID), and above the highest cacheable RAM/NVDIMM/BAR/etc. All logic here is best-effort heuristics, which should in practice be fine for most hardware. Future work will see about disentangling the SRAT handling further, as well as having L0 pass this information down to lower levels when virtualised. This is part of XSA-273 / CVE-2018-3620. Signed-off-by: Andrew Cooper Signed-off-by: Jan Beulich --- v2: * New v3: * Adjust heuristics to be mostly safe for the high end server case, and heterogeneous migration cases. Extend the comments a lot to explain what is going on. v4: * Fold EFI and SRAT adjustments. * Expand safety comment. --- xen/arch/x86/setup.c | 12 ++++ xen/arch/x86/spec_ctrl.c | 154 ++++++++++++++++++++++++++++++++++++++++ xen/arch/x86/srat.c | 8 ++- xen/common/efi/boot.c | 12 ++++ xen/include/asm-x86/spec_ctrl.h | 7 ++ 5 files changed, 191 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 8301de8..7c86b9a 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -912,6 +912,18 @@ void __init noreturn __start_xen(unsigned long mbi_p) /* Sanitise the raw E820 map to produce a final clean version. */ max_page = raw_max_page = init_e820(memmap_type, &e820_raw); + if ( !efi_enabled(EFI_BOOT) ) + { + /* + * Supplement the heuristics in l1tf_calculations() by assuming that + * anything referenced in the E820 may be cacheable. + */ + l1tf_safe_maddr = + max(l1tf_safe_maddr, + ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr + + e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE)); + } + /* Create a temporary copy of the E820 map. */ memcpy(&boot_e820, &e820, sizeof(e820)); diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c index 32a4ea6..abe3785 100644 --- a/xen/arch/x86/spec_ctrl.c +++ b/xen/arch/x86/spec_ctrl.c @@ -50,6 +50,10 @@ bool __initdata bsp_delay_spec_ctrl; uint8_t __read_mostly default_xen_spec_ctrl; uint8_t __read_mostly default_spec_ctrl_flags; +paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr; +static bool __initdata cpu_has_bug_l1tf; +static unsigned int __initdata l1d_maxphysaddr; + static int __init parse_bti(const char *s) { const char *ss; @@ -420,6 +424,154 @@ static bool __init should_use_eager_fpu(void) } } +/* Calculate whether this CPU is vulnerable to L1TF. */ +static __init void l1tf_calculations(uint64_t caps) +{ + bool hit_default = false; + + l1d_maxphysaddr = paddr_bits; + + /* L1TF is only known to affect Intel Family 6 processors at this time. */ + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 ) + { + switch ( boot_cpu_data.x86_model ) + { + /* + * Core processors since at least Penryn are vulnerable. + */ + case 0x17: /* Penryn */ + case 0x1d: /* Dunnington */ + cpu_has_bug_l1tf = true; + break; + + case 0x1f: /* Auburndale / Havendale */ + case 0x1e: /* Nehalem */ + case 0x1a: /* Nehalem EP */ + case 0x2e: /* Nehalem EX */ + case 0x25: /* Westmere */ + case 0x2c: /* Westmere EP */ + case 0x2f: /* Westmere EX */ + cpu_has_bug_l1tf = true; + l1d_maxphysaddr = 44; + break; + + case 0x2a: /* SandyBridge */ + case 0x2d: /* SandyBridge EP/EX */ + case 0x3a: /* IvyBridge */ + case 0x3e: /* IvyBridge EP/EX */ + case 0x3c: /* Haswell */ + case 0x3f: /* Haswell EX/EP */ + case 0x45: /* Haswell D */ + case 0x46: /* Haswell H */ + case 0x3d: /* Broadwell */ + case 0x47: /* Broadwell H */ + case 0x4f: /* Broadwell EP/EX */ + case 0x56: /* Broadwell D */ + case 0x4e: /* Skylake M */ + case 0x55: /* Skylake X */ + case 0x5e: /* Skylake D */ + case 0x66: /* Cannonlake */ + case 0x67: /* Cannonlake? */ + case 0x8e: /* Kabylake M */ + case 0x9e: /* Kabylake D */ + cpu_has_bug_l1tf = true; + l1d_maxphysaddr = 46; + break; + + /* + * Atom processors are not vulnerable. + */ + case 0x1c: /* Pineview */ + case 0x26: /* Lincroft */ + case 0x27: /* Penwell */ + case 0x35: /* Cloverview */ + case 0x36: /* Cedarview */ + case 0x37: /* Baytrail / Valleyview (Silvermont) */ + case 0x4d: /* Avaton / Rangely (Silvermont) */ + case 0x4c: /* Cherrytrail / Brasswell */ + case 0x4a: /* Merrifield */ + case 0x5a: /* Moorefield */ + case 0x5c: /* Goldmont */ + case 0x5f: /* Denverton */ + case 0x7a: /* Gemini Lake */ + break; + + /* + * Knights processors are not vulnerable. + */ + case 0x57: /* Knights Landing */ + case 0x85: /* Knights Mill */ + break; + + default: + /* Defer printk() until we've accounted for RDCL_NO. */ + hit_default = true; + cpu_has_bug_l1tf = true; + break; + } + } + + /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */ + if ( caps & ARCH_CAPABILITIES_RDCL_NO ) + cpu_has_bug_l1tf = false; + + if ( cpu_has_bug_l1tf && hit_default ) + printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n", + boot_cpu_data.x86_model); + + /* + * L1TF safe address heuristics. These apply to the real hardware we are + * running on, and are best-effort-only if Xen is virtualised. + * + * The address mask which the L1D cache uses, which might be wider than + * the CPUID-reported maxphysaddr. + */ + l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK; + + /* + * To be safe, l1tf_safe_maddr must be above the highest cacheable entity + * in system physical address space. However, to preserve space for + * paged-out metadata, it should be as low as possible above the highest + * cacheable address, so as to require fewer high-order bits being set. + * + * These heuristics are based on some guesswork to improve the likelihood + * of safety in the common case, accounting for the fact that Linux's + * behaviour of mitigating L1TF by inverting all address bits in a + * non-present PTE. + * + * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end + * server), setting any address bit beyond CPUID maxphysaddr guarantees + * to make the PTE safe. This case doesn't require all the high-order + * bits being set, and doesn't require any other source of information + * for safety. + * + * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we + * must sacrifice high order bits from the real address space for + * safety. Therefore, make a blind guess that there is nothing + * cacheable in the top quarter of physical address space. + * + * It is exceedingly unlikely for machines to be populated with this + * much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on + * Sandybridge and later) due to the sheer volume of DIMMs this would + * actually take. + * + * However, it is possible to find machines this large, so the "top + * quarter" guess is supplemented to push the limit higher if references + * to cacheable mappings (E820/SRAT/EFI/etc) are found above the top + * quarter boundary. + * + * Finally, this top quarter guess gives us a good chance of being safe + * when running virtualised (and the CPUID maxphysaddr hasn't been + * levelled for heterogeneous migration safety), where the safety + * consideration is still in terms of host details, but all E820/etc + * information is in terms of guest physical layout. + */ + l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits) + ? (1ul << paddr_bits) + : (3ul << (paddr_bits - 2)))); +} + #define OPT_XPTI_DEFAULT 0xff uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT; @@ -626,6 +778,8 @@ void __init init_speculation_mitigations(void) else setup_clear_cpu_cap(X86_FEATURE_NO_XPTI); + l1tf_calculations(caps); + print_details(thunk, caps); /* diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c index 166eb44..2d70b45 100644 --- a/xen/arch/x86/srat.c +++ b/xen/arch/x86/srat.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct acpi_table_slit *__read_mostly acpi_slit; @@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma) if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) return; + start = ma->base_address; + end = start + ma->length; + /* Supplement the heuristics in l1tf_calculations(). */ + l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE)); + if (num_node_memblks >= NR_NODE_MEMBLKS) { dprintk(XENLOG_WARNING, @@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma) return; } - start = ma->base_address; - end = start + ma->length; pxm = ma->proximity_domain; if (srat_rev < 2) pxm &= 0xff; diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c index 1464520..2f49731 100644 --- a/xen/common/efi/boot.c +++ b/xen/common/efi/boot.c @@ -1382,6 +1382,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) #ifndef CONFIG_ARM /* TODO - runtime service support */ +#include + static bool __initdata efi_map_uc; static int __init parse_efi_param(const char *s) @@ -1497,6 +1499,16 @@ void __init efi_init_memory(void) desc->PhysicalStart, desc->PhysicalStart + len - 1, desc->Type, desc->Attribute); + if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) || + (efi_bs_revision >= EFI_REVISION(2, 5) && + (desc->Attribute & EFI_MEMORY_WP)) ) + { + /* Supplement the heuristics in l1tf_calculations(). */ + l1tf_safe_maddr = + max(l1tf_safe_maddr, + ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE)); + } + if ( !efi_enabled(EFI_RS) || (!(desc->Attribute & EFI_MEMORY_RUNTIME) && (!map_bs || diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h index 5b40afb..872b494 100644 --- a/xen/include/asm-x86/spec_ctrl.h +++ b/xen/include/asm-x86/spec_ctrl.h @@ -38,6 +38,13 @@ extern uint8_t opt_xpti; #define OPT_XPTI_DOM0 0x01 #define OPT_XPTI_DOMU 0x02 +/* + * The L1D address mask, which might be wider than reported in CPUID, and the + * system physical address above which there are believed to be no cacheable + * memory regions, thus unable to leak data via the L1TF vulnerability. + */ +extern paddr_t l1tf_addr_mask, l1tf_safe_maddr; + static inline void init_shadow_spec_ctrl_state(void) { struct cpu_info *info = get_cpu_info(); -- 2.1.4