[MODERATED] Re: [PATCH] SPTE masking

From: Andrew Cooper <andrew.cooper3@citrix.com>
To: speck@linutronix.de
Subject: [MODERATED] Re: [PATCH] SPTE masking
Date: Thu, 9 Aug 2018 10:33:38 +0100	[thread overview]
Message-ID: <b6b405c4-05cf-a827-1028-47586796d96a@citrix.com> (raw)
In-Reply-To: <d64e7abd-7591-d81d-61ef-900203033c07@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 1740 bytes --]

On 09/08/18 10:25, speck for Paolo Bonzini wrote:
> On 09/08/2018 01:21, speck for Jim Mattson wrote:
>> [PATCH] kvm: x86: Set highest physical address bit in non-present/reserved SPTEs
>>
>> Always set the upper-most supported physical address bit to 1 for SPTEs
>> that are marked as non-present or reserved, to make them unusable for
>> L1TF attacks from the guest. Currently, this just applies to MMIO SPTEs.
>> (We do not need to mark PTEs that are completely 0 as physical page 0
>> is already reserved.)
>>
>> This allows mitigation of L1TF without disabling hyper-threading by using
>> shadow paging mode instead of EPT.
> I don't understand why the big patch is needed.  MMIO SPTEs already have a mask
> applied that includes the top bit on all processors that have MAXPHYADDR<52
> I would hope that all processors with MAXPHYADDR=52 will have the bug fixed
> (and AFAIK none are being sold right now), but in any case something like
>
>         if (maxphyaddr == 52) {
>                 kvm_mmu_set_mmio_spte_mask((1ull << 51) | 1, 1ull << 51);
> 		return;
>         }
>
> in kvm_set_mmio_spte_mask should do, or alternatively the nicer patch after
> my signature (untested and unthought).

Setting bit 51 doesn't mitigate L1TF on any current processor.

You need to set an address bit inside L1D-maxphysaddr, and isn't
cacheable on the current system.

Attached is my patch for doing this generally in Xen, along with some
safety heuristics for nesting.  In Xen, we need to audit each PTE a PV
guest tries to write, and the bottom line safety check for that is:

static inline bool is_l1tf_safe_maddr(intpte_t pte)
{
    paddr_t maddr = pte & l1tf_addr_mask;

    return maddr == 0 || maddr >= l1tf_safe_maddr;
}

~Andrew

[-- Attachment #2: 0001-x86-spec-ctrl-Calculate-safe-PTE-addresses-for-L1TF-.patch --]
[-- Type: text/x-patch, Size: 11798 bytes --]

From 2e4a4617d484fd3b44abdc77013d559a02c1ed10 Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Wed, 25 Jul 2018 12:10:19 +0000
Subject: [PATCH] x86/spec-ctrl: Calculate safe PTE addresses for L1TF
 mitigations

Safe PTE addresses for L1TF mitigations are ones which are within the L1D
address width (may be wider than reported in CPUID), and above the highest
cacheable RAM/NVDIMM/BAR/etc.

All logic here is best-effort heuristics, which should in practice be fine for
most hardware.  Future work will see about disentangling the SRAT handling
further, as well as having L0 pass this information down to lower levels when
virtualised.

This is part of XSA-273 / CVE-2018-3620.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2:
 * New
v3:
 * Adjust heuristics to be mostly safe for the high end server case, and
   heterogeneous migration cases.  Extend the comments a lot to explain what
   is going on.
v4:
 * Fold EFI and SRAT adjustments.
 * Expand safety comment.
---
 xen/arch/x86/setup.c            |  12 ++++
 xen/arch/x86/spec_ctrl.c        | 154 ++++++++++++++++++++++++++++++++++++++++
 xen/arch/x86/srat.c             |   8 ++-
 xen/common/efi/boot.c           |  12 ++++
 xen/include/asm-x86/spec_ctrl.h |   7 ++
 5 files changed, 191 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 8301de8..7c86b9a 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -912,6 +912,18 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     /* Sanitise the raw E820 map to produce a final clean version. */
     max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
 
+    if ( !efi_enabled(EFI_BOOT) )
+    {
+        /*
+         * Supplement the heuristics in l1tf_calculations() by assuming that
+         * anything referenced in the E820 may be cacheable.
+         */
+        l1tf_safe_maddr =
+            max(l1tf_safe_maddr,
+                ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
+                        e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
+    }
+
     /* Create a temporary copy of the E820 map. */
     memcpy(&boot_e820, &e820, sizeof(e820));
 
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 32a4ea6..abe3785 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -50,6 +50,10 @@ bool __initdata bsp_delay_spec_ctrl;
 uint8_t __read_mostly default_xen_spec_ctrl;
 uint8_t __read_mostly default_spec_ctrl_flags;
 
+paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr;
+static bool __initdata cpu_has_bug_l1tf;
+static unsigned int __initdata l1d_maxphysaddr;
+
 static int __init parse_bti(const char *s)
 {
     const char *ss;
@@ -420,6 +424,154 @@ static bool __init should_use_eager_fpu(void)
     }
 }
 
+/* Calculate whether this CPU is vulnerable to L1TF. */
+static __init void l1tf_calculations(uint64_t caps)
+{
+    bool hit_default = false;
+
+    l1d_maxphysaddr = paddr_bits;
+
+    /* L1TF is only known to affect Intel Family 6 processors at this time. */
+    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+         boot_cpu_data.x86 == 6 )
+    {
+        switch ( boot_cpu_data.x86_model )
+        {
+            /*
+             * Core processors since at least Penryn are vulnerable.
+             */
+        case 0x17: /* Penryn */
+        case 0x1d: /* Dunnington */
+            cpu_has_bug_l1tf = true;
+            break;
+
+        case 0x1f: /* Auburndale / Havendale */
+        case 0x1e: /* Nehalem */
+        case 0x1a: /* Nehalem EP */
+        case 0x2e: /* Nehalem EX */
+        case 0x25: /* Westmere */
+        case 0x2c: /* Westmere EP */
+        case 0x2f: /* Westmere EX */
+            cpu_has_bug_l1tf = true;
+            l1d_maxphysaddr = 44;
+            break;
+
+        case 0x2a: /* SandyBridge */
+        case 0x2d: /* SandyBridge EP/EX */
+        case 0x3a: /* IvyBridge */
+        case 0x3e: /* IvyBridge EP/EX */
+        case 0x3c: /* Haswell */
+        case 0x3f: /* Haswell EX/EP */
+        case 0x45: /* Haswell D */
+        case 0x46: /* Haswell H */
+        case 0x3d: /* Broadwell */
+        case 0x47: /* Broadwell H */
+        case 0x4f: /* Broadwell EP/EX */
+        case 0x56: /* Broadwell D */
+        case 0x4e: /* Skylake M */
+        case 0x55: /* Skylake X */
+        case 0x5e: /* Skylake D */
+        case 0x66: /* Cannonlake */
+        case 0x67: /* Cannonlake? */
+        case 0x8e: /* Kabylake M */
+        case 0x9e: /* Kabylake D */
+            cpu_has_bug_l1tf = true;
+            l1d_maxphysaddr = 46;
+            break;
+
+            /*
+             * Atom processors are not vulnerable.
+             */
+        case 0x1c: /* Pineview */
+        case 0x26: /* Lincroft */
+        case 0x27: /* Penwell */
+        case 0x35: /* Cloverview */
+        case 0x36: /* Cedarview */
+        case 0x37: /* Baytrail / Valleyview (Silvermont) */
+        case 0x4d: /* Avaton / Rangely (Silvermont) */
+        case 0x4c: /* Cherrytrail / Brasswell */
+        case 0x4a: /* Merrifield */
+        case 0x5a: /* Moorefield */
+        case 0x5c: /* Goldmont */
+        case 0x5f: /* Denverton */
+        case 0x7a: /* Gemini Lake */
+            break;
+
+            /*
+             * Knights processors are not vulnerable.
+             */
+        case 0x57: /* Knights Landing */
+        case 0x85: /* Knights Mill */
+            break;
+
+        default:
+            /* Defer printk() until we've accounted for RDCL_NO. */
+            hit_default = true;
+            cpu_has_bug_l1tf = true;
+            break;
+        }
+    }
+
+    /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */
+    if ( caps & ARCH_CAPABILITIES_RDCL_NO )
+        cpu_has_bug_l1tf = false;
+
+    if ( cpu_has_bug_l1tf && hit_default )
+        printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n",
+               boot_cpu_data.x86_model);
+
+    /*
+     * L1TF safe address heuristics.  These apply to the real hardware we are
+     * running on, and are best-effort-only if Xen is virtualised.
+     *
+     * The address mask which the L1D cache uses, which might be wider than
+     * the CPUID-reported maxphysaddr.
+     */
+    l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK;
+
+    /*
+     * To be safe, l1tf_safe_maddr must be above the highest cacheable entity
+     * in system physical address space.  However, to preserve space for
+     * paged-out metadata, it should be as low as possible above the highest
+     * cacheable address, so as to require fewer high-order bits being set.
+     *
+     * These heuristics are based on some guesswork to improve the likelihood
+     * of safety in the common case, accounting for the fact that Linux's
+     * behaviour of mitigating L1TF by inverting all address bits in a
+     * non-present PTE.
+     *
+     * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end
+     *   server), setting any address bit beyond CPUID maxphysaddr guarantees
+     *   to make the PTE safe.  This case doesn't require all the high-order
+     *   bits being set, and doesn't require any other source of information
+     *   for safety.
+     *
+     * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we
+     *   must sacrifice high order bits from the real address space for
+     *   safety.  Therefore, make a blind guess that there is nothing
+     *   cacheable in the top quarter of physical address space.
+     *
+     *   It is exceedingly unlikely for machines to be populated with this
+     *   much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on
+     *   Sandybridge and later) due to the sheer volume of DIMMs this would
+     *   actually take.
+     *
+     *   However, it is possible to find machines this large, so the "top
+     *   quarter" guess is supplemented to push the limit higher if references
+     *   to cacheable mappings (E820/SRAT/EFI/etc) are found above the top
+     *   quarter boundary.
+     *
+     *   Finally, this top quarter guess gives us a good chance of being safe
+     *   when running virtualised (and the CPUID maxphysaddr hasn't been
+     *   levelled for heterogeneous migration safety), where the safety
+     *   consideration is still in terms of host details, but all E820/etc
+     *   information is in terms of guest physical layout.
+     */
+    l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits)
+                                            ? (1ul << paddr_bits)
+                                            : (3ul << (paddr_bits - 2))));
+}
+
 #define OPT_XPTI_DEFAULT  0xff
 uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT;
 
@@ -626,6 +778,8 @@ void __init init_speculation_mitigations(void)
     else
         setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
 
+    l1tf_calculations(caps);
+
     print_details(thunk, caps);
 
     /*
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index 166eb44..2d70b45 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -20,6 +20,7 @@
 #include <xen/pfn.h>
 #include <asm/e820.h>
 #include <asm/page.h>
+#include <asm/spec_ctrl.h>
 
 static struct acpi_table_slit *__read_mostly acpi_slit;
 
@@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
 	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
 		return;
 
+	start = ma->base_address;
+	end = start + ma->length;
+	/* Supplement the heuristics in l1tf_calculations(). */
+	l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
+
 	if (num_node_memblks >= NR_NODE_MEMBLKS)
 	{
 		dprintk(XENLOG_WARNING,
@@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
 		return;
 	}
 
-	start = ma->base_address;
-	end = start + ma->length;
 	pxm = ma->proximity_domain;
 	if (srat_rev < 2)
 		pxm &= 0xff;
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index 1464520..2f49731 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1382,6 +1382,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
 
 #ifndef CONFIG_ARM /* TODO - runtime service support */
 
+#include <asm/spec_ctrl.h>
+
 static bool __initdata efi_map_uc;
 
 static int __init parse_efi_param(const char *s)
@@ -1497,6 +1499,16 @@ void __init efi_init_memory(void)
                desc->PhysicalStart, desc->PhysicalStart + len - 1,
                desc->Type, desc->Attribute);
 
+        if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) ||
+             (efi_bs_revision >= EFI_REVISION(2, 5) &&
+              (desc->Attribute & EFI_MEMORY_WP)) )
+        {
+            /* Supplement the heuristics in l1tf_calculations(). */
+            l1tf_safe_maddr =
+                max(l1tf_safe_maddr,
+                    ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE));
+        }
+
         if ( !efi_enabled(EFI_RS) ||
              (!(desc->Attribute & EFI_MEMORY_RUNTIME) &&
               (!map_bs ||
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
index 5b40afb..872b494 100644
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -38,6 +38,13 @@ extern uint8_t opt_xpti;
 #define OPT_XPTI_DOM0  0x01
 #define OPT_XPTI_DOMU  0x02
 
+/*
+ * The L1D address mask, which might be wider than reported in CPUID, and the
+ * system physical address above which there are believed to be no cacheable
+ * memory regions, thus unable to leak data via the L1TF vulnerability.
+ */
+extern paddr_t l1tf_addr_mask, l1tf_safe_maddr;
+
 static inline void init_shadow_spec_ctrl_state(void)
 {
     struct cpu_info *info = get_cpu_info();
-- 
2.1.4