[PATCH] x86: Meltdown band-aid against malicious 64-bit PV guests

* [PATCH] x86: Meltdown band-aid against malicious 64-bit PV guests
@ 2018-01-12 10:19 Jan Beulich
  2018-01-12 15:23 ` George Dunlap
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Jan Beulich @ 2018-01-12 10:19 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, security

This is a very simplistic change limiting the amount of memory a running
64-bit PV guest has mapped (and hence available for attacking): Only the
mappings of stack, IDT, and TSS are being cloned from the direct map
into per-CPU page tables. Guest controlled parts of the page tables are
being copied into those per-CPU page tables upon entry into the guest.
Cross-vCPU synchronization of top level page table entry changes is
being effected by forcing other active vCPU-s of the guest into the
hypervisor.

The change to context_switch() isn't strictly necessary, but there's no
reason to keep switching page tables once a PV guest is being scheduled
out.

There is certainly much room for improvement, especially of performance,
here - first and foremost suppressing all the negative effects on AMD
systems. But in the interest of backportability (including to really old
hypervisors, which may not even have alternative patching) any such is
being left out here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: Is forcing an event check interrupt for synchronization purposes
enough? It may be necessary to actually wait for remote vCPU-s to have
touched into the hypervisor, in which case a function-call-IPI should be
sent, with an empty handler (a flush-IPI with zero operation mask would
also do). Otoh, if the vCPU isn't already in hypervisor context,
delivery of the IPI should be almost instantly (as interrupts are always
enabled while in guest mode).
---
Backporting notes:
- This needs f9eb74789a ("x86/entry: Remove support for partial
  cpu_user_regs frames") as a prereq, due to the uses of %r14 and %r15.
  But that's intended to be backported anyway (for Spectre/SP2).
- The use of "root" instead of "l4" here is mainly to not make 5-level
  page table additions any harder. In backports "l4" should probably be
  preferred.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1511,6 +1511,9 @@ void paravirt_ctxt_switch_to(struct vcpu
 {
     unsigned long cr4;
 
+    this_cpu(root_pgt)[root_table_offset(PERDOMAIN_VIRT_START)] =
+        l4e_from_page(v->domain->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW);
+
     cr4 = pv_guest_cr4_to_real_cr4(v);
     if ( unlikely(cr4 != read_cr4()) )
         write_cr4(cr4);
@@ -1682,6 +1685,8 @@ void context_switch(struct vcpu *prev, s
 
     ASSERT(local_irq_is_enabled());
 
+    get_cpu_info()->xen_cr3 = 0;
+
     cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask);
     /* Allow at most one CPU at a time to be dirty. */
     ASSERT(cpumask_weight(&dirty_mask) <= 1);
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3683,6 +3683,20 @@ long do_mmu_update(
                         break;
                     rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+                    if ( !rc )
+                    {
+                        /*
+                         * Force other vCPU-s of the affected guest to pick up
+                         * the change (if any).
+                         */
+                        unsigned int cpu = smp_processor_id();
+                        cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
+
+                        cpumask_andnot(mask, pt_owner->domain_dirty_cpumask,
+                                       cpumask_of(cpu));
+                        if ( !cpumask_empty(mask) )
+                            smp_send_event_check_mask(mask);
+                    }
                     break;
 
                 case PGT_writable_page:
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -327,6 +327,9 @@ void start_secondary(void *unused)
      */
     spin_debug_disable();
 
+    get_cpu_info()->xen_cr3 = 0;
+    get_cpu_info()->pv_cr3 = __pa(this_cpu(root_pgt));
+
     load_system_tables();
 
     /* Full exception support from here on in. */
@@ -633,6 +636,181 @@ void cpu_exit_clear(unsigned int cpu)
     set_cpu_state(CPU_STATE_DEAD);
 }
 
+static bool clone_mapping(const void *ptr, root_pgentry_t *rpt)
+{
+    unsigned long linear = (unsigned long)ptr, pfn;
+    unsigned int flags;
+    l3_pgentry_t *l3t = l4e_to_l3e(idle_pg_table[root_table_offset(linear)]);
+    l2_pgentry_t *l2t;
+    l1_pgentry_t *l1t;
+
+    if ( linear < DIRECTMAP_VIRT_START )
+        return true;
+
+    flags = l3e_get_flags(l3t[l3_table_offset(linear)]);
+    ASSERT(flags & _PAGE_PRESENT);
+    if ( flags & _PAGE_PSE )
+    {
+        pfn = (l3e_get_pfn(l3t[l3_table_offset(linear)]) &
+               ~((1UL << (2 * PAGETABLE_ORDER)) - 1)) |
+              (PFN_DOWN(linear) & ((1UL << (2 * PAGETABLE_ORDER)) - 1));
+        flags &= ~_PAGE_PSE;
+    }
+    else
+    {
+        l2t = l3e_to_l2e(l3t[l3_table_offset(linear)]);
+        flags = l2e_get_flags(l2t[l2_table_offset(linear)]);
+        ASSERT(flags & _PAGE_PRESENT);
+        if ( flags & _PAGE_PSE )
+        {
+            pfn = (l2e_get_pfn(l2t[l2_table_offset(linear)]) &
+                   ~((1UL << PAGETABLE_ORDER) - 1)) |
+                  (PFN_DOWN(linear) & ((1UL << PAGETABLE_ORDER) - 1));
+            flags &= ~_PAGE_PSE;
+        }
+        else
+        {
+            l1t = l2e_to_l1e(l2t[l2_table_offset(linear)]);
+            flags = l1e_get_flags(l1t[l1_table_offset(linear)]);
+            if ( !(flags & _PAGE_PRESENT) )
+                return true;
+            pfn = l1e_get_pfn(l1t[l1_table_offset(linear)]);
+        }
+    }
+
+    if ( !(root_get_flags(rpt[root_table_offset(linear)]) & _PAGE_PRESENT) )
+    {
+        l3t = alloc_xen_pagetable();
+        if ( !l3t )
+            return false;
+        clear_page(l3t);
+        l4e_write(&rpt[root_table_offset(linear)],
+                  l4e_from_paddr(__pa(l3t), __PAGE_HYPERVISOR));
+    }
+    else
+        l3t = l4e_to_l3e(rpt[root_table_offset(linear)]);
+
+    if ( !(l3e_get_flags(l3t[l3_table_offset(linear)]) & _PAGE_PRESENT) )
+    {
+        l2t = alloc_xen_pagetable();
+        if ( !l2t )
+            return false;
+        clear_page(l2t);
+        l3e_write(&l3t[l3_table_offset(linear)],
+                  l3e_from_paddr(__pa(l2t), __PAGE_HYPERVISOR));
+    }
+    else
+    {
+        ASSERT(!(l3e_get_flags(l3t[l3_table_offset(linear)]) & _PAGE_PSE));
+        l2t = l3e_to_l2e(l3t[l3_table_offset(linear)]);
+    }
+
+    if ( !(l2e_get_flags(l2t[l2_table_offset(linear)]) & _PAGE_PRESENT) )
+    {
+        l1t = alloc_xen_pagetable();
+        if ( !l1t )
+            return false;
+        clear_page(l1t);
+        l2e_write(&l2t[l2_table_offset(linear)],
+                  l2e_from_paddr(__pa(l1t), __PAGE_HYPERVISOR));
+    }
+    else
+    {
+        ASSERT(!(l2e_get_flags(l2t[l2_table_offset(linear)]) & _PAGE_PSE));
+        l1t = l2e_to_l1e(l2t[l2_table_offset(linear)]);
+    }
+
+    if ( l1e_get_flags(l1t[l1_table_offset(linear)]) & _PAGE_PRESENT )
+    {
+        ASSERT(l1e_get_pfn(l1t[l1_table_offset(linear)]) == pfn);
+        ASSERT(l1e_get_flags(l1t[l1_table_offset(linear)]) == flags);
+    }
+    else
+        l1e_write(&l1t[l1_table_offset(linear)], l1e_from_pfn(pfn, flags));
+
+    return true;
+}
+
+DEFINE_PER_CPU(root_pgentry_t *, root_pgt);
+
+static bool setup_cpu_root_pgt(unsigned int cpu)
+{
+    root_pgentry_t *rpt = alloc_xen_pagetable();
+    unsigned int off;
+
+    if ( !rpt )
+        return false;
+
+    clear_page(rpt);
+    per_cpu(root_pgt, cpu) = rpt;
+
+    rpt[root_table_offset(RO_MPT_VIRT_START)] =
+        idle_pg_table[root_table_offset(RO_MPT_VIRT_START)];
+    /* SH_LINEAR_PT inserted together with guest mappings. */
+    /* PERDOMAIN inserted during context switch. */
+    rpt[root_table_offset(XEN_VIRT_START)] =
+        idle_pg_table[root_table_offset(XEN_VIRT_START)];
+
+    /* Install direct map page table entries for stack, IDT, and TSS. */
+    for ( off = 0; off < STACK_SIZE; off += PAGE_SIZE )
+        if ( !clone_mapping(__va(__pa(stack_base[cpu])) + off, rpt) )
+            break;
+
+    return off == STACK_SIZE &&
+           clone_mapping(idt_tables[cpu], rpt) &&
+           clone_mapping(&per_cpu(init_tss, cpu), rpt);
+}
+
+static void cleanup_cpu_root_pgt(unsigned int cpu)
+{
+    root_pgentry_t *rpt = per_cpu(root_pgt, cpu);
+    unsigned int r;
+
+    if ( !rpt )
+        return;
+
+    per_cpu(root_pgt, cpu) = NULL;
+
+    for ( r = root_table_offset(DIRECTMAP_VIRT_START);
+          r < root_table_offset(HYPERVISOR_VIRT_END); ++r )
+    {
+        l3_pgentry_t *l3t;
+        unsigned int i3;
+
+        if ( !(root_get_flags(rpt[r]) & _PAGE_PRESENT) )
+            continue;
+
+        l3t = l4e_to_l3e(rpt[r]);
+
+        for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; ++i3 )
+        {
+            l2_pgentry_t *l2t;
+            unsigned int i2;
+
+            if ( !(l3e_get_flags(l3t[i3]) & _PAGE_PRESENT) )
+                continue;
+
+            ASSERT(!(l3e_get_flags(l3t[i3]) & _PAGE_PSE));
+            l2t = l3e_to_l2e(l3t[i3]);
+
+            for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; ++i2 )
+            {
+                if ( !(l2e_get_flags(l2t[i2]) & _PAGE_PRESENT) )
+                    continue;
+
+                ASSERT(!(l2e_get_flags(l2t[i2]) & _PAGE_PSE));
+                free_xen_pagetable(l2e_to_l1e(l2t[i2]));
+            }
+
+            free_xen_pagetable(l2t);
+        }
+
+        free_xen_pagetable(l3t);
+    }
+
+    free_xen_pagetable(rpt);
+}
+
 static void cpu_smpboot_free(unsigned int cpu)
 {
     unsigned int order, socket = cpu_to_socket(cpu);
@@ -671,6 +849,8 @@ static void cpu_smpboot_free(unsigned in
             free_domheap_page(mfn_to_page(mfn));
     }
 
+    cleanup_cpu_root_pgt(cpu);
+
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
     free_xenheap_pages(per_cpu(gdt_table, cpu), order);
 
@@ -727,6 +907,9 @@ static int cpu_smpboot_alloc(unsigned in
     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
 
+    if ( !setup_cpu_root_pgt(cpu) )
+        goto oom;
+
     for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
           i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
         if ( cpu_online(i) && cpu_to_node(i) == node )
@@ -799,6 +982,10 @@ void __init smp_prepare_cpus(unsigned in
 
     stack_base[0] = stack_start;
 
+    if ( !setup_cpu_root_pgt(0) )
+        panic("No memory for root page table\n");
+    get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0));
+
     set_nr_sockets();
 
     socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
@@ -867,6 +1054,8 @@ void __init smp_prepare_boot_cpu(void)
 #if NR_CPUS > 2 * BITS_PER_LONG
     per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
 #endif
+
+    get_cpu_info()->xen_cr3 = 0;
 }
 
 static void
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -137,6 +137,8 @@ void __dummy__(void)
     OFFSET(CPUINFO_processor_id, struct cpu_info, processor_id);
     OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu);
     OFFSET(CPUINFO_cr4, struct cpu_info, cr4);
+    OFFSET(CPUINFO_xen_cr3, struct cpu_info, xen_cr3);
+    OFFSET(CPUINFO_pv_cr3, struct cpu_info, pv_cr3);
     DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info));
     BLANK();
 
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -199,6 +199,17 @@ ENTRY(cstar_enter)
         pushq $0
         movl  $TRAP_syscall, 4(%rsp)
         SAVE_ALL
+
+        GET_STACK_END(bx)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
+        neg   %rcx
+UNLIKELY_START(nz, cstar_cr3)
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+        neg   %rcx
+        write_cr3 rcx, rdi, rsi
+        movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+UNLIKELY_END(cstar_cr3)
+
         GET_CURRENT(bx)
         movq  VCPU_domain(%rbx),%rcx
         cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -37,6 +37,32 @@ ENTRY(switch_to_kernel)
 /* %rbx: struct vcpu, interrupts disabled */
 restore_all_guest:
         ASSERT_INTERRUPTS_DISABLED
+
+        /* Copy guest mappings and switch to per-CPU root page table. */
+        mov   %cr3, %r9
+        GET_STACK_END(dx)
+        mov   STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
+        movabs $PADDR_MASK & PAGE_MASK, %rsi
+        movabs $DIRECTMAP_VIRT_START, %rcx
+        mov   %rdi, %rax
+        and   %rsi, %rdi
+        and   %r9, %rsi
+        add   %rcx, %rdi
+        add   %rcx, %rsi
+        mov   $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx
+        mov   root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8
+        mov   %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi)
+        rep movsq
+        mov   $ROOT_PAGETABLE_ENTRIES - \
+               ROOT_PAGETABLE_LAST_XEN_SLOT - 1, %ecx
+        add   $(ROOT_PAGETABLE_LAST_XEN_SLOT + 1 - \
+                ROOT_PAGETABLE_FIRST_XEN_SLOT) * 8, %rsi
+        add   $(ROOT_PAGETABLE_LAST_XEN_SLOT + 1 - \
+                ROOT_PAGETABLE_FIRST_XEN_SLOT) * 8, %rdi
+        rep movsq
+        mov   %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
+        write_cr3 rax, rdi, rsi
+
         RESTORE_ALL
         testw $TRAP_syscall,4(%rsp)
         jz    iret_exit_to_guest
@@ -71,6 +97,18 @@ iret_exit_to_guest:
         ALIGN
 /* No special register assumptions. */
 restore_all_xen:
+        /*
+         * Check whether we need to switch to the per-CPU page tables, in
+         * case we return to late PV exit code (from an NMI or #MC).
+         */
+        GET_STACK_END(ax)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%rax), %rdx
+        mov   STACK_CPUINFO_FIELD(pv_cr3)(%rax), %rax
+        test  %rdx, %rdx
+UNLIKELY_START(g, exit_cr3)
+        write_cr3 rax, rdi, rsi
+UNLIKELY_END(exit_cr3)
+
         RESTORE_ALL adj=8
         iretq
 
@@ -100,7 +138,18 @@ ENTRY(lstar_enter)
         pushq $0
         movl  $TRAP_syscall, 4(%rsp)
         SAVE_ALL
-        GET_CURRENT(bx)
+
+        GET_STACK_END(bx)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
+        neg   %rcx
+        jz    .Llstar_cr3_okay
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+        neg   %rcx
+        write_cr3 rcx, rdi, rsi
+        movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+.Llstar_cr3_okay:
+
+        __GET_CURRENT(bx)
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
@@ -192,7 +241,18 @@ GLOBAL(sysenter_eflags_saved)
         pushq $0
         movl  $TRAP_syscall, 4(%rsp)
         SAVE_ALL
-        GET_CURRENT(bx)
+
+        GET_STACK_END(bx)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
+        neg   %rcx
+        jz    .Lsyse_cr3_okay
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+        neg   %rcx
+        write_cr3 rcx, rdi, rsi
+        movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+.Lsyse_cr3_okay:
+
+        __GET_CURRENT(bx)
         cmpb  $0,VCPU_sysenter_disables_events(%rbx)
         movq  VCPU_sysenter_addr(%rbx),%rax
         setne %cl
@@ -228,13 +288,23 @@ ENTRY(int80_direct_trap)
         movl  $0x80, 4(%rsp)
         SAVE_ALL
 
+        GET_STACK_END(bx)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
+        neg   %rcx
+UNLIKELY_START(nz, int80_cr3)
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+        neg   %rcx
+        write_cr3 rcx, rdi, rsi
+        movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+UNLIKELY_END(int80_cr3)
+
         cmpb  $0,untrusted_msi(%rip)
 UNLIKELY_START(ne, msi_check)
         movl  $0x80,%edi
         call  check_for_unexpected_msi
 UNLIKELY_END(msi_check)
 
-        GET_CURRENT(bx)
+        __GET_CURRENT(bx)
 
         /* Check that the callback is non-null. */
         leaq  VCPU_int80_bounce(%rbx),%rdx
@@ -391,9 +461,27 @@ ENTRY(dom_crash_sync_extable)
 
 ENTRY(common_interrupt)
         SAVE_ALL CLAC
+
+        GET_STACK_END(14)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
+        mov   %rcx, %r15
+        neg   %rcx
+        jz    .Lintr_cr3_okay
+        jns   .Lintr_cr3_load
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+        neg   %rcx
+.Lintr_cr3_load:
+        write_cr3 rcx, rdi, rsi
+        xor   %ecx, %ecx
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+        testb $3, UREGS_cs(%rsp)
+        cmovnz %rcx, %r15
+.Lintr_cr3_okay:
+
         CR4_PV32_RESTORE
         movq %rsp,%rdi
         callq do_IRQ
+        mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         jmp ret_from_intr
 
 /* No special register assumptions. */
@@ -411,6 +499,23 @@ ENTRY(page_fault)
 /* No special register assumptions. */
 GLOBAL(handle_exception)
         SAVE_ALL CLAC
+
+        GET_STACK_END(14)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
+        mov   %rcx, %r15
+        neg   %rcx
+        jz    .Lxcpt_cr3_okay
+        jns   .Lxcpt_cr3_load
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+        neg   %rcx
+.Lxcpt_cr3_load:
+        write_cr3 rcx, rdi, rsi
+        xor   %ecx, %ecx
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+        testb $3, UREGS_cs(%rsp)
+        cmovnz %rcx, %r15
+.Lxcpt_cr3_okay:
+
 handle_exception_saved:
         GET_CURRENT(bx)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
@@ -475,6 +580,7 @@ handle_exception_saved:
         leaq  exception_table(%rip),%rdx
         PERFC_INCR(exceptions, %rax, %rbx)
         callq *(%rdx,%rax,8)
+        mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
         leaq  VCPU_trap_bounce(%rbx),%rdx
@@ -507,6 +613,7 @@ exception_with_ints_disabled:
         rep;  movsq                     # make room for ec/ev
 1:      movq  UREGS_error_code(%rsp),%rax # ec/ev
         movq  %rax,UREGS_kernel_sizeof(%rsp)
+        mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         jmp   restore_all_xen           # return to fixup code
 
 /* No special register assumptions. */
@@ -585,6 +692,17 @@ ENTRY(double_fault)
         movl  $TRAP_double_fault,4(%rsp)
         /* Set AC to reduce chance of further SMAP faults */
         SAVE_ALL STAC
+
+        GET_STACK_END(bx)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rbx
+        test  %rbx, %rbx
+        jz    .Ldblf_cr3_okay
+        jns   .Ldblf_cr3_load
+        neg   %rbx
+.Ldblf_cr3_load:
+        write_cr3 rbx, rdi, rsi
+.Ldblf_cr3_okay:
+
         movq  %rsp,%rdi
         call  do_double_fault
         BUG   /* do_double_fault() shouldn't return. */
@@ -603,10 +721,28 @@ ENTRY(nmi)
         movl  $TRAP_nmi,4(%rsp)
 handle_ist_exception:
         SAVE_ALL CLAC
+
+        GET_STACK_END(14)
+        mov   STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
+        mov   %rcx, %r15
+        neg   %rcx
+        jz    .List_cr3_okay
+        jns   .List_cr3_load
+        mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+        neg   %rcx
+.List_cr3_load:
+        write_cr3 rcx, rdi, rsi
+        movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+.List_cr3_okay:
+
         CR4_PV32_RESTORE
         testb $3,UREGS_cs(%rsp)
         jz    1f
-        /* Interrupted guest context. Copy the context to stack bottom. */
+        /*
+         * Interrupted guest context. Clear the restore value for xen_cr3
+         * and copy the context to stack bottom.
+         */
+        xor   %r15, %r15
         GET_CPUINFO_FIELD(guest_cpu_user_regs,di)
         movq  %rsp,%rsi
         movl  $UREGS_kernel_sizeof/8,%ecx
@@ -616,6 +752,7 @@ handle_ist_exception:
         movzbl UREGS_entry_vector(%rsp),%eax
         leaq  exception_table(%rip),%rdx
         callq *(%rdx,%rax,8)
+        mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         cmpb  $TRAP_nmi,UREGS_entry_vector(%rsp)
         jne   ret_from_intr
 
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -93,9 +93,29 @@ void ret_from_intr(void);
         UNLIKELY_DONE(mp, tag);   \
         __UNLIKELY_END(tag)
 
+        .equ .Lax, -1
+        .equ .Lcx, -1
+        .equ .Ldx, -1
+        .equ .Lbx, -1
+        .equ .Lbp, -1
+        .equ .Lsi, -1
+        .equ .Ldi, -1
+        .equ .L8, 8
+        .equ .L9, 9
+        .equ .L10, 10
+        .equ .L11, 11
+        .equ .L12, 12
+        .equ .L13, 13
+        .equ .L14, 14
+        .equ .L15, 15
+
 #define STACK_CPUINFO_FIELD(field) (1 - CPUINFO_sizeof + CPUINFO_##field)
 #define GET_STACK_END(reg)                        \
+        .if .L##reg > 0;                          \
+        movq $STACK_SIZE-1, %r##reg;              \
+        .else;                                    \
         movl $STACK_SIZE-1, %e##reg;              \
+        .endif;                                   \
         orq  %rsp, %r##reg
 
 #define GET_CPUINFO_FIELD(field, reg)             \
@@ -177,6 +197,15 @@ void ret_from_intr(void);
 #define ASM_STAC ASM_AC(STAC)
 #define ASM_CLAC ASM_AC(CLAC)
 
+.macro write_cr3 val:req, tmp1:req, tmp2:req
+        mov   %cr4, %\tmp1
+        mov   %\tmp1, %\tmp2
+        and   $~X86_CR4_PGE, %\tmp1
+        mov   %\tmp1, %cr4
+        mov   %\val, %cr3
+        mov   %\tmp2, %cr4
+.endm
+
 #define CR4_PV32_RESTORE                                           \
         667: ASM_NOP5;                                             \
         .pushsection .altinstr_replacement, "ax";                  \
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -41,6 +41,8 @@ struct cpu_info {
     struct vcpu *current_vcpu;
     unsigned long per_cpu_offset;
     unsigned long cr4;
+    unsigned long xen_cr3;
+    unsigned long pv_cr3;
     /* get_stack_bottom() must be 16-byte aligned */
 };
 
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -462,6 +462,7 @@ extern idt_entry_t idt_table[];
 extern idt_entry_t *idt_tables[];
 
 DECLARE_PER_CPU(struct tss_struct, init_tss);
+DECLARE_PER_CPU(root_pgentry_t *, root_pgt);
 
 extern void init_int80_direct_trap(struct vcpu *v);
 
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -24,8 +24,8 @@
 /* These are architectural limits. Current CPUs support only 40-bit phys. */
 #define PADDR_BITS              52
 #define VADDR_BITS              48
-#define PADDR_MASK              ((1UL << PADDR_BITS)-1)
-#define VADDR_MASK              ((1UL << VADDR_BITS)-1)
+#define PADDR_MASK              ((_AC(1,UL) << PADDR_BITS)-1)
+#define VADDR_MASK              ((_AC(1,UL) << VADDR_BITS)-1)
 
 #define VADDR_TOP_BIT           (1UL << (VADDR_BITS - 1))
 #define CANONICAL_MASK          (~0UL & ~VADDR_MASK)
@@ -107,6 +107,7 @@ typedef l4_pgentry_t root_pgentry_t;
       : (((_s) < ROOT_PAGETABLE_FIRST_XEN_SLOT) ||  \
          ((_s) > ROOT_PAGETABLE_LAST_XEN_SLOT)))
 
+#define root_table_offset         l4_table_offset
 #define root_get_pfn              l4e_get_pfn
 #define root_get_flags            l4e_get_flags
 #define root_get_intpte           l4e_get_intpte



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread