[PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults

* [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2018-11-16 10:06 Alexandru Stefan ISAILA
  2018-11-16 17:04 ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-11-16 10:06 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, rcojocaru,
	george.dunlap, andrew.cooper3, Mihai Donțu,
	Andrei Vlad LUTAS, jbeulich, Alexandru Stefan ISAILA,
	Anshul Makkar, roger.pau

A new mechanism has been added which is able to generically re-execute
instructions, by temporarily granting permissions inside the EPT and
re-executing the instruction with all other vcpus paused and with the
monitor trap flag set. The mechanism is re-entrant, meaning that is
capable of handling different violations caused by the same instruction.
Usually, a security appliance will decide when and what instructions
must be re-executed this way instructions that lie in non-executable
pages and instructions that cause the setting of Accessed and/or Dirty
flags inside page tables are two examples.

Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
Signed-off-by: Andrei Lutas <vlutas@bitdefender.com>
Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
Signed-off-by: Anshul Makkar <anshul.makkar@citrix.com>
---
 xen/arch/x86/domain.c         |   3 +
 xen/arch/x86/hvm/vmx/vmx.c    | 255 ++++++++++++++++++++++++++++++++++
 xen/arch/x86/mm/mem_access.c  |  20 ++-
 xen/include/asm-x86/domain.h  |  18 +++
 xen/include/asm-x86/hvm/hvm.h |   2 +
 5 files changed, 295 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 295b10c48c..b0680a76f1 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -343,6 +343,7 @@ int arch_vcpu_create(struct vcpu *v)
     int rc;
 
     v->arch.flags = TF_kernel_mode;
+    v->arch.in_host = 1;
 
     rc = mapcache_vcpu_init(v);
     if ( rc )
@@ -482,6 +483,8 @@ int arch_domain_create(struct domain *d,
     spin_lock_init(&d->arch.e820_lock);
     spin_lock_init(&d->arch.vtsc_lock);
 
+    spin_lock_init(&d->arch.rexec_lock);
+
     /* Minimal initialisation for the idle domain. */
     if ( unlikely(is_idle_domain(d)) )
     {
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 365eeb2886..84f8648fc0 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2289,6 +2289,255 @@ static bool vmx_get_pending_event(struct vcpu *v, struct x86_event *info)
     return true;
 }
 
+static int vmx_start_reexecute_instruction(struct vcpu *v, unsigned long gpa,
+                                           xenmem_access_t required_access)
+{
+    /*
+     * NOTE: Some required_accesses may be invalid. For example, one
+     * cannot grant only write access on a given page; read/write
+     * access must be granted instead. These inconsistencies are NOT
+     * checked here. The caller must ensure that "required_access" is
+     * an allowed combination.
+     */
+
+    int ret = 0, i, found = 0, r = 0, w = 0, x = 0, level = 0, leave = 0;
+    xenmem_access_t old_access, new_access;
+    struct vcpu *a;
+    unsigned int altp2m_idx =
+        altp2m_active(v->domain) ? altp2m_vcpu_idx(v) : 0;
+
+    spin_lock(&v->domain->arch.rexec_lock);
+
+    level = v->arch.rexec_level;
+
+    /*
+     * Step 1: Make sure someone else didn't get to start an
+     * instruction re-execution.
+     */
+    for_each_vcpu ( v->domain, a )
+    {
+        /* We're interested in pausing all the VCPUs except self/v. */
+        if ( a == v )
+            continue;
+
+        /*
+         * Check if "a" started an instruction re-execution. If so,
+         * return success, as we'll re-execute our instruction later.
+         */
+        if ( a->arch.rexec_level != 0 )
+        {
+            /* We should be paused. */
+            ret = 0;
+            leave = 1;
+            goto release_and_exit;
+        }
+    }
+
+    /* Step 2: Make sure we're not exceeding the max re-execution depth. */
+    if ( level >= REEXECUTION_MAX_DEPTH )
+    {
+        ret = -1;
+        leave = 1;
+        goto release_and_exit;
+    }
+
+    /*
+     * Step 2: Pause all the VCPUs, except self. Note that we have to do
+     * this only if we're at nesting level 0; if we're at a higher level
+     * of nested re-exec, the vcpus are already paused.
+     */
+    if ( level == 0 )
+    {
+        for_each_vcpu ( v->domain, a )
+        {
+            /* We're interested in pausing all the VCPUs except self/v. */
+            if ( a == v )
+                continue;
+
+            /* Pause, NO SYNC! We're gonna do our own syncing. */
+            vcpu_pause_nosync(a);
+        }
+
+        /*
+         * Step 3: Wait for all the paused VCPUs to actually leave the VMX
+         * non-root realm and enter VMX root.
+         */
+        for_each_vcpu ( v->domain, a )
+        {
+            /* We're interested in pausing all the VCPUs except self/v. */
+            if ( a == v )
+                continue;
+
+            /* Pause, synced. */
+            while ( !a->arch.in_host )
+                cpu_relax();
+        }
+    }
+
+    /* Update the rexecution nexting level. */
+    v->arch.rexec_level++;
+
+release_and_exit:
+    spin_unlock(&v->domain->arch.rexec_lock);
+
+    /* If we've got errors so far, return. */
+    if ( leave )
+        return ret;
+
+    /*
+     * Step 4: Save the current gpa & old access rights. Also, check if this
+     * is a "double-fault" on the exact same GPA, in which case, we will
+     * promote the rights of this particular GPA, and try again.
+     */
+    for ( i = 0; i < level; i++ )
+    {
+        if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) ==
+             (gpa >> PAGE_SHIFT) )
+        {
+            /* This GPA is already in the queue. */
+            found = 1;
+
+            switch (v->arch.rexec_context[i].cur_access) {
+                case XENMEM_access_r: r = 1; break;
+                case XENMEM_access_w: w = 1; break;
+                case XENMEM_access_x: x = 1; break;
+                case XENMEM_access_rx: r = x = 1; break;
+                case XENMEM_access_wx: w = x = 1;  break;
+                case XENMEM_access_rw: r = w = 1; break;
+                case XENMEM_access_rwx: r = w = x = 1; break;
+                default: break; /* We don't care about any other case. */
+            }
+        }
+    }
+
+    /*
+     * Get the current EPT access rights. They will be restored when we're done.
+     * Note that the restoration is done in reverse-order, in order to ensure
+     * that the original access rights are restore correctly. Otherwise, we may
+     * restore whatever access rights were modified by another re-execution
+     * request, and that would be bad.
+     */
+    if ( p2m_get_mem_access(v->domain, _gfn(gpa >> PAGE_SHIFT),
+                            &old_access, altp2m_idx) != 0 )
+        return -1;
+
+    v->arch.rexec_context[level].gpa = gpa;
+    v->arch.rexec_context[level].old_access = old_access;
+    v->arch.rexec_context[level].old_single_step = v->arch.hvm.single_step;
+
+    /*
+     * Step 5: Make the GPA with the required access, so we can re-execute
+     * the instruction.
+     */
+    switch ( required_access )
+    {
+        case XENMEM_access_r: r = 1; break;
+        case XENMEM_access_w: w = 1; break;
+        case XENMEM_access_x: x = 1; break;
+        case XENMEM_access_rx: r = x = 1; break;
+        case XENMEM_access_wx: w = x = 1;  break;
+        case XENMEM_access_rw: r = w = 1; break;
+        case XENMEM_access_rwx: r = w = x = 1; break;
+        default: break; /* We don't care about any other case. */
+    }
+
+    /* Now transform our RWX values in a XENMEM_access_* constant. */
+    if ( r == 0 && w == 0 && x == 0 )
+        new_access = XENMEM_access_n;
+    else if ( r == 0 && w == 0 && x == 1 )
+        new_access = XENMEM_access_x;
+    else if ( r == 0 && w == 1 && x == 0 )
+        new_access = XENMEM_access_w;
+    else if ( r == 0 && w == 1 && x == 1 )
+        new_access = XENMEM_access_wx;
+    else if ( r == 1 && w == 0 && x == 0 )
+        new_access = XENMEM_access_r;
+    else if ( r == 1 && w == 0 && x == 1 )
+        new_access = XENMEM_access_rx;
+    else if ( r == 1 && w == 1 && x == 0 )
+        new_access = XENMEM_access_rw;
+    else if ( r == 1 && w == 1 && x == 1 )
+        new_access = XENMEM_access_rwx;
+    else
+        new_access = required_access; /* Should never get here. */
+
+    /* And save the current access rights. */
+    v->arch.rexec_context[level].cur_access = new_access;
+
+    /* Apply the changes inside the EPT. */
+    if ( p2m_set_mem_access(v->domain, _gfn(gpa >> PAGE_SHIFT),
+                            1, 0, MEMOP_CMD_MASK, new_access,
+                            altp2m_idx) != 0 )
+        return -1;
+
+    /*
+     * Step 6: Reconfigure the VMCS, so it suits our needs. We want a
+     * VM-exit to be generated after the instruction has been
+     * successfully re-executed.
+     */
+    if ( level == 0 )
+        v->arch.hvm.single_step = 1;
+
+    /* Step 8: We should be done! */
+
+    return ret;
+}
+
+static int vmx_stop_reexecute_instruction(struct vcpu *v)
+{
+    int ret = 0, i;
+    struct vcpu *a;
+    unsigned int altp2m_idx =
+        altp2m_active(v->domain) ? altp2m_vcpu_idx(v) : 0;
+
+    if ( v->arch.rexec_level == 0 )
+        return 0;
+
+    /* Step 1: Restore original EPT access rights for each GPA. */
+    for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
+    {
+        if ( v->arch.rexec_context[i].gpa != mfn_x(INVALID_MFN) &&
+             p2m_set_mem_access(v->domain,
+                                _gfn(v->arch.rexec_context[i].gpa >> PAGE_SHIFT),
+                                1, 0, MEMOP_CMD_MASK,
+                                v->arch.rexec_context[i].old_access,
+                                altp2m_idx) != 0 )
+        {
+            ret = -1;
+            return ret;
+        }
+
+        v->arch.rexec_context[i].gpa = 0;
+        v->arch.hvm.single_step = v->arch.rexec_context[i].old_single_step;
+    }
+
+    spin_lock(&v->domain->arch.rexec_lock);
+
+    /* Step 2: Reset the nesting level to zero. */
+    v->arch.rexec_level = 0;
+
+    /* Step 3: Resume all other VCPUs. */
+    for_each_vcpu ( v->domain, a )
+    {
+        if ( a == v )
+            continue;
+
+        /* Unpause the VCPU. */
+        vcpu_unpause(a);
+    }
+
+    /*
+     * Step 4: Remove the MONITOR trap flag.
+     * - this is already done when handling the exit.
+     */
+
+    /* Step 5: We're done! */
+
+    spin_unlock(&v->domain->arch.rexec_lock);
+
+    return ret;
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -2324,6 +2573,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .invlpg               = vmx_invlpg,
     .cpu_up               = vmx_cpu_up,
     .cpu_down             = vmx_cpu_down,
+    .start_reexecute_instruction = vmx_start_reexecute_instruction,
     .wbinvd_intercept     = vmx_wbinvd_intercept,
     .fpu_dirty_intercept  = vmx_fpu_dirty_intercept,
     .msr_read_intercept   = vmx_msr_read_intercept,
@@ -3590,6 +3840,8 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     unsigned int vector = 0, mode;
     struct vcpu *v = current;
 
+    v->arch.in_host = 1;
+
     __vmread(GUEST_RIP,    &regs->rip);
     __vmread(GUEST_RSP,    &regs->rsp);
     __vmread(GUEST_RFLAGS, &regs->rflags);
@@ -4112,6 +4364,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     case EXIT_REASON_MONITOR_TRAP_FLAG:
         v->arch.hvm.vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
+        vmx_stop_reexecute_instruction(v);
         if ( v->arch.hvm.single_step )
         {
             hvm_monitor_debug(regs->rip,
@@ -4330,6 +4583,8 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
     if ( unlikely(curr->arch.hvm.vmx.lbr_flags & LBR_FIXUP_MASK) )
         lbr_fixup();
 
+    curr->arch.in_host = 0;
+
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 
     __vmwrite(GUEST_RIP,    regs->rip);
diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
index 2f1295e56a..5ae3a61b5c 100644
--- a/xen/arch/x86/mm/mem_access.c
+++ b/xen/arch/x86/mm/mem_access.c
@@ -212,10 +212,11 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
     }
     if ( vm_event_check_ring(d->vm_event_monitor) &&
          d->arch.monitor.inguest_pagefault_disabled &&
-         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
+         npfec.kind != npfec_kind_with_gla &&
+         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
     {
-        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
-
+        v->arch.vm_event->emulate_flags = 0;
+        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
         return true;
     }
 
@@ -226,6 +227,7 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
         *req_ptr = req;
 
         req->reason = VM_EVENT_REASON_MEM_ACCESS;
+
         req->u.mem_access.gfn = gfn_x(gfn);
         req->u.mem_access.offset = gpa & ((1 << PAGE_SHIFT) - 1);
 
@@ -377,6 +379,8 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
     p2m_access_t a;
     unsigned long gfn_l;
     long rc = 0;
+    struct vcpu *v;
+    int i;
 
     /* altp2m view 0 is treated as the hostp2m */
 #ifdef CONFIG_HVM
@@ -413,6 +417,16 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
         if ( rc )
             break;
 
+        for_each_vcpu(d, v)
+        {
+            if ( !v->arch.rexec_level )
+                continue;
+
+            for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
+                if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) == gfn_x(gfn) )
+                    v->arch.rexec_context[i].gpa = mfn_x(INVALID_MFN);
+        }
+
         /* Check for continuation if it's not the last iteration. */
         if ( nr > ++start && !(start & mask) && hypercall_preempt_check() )
         {
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 277f99f633..dbb68f108a 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -438,6 +438,8 @@ struct arch_domain
 
     /* Emulated devices enabled bitmap. */
     uint32_t emulation_flags;
+
+    spinlock_t rexec_lock;
 } __cacheline_aligned;
 
 #ifdef CONFIG_HVM
@@ -629,6 +631,22 @@ struct arch_vcpu
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
 
+#define REEXECUTION_MAX_DEPTH 8
+    struct rexec_context_t {
+        unsigned long gpa;
+        xenmem_access_t old_access;
+        xenmem_access_t cur_access;
+        bool_t old_single_step;
+    } rexec_context[REEXECUTION_MAX_DEPTH];
+
+    int rexec_level;
+
+    /*
+     *  Will be true when the vcpu is in VMX root,
+     * false when it is not.
+     */
+    bool_t in_host;
+
     struct arch_vm_event *vm_event;
 
     struct vcpu_msrs *msrs;
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 3d3250dff0..1f5d43a98d 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -167,6 +167,8 @@ struct hvm_function_table {
 
     int  (*cpu_up)(void);
     void (*cpu_down)(void);
+    int  (*start_reexecute_instruction)(struct vcpu *v, unsigned long gpa,
+                                        xenmem_access_t required_access);
 
     /* Copy up to 15 bytes from cached instruction bytes at current rIP. */
     unsigned int (*get_insn_bytes)(struct vcpu *v, uint8_t *buf);
-- 
2.17.1

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related	[flat|nested] 52+ messages in thread