All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2018-11-16 10:06 Alexandru Stefan ISAILA
  2018-11-16 17:04 ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-11-16 10:06 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, rcojocaru,
	george.dunlap, andrew.cooper3, Mihai Donțu,
	Andrei Vlad LUTAS, jbeulich, Alexandru Stefan ISAILA,
	Anshul Makkar, roger.pau

A new mechanism has been added which is able to generically re-execute
instructions, by temporarily granting permissions inside the EPT and
re-executing the instruction with all other vcpus paused and with the
monitor trap flag set. The mechanism is re-entrant, meaning that is
capable of handling different violations caused by the same instruction.
Usually, a security appliance will decide when and what instructions
must be re-executed this way instructions that lie in non-executable
pages and instructions that cause the setting of Accessed and/or Dirty
flags inside page tables are two examples.

Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
Signed-off-by: Andrei Lutas <vlutas@bitdefender.com>
Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
Signed-off-by: Anshul Makkar <anshul.makkar@citrix.com>
---
 xen/arch/x86/domain.c         |   3 +
 xen/arch/x86/hvm/vmx/vmx.c    | 255 ++++++++++++++++++++++++++++++++++
 xen/arch/x86/mm/mem_access.c  |  20 ++-
 xen/include/asm-x86/domain.h  |  18 +++
 xen/include/asm-x86/hvm/hvm.h |   2 +
 5 files changed, 295 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 295b10c48c..b0680a76f1 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -343,6 +343,7 @@ int arch_vcpu_create(struct vcpu *v)
     int rc;
 
     v->arch.flags = TF_kernel_mode;
+    v->arch.in_host = 1;
 
     rc = mapcache_vcpu_init(v);
     if ( rc )
@@ -482,6 +483,8 @@ int arch_domain_create(struct domain *d,
     spin_lock_init(&d->arch.e820_lock);
     spin_lock_init(&d->arch.vtsc_lock);
 
+    spin_lock_init(&d->arch.rexec_lock);
+
     /* Minimal initialisation for the idle domain. */
     if ( unlikely(is_idle_domain(d)) )
     {
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 365eeb2886..84f8648fc0 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2289,6 +2289,255 @@ static bool vmx_get_pending_event(struct vcpu *v, struct x86_event *info)
     return true;
 }
 
+static int vmx_start_reexecute_instruction(struct vcpu *v, unsigned long gpa,
+                                           xenmem_access_t required_access)
+{
+    /*
+     * NOTE: Some required_accesses may be invalid. For example, one
+     * cannot grant only write access on a given page; read/write
+     * access must be granted instead. These inconsistencies are NOT
+     * checked here. The caller must ensure that "required_access" is
+     * an allowed combination.
+     */
+
+    int ret = 0, i, found = 0, r = 0, w = 0, x = 0, level = 0, leave = 0;
+    xenmem_access_t old_access, new_access;
+    struct vcpu *a;
+    unsigned int altp2m_idx =
+        altp2m_active(v->domain) ? altp2m_vcpu_idx(v) : 0;
+
+    spin_lock(&v->domain->arch.rexec_lock);
+
+    level = v->arch.rexec_level;
+
+    /*
+     * Step 1: Make sure someone else didn't get to start an
+     * instruction re-execution.
+     */
+    for_each_vcpu ( v->domain, a )
+    {
+        /* We're interested in pausing all the VCPUs except self/v. */
+        if ( a == v )
+            continue;
+
+        /*
+         * Check if "a" started an instruction re-execution. If so,
+         * return success, as we'll re-execute our instruction later.
+         */
+        if ( a->arch.rexec_level != 0 )
+        {
+            /* We should be paused. */
+            ret = 0;
+            leave = 1;
+            goto release_and_exit;
+        }
+    }
+
+    /* Step 2: Make sure we're not exceeding the max re-execution depth. */
+    if ( level >= REEXECUTION_MAX_DEPTH )
+    {
+        ret = -1;
+        leave = 1;
+        goto release_and_exit;
+    }
+
+    /*
+     * Step 2: Pause all the VCPUs, except self. Note that we have to do
+     * this only if we're at nesting level 0; if we're at a higher level
+     * of nested re-exec, the vcpus are already paused.
+     */
+    if ( level == 0 )
+    {
+        for_each_vcpu ( v->domain, a )
+        {
+            /* We're interested in pausing all the VCPUs except self/v. */
+            if ( a == v )
+                continue;
+
+            /* Pause, NO SYNC! We're gonna do our own syncing. */
+            vcpu_pause_nosync(a);
+        }
+
+        /*
+         * Step 3: Wait for all the paused VCPUs to actually leave the VMX
+         * non-root realm and enter VMX root.
+         */
+        for_each_vcpu ( v->domain, a )
+        {
+            /* We're interested in pausing all the VCPUs except self/v. */
+            if ( a == v )
+                continue;
+
+            /* Pause, synced. */
+            while ( !a->arch.in_host )
+                cpu_relax();
+        }
+    }
+
+    /* Update the rexecution nexting level. */
+    v->arch.rexec_level++;
+
+release_and_exit:
+    spin_unlock(&v->domain->arch.rexec_lock);
+
+    /* If we've got errors so far, return. */
+    if ( leave )
+        return ret;
+
+    /*
+     * Step 4: Save the current gpa & old access rights. Also, check if this
+     * is a "double-fault" on the exact same GPA, in which case, we will
+     * promote the rights of this particular GPA, and try again.
+     */
+    for ( i = 0; i < level; i++ )
+    {
+        if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) ==
+             (gpa >> PAGE_SHIFT) )
+        {
+            /* This GPA is already in the queue. */
+            found = 1;
+
+            switch (v->arch.rexec_context[i].cur_access) {
+                case XENMEM_access_r: r = 1; break;
+                case XENMEM_access_w: w = 1; break;
+                case XENMEM_access_x: x = 1; break;
+                case XENMEM_access_rx: r = x = 1; break;
+                case XENMEM_access_wx: w = x = 1;  break;
+                case XENMEM_access_rw: r = w = 1; break;
+                case XENMEM_access_rwx: r = w = x = 1; break;
+                default: break; /* We don't care about any other case. */
+            }
+        }
+    }
+
+    /*
+     * Get the current EPT access rights. They will be restored when we're done.
+     * Note that the restoration is done in reverse-order, in order to ensure
+     * that the original access rights are restore correctly. Otherwise, we may
+     * restore whatever access rights were modified by another re-execution
+     * request, and that would be bad.
+     */
+    if ( p2m_get_mem_access(v->domain, _gfn(gpa >> PAGE_SHIFT),
+                            &old_access, altp2m_idx) != 0 )
+        return -1;
+
+    v->arch.rexec_context[level].gpa = gpa;
+    v->arch.rexec_context[level].old_access = old_access;
+    v->arch.rexec_context[level].old_single_step = v->arch.hvm.single_step;
+
+    /*
+     * Step 5: Make the GPA with the required access, so we can re-execute
+     * the instruction.
+     */
+    switch ( required_access )
+    {
+        case XENMEM_access_r: r = 1; break;
+        case XENMEM_access_w: w = 1; break;
+        case XENMEM_access_x: x = 1; break;
+        case XENMEM_access_rx: r = x = 1; break;
+        case XENMEM_access_wx: w = x = 1;  break;
+        case XENMEM_access_rw: r = w = 1; break;
+        case XENMEM_access_rwx: r = w = x = 1; break;
+        default: break; /* We don't care about any other case. */
+    }
+
+    /* Now transform our RWX values in a XENMEM_access_* constant. */
+    if ( r == 0 && w == 0 && x == 0 )
+        new_access = XENMEM_access_n;
+    else if ( r == 0 && w == 0 && x == 1 )
+        new_access = XENMEM_access_x;
+    else if ( r == 0 && w == 1 && x == 0 )
+        new_access = XENMEM_access_w;
+    else if ( r == 0 && w == 1 && x == 1 )
+        new_access = XENMEM_access_wx;
+    else if ( r == 1 && w == 0 && x == 0 )
+        new_access = XENMEM_access_r;
+    else if ( r == 1 && w == 0 && x == 1 )
+        new_access = XENMEM_access_rx;
+    else if ( r == 1 && w == 1 && x == 0 )
+        new_access = XENMEM_access_rw;
+    else if ( r == 1 && w == 1 && x == 1 )
+        new_access = XENMEM_access_rwx;
+    else
+        new_access = required_access; /* Should never get here. */
+
+    /* And save the current access rights. */
+    v->arch.rexec_context[level].cur_access = new_access;
+
+    /* Apply the changes inside the EPT. */
+    if ( p2m_set_mem_access(v->domain, _gfn(gpa >> PAGE_SHIFT),
+                            1, 0, MEMOP_CMD_MASK, new_access,
+                            altp2m_idx) != 0 )
+        return -1;
+
+    /*
+     * Step 6: Reconfigure the VMCS, so it suits our needs. We want a
+     * VM-exit to be generated after the instruction has been
+     * successfully re-executed.
+     */
+    if ( level == 0 )
+        v->arch.hvm.single_step = 1;
+
+    /* Step 8: We should be done! */
+
+    return ret;
+}
+
+static int vmx_stop_reexecute_instruction(struct vcpu *v)
+{
+    int ret = 0, i;
+    struct vcpu *a;
+    unsigned int altp2m_idx =
+        altp2m_active(v->domain) ? altp2m_vcpu_idx(v) : 0;
+
+    if ( v->arch.rexec_level == 0 )
+        return 0;
+
+    /* Step 1: Restore original EPT access rights for each GPA. */
+    for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
+    {
+        if ( v->arch.rexec_context[i].gpa != mfn_x(INVALID_MFN) &&
+             p2m_set_mem_access(v->domain,
+                                _gfn(v->arch.rexec_context[i].gpa >> PAGE_SHIFT),
+                                1, 0, MEMOP_CMD_MASK,
+                                v->arch.rexec_context[i].old_access,
+                                altp2m_idx) != 0 )
+        {
+            ret = -1;
+            return ret;
+        }
+
+        v->arch.rexec_context[i].gpa = 0;
+        v->arch.hvm.single_step = v->arch.rexec_context[i].old_single_step;
+    }
+
+    spin_lock(&v->domain->arch.rexec_lock);
+
+    /* Step 2: Reset the nesting level to zero. */
+    v->arch.rexec_level = 0;
+
+    /* Step 3: Resume all other VCPUs. */
+    for_each_vcpu ( v->domain, a )
+    {
+        if ( a == v )
+            continue;
+
+        /* Unpause the VCPU. */
+        vcpu_unpause(a);
+    }
+
+    /*
+     * Step 4: Remove the MONITOR trap flag.
+     * - this is already done when handling the exit.
+     */
+
+    /* Step 5: We're done! */
+
+    spin_unlock(&v->domain->arch.rexec_lock);
+
+    return ret;
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -2324,6 +2573,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .invlpg               = vmx_invlpg,
     .cpu_up               = vmx_cpu_up,
     .cpu_down             = vmx_cpu_down,
+    .start_reexecute_instruction = vmx_start_reexecute_instruction,
     .wbinvd_intercept     = vmx_wbinvd_intercept,
     .fpu_dirty_intercept  = vmx_fpu_dirty_intercept,
     .msr_read_intercept   = vmx_msr_read_intercept,
@@ -3590,6 +3840,8 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     unsigned int vector = 0, mode;
     struct vcpu *v = current;
 
+    v->arch.in_host = 1;
+
     __vmread(GUEST_RIP,    &regs->rip);
     __vmread(GUEST_RSP,    &regs->rsp);
     __vmread(GUEST_RFLAGS, &regs->rflags);
@@ -4112,6 +4364,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     case EXIT_REASON_MONITOR_TRAP_FLAG:
         v->arch.hvm.vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
+        vmx_stop_reexecute_instruction(v);
         if ( v->arch.hvm.single_step )
         {
             hvm_monitor_debug(regs->rip,
@@ -4330,6 +4583,8 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
     if ( unlikely(curr->arch.hvm.vmx.lbr_flags & LBR_FIXUP_MASK) )
         lbr_fixup();
 
+    curr->arch.in_host = 0;
+
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 
     __vmwrite(GUEST_RIP,    regs->rip);
diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
index 2f1295e56a..5ae3a61b5c 100644
--- a/xen/arch/x86/mm/mem_access.c
+++ b/xen/arch/x86/mm/mem_access.c
@@ -212,10 +212,11 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
     }
     if ( vm_event_check_ring(d->vm_event_monitor) &&
          d->arch.monitor.inguest_pagefault_disabled &&
-         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
+         npfec.kind != npfec_kind_with_gla &&
+         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
     {
-        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
-
+        v->arch.vm_event->emulate_flags = 0;
+        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
         return true;
     }
 
@@ -226,6 +227,7 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
         *req_ptr = req;
 
         req->reason = VM_EVENT_REASON_MEM_ACCESS;
+
         req->u.mem_access.gfn = gfn_x(gfn);
         req->u.mem_access.offset = gpa & ((1 << PAGE_SHIFT) - 1);
 
@@ -377,6 +379,8 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
     p2m_access_t a;
     unsigned long gfn_l;
     long rc = 0;
+    struct vcpu *v;
+    int i;
 
     /* altp2m view 0 is treated as the hostp2m */
 #ifdef CONFIG_HVM
@@ -413,6 +417,16 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
         if ( rc )
             break;
 
+        for_each_vcpu(d, v)
+        {
+            if ( !v->arch.rexec_level )
+                continue;
+
+            for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
+                if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) == gfn_x(gfn) )
+                    v->arch.rexec_context[i].gpa = mfn_x(INVALID_MFN);
+        }
+
         /* Check for continuation if it's not the last iteration. */
         if ( nr > ++start && !(start & mask) && hypercall_preempt_check() )
         {
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 277f99f633..dbb68f108a 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -438,6 +438,8 @@ struct arch_domain
 
     /* Emulated devices enabled bitmap. */
     uint32_t emulation_flags;
+
+    spinlock_t rexec_lock;
 } __cacheline_aligned;
 
 #ifdef CONFIG_HVM
@@ -629,6 +631,22 @@ struct arch_vcpu
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
 
+#define REEXECUTION_MAX_DEPTH 8
+    struct rexec_context_t {
+        unsigned long gpa;
+        xenmem_access_t old_access;
+        xenmem_access_t cur_access;
+        bool_t old_single_step;
+    } rexec_context[REEXECUTION_MAX_DEPTH];
+
+    int rexec_level;
+
+    /*
+     *  Will be true when the vcpu is in VMX root,
+     * false when it is not.
+     */
+    bool_t in_host;
+
     struct arch_vm_event *vm_event;
 
     struct vcpu_msrs *msrs;
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 3d3250dff0..1f5d43a98d 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -167,6 +167,8 @@ struct hvm_function_table {
 
     int  (*cpu_up)(void);
     void (*cpu_down)(void);
+    int  (*start_reexecute_instruction)(struct vcpu *v, unsigned long gpa,
+                                        xenmem_access_t required_access);
 
     /* Copy up to 15 bytes from cached instruction bytes at current rIP. */
     unsigned int (*get_insn_bytes)(struct vcpu *v, uint8_t *buf);
-- 
2.17.1

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-16 10:06 [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults Alexandru Stefan ISAILA
@ 2018-11-16 17:04 ` Roger Pau Monné
  2018-11-19 13:30   ` Alexandru Stefan ISAILA
                     ` (2 more replies)
  0 siblings, 3 replies; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-16 17:04 UTC (permalink / raw)
  To: Alexandru Stefan ISAILA
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, rcojocaru,
	george.dunlap, andrew.cooper3, Mihai Donțu,
	Andrei Vlad LUTAS, jbeulich, xen-devel, Anshul Makkar

On Fri, Nov 16, 2018 at 10:06:36AM +0000, Alexandru Stefan ISAILA wrote:
> A new mechanism has been added which is able to generically re-execute
> instructions, by temporarily granting permissions inside the EPT and
> re-executing the instruction with all other vcpus paused and with the
> monitor trap flag set. The mechanism is re-entrant, meaning that is
> capable of handling different violations caused by the same instruction.
> Usually, a security appliance will decide when and what instructions
> must be re-executed this way instructions that lie in non-executable
> pages and instructions that cause the setting of Accessed and/or Dirty
> flags inside page tables are two examples.
> 
> Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>
> Signed-off-by: Andrei Lutas <vlutas@bitdefender.com>
> Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
> Signed-off-by: Anshul Makkar <anshul.makkar@citrix.com>
> ---
>  xen/arch/x86/domain.c         |   3 +
>  xen/arch/x86/hvm/vmx/vmx.c    | 255 ++++++++++++++++++++++++++++++++++
>  xen/arch/x86/mm/mem_access.c  |  20 ++-
>  xen/include/asm-x86/domain.h  |  18 +++
>  xen/include/asm-x86/hvm/hvm.h |   2 +
>  5 files changed, 295 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index 295b10c48c..b0680a76f1 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -343,6 +343,7 @@ int arch_vcpu_create(struct vcpu *v)
>      int rc;
>  
>      v->arch.flags = TF_kernel_mode;
> +    v->arch.in_host = 1;

This should be a bool (as proposed below), so please use true/false
then.

>  
>      rc = mapcache_vcpu_init(v);
>      if ( rc )
> @@ -482,6 +483,8 @@ int arch_domain_create(struct domain *d,
>      spin_lock_init(&d->arch.e820_lock);
>      spin_lock_init(&d->arch.vtsc_lock);
>  

AFAICT, there's no need to add a newline here.

> +    spin_lock_init(&d->arch.rexec_lock);
> +
>      /* Minimal initialisation for the idle domain. */
>      if ( unlikely(is_idle_domain(d)) )
>      {
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 365eeb2886..84f8648fc0 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -2289,6 +2289,255 @@ static bool vmx_get_pending_event(struct vcpu *v, struct x86_event *info)
>      return true;
>  }
>  
> +static int vmx_start_reexecute_instruction(struct vcpu *v, unsigned long gpa,
> +                                           xenmem_access_t required_access)
> +{
> +    /*
> +     * NOTE: Some required_accesses may be invalid. For example, one
> +     * cannot grant only write access on a given page; read/write
> +     * access must be granted instead. These inconsistencies are NOT
> +     * checked here. The caller must ensure that "required_access" is
> +     * an allowed combination.
> +     */
> +
> +    int ret = 0, i, found = 0, r = 0, w = 0, x = 0, level = 0, leave = 0;

There are a bunch of variables that need to be of different type here.

i likely wants to be unsigned, same with level.

found, r, w, x and leave should be bools.

> +    xenmem_access_t old_access, new_access;
> +    struct vcpu *a;
> +    unsigned int altp2m_idx =
> +        altp2m_active(v->domain) ? altp2m_vcpu_idx(v) : 0;
> +
> +    spin_lock(&v->domain->arch.rexec_lock);
> +
> +    level = v->arch.rexec_level;
> +
> +    /*
> +     * Step 1: Make sure someone else didn't get to start an
> +     * instruction re-execution.
> +     */
> +    for_each_vcpu ( v->domain, a )
> +    {
> +        /* We're interested in pausing all the VCPUs except self/v. */

But there's no pause done here AFAICT?

> +        if ( a == v )
> +            continue;
> +
> +        /*
> +         * Check if "a" started an instruction re-execution. If so,
> +         * return success, as we'll re-execute our instruction later.
> +         */
> +        if ( a->arch.rexec_level != 0 )
> +        {
> +            /* We should be paused. */
> +            ret = 0;
> +            leave = 1;
> +            goto release_and_exit;
> +        }
> +    }
> +
> +    /* Step 2: Make sure we're not exceeding the max re-execution depth. */
> +    if ( level >= REEXECUTION_MAX_DEPTH )
> +    {
> +        ret = -1;

Please return a proper errno value here.

> +        leave = 1;
> +        goto release_and_exit;
> +    }
> +
> +    /*
> +     * Step 2: Pause all the VCPUs, except self. Note that we have to do
> +     * this only if we're at nesting level 0; if we're at a higher level
> +     * of nested re-exec, the vcpus are already paused.
> +     */
> +    if ( level == 0 )
> +    {
> +        for_each_vcpu ( v->domain, a )
> +        {
> +            /* We're interested in pausing all the VCPUs except self/v. */
> +            if ( a == v )
> +                continue;
> +
> +            /* Pause, NO SYNC! We're gonna do our own syncing. */
> +            vcpu_pause_nosync(a);
> +        }
> +
> +        /*
> +         * Step 3: Wait for all the paused VCPUs to actually leave the VMX
> +         * non-root realm and enter VMX root.
> +         */
> +        for_each_vcpu ( v->domain, a )
> +        {
> +            /* We're interested in pausing all the VCPUs except self/v. */

It's the 3rd time this comment has been repeated.

> +            if ( a == v )
> +                continue;
> +
> +            /* Pause, synced. */
> +            while ( !a->arch.in_host )

Why not use a->is_running as a way to know whether the vCPU is
running?

I think the logic of using vcpu_pause and expecting the running vcpu
to take a vmexit and thus set in_host is wrong because a vcpu that
wasn't running when vcpu_pause_nosync is called won't get scheduled
anymore, thus not taking a vmexit and this function will lockup.

I don't think you need the in_host boolean at all.

> +                cpu_relax();

Is this really better than using vcpu_pause?

I assume this is done to avoid waiting on each vcpu, and instead doing
it here likely means less wait time?

> +        }
> +    }
> +
> +    /* Update the rexecution nexting level. */
> +    v->arch.rexec_level++;
> +
> +release_and_exit:
> +    spin_unlock(&v->domain->arch.rexec_lock);
> +
> +    /* If we've got errors so far, return. */
> +    if ( leave )
> +        return ret;
> +
> +    /*
> +     * Step 4: Save the current gpa & old access rights. Also, check if this
> +     * is a "double-fault" on the exact same GPA, in which case, we will
> +     * promote the rights of this particular GPA, and try again.
> +     */
> +    for ( i = 0; i < level; i++ )
> +    {
> +        if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) ==
> +             (gpa >> PAGE_SHIFT) )
> +        {
> +            /* This GPA is already in the queue. */
> +            found = 1;
> +
> +            switch (v->arch.rexec_context[i].cur_access) {
> +                case XENMEM_access_r: r = 1; break;
> +                case XENMEM_access_w: w = 1; break;
> +                case XENMEM_access_x: x = 1; break;
> +                case XENMEM_access_rx: r = x = 1; break;
> +                case XENMEM_access_wx: w = x = 1;  break;
> +                case XENMEM_access_rw: r = w = 1; break;
> +                case XENMEM_access_rwx: r = w = x = 1; break;
> +                default: break; /* We don't care about any other case. */

The above chunk needs proper formatting, and I would argue that you
need to add an assert to the default case at least?

> +            }
> +        }
> +    }
> +
> +    /*
> +     * Get the current EPT access rights. They will be restored when we're done.
> +     * Note that the restoration is done in reverse-order, in order to ensure
> +     * that the original access rights are restore correctly. Otherwise, we may
> +     * restore whatever access rights were modified by another re-execution
> +     * request, and that would be bad.
> +     */
> +    if ( p2m_get_mem_access(v->domain, _gfn(gpa >> PAGE_SHIFT),
> +                            &old_access, altp2m_idx) != 0 )
> +        return -1;
> +
> +    v->arch.rexec_context[level].gpa = gpa;
> +    v->arch.rexec_context[level].old_access = old_access;
> +    v->arch.rexec_context[level].old_single_step = v->arch.hvm.single_step;
> +
> +    /*
> +     * Step 5: Make the GPA with the required access, so we can re-execute
> +     * the instruction.
> +     */
> +    switch ( required_access )
> +    {
> +        case XENMEM_access_r: r = 1; break;
> +        case XENMEM_access_w: w = 1; break;
> +        case XENMEM_access_x: x = 1; break;
> +        case XENMEM_access_rx: r = x = 1; break;
> +        case XENMEM_access_wx: w = x = 1;  break;
> +        case XENMEM_access_rw: r = w = 1; break;
> +        case XENMEM_access_rwx: r = w = x = 1; break;
> +        default: break; /* We don't care about any other case. */
> +    }
> +
> +    /* Now transform our RWX values in a XENMEM_access_* constant. */
> +    if ( r == 0 && w == 0 && x == 0 )
> +        new_access = XENMEM_access_n;
> +    else if ( r == 0 && w == 0 && x == 1 )
> +        new_access = XENMEM_access_x;
> +    else if ( r == 0 && w == 1 && x == 0 )
> +        new_access = XENMEM_access_w;
> +    else if ( r == 0 && w == 1 && x == 1 )
> +        new_access = XENMEM_access_wx;
> +    else if ( r == 1 && w == 0 && x == 0 )
> +        new_access = XENMEM_access_r;
> +    else if ( r == 1 && w == 0 && x == 1 )
> +        new_access = XENMEM_access_rx;
> +    else if ( r == 1 && w == 1 && x == 0 )
> +        new_access = XENMEM_access_rw;
> +    else if ( r == 1 && w == 1 && x == 1 )
> +        new_access = XENMEM_access_rwx;
> +    else
> +        new_access = required_access; /* Should never get here. */

There seems to be a lot of translation from xenmem_access_t to bool
fields and then to xenmem_access_t again. Can't you just avoid the
booleans?

> +
> +    /* And save the current access rights. */
> +    v->arch.rexec_context[level].cur_access = new_access;
> +
> +    /* Apply the changes inside the EPT. */
> +    if ( p2m_set_mem_access(v->domain, _gfn(gpa >> PAGE_SHIFT),
> +                            1, 0, MEMOP_CMD_MASK, new_access,
> +                            altp2m_idx) != 0 )
> +        return -1;

Again you should return proper errno values.

> +
> +    /*
> +     * Step 6: Reconfigure the VMCS, so it suits our needs. We want a
> +     * VM-exit to be generated after the instruction has been
> +     * successfully re-executed.
> +     */
> +    if ( level == 0 )
> +        v->arch.hvm.single_step = 1;
> +
> +    /* Step 8: We should be done! */
> +
> +    return ret;
> +}
> +
> +static int vmx_stop_reexecute_instruction(struct vcpu *v)
> +{
> +    int ret = 0, i;
> +    struct vcpu *a;
> +    unsigned int altp2m_idx =
> +        altp2m_active(v->domain) ? altp2m_vcpu_idx(v) : 0;
> +
> +    if ( v->arch.rexec_level == 0 )
> +        return 0;
> +
> +    /* Step 1: Restore original EPT access rights for each GPA. */
> +    for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
> +    {
> +        if ( v->arch.rexec_context[i].gpa != mfn_x(INVALID_MFN) &&
> +             p2m_set_mem_access(v->domain,
> +                                _gfn(v->arch.rexec_context[i].gpa >> PAGE_SHIFT),
> +                                1, 0, MEMOP_CMD_MASK,
> +                                v->arch.rexec_context[i].old_access,
> +                                altp2m_idx) != 0 )
> +        {
> +            ret = -1;
> +            return ret;
> +        }
> +
> +        v->arch.rexec_context[i].gpa = 0;
> +        v->arch.hvm.single_step = v->arch.rexec_context[i].old_single_step;
> +    }
> +
> +    spin_lock(&v->domain->arch.rexec_lock);
> +
> +    /* Step 2: Reset the nesting level to zero. */
> +    v->arch.rexec_level = 0;
> +
> +    /* Step 3: Resume all other VCPUs. */
> +    for_each_vcpu ( v->domain, a )
> +    {
> +        if ( a == v )
> +            continue;
> +
> +        /* Unpause the VCPU. */
> +        vcpu_unpause(a);
> +    }
> +
> +    /*
> +     * Step 4: Remove the MONITOR trap flag.
> +     * - this is already done when handling the exit.
> +     */
> +
> +    /* Step 5: We're done! */
> +
> +    spin_unlock(&v->domain->arch.rexec_lock);
> +
> +    return ret;
> +}
> +
>  static struct hvm_function_table __initdata vmx_function_table = {
>      .name                 = "VMX",
>      .cpu_up_prepare       = vmx_cpu_up_prepare,
> @@ -2324,6 +2573,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
>      .invlpg               = vmx_invlpg,
>      .cpu_up               = vmx_cpu_up,
>      .cpu_down             = vmx_cpu_down,
> +    .start_reexecute_instruction = vmx_start_reexecute_instruction,
>      .wbinvd_intercept     = vmx_wbinvd_intercept,
>      .fpu_dirty_intercept  = vmx_fpu_dirty_intercept,
>      .msr_read_intercept   = vmx_msr_read_intercept,
> @@ -3590,6 +3840,8 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>      unsigned int vector = 0, mode;
>      struct vcpu *v = current;
>  
> +    v->arch.in_host = 1;
> +
>      __vmread(GUEST_RIP,    &regs->rip);
>      __vmread(GUEST_RSP,    &regs->rsp);
>      __vmread(GUEST_RFLAGS, &regs->rflags);
> @@ -4112,6 +4364,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>      case EXIT_REASON_MONITOR_TRAP_FLAG:
>          v->arch.hvm.vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
>          vmx_update_cpu_exec_control(v);
> +        vmx_stop_reexecute_instruction(v);
>          if ( v->arch.hvm.single_step )
>          {
>              hvm_monitor_debug(regs->rip,
> @@ -4330,6 +4583,8 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
>      if ( unlikely(curr->arch.hvm.vmx.lbr_flags & LBR_FIXUP_MASK) )
>          lbr_fixup();
>  
> +    curr->arch.in_host = 0;
> +
>      HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
>  
>      __vmwrite(GUEST_RIP,    regs->rip);
> diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
> index 2f1295e56a..5ae3a61b5c 100644
> --- a/xen/arch/x86/mm/mem_access.c
> +++ b/xen/arch/x86/mm/mem_access.c
> @@ -212,10 +212,11 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>      }
>      if ( vm_event_check_ring(d->vm_event_monitor) &&
>           d->arch.monitor.inguest_pagefault_disabled &&
> -         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
> +         npfec.kind != npfec_kind_with_gla &&
> +         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
>      {
> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
> -
> +        v->arch.vm_event->emulate_flags = 0;
> +        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
>          return true;
>      }

Don't you need to fallback to using hvm_emulate_one_vm_event if
start_reexecute_instruction is not available?

>  
> @@ -226,6 +227,7 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
>          *req_ptr = req;
>  
>          req->reason = VM_EVENT_REASON_MEM_ACCESS;
> +

Unrelated change?

>          req->u.mem_access.gfn = gfn_x(gfn);
>          req->u.mem_access.offset = gpa & ((1 << PAGE_SHIFT) - 1);
>  
> @@ -377,6 +379,8 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
>      p2m_access_t a;
>      unsigned long gfn_l;
>      long rc = 0;
> +    struct vcpu *v;
> +    int i;
>  
>      /* altp2m view 0 is treated as the hostp2m */
>  #ifdef CONFIG_HVM
> @@ -413,6 +417,16 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
>          if ( rc )
>              break;
>  
> +        for_each_vcpu(d, v)
> +        {
> +            if ( !v->arch.rexec_level )
> +                continue;
> +
> +            for ( i = v->arch.rexec_level - 1; i >= 0; i-- )

Is there any reason this has to be done backwards?

If you do it from 0 to v->arch.rexec_level you could use an unsigned
int as the index.

> +                if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) == gfn_x(gfn) )

PFN_DOWN instead of the right shift, and maybe use gfn_eq instead of
converting gfn.

> +                    v->arch.rexec_context[i].gpa = mfn_x(INVALID_MFN);

This is a guest physical address (given the field name), but you are
using the invalid machine frame number in order to set it. You likely
want to use INVALID_PADDR or gfn_x(INVALID_GFN) << PAGE_SHIFT.

> +        }
> +
>          /* Check for continuation if it's not the last iteration. */
>          if ( nr > ++start && !(start & mask) && hypercall_preempt_check() )
>          {
> diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
> index 277f99f633..dbb68f108a 100644
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -438,6 +438,8 @@ struct arch_domain
>  
>      /* Emulated devices enabled bitmap. */
>      uint32_t emulation_flags;
> +
> +    spinlock_t rexec_lock;
>  } __cacheline_aligned;
>  
>  #ifdef CONFIG_HVM
> @@ -629,6 +631,22 @@ struct arch_vcpu
>      /* A secondary copy of the vcpu time info. */
>      XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
>  
> +#define REEXECUTION_MAX_DEPTH 8
> +    struct rexec_context_t {
> +        unsigned long gpa;
> +        xenmem_access_t old_access;
> +        xenmem_access_t cur_access;
> +        bool_t old_single_step;

bool please

> +    } rexec_context[REEXECUTION_MAX_DEPTH];

This is fairly big amount of data that's only used if vm events are
enabled, could this be allocated on a per-guest basis?

> +
> +    int rexec_level;
> +
> +    /*
> +     *  Will be true when the vcpu is in VMX root,
> +     * false when it is not.
> +     */
> +    bool_t in_host;

bool.

> +
>      struct arch_vm_event *vm_event;
>  
>      struct vcpu_msrs *msrs;
> diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
> index 3d3250dff0..1f5d43a98d 100644
> --- a/xen/include/asm-x86/hvm/hvm.h
> +++ b/xen/include/asm-x86/hvm/hvm.h
> @@ -167,6 +167,8 @@ struct hvm_function_table {
>  
>      int  (*cpu_up)(void);
>      void (*cpu_down)(void);
> +    int  (*start_reexecute_instruction)(struct vcpu *v, unsigned long gpa,
> +                                        xenmem_access_t required_access);

I would name this reexecute_instruction, I don't think the start_
prefix adds any value to the handler.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-16 17:04 ` Roger Pau Monné
@ 2018-11-19 13:30   ` Alexandru Stefan ISAILA
  2018-11-19 14:26     ` Jan Beulich
  2018-11-19 15:08     ` Roger Pau Monné
  2018-11-19 13:33   ` Jan Beulich
  2018-11-21 18:55   ` Razvan Cojocaru
  2 siblings, 2 replies; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-11-19 13:30 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, rcojocaru,
	george.dunlap, andrew.cooper3, Mihai Donțu,
	Andrei Vlad LUTAS, jbeulich, xen-devel, Anshul Makkar

>> +    /* Now transform our RWX values in a XENMEM_access_* constant. */
>> +    if ( r == 0 && w == 0 && x == 0 )
>> +        new_access = XENMEM_access_n;
>> +    else if ( r == 0 && w == 0 && x == 1 )
>> +        new_access = XENMEM_access_x;
>> +    else if ( r == 0 && w == 1 && x == 0 )
>> +        new_access = XENMEM_access_w;
>> +    else if ( r == 0 && w == 1 && x == 1 )
>> +        new_access = XENMEM_access_wx;
>> +    else if ( r == 1 && w == 0 && x == 0 )
>> +        new_access = XENMEM_access_r;
>> +    else if ( r == 1 && w == 0 && x == 1 )
>> +        new_access = XENMEM_access_rx;
>> +    else if ( r == 1 && w == 1 && x == 0 )
>> +        new_access = XENMEM_access_rw;
>> +    else if ( r == 1 && w == 1 && x == 1 )
>> +        new_access = XENMEM_access_rwx;
>> +    else
>> +        new_access = required_access; /* Should never get here. */
> 
> There seems to be a lot of translation from xenmem_access_t to bool
> fields and then to xenmem_access_t again. Can't you just avoid the
> booleans?

The translation is done because the rights are cumulative and I think 
this is the clear way to do this.


>>       if ( vm_event_check_ring(d->vm_event_monitor) &&
>>            d->arch.monitor.inguest_pagefault_disabled &&
>> -         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>> +         npfec.kind != npfec_kind_with_gla &&
>> +         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
>>       {
>> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
>> -
>> +        v->arch.vm_event->emulate_flags = 0;
>> +        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
>>           return true;
>>       }
> 
> Don't you need to fallback to using hvm_emulate_one_vm_event if
> start_reexecute_instruction is not available?

Fallback with hvm_emulate_one_vm_event can result in loosing events.

>> +        for_each_vcpu(d, v)
>> +        {
>> +            if ( !v->arch.rexec_level )
>> +                continue;
>> +
>> +            for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
> 
> Is there any reason this has to be done backwards?
> 
> If you do it from 0 to v->arch.rexec_level you could use an unsigned
> int as the index.

This is done backwards because of the corresponding code in 
vmx_stop_reexecute_instruction() but here it can be turned the other way 
if you insist on i to be unsigned.

>> +#define REEXECUTION_MAX_DEPTH 8
>> +    struct rexec_context_t {
>> +        unsigned long gpa;
>> +        xenmem_access_t old_access;
>> +        xenmem_access_t cur_access;
>> +        bool_t old_single_step;
> 
> bool please
> 
>> +    } rexec_context[REEXECUTION_MAX_DEPTH];
> 
> This is fairly big amount of data that's only used if vm events are
> enabled, could this be allocated on a per-guest basis?

Yes, this can be moved to d->arch.monitor in the next version.

> 
>> +
>> +    int rexec_level;
>> +
>> +    /*
>> +     *  Will be true when the vcpu is in VMX root,
>> +     * false when it is not.
>> +     */
>> +    bool_t in_host;
> 
> bool.
> 
>> +
>>       struct arch_vm_event *vm_event;
>>   
>>       struct vcpu_msrs *msrs;
>> diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
>> index 3d3250dff0..1f5d43a98d 100644
>> --- a/xen/include/asm-x86/hvm/hvm.h
>> +++ b/xen/include/asm-x86/hvm/hvm.h
>> @@ -167,6 +167,8 @@ struct hvm_function_table {
>>   
>>       int  (*cpu_up)(void);
>>       void (*cpu_down)(void);
>> +    int  (*start_reexecute_instruction)(struct vcpu *v, unsigned long gpa,
>> +                                        xenmem_access_t required_access);
> 
> I would name this reexecute_instruction, I don't think the start_
> prefix adds any value to the handler.

Sure, I will drop the start on the next version

Regards,
Alex
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-16 17:04 ` Roger Pau Monné
  2018-11-19 13:30   ` Alexandru Stefan ISAILA
@ 2018-11-19 13:33   ` Jan Beulich
  2018-11-21 18:55   ` Razvan Cojocaru
  2 siblings, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2018-11-19 13:33 UTC (permalink / raw)
  To: aisaila, Roger Pau Monne
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, Razvan Cojocaru,
	George Dunlap, Andrew Cooper, Mihai Dontu, Kevin Tian,
	Jun Nakajima, xen-devel, Anshul Makkar

>>> On 16.11.18 at 18:04, <roger.pau@citrix.com> wrote:
> On Fri, Nov 16, 2018 at 10:06:36AM +0000, Alexandru Stefan ISAILA wrote:
>> @@ -377,6 +379,8 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
>>      p2m_access_t a;
>>      unsigned long gfn_l;
>>      long rc = 0;
>> +    struct vcpu *v;
>> +    int i;
>>  
>>      /* altp2m view 0 is treated as the hostp2m */
>>  #ifdef CONFIG_HVM
>> @@ -413,6 +417,16 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
>>          if ( rc )
>>              break;
>>  
>> +        for_each_vcpu(d, v)
>> +        {
>> +            if ( !v->arch.rexec_level )
>> +                continue;
>> +
>> +            for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
> 
> Is there any reason this has to be done backwards?
> 
> If you do it from 0 to v->arch.rexec_level you could use an unsigned
> int as the index.

And even if there's need for this going backwards the variable should
still be unsigned (using "for ( i = v->arch.rexec_level; i--; )" then,
presumably allowing the if() above to be dropped altogether).

>> +                if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) == gfn_x(gfn) )
> 
> PFN_DOWN instead of the right shift, and maybe use gfn_eq instead of
> converting gfn.

ITYM gaddr_to_gfn() instead of PFN_DOWN.

>> --- a/xen/include/asm-x86/hvm/hvm.h
>> +++ b/xen/include/asm-x86/hvm/hvm.h
>> @@ -167,6 +167,8 @@ struct hvm_function_table {
>>  
>>      int  (*cpu_up)(void);
>>      void (*cpu_down)(void);
>> +    int  (*start_reexecute_instruction)(struct vcpu *v, unsigned long gpa,
>> +                                        xenmem_access_t required_access);
> 
> I would name this reexecute_instruction, I don't think the start_
> prefix adds any value to the handler.

Or even just rexec_insn, to cut down on name length. I also
dislike the insertion point: This should live amidst the less "core"
hooks further down - there's already a block of three
introspection related hooks where this one would likely be a
good fit.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-19 13:30   ` Alexandru Stefan ISAILA
@ 2018-11-19 14:26     ` Jan Beulich
  2018-11-19 15:08     ` Roger Pau Monné
  1 sibling, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2018-11-19 14:26 UTC (permalink / raw)
  To: aisaila
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, Razvan Cojocaru,
	George Dunlap, Andrew Cooper, Mihai Dontu, Kevin Tian,
	Jun Nakajima, xen-devel, Anshul Makkar, Roger Pau Monne

>>> On 19.11.18 at 14:30, <aisaila@bitdefender.com> wrote:
>> > +    /* Now transform our RWX values in a XENMEM_access_* constant. */
>>> +    if ( r == 0 && w == 0 && x == 0 )
>>> +        new_access = XENMEM_access_n;
>>> +    else if ( r == 0 && w == 0 && x == 1 )
>>> +        new_access = XENMEM_access_x;
>>> +    else if ( r == 0 && w == 1 && x == 0 )
>>> +        new_access = XENMEM_access_w;
>>> +    else if ( r == 0 && w == 1 && x == 1 )
>>> +        new_access = XENMEM_access_wx;
>>> +    else if ( r == 1 && w == 0 && x == 0 )
>>> +        new_access = XENMEM_access_r;
>>> +    else if ( r == 1 && w == 0 && x == 1 )
>>> +        new_access = XENMEM_access_rx;
>>> +    else if ( r == 1 && w == 1 && x == 0 )
>>> +        new_access = XENMEM_access_rw;
>>> +    else if ( r == 1 && w == 1 && x == 1 )
>>> +        new_access = XENMEM_access_rwx;
>>> +    else
>>> +        new_access = required_access; /* Should never get here. */
>> 
>> There seems to be a lot of translation from xenmem_access_t to bool
>> fields and then to xenmem_access_t again. Can't you just avoid the
>> booleans?
> 
> The translation is done because the rights are cumulative and I think 
> this is the clear way to do this.

But then at the very least don't use == 0 and == 1, but
simple boolean tests.

>>>       if ( vm_event_check_ring(d->vm_event_monitor) &&
>>>            d->arch.monitor.inguest_pagefault_disabled &&
>>> -         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>>> +         npfec.kind != npfec_kind_with_gla &&
>>> +         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
>>>       {
>>> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
>>> -
>>> +        v->arch.vm_event->emulate_flags = 0;
>>> +        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
>>>           return true;
>>>       }
>> 
>> Don't you need to fallback to using hvm_emulate_one_vm_event if
>> start_reexecute_instruction is not available?
> 
> Fallback with hvm_emulate_one_vm_event can result in loosing events.

But is not doing anything at going to result in even worse a
situation?

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-19 13:30   ` Alexandru Stefan ISAILA
  2018-11-19 14:26     ` Jan Beulich
@ 2018-11-19 15:08     ` Roger Pau Monné
  2018-11-19 15:56       ` Alexandru Stefan ISAILA
  1 sibling, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-19 15:08 UTC (permalink / raw)
  To: Alexandru Stefan ISAILA
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, rcojocaru, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel, Anshul Makkar

On Mon, Nov 19, 2018 at 01:30:09PM +0000, Alexandru Stefan ISAILA wrote:
> >> +    /* Now transform our RWX values in a XENMEM_access_* constant. */
> >> +    if ( r == 0 && w == 0 && x == 0 )
> >> +        new_access = XENMEM_access_n;
> >> +    else if ( r == 0 && w == 0 && x == 1 )
> >> +        new_access = XENMEM_access_x;
> >> +    else if ( r == 0 && w == 1 && x == 0 )
> >> +        new_access = XENMEM_access_w;
> >> +    else if ( r == 0 && w == 1 && x == 1 )
> >> +        new_access = XENMEM_access_wx;
> >> +    else if ( r == 1 && w == 0 && x == 0 )
> >> +        new_access = XENMEM_access_r;
> >> +    else if ( r == 1 && w == 0 && x == 1 )
> >> +        new_access = XENMEM_access_rx;
> >> +    else if ( r == 1 && w == 1 && x == 0 )
> >> +        new_access = XENMEM_access_rw;
> >> +    else if ( r == 1 && w == 1 && x == 1 )
> >> +        new_access = XENMEM_access_rwx;
> >> +    else
> >> +        new_access = required_access; /* Should never get here. */
> > 
> > There seems to be a lot of translation from xenmem_access_t to bool
> > fields and then to xenmem_access_t again. Can't you just avoid the
> > booleans?
> 
> The translation is done because the rights are cumulative and I think 
> this is the clear way to do this.

So the switch converts required_access using the following relation:

_r   -> r = 1 w = 0 x = 0
_w   -> r = 0 w = 1 x = 0
_x   -> r = 0 w = 0 x = 1
_rx  -> r = 1 w = 1 x = 0
_wx  -> r = 0 w = 1 x = 1
_rw  -> r = 1 w = 1 x = 0
_rwx -> r = 1 w = 1 x = 1

Then the if below performs the following transformation:

r = 0 w = 0 x = 0 -> _n
r = 1 w = 0 x = 0 -> _r
r = 0 w = 1 x = 0 -> _w
r = 0 w = 0 x = 1 -> _x
r = 1 w = 1 x = 0 -> _rw
r = 0 w = 1 x = 1 -> _wx
r = 1 w = 1 x = 0 -> _rw
r = 1 w = 1 x = 1 -> _rwx

I'm not sure I understand this chunk of code, because you end up
getting exactly the same type that you have as the input, and a type
not listed here is just silently passed through, so I don't see the
point in doing this transformation.

> 
> >>       if ( vm_event_check_ring(d->vm_event_monitor) &&
> >>            d->arch.monitor.inguest_pagefault_disabled &&
> >> -         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
> >> +         npfec.kind != npfec_kind_with_gla &&
> >> +         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
> >>       {
> >> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
> >> -
> >> +        v->arch.vm_event->emulate_flags = 0;
> >> +        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
> >>           return true;
> >>       }
> > 
> > Don't you need to fallback to using hvm_emulate_one_vm_event if
> > start_reexecute_instruction is not available?
> 
> Fallback with hvm_emulate_one_vm_event can result in loosing events.

But by changing this here unconditionally you are removing this
functionality on AMD hardware, which it used to have before by making
use of hvm_emulate_one_vm_event.

I think this needs to at least be written in the commit message.

> >> +        for_each_vcpu(d, v)
> >> +        {
> >> +            if ( !v->arch.rexec_level )
> >> +                continue;
> >> +
> >> +            for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
> > 
> > Is there any reason this has to be done backwards?
> > 
> > If you do it from 0 to v->arch.rexec_level you could use an unsigned
> > int as the index.
> 
> This is done backwards because of the corresponding code in 
> vmx_stop_reexecute_instruction() but here it can be turned the other way 
> if you insist on i to be unsigned.

Yes, Jan has also suggested a way to make i unsigned while keeping the
loop backwards, but I don't see the point of performing the loop
backwards if there's no need.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-19 15:08     ` Roger Pau Monné
@ 2018-11-19 15:56       ` Alexandru Stefan ISAILA
  2018-11-21  9:56         ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-11-19 15:56 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, rcojocaru, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel, Anshul Makkar



On 19.11.2018 17:08, Roger Pau Monné wrote:
> On Mon, Nov 19, 2018 at 01:30:09PM +0000, Alexandru Stefan ISAILA wrote:
>>>> +    /* Now transform our RWX values in a XENMEM_access_* constant. */
>>>> +    if ( r == 0 && w == 0 && x == 0 )
>>>> +        new_access = XENMEM_access_n;
>>>> +    else if ( r == 0 && w == 0 && x == 1 )
>>>> +        new_access = XENMEM_access_x;
>>>> +    else if ( r == 0 && w == 1 && x == 0 )
>>>> +        new_access = XENMEM_access_w;
>>>> +    else if ( r == 0 && w == 1 && x == 1 )
>>>> +        new_access = XENMEM_access_wx;
>>>> +    else if ( r == 1 && w == 0 && x == 0 )
>>>> +        new_access = XENMEM_access_r;
>>>> +    else if ( r == 1 && w == 0 && x == 1 )
>>>> +        new_access = XENMEM_access_rx;
>>>> +    else if ( r == 1 && w == 1 && x == 0 )
>>>> +        new_access = XENMEM_access_rw;
>>>> +    else if ( r == 1 && w == 1 && x == 1 )
>>>> +        new_access = XENMEM_access_rwx;
>>>> +    else
>>>> +        new_access = required_access; /* Should never get here. */
>>>
>>> There seems to be a lot of translation from xenmem_access_t to bool
>>> fields and then to xenmem_access_t again. Can't you just avoid the
>>> booleans?
>>
>> The translation is done because the rights are cumulative and I think
>> this is the clear way to do this.
> 
> So the switch converts required_access using the following relation:
> 
> _r   -> r = 1 w = 0 x = 0
> _w   -> r = 0 w = 1 x = 0
> _x   -> r = 0 w = 0 x = 1
> _rx  -> r = 1 w = 1 x = 0
> _wx  -> r = 0 w = 1 x = 1
> _rw  -> r = 1 w = 1 x = 0
> _rwx -> r = 1 w = 1 x = 1
> 
> Then the if below performs the following transformation:
> 
> r = 0 w = 0 x = 0 -> _n
> r = 1 w = 0 x = 0 -> _r
> r = 0 w = 1 x = 0 -> _w
> r = 0 w = 0 x = 1 -> _x
> r = 1 w = 1 x = 0 -> _rw
> r = 0 w = 1 x = 1 -> _wx
> r = 1 w = 1 x = 0 -> _rw
> r = 1 w = 1 x = 1 -> _rwx
> 
> I'm not sure I understand this chunk of code, because you end up
> getting exactly the same type that you have as the input, and a type
> not listed here is just silently passed through, so I don't see the
> point in doing this transformation.

The first switch is for cur_access and it sets r,w,x accordingly,
the second switch is required_access where r,w,x are appended
and then in the last if().. part new_access is assigned according to the
previous assignments of r,w,x.

> 
>>
>>>>        if ( vm_event_check_ring(d->vm_event_monitor) &&
>>>>             d->arch.monitor.inguest_pagefault_disabled &&
>>>> -         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>>>> +         npfec.kind != npfec_kind_with_gla &&
>>>> +         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
>>>>        {
>>>> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
>>>> -
>>>> +        v->arch.vm_event->emulate_flags = 0;
>>>> +        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
>>>>            return true;
>>>>        }
>>>
>>> Don't you need to fallback to using hvm_emulate_one_vm_event if
>>> start_reexecute_instruction is not available?
>>
>> Fallback with hvm_emulate_one_vm_event can result in loosing events.
> 
> But by changing this here unconditionally you are removing this
> functionality on AMD hardware, which it used to have before by making
> use of hvm_emulate_one_vm_event.
> 
> I think this needs to at least be written in the commit message.

For AMD I could add if (cpu_has_svm()) and call emulate_one_vm_event. 
Introspection wise loosing valuable info is not a good thing to have 
because it could result in security breach.

> 
>>>> +        for_each_vcpu(d, v)
>>>> +        {
>>>> +            if ( !v->arch.rexec_level )
>>>> +                continue;
>>>> +
>>>> +            for ( i = v->arch.rexec_level - 1; i >= 0; i-- )
>>>
>>> Is there any reason this has to be done backwards?
>>>
>>> If you do it from 0 to v->arch.rexec_level you could use an unsigned
>>> int as the index.
>>
>> This is done backwards because of the corresponding code in
>> vmx_stop_reexecute_instruction() but here it can be turned the other way
>> if you insist on i to be unsigned.
> 
> Yes, Jan has also suggested a way to make i unsigned while keeping the
> loop backwards, but I don't see the point of performing the loop
> backwards if there's no need.
> 

There is no problem here, I can change it in the next version.

Regards,
Alex
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-19 15:56       ` Alexandru Stefan ISAILA
@ 2018-11-21  9:56         ` Roger Pau Monné
  2018-11-21 10:28           ` Alexandru Stefan ISAILA
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-21  9:56 UTC (permalink / raw)
  To: Alexandru Stefan ISAILA
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, rcojocaru, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel, Anshul Makkar

On Mon, Nov 19, 2018 at 03:56:14PM +0000, Alexandru Stefan ISAILA wrote:
> 
> 
> On 19.11.2018 17:08, Roger Pau Monné wrote:
> > On Mon, Nov 19, 2018 at 01:30:09PM +0000, Alexandru Stefan ISAILA wrote:
> >>>> +    /* Now transform our RWX values in a XENMEM_access_* constant. */
> >>>> +    if ( r == 0 && w == 0 && x == 0 )
> >>>> +        new_access = XENMEM_access_n;
> >>>> +    else if ( r == 0 && w == 0 && x == 1 )
> >>>> +        new_access = XENMEM_access_x;
> >>>> +    else if ( r == 0 && w == 1 && x == 0 )
> >>>> +        new_access = XENMEM_access_w;
> >>>> +    else if ( r == 0 && w == 1 && x == 1 )
> >>>> +        new_access = XENMEM_access_wx;
> >>>> +    else if ( r == 1 && w == 0 && x == 0 )
> >>>> +        new_access = XENMEM_access_r;
> >>>> +    else if ( r == 1 && w == 0 && x == 1 )
> >>>> +        new_access = XENMEM_access_rx;
> >>>> +    else if ( r == 1 && w == 1 && x == 0 )
> >>>> +        new_access = XENMEM_access_rw;
> >>>> +    else if ( r == 1 && w == 1 && x == 1 )
> >>>> +        new_access = XENMEM_access_rwx;
> >>>> +    else
> >>>> +        new_access = required_access; /* Should never get here. */
> >>>
> >>> There seems to be a lot of translation from xenmem_access_t to bool
> >>> fields and then to xenmem_access_t again. Can't you just avoid the
> >>> booleans?
> >>
> >> The translation is done because the rights are cumulative and I think
> >> this is the clear way to do this.
> > 
> > So the switch converts required_access using the following relation:
> > 
> > _r   -> r = 1 w = 0 x = 0
> > _w   -> r = 0 w = 1 x = 0
> > _x   -> r = 0 w = 0 x = 1
> > _rx  -> r = 1 w = 1 x = 0
> > _wx  -> r = 0 w = 1 x = 1
> > _rw  -> r = 1 w = 1 x = 0
> > _rwx -> r = 1 w = 1 x = 1
> > 
> > Then the if below performs the following transformation:
> > 
> > r = 0 w = 0 x = 0 -> _n
> > r = 1 w = 0 x = 0 -> _r
> > r = 0 w = 1 x = 0 -> _w
> > r = 0 w = 0 x = 1 -> _x
> > r = 1 w = 1 x = 0 -> _rw
> > r = 0 w = 1 x = 1 -> _wx
> > r = 1 w = 1 x = 0 -> _rw
> > r = 1 w = 1 x = 1 -> _rwx
> > 
> > I'm not sure I understand this chunk of code, because you end up
> > getting exactly the same type that you have as the input, and a type
> > not listed here is just silently passed through, so I don't see the
> > point in doing this transformation.
> 
> The first switch is for cur_access and it sets r,w,x accordingly,
> the second switch is required_access where r,w,x are appended
> and then in the last if().. part new_access is assigned according to the
> previous assignments of r,w,x.

I would move the code that converts xenmem_access_t into a separate
helper (as it's used in two different places), and use a bitmap
instead of 3 boolean variables, so you can do:

void convert_access(xenmem_access_t *access, unsigned int *attr)

And don't need to repeat the switch in two different places.

> > 
> >>
> >>>>        if ( vm_event_check_ring(d->vm_event_monitor) &&
> >>>>             d->arch.monitor.inguest_pagefault_disabled &&
> >>>> -         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
> >>>> +         npfec.kind != npfec_kind_with_gla &&
> >>>> +         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
> >>>>        {
> >>>> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
> >>>> -
> >>>> +        v->arch.vm_event->emulate_flags = 0;
> >>>> +        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
> >>>>            return true;
> >>>>        }
> >>>
> >>> Don't you need to fallback to using hvm_emulate_one_vm_event if
> >>> start_reexecute_instruction is not available?
> >>
> >> Fallback with hvm_emulate_one_vm_event can result in loosing events.
> > 
> > But by changing this here unconditionally you are removing this
> > functionality on AMD hardware, which it used to have before by making
> > use of hvm_emulate_one_vm_event.
> > 
> > I think this needs to at least be written in the commit message.
> 
> For AMD I could add if (cpu_has_svm()) and call emulate_one_vm_event. 

I would just use hvm_emulate_one_vm_event if
hvm_funcs.start_reexecute_instruction is unset, or else an explanation
needs to be added to the commit message about why
hvm_emulate_one_vm_event is not suitable.

Also, after looking at the code I'm not sure I see why this needs to
be VMX specific, AFAICT it doesn't directly call any VMX functions?

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-21  9:56         ` Roger Pau Monné
@ 2018-11-21 10:28           ` Alexandru Stefan ISAILA
  2018-11-21 11:41             ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-11-21 10:28 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, rcojocaru, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel, Anshul Makkar



On 21.11.2018 11:56, Roger Pau Monné wrote:
> On Mon, Nov 19, 2018 at 03:56:14PM +0000, Alexandru Stefan ISAILA wrote:
>>
>>
>> On 19.11.2018 17:08, Roger Pau Monné wrote:
>>> On Mon, Nov 19, 2018 at 01:30:09PM +0000, Alexandru Stefan ISAILA wrote:
>>>>>> +    /* Now transform our RWX values in a XENMEM_access_* constant. */
>>>>>> +    if ( r == 0 && w == 0 && x == 0 )
>>>>>> +        new_access = XENMEM_access_n;
>>>>>> +    else if ( r == 0 && w == 0 && x == 1 )
>>>>>> +        new_access = XENMEM_access_x;
>>>>>> +    else if ( r == 0 && w == 1 && x == 0 )
>>>>>> +        new_access = XENMEM_access_w;
>>>>>> +    else if ( r == 0 && w == 1 && x == 1 )
>>>>>> +        new_access = XENMEM_access_wx;
>>>>>> +    else if ( r == 1 && w == 0 && x == 0 )
>>>>>> +        new_access = XENMEM_access_r;
>>>>>> +    else if ( r == 1 && w == 0 && x == 1 )
>>>>>> +        new_access = XENMEM_access_rx;
>>>>>> +    else if ( r == 1 && w == 1 && x == 0 )
>>>>>> +        new_access = XENMEM_access_rw;
>>>>>> +    else if ( r == 1 && w == 1 && x == 1 )
>>>>>> +        new_access = XENMEM_access_rwx;
>>>>>> +    else
>>>>>> +        new_access = required_access; /* Should never get here. */
>>>>>
>>>>> There seems to be a lot of translation from xenmem_access_t to bool
>>>>> fields and then to xenmem_access_t again. Can't you just avoid the
>>>>> booleans?
>>>>
>>>> The translation is done because the rights are cumulative and I think
>>>> this is the clear way to do this.
>>>
>>> So the switch converts required_access using the following relation:
>>>
>>> _r   -> r = 1 w = 0 x = 0
>>> _w   -> r = 0 w = 1 x = 0
>>> _x   -> r = 0 w = 0 x = 1
>>> _rx  -> r = 1 w = 1 x = 0
>>> _wx  -> r = 0 w = 1 x = 1
>>> _rw  -> r = 1 w = 1 x = 0
>>> _rwx -> r = 1 w = 1 x = 1
>>>
>>> Then the if below performs the following transformation:
>>>
>>> r = 0 w = 0 x = 0 -> _n
>>> r = 1 w = 0 x = 0 -> _r
>>> r = 0 w = 1 x = 0 -> _w
>>> r = 0 w = 0 x = 1 -> _x
>>> r = 1 w = 1 x = 0 -> _rw
>>> r = 0 w = 1 x = 1 -> _wx
>>> r = 1 w = 1 x = 0 -> _rw
>>> r = 1 w = 1 x = 1 -> _rwx
>>>
>>> I'm not sure I understand this chunk of code, because you end up
>>> getting exactly the same type that you have as the input, and a type
>>> not listed here is just silently passed through, so I don't see the
>>> point in doing this transformation.
>>
>> The first switch is for cur_access and it sets r,w,x accordingly,
>> the second switch is required_access where r,w,x are appended
>> and then in the last if().. part new_access is assigned according to the
>> previous assignments of r,w,x.
> 
> I would move the code that converts xenmem_access_t into a separate
> helper (as it's used in two different places), and use a bitmap
> instead of 3 boolean variables, so you can do:
> 
> void convert_access(xenmem_access_t *access, unsigned int *attr)
> 
> And don't need to repeat the switch in two different places.

This is a good thing and by this I will remove the new_access assignment 
as well.

> 
>>>
>>>>
>>>>>>         if ( vm_event_check_ring(d->vm_event_monitor) &&
>>>>>>              d->arch.monitor.inguest_pagefault_disabled &&
>>>>>> -         npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
>>>>>> +         npfec.kind != npfec_kind_with_gla &&
>>>>>> +         hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */
>>>>>>         {
>>>>>> -        hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
>>>>>> -
>>>>>> +        v->arch.vm_event->emulate_flags = 0;
>>>>>> +        hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw);
>>>>>>             return true;
>>>>>>         }
>>>>>
>>>>> Don't you need to fallback to using hvm_emulate_one_vm_event if
>>>>> start_reexecute_instruction is not available?
>>>>
>>>> Fallback with hvm_emulate_one_vm_event can result in loosing events.
>>>
>>> But by changing this here unconditionally you are removing this
>>> functionality on AMD hardware, which it used to have before by making
>>> use of hvm_emulate_one_vm_event.
>>>
>>> I think this needs to at least be written in the commit message.
>>
>> For AMD I could add if (cpu_has_svm()) and call emulate_one_vm_event.
> 
> I would just use hvm_emulate_one_vm_event if
> hvm_funcs.start_reexecute_instruction is unset, or else an explanation
> needs to be added to the commit message about why
> hvm_emulate_one_vm_event is not suitable.

Yes, that is what I was about to add on v2. I will add a note in the 
commit msg as well.


> Also, after looking at the code I'm not sure I see why this needs to
> be VMX specific, AFAICT it doesn't directly call any VMX functions?
> 

It is vmx specific because svm does not have single step. We talked 
about in the past about this and it turned out that it was to much 
trouble to make a custom single step.

Regards,
Alex
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-21 10:28           ` Alexandru Stefan ISAILA
@ 2018-11-21 11:41             ` Roger Pau Monné
  2018-11-21 12:00               ` Alexandru Stefan ISAILA
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-21 11:41 UTC (permalink / raw)
  To: Alexandru Stefan ISAILA
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, rcojocaru, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel, Anshul Makkar

On Wed, Nov 21, 2018 at 10:28:18AM +0000, Alexandru Stefan ISAILA wrote:
> 
> 
> On 21.11.2018 11:56, Roger Pau Monné wrote:
> > On Mon, Nov 19, 2018 at 03:56:14PM +0000, Alexandru Stefan ISAILA wrote:
> >> On 19.11.2018 17:08, Roger Pau Monné wrote:
> > Also, after looking at the code I'm not sure I see why this needs to
> > be VMX specific, AFAICT it doesn't directly call any VMX functions?
> > 
> 
> It is vmx specific because svm does not have single step. We talked 
> about in the past about this and it turned out that it was to much 
> trouble to make a custom single step.

I still think this shouldn't be VMX specific, and you should just
return -EOPNOTSUPP if single stepping is not supported, just like
hvm_debug_op does. In fact I'm missing a helper to set single
stepping, which would be the right place to return -EOPNOTSUPP.

Then your rexec wouldn't need to know anything about the hardware and
would just attempt to set single stepping, failing if it cannot be
enabled.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-21 11:41             ` Roger Pau Monné
@ 2018-11-21 12:00               ` Alexandru Stefan ISAILA
  0 siblings, 0 replies; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-11-21 12:00 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, rcojocaru, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel, Anshul Makkar



On 21.11.2018 13:41, Roger Pau Monné wrote:
> On Wed, Nov 21, 2018 at 10:28:18AM +0000, Alexandru Stefan ISAILA wrote:
>>
>>
>> On 21.11.2018 11:56, Roger Pau Monné wrote:
>>> On Mon, Nov 19, 2018 at 03:56:14PM +0000, Alexandru Stefan ISAILA wrote:
>>>> On 19.11.2018 17:08, Roger Pau Monné wrote:
>>> Also, after looking at the code I'm not sure I see why this needs to
>>> be VMX specific, AFAICT it doesn't directly call any VMX functions?
>>>
>>
>> It is vmx specific because svm does not have single step. We talked
>> about in the past about this and it turned out that it was to much
>> trouble to make a custom single step.
> 
> I still think this shouldn't be VMX specific, and you should just
> return -EOPNOTSUPP if single stepping is not supported, just like
> hvm_debug_op does. In fact I'm missing a helper to set single
> stepping, which would be the right place to return -EOPNOTSUPP.
> 
> Then your rexec wouldn't need to know anything about the hardware and
> would just attempt to set single stepping, failing if it cannot be
> enabled.
> 

There is a helper function for single step "hvm_is_singlestep_supported" 
and we have the d->arch.monitor.inguest_pagefault_disabled monitor flag. 
I can return not supported in the xc_monitor_inguest_pagefault call and 
save time on the process.

~Alex
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-16 17:04 ` Roger Pau Monné
  2018-11-19 13:30   ` Alexandru Stefan ISAILA
  2018-11-19 13:33   ` Jan Beulich
@ 2018-11-21 18:55   ` Razvan Cojocaru
  2018-11-22  9:50     ` Alexandru Stefan ISAILA
  2018-11-22 10:05     ` Roger Pau Monné
  2 siblings, 2 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-21 18:55 UTC (permalink / raw)
  To: Roger Pau Monné, Alexandru Stefan ISAILA
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel, Anshul Makkar

On 11/16/18 7:04 PM, Roger Pau Monné wrote:
>> +            if ( a == v )
>> +                continue;
>> +
>> +            /* Pause, synced. */
>> +            while ( !a->arch.in_host )
> Why not use a->is_running as a way to know whether the vCPU is
> running?
> 
> I think the logic of using vcpu_pause and expecting the running vcpu
> to take a vmexit and thus set in_host is wrong because a vcpu that
> wasn't running when vcpu_pause_nosync is called won't get scheduled
> anymore, thus not taking a vmexit and this function will lockup.
> 
> I don't think you need the in_host boolean at all.
> 
>> +                cpu_relax();
> Is this really better than using vcpu_pause?
> 
> I assume this is done to avoid waiting on each vcpu, and instead doing
> it here likely means less wait time?

The problem with plain vcpu_pause() is that we weren't able to use it,
for the same reason (which remains unclear as of yet) that we couldn't
use a->is_running: we get CPU stuck hypervisor crashes that way. Here's
one that uses the same logic, but loops on a->is_running instead of
!a->arch.in_host:

(XEN) [ 3663.19(XEN) [ 3667.995061] Watchdog timer detects that CPU0 is
stuck!
(XEN) [ 3668.000694] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
(XEN) [ 3668.007108] CPU:    0
(XEN) [ 3668.009882] RIP:    e008:[<ffff82d0801327d2>]
vcpu_sleep_sync+0x40/0x71
(XEN) [ 3668.016989] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d0v0)
(XEN) [ 3668.023575] rax: 0000000000000001   rbx: ffff83007ccfc000
rcx: ffff83007ccfc128
(XEN) [ 3668.031548] rdx: 0000000000000000   rsi: 0000000000000246
rdi: ffff830c52984148
(XEN) [ 3668.039522] rbp: ffff83007cf2fcd8   rsp: ffff83007cf2fcc8   r8:
 0000000000000003
(XEN) [ 3668.047495] r9:  0000000000000000   r10: ffff82d080348460
r11: 0000000000000000
(XEN) [ 3668.055465] r12: ffff82d080132792   r13: ffff830b172b4000
r14: ffff82c000225000
(XEN) [ 3668.063439] r15: 00000000000f0000   cr0: 0000000080050033
cr4: 00000000003526e0
(XEN) [ 3668.071415] cr3: 0000000b4ba94000   cr2: 00007f6161714f70
(XEN) [ 3668.077308] fsb: 00007f9164f088c0   gsb: ffff880276c00000
gss: 0000000000000000
(XEN) [ 3668.085280] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
e010   cs: e008
(XEN) [ 3668.092731] Xen code around <ffff82d0801327d2>
(vcpu_sleep_sync+0x40/0x71):
(XEN) [ 3668.100186]  01 00 00 00 74 24 f3 90 <8b> 11 48 8b 43 10 8b 80
dc 01 00 00 09 d0 48 98
(XEN) [ 3668.108593] Xen stack trace from rsp=ffff83007cf2fcc8:
(XEN) [ 3668.114223]    0000000000000240 ffff83007ccfc000
ffff83007cf2fd08 ffff82d08010735b
(XEN) [ 3668.122282]    ffff82d0801358ad ffff830b172b4000
0000000000000240 0000000000000048
(XEN) [ 3668.130346]    ffff83007cf2fd18 ffff82d08010879a
ffff83007cf2fd88 ffff82d080245e69
(XEN) [ 3668.138402]    ffff83007d615000 ffff830b172b4658
ffff83007cf2fd48 00000000000f0000
(XEN) [ 3668.146464]    00007f9164fb8004 0000000000000048
ffff830c52974000 0000000000000006
(XEN) [ 3668.154523]    ffffffffffffffff ffffffffffffffea
00007f9164fb1004 0000000000000000
(XEN) [ 3668.162584]    ffff83007cf2fe48 ffff82d0801dd8f5
ffff82d080374d58 ffff82d08024b308
(XEN) [ 3668.170643]    ffff83007cf2fdc8 ffff83007cf2ffff
ffff83007cf2fdc8 ffff830b172b4000
(XEN) [ 3668.178704]    0000024000000001 00000000000f0000
00007f9164fb8004 fffffffffffffffc
(XEN) [ 3668.186763]    0000000000000293 00007f91631f85d3
ffff82d080250834 ffff82d080250828
(XEN) [ 3668.194820]    ffff82d080250834 ffff82d080250828
ffff82d080250834 ffff83007cf2fef8
(XEN) [ 3668.202882]    0000000000000022 ffff82d0801dc037
deadbeefdeadf00d ffffffff8100144a
(XEN) [ 3668.210942]    ffff83007cf2fee8 ffff82d080172aca
02ff82d080250834 0000000000000006
(XEN) [ 3668.219000]    00007f9164fb1004 deadbeefdeadf00d
deadbeefdeadf00d deadbeefdeadf00d
(XEN) [ 3668.227062]    ffff82d080250834 ffff82d080250828
ffff82d080250834 ffff82d080250828
(XEN) [ 3668.235121]    ffff82d080250834 ffff82d080250828
ffff82d080250834 ffff83007d615000
(XEN) [ 3668.243180]    0000000000000000 0000000000000000
0000000000000000 0000000000000000
(XEN) [ 3668.251240]    00007cff830d00e7 ffff82d080250899
00007ffef6baf1d0 0000000000305000
(XEN) [ 3668.259298]    ffff88022740b900 fffffffffffffff2
ffff88022b31fe98 ffff88026f3374d8
(XEN) [ 3668.267361]    0000000000000282 0000000000000000
ffff88007c995080 0000000000000000
(XEN) [ 3668.275417] Xen call trace:
(XEN) [ 3668.278714]    [<ffff82d0801327d2>] vcpu_sleep_sync+0x40/0x71
(XEN) [ 3668.284952]    [<ffff82d08010735b>]
domain.c#do_domain_pause+0x33/0x4f
(XEN) [ 3668.291973]    [<ffff82d08010879a>] domain_pause+0x25/0x27
(XEN) [ 3668.297952]    [<ffff82d080245e69>]
hap_track_dirty_vram+0x2c1/0x4a7
(XEN) [ 3668.304797]    [<ffff82d0801dd8f5>] do_hvm_op+0x18be/0x2b58
(XEN) [ 3668.310864]    [<ffff82d080172aca>] pv_hypercall+0x1e5/0x402
(XEN) [ 3668.317017]    [<ffff82d080250899>] entry.o#test_all_events+0/0x3d
(XEN) [ 3668.323689]
(XEN) [ 3668.325685]
(XEN) [ 3668.327678] ****************************************
(XEN) [ 3668.333138] Panic on CPU 0:
(XEN) [ 3668.336428] FATAL TRAP: vector = 2 (nmi)
(XEN) [ 3668.340850] [error_code=0000]
(XEN) [ 3668.344404] ****************************************
(XEN) [ 3668.349863]
(XEN) [ 3668.351854] Reboot in five seconds...
(XEN) [ 3668.356017] Dumping other CPUs
(XEN) [ 3668.359567] *** Dumping CPU1 host state: ***
(XEN) [ 3668.364337] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
(XEN) [ 3668.370750] CPU:    1
(XEN) [ 3668.373522] RIP:    e008:[<ffff82d08016b5a6>]
domain.c#default_idle+0xa2/0xb5
(XEN) [ 3668.381149] RFLAGS: 0000000000000202   CONTEXT: hypervisor
(XEN) [ 3668.387128] rax: 0000000000000000   rbx: ffff830c529b7fff
rcx: 0000000000000048
(XEN) [ 3668.395101] rdx: 0000000000000000   rsi: ffff830c529b7fff
rdi: ffff830c529b7ef8
(XEN) [ 3668.403076] rbp: ffff830c529b7ed0   rsp: ffff830c529b7ed0   r8:
 ffff830c529fe4a8
(XEN) [ 3668.411048] r9:  ffff830c529bac20   r10: ffff830c529fe490
r11: ffff830c529ba148
(XEN) [ 3668.419019] r12: ffff830c529ba140   r13: ffff83007cf75000
r14: 000003540fd7cd6b
(XEN) [ 3668.426994] r15: ffffffffffffffff   cr0: 000000008005003b
cr4: 00000000003526e0
(XEN) [ 3668.434964] cr3: 000000007cf1d000   cr2: 0000000000000000
(XEN) [ 3668.440861] fsb: 0000000000000000   gsb: 0000000000000000
gss: 0000000000000000
(XEN) [ 3668.448832] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
0000   cs: e008
(XEN) [ 3668.456285] Xen code around <ffff82d08016b5a6>
(domain.c#default_idle+0xa2/0xb5):
(XEN) [ 3668.464260]  00 00 00 0f 30 90 fb f4 <0f> b6 46 f5 80 a7 fd 00
00 00 fe 0f 30 90 eb 01
(XEN) [ 3668.472663] Xen stack trace from rsp=ffff830c529b7ed0:
(XEN) [ 3668.478297]    ffff830c529b7ef0 ffff82d08016b628
ffff82d080134ffe ffff83007cf75000
(XEN) [ 3668.486358]    ffff830c529b7df0 0000000000000000
0000000000000000 0000000000000000
(XEN) [ 3668.494417]    0000000000000000 00000000001c3a38
0000000000000000 0000000000000000
(XEN) [ 3668.502478]    0000000000000000 0000000000000000
0000000000000000 0000000000000000
(XEN) [ 3668.510538]    00000000000002ff 00000000001c00e9
0000000000000000 0000000000000000
(XEN) [ 3668.518595]    0000beef0000beef 0000000000103f15
000000bf0000beef 0000000000000046
(XEN) [ 3668.526656]    00000000001c3a38 000000000000beef
ffffea000d5bbeef ffffea000d5bbeef
(XEN) [ 3668.534715]    000000000000beef 000000000000beef
017fffc000000001 ffff83007cf75000
(XEN) [ 3668.542775]    0000003bd2646380 00000000003526e0
0000000000000000 0000000c5299e000
(XEN) [ 3668.550837]    0000070100000000 0000000000000000
(XEN) [ 3668.555948] Xen call trace:
(XEN) [ 3668.559242]    [<ffff82d08016b5a6>] domain.c#default_idle+0xa2/0xb5
(XEN) [ 3668.566000]    [<ffff82d08016b628>] domain.c#idle_loop+0x57/0x6e
(XEN) [ 3668.572502]
(XEN) [ 3668.574494] *** Dumping CPU2 host state: ***
(XEN) [ 3668.579261] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
(XEN) [ 3668.585675] CPU:    2
(XEN) [ 3668.588449] RIP:    e008:[<ffff82d080127880>]
queue_read_lock_slowpath+0x27/0x4d
(XEN) [ 3668.596332] RFLAGS: 0000000000000286   CONTEXT: hypervisor (d1v1)
(XEN) [ 3668.602919] rax: 00000000000000ff   rbx: ffff830b1b2b6980
rcx: 0000000000000000
(XEN) [ 3668.610893] rdx: ffff830c52997fff   rsi: 0000000000000009
rdi: ffff830b1b2b698a
(XEN) [ 3668.618865] rbp: ffff830c52997a68   rsp: ffff830c52997a58   r8:
 0000000000000000
(XEN) [ 3668.626837] r9:  0000000000000003   r10: 0000000000000000
r11: 0000000000000000
(XEN) [ 3668.634812] r12: ffff830b1b2b6984   r13: ffff830c52997aa4
r14: ffff830c52997c34
(XEN) [ 3668.642786] r15: 00000000000001aa   cr0: 0000000080050033
cr4: 00000000003526e0
(XEN) [ 3668.650759] cr3: 0000000b105ef000   cr2: 00000190068c3000
(XEN) [ 3668.656650] fsb: 0000000000000000   gsb: 0000000000000000
gss: 0000004f58bd3000
(XEN) [ 3668.664624] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
0000   cs: e008
(XEN) [ 3668.672077] Xen code around <ffff82d080127880>
(queue_read_lock_slowpath+0x27/0x4d):
(XEN) [ 3668.680309]  84 c0 74 08 f3 90 8b 03 <84> c0 75 f8 b8 00 01 00
00 f0 0f c1 03 3c ff 75
(XEN) [ 3668.688717] Xen stack trace from rsp=ffff830c52997a58:
(XEN) [ 3668.694351]    ffff830b1b2b6980 ffff830c52997b54
ffff830c52997ad8 ffff82d08020c1df
(XEN) [ 3668.702411]    ffff830c52997b08 ffff82d080217db4
ffff830b172b4000 0000000352997c44
(XEN) [ 3668.710468]    000000000db12f43 0000000000000000
ffff830c00000000 00000000000001aa
(XEN) [ 3668.718529]    ffff830b1b2b6980 fffff801a1e18d03
ffff830c52997c34 ffff830078ba7000
(XEN) [ 3668.726591]    ffff830c52997b88 ffff82d080247208
ffff830b1b2b6980 ffff830c52997c44
(XEN) [ 3668.734648]    0000000000000000 fffff801a1e18d03
ffff830c52997b68 ffff82d08020bf20
(XEN) [ 3668.742707]    0000000000000000 0000000208a008e3
ffff830c52997b58 0000000400000000
(XEN) [ 3668.750768]    0000000000008000 0000000000000000
ffff830c52997be0 0000000000000000
(XEN) [ 3668.758826]    0000000000000000 ffff830078ba7000
ffff830c52997c34 fffff801a1e18d03
(XEN) [ 3668.766888]    ffff830b1b2b6980 ffff82d080311520
ffff830c52997b98 ffff82d080247475
(XEN) [ 3668.774945]    ffff830c52997be8 ffff82d080212751
0000000000008000 ffffef07c38b76b0
(XEN) [ 3668.783006]    0000000000000010 fffff801a1e18d03
fffff801a1e18d03 0000000000000d03
(XEN) [ 3668.791067]    000fffff801a1e18 ffff830c52997ef8
ffff830c52997c78 ffff82d0801d66a0
(XEN) [ 3668.799128]    ffffef07c38b7708 ffff830c52997c44
ffff830c52997c34 0000000000000004
(XEN) [ 3668.807188]    ffff830c52997d38 0000001000000004
ffff830078ba7000 0000001100000010
(XEN) [ 3668.815244]    ffffea000d59beef ffffea000d59beef
000000000000beef ffff830c52997d10
(XEN) [ 3668.823304]    ffff830078ba7000 0000000000000001
0000000000000000 ffff830c52997ef8
(XEN) [ 3668.831363]    ffff830c52997c88 ffff82d0801d844d
ffff830c52997ce8 ffff82d0801d13da
(XEN) [ 3668.839423]    ffff830c52997d38 ffff82d0803107e0
0000000000000000 fffff801a1e18d03
(XEN) [ 3668.847484]    ffff830c52997cd8 ffff830078ba7000
ffff830c52997d10 000000000000002c
(XEN) [ 3668.855544] Xen call trace:
(XEN) [ 3668.858838]    [<ffff82d080127880>]
queue_read_lock_slowpath+0x27/0x4d
(XEN) [ 3668.865857]    [<ffff82d08020c1df>]
get_page_from_gfn_p2m+0x14e/0x3b0
(XEN) [ 3668.872792]    [<ffff82d080247208>]
hap_p2m_ga_to_gfn_4_levels+0x48/0x299
(XEN) [ 3668.880071]    [<ffff82d080247475>]
hap_gva_to_gfn_4_levels+0x1c/0x1e
(XEN) [ 3668.887004]    [<ffff82d080212751>] paging_gva_to_gfn+0x10e/0x11d
(XEN) [ 3668.893590]    [<ffff82d0801d66a0>] hvm.c#__hvm_copy+0x98/0x37f
(XEN) [ 3668.900003]    [<ffff82d0801d844d>]
hvm_fetch_from_guest_virt_nofault+0x14/0x16
(XEN) [ 3668.907801]    [<ffff82d0801d13da>]
emulate.c#_hvm_emulate_one+0x118/0x2bc
(XEN) [ 3668.915168]    [<ffff82d0801d1674>] hvm_emulate_one+0x10/0x12
(XEN) [ 3668.921409]    [<ffff82d0801e08c2>] handle_mmio+0x52/0xc9
(XEN) [ 3668.927303]    [<ffff82d0802034a2>]
vmx_vmexit_handler+0x1e0e/0x1e45
(XEN) [ 3668.934149]    [<ffff82d08020820c>]
vmx_asm_vmexit_handler+0xec/0x250
(XEN) [ 3668.941079]
(XEN) [ 3668.943072] *** Dumping CPU2 guest state (d1v1): ***
(XEN) [ 3668.948533] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
(XEN) [ 3668.954948] CPU:    2
(XEN) [ 3668.957719] RIP:    0010:[<fffff801a1e18d03>]
(XEN) [ 3668.962572] RFLAGS: 0000000000010046   CONTEXT: hvm guest (d1v1)
(XEN) [ 3668.969075] rax: fffff78880009000   rbx: 000000000004002f
rcx: fffff801a1e19300
(XEN) [ 3668.977045] rdx: ffffef07c38b76b8   rsi: ffffef07c38b7708
rdi: 0000000000000000
(XEN) [ 3668.985018] rbp: ffffef07c38b76b0   rsp: ffffef07c38b75f0   r8:
 ffffef07c38b7708
(XEN) [ 3668.992991] r9:  000000000000002f   r10: 0000000000000001
r11: 0000000000000001
(XEN) [ 3669.000966] r12: 0000000000000001   r13: 0000000000000000
r14: 0000000000000001
(XEN) [ 3669.008938] r15: 000000000000002f   cr0: 0000000080050031
cr4: 0000000000170678
(XEN) [ 3669.016913] cr3: 00000000001aa002   cr2: 00000190068c3000
(XEN) [ 3669.022806] fsb: 0000000000000000   gsb: ffffc9814c820000
gss: 0000000473bfe000
(XEN) [ 3669.030776] ds: 002b   es: 002b   fs: 0053   gs: 002b   ss:
0000   cs: 0010
(XEN) [ 3669.038229]
(XEN) [ 3669.040223] *** Dumping CPU3 host state: ***
(XEN) [ 3669.044988] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
(XEN) [ 3669.051403] CPU:    3
(XEN) [ 3669.054177] RIP:    e008:[<ffff82d08021006a>]
vmx_start_reexecute_instruction+0x107/0x68a
(XEN) [ 3669.062841] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d1v0)
(XEN) [ 3669.069431] rax: ffff830078ba7000   rbx: ffff83007ccfc000
rcx: 0000000000000002
(XEN) [ 3669.077404] rdx: ffff830c5297ffff   rsi: 0000000000000246
rdi: ffff830c52998148
(XEN) [ 3669.085377] rbp: ffff830c5297fd18   rsp: ffff830c5297fcb8   r8:
 0000000000000002
(XEN) [ 3669.093349] r9:  0000000000000006   r10: 000000000003d976
r11: 0000000000000006
(XEN) [ 3669.101320] r12: 0000000000000000   r13: ffff82d08028a3e4
r14: 0000000000000000
(XEN) [ 3669.109296] r15: 0000000113f007f8   cr0: 0000000080050033
cr4: 00000000003526e0
(XEN) [ 3669.117269] cr3: 0000000b10380000   cr2: 0000000000000000
(XEN) [ 3669.123163] fsb: 0000000000000000   gsb: 0000000000000000
gss: fffff801a129e000
(XEN) [ 3669.131132] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
0000   cs: e008
(XEN) [ 3669.138586] Xen code around <ffff82d08021006a>
(vmx_start_reexecute_instruction+0x107/0x68a):
(XEN) [ 3669.147598]  90 80 b8 0b 01 00 00 00 <75> f5 48 8b 40 18 48 85
c0 75 de e9 19 05 00 00
(XEN) [ 3669.156005] Xen stack trace from rsp=ffff830c5297fcb8:
(XEN) [ 3669.161640]    ffff830b17342800 0000000300000009
0000000000000100 ffff830b17342000
(XEN) [ 3669.169697]    ffff830c00000000 ffff830b1b2b6980
ffff830b172b4000 ffff830b1b2b6980
(XEN) [ 3669.177761]    000000001b2b6801 0000000000000002
ffff83007ccfc000 000000000000003b
(XEN) [ 3669.185818]    ffff830c5297fda8 ffff82d080210b3e
0000000000113f00 0000000000000000
(XEN) [ 3669.193877]    00007ff91cd34d60 0000000113f007f8
0000000000000000 ffff830c5297fdf0
(XEN) [ 3669.201937]    0000000000113f00 0000000000000000
ffff83007ccfc000 0000000000000005
(XEN) [ 3669.209997]    ffff83007ccfc000 ffff830b172b4000
ffff83007ccfc000 ffff83007ccfc000
(XEN) [ 3669.218056]    0000000000113f00 0000000000000000
ffff830c5297fe38 ffff82d0801dee9e
(XEN) [ 3669.226116]    0000000000913f00 0000000000000000
00007ff91cd34d60 ffff830b1b2b6980
(XEN) [ 3669.234177]    0000003b5297fe38 0000000113f007f8
0000000000000296 0000000000000000
(XEN) [ 3669.242236]    ffff830b1b2b6980 0000000000000005
ffff82d0802081d1 ffff830c5297fef8
(XEN) [ 3669.250295]    ffff83007ccfc000 00000000000006ab
000000000000001b 0000000113f007f8
(XEN) [ 3669.258354]    ffff830c5297fee8 ffff82d080202c00
ffff82d0802081d1 0000000000000080
(XEN) [ 3669.266417]    0000000000000000 0000000000000002
ffff830b172b4000 0000000000113f00
(XEN) [ 3669.274474]    00007ff91cd34d60 000000000000003b
ffff82d0802081d1 ffff82d0802081c5
(XEN) [ 3669.282537]    ffff82d0802081d1 ffff82d0802081c5
ffff82d0802081d1 ffff82d0802081c5
(XEN) [ 3669.290596]    ffff82d0802081d1 ffff83007ccfc000
0000000000000000 0000000000000000
(XEN) [ 3669.298655]    0000000000000000 0000000000000000
00007cf3ad6800e7 ffff82d08020820c
(XEN) [ 3669.306712]    00007ff91cd34d60 0000019285c42a50
00000192858a8eb0 0000000000000000
(XEN) [ 3669.314772]    0000019285894438 41c64e6da3bd2845
0000104000000000 00000fff239a69ac
(XEN) [ 3669.322832] Xen call trace:
(XEN) [ 3669.326128]    [<ffff82d08021006a>]
vmx_start_reexecute_instruction+0x107/0x68a
(XEN) [ 3669.333925]    [<ffff82d080210b3e>]
p2m_mem_access_check+0x551/0x64d
(XEN) [ 3669.340774]    [<ffff82d0801dee9e>]
hvm_hap_nested_page_fault+0x2f2/0x631
(XEN) [ 3669.348051]    [<ffff82d080202c00>]
vmx_vmexit_handler+0x156c/0x1e45
(XEN) [ 3669.354899]    [<ffff82d08020820c>]
vmx_asm_vmexit_handler+0xec/0x250
(XEN) [ 3669.361832]
(XEN) [ 3669.363827] *** Dumping CPU3 guest state (d1v0): ***
(XEN) [ 3669.369285] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
(XEN) [ 3669.375700] CPU:    3
(XEN) [ 3669.378471] RIP:    0033:[<00007ff91cd34d60>]
(XEN) [ 3669.383323] RFLAGS: 0000000000010247   CONTEXT: hvm guest (d1v0)
(XEN) [ 3669.389824] rax: 00007ff91cd34d60   rbx: 41c64e6da3bd2845
rcx: 41c64e6da3bd2845
(XEN) [ 3669.397799] rdx: 0000000000000077   rsi: 0000000000000001
rdi: 0000019285877150
(XEN) [ 3669.405768] rbp: 0000019285894438   rsp: 0000008d6aa7f608   r8:
 0000000000000000
(XEN) [ 3669.413743] r9:  00000192858a8eb0   r10: 00000fff239a69ac
r11: 0000104000000000
(XEN) [ 3669.421716] r12: 0000000000000000   r13: 00000192858a8eb0
r14: 0000019285c42a50
(XEN) [ 3669.429690] r15: 00007ff91cd34d60   cr0: 0000000080050031
cr4: 0000000000170678
(XEN) [ 3669.437662] cr3: 0000000113f00002   cr2: 0000000000000000
(XEN) [ 3669.443555] fsb: 0000000000000000   gsb: 0000008d6a7cf000
gss: 0000002562d20000
(XEN) [ 3669.451529] ds: 002b   es: 002b   fs: 0053   gs: 002b   ss:
002b   cs: 0033
(XEN) [ 3669.458980]
(XEN) [ 3669.463584] APIC error on CPU0: 40(00)

Some scheduler magic appears to happen here where it is unclear why
is_running doesn't seem to end up being 0 as expected in our case. We'll
keep digging.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-21 18:55   ` Razvan Cojocaru
@ 2018-11-22  9:50     ` Alexandru Stefan ISAILA
  2018-11-22 10:00       ` Jan Beulich
  2018-11-22 10:07       ` Roger Pau Monné
  2018-11-22 10:05     ` Roger Pau Monné
  1 sibling, 2 replies; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-11-22  9:50 UTC (permalink / raw)
  To: Razvan Cojocaru, Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel, Anshul Makkar


On 21.11.2018 20:55, Razvan Cojocaru wrote:
>> +            if ( a == v )
>> +                continue;
>> +
>> +            /* Pause, synced. */
>> +            while ( !a->arch.in_host )
> Why not use a->is_running as a way to know whether the vCPU is
> running?
> 
> I think the logic of using vcpu_pause and expecting the running vcpu
> to take a vmexit and thus set in_host is wrong because a vcpu that
> wasn't running when vcpu_pause_nosync is called won't get scheduled
> anymore, thus not taking a vmexit and this function will lockup.

We can resolve this by using while ( !vcpu_runnable(a) && 
!a->arch.in_host ), if this is suitable.

~Alex
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22  9:50     ` Alexandru Stefan ISAILA
@ 2018-11-22 10:00       ` Jan Beulich
  2018-11-22 10:07       ` Roger Pau Monné
  1 sibling, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2018-11-22 10:00 UTC (permalink / raw)
  To: aisaila
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, Razvan Cojocaru,
	George Dunlap, Andrew Cooper, Mihai Dontu, Kevin Tian,
	Jun Nakajima, xen-devel, Anshul Makkar, Roger Pau Monne

>>> On 22.11.18 at 10:50, <aisaila@bitdefender.com> wrote:
> On 21.11.2018 20:55, Razvan Cojocaru wrote:
>>> +            if ( a == v )
>>> +                continue;
>>> +
>>> +            /* Pause, synced. */
>>> +            while ( !a->arch.in_host )
>> Why not use a->is_running as a way to know whether the vCPU is
>> running?
>> 
>> I think the logic of using vcpu_pause and expecting the running vcpu
>> to take a vmexit and thus set in_host is wrong because a vcpu that
>> wasn't running when vcpu_pause_nosync is called won't get scheduled
>> anymore, thus not taking a vmexit and this function will lockup.
> 
> We can resolve this by using while ( !vcpu_runnable(a) && 
> !a->arch.in_host ), if this is suitable.

Only if you can fully explain why the current infrastructure doesn't
work and can't be made work without the custom in_host flag.
From prior work I know things are subtle in some cases where one
would want to use ->is_running, but so far all issues could be
resolved without custom additions.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-21 18:55   ` Razvan Cojocaru
  2018-11-22  9:50     ` Alexandru Stefan ISAILA
@ 2018-11-22 10:05     ` Roger Pau Monné
  2018-11-22 10:14       ` Razvan Cojocaru
  1 sibling, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-22 10:05 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS, jbeulich,
	Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On Wed, Nov 21, 2018 at 08:55:48PM +0200, Razvan Cojocaru wrote:
> On 11/16/18 7:04 PM, Roger Pau Monné wrote:
> >> +            if ( a == v )
> >> +                continue;
> >> +
> >> +            /* Pause, synced. */
> >> +            while ( !a->arch.in_host )
> > Why not use a->is_running as a way to know whether the vCPU is
> > running?
> > 
> > I think the logic of using vcpu_pause and expecting the running vcpu
> > to take a vmexit and thus set in_host is wrong because a vcpu that
> > wasn't running when vcpu_pause_nosync is called won't get scheduled
> > anymore, thus not taking a vmexit and this function will lockup.
> > 
> > I don't think you need the in_host boolean at all.
> > 
> >> +                cpu_relax();
> > Is this really better than using vcpu_pause?
> > 
> > I assume this is done to avoid waiting on each vcpu, and instead doing
> > it here likely means less wait time?
> 
> The problem with plain vcpu_pause() is that we weren't able to use it,
> for the same reason (which remains unclear as of yet) that we couldn't
> use a->is_running: we get CPU stuck hypervisor crashes that way. Here's
> one that uses the same logic, but loops on a->is_running instead of
> !a->arch.in_host:
>
> (XEN) [ 3663.19(XEN) [ 3667.995061] Watchdog timer detects that CPU0 is
> stuck!
> (XEN) [ 3668.000694] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
> (XEN) [ 3668.007108] CPU:    0
> (XEN) [ 3668.009882] RIP:    e008:[<ffff82d0801327d2>]
> vcpu_sleep_sync+0x40/0x71
> (XEN) [ 3668.016989] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d0v0)
> (XEN) [ 3668.023575] rax: 0000000000000001   rbx: ffff83007ccfc000
> rcx: ffff83007ccfc128
> (XEN) [ 3668.031548] rdx: 0000000000000000   rsi: 0000000000000246
> rdi: ffff830c52984148
> (XEN) [ 3668.039522] rbp: ffff83007cf2fcd8   rsp: ffff83007cf2fcc8   r8:
>  0000000000000003
> (XEN) [ 3668.047495] r9:  0000000000000000   r10: ffff82d080348460
> r11: 0000000000000000
> (XEN) [ 3668.055465] r12: ffff82d080132792   r13: ffff830b172b4000
> r14: ffff82c000225000
> (XEN) [ 3668.063439] r15: 00000000000f0000   cr0: 0000000080050033
> cr4: 00000000003526e0
> (XEN) [ 3668.071415] cr3: 0000000b4ba94000   cr2: 00007f6161714f70
> (XEN) [ 3668.077308] fsb: 00007f9164f088c0   gsb: ffff880276c00000
> gss: 0000000000000000
> (XEN) [ 3668.085280] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
> e010   cs: e008
> (XEN) [ 3668.092731] Xen code around <ffff82d0801327d2>
> (vcpu_sleep_sync+0x40/0x71):
> (XEN) [ 3668.100186]  01 00 00 00 74 24 f3 90 <8b> 11 48 8b 43 10 8b 80
> dc 01 00 00 09 d0 48 98
> (XEN) [ 3668.108593] Xen stack trace from rsp=ffff83007cf2fcc8:
> (XEN) [ 3668.114223]    0000000000000240 ffff83007ccfc000
> ffff83007cf2fd08 ffff82d08010735b
> (XEN) [ 3668.122282]    ffff82d0801358ad ffff830b172b4000
> 0000000000000240 0000000000000048
> (XEN) [ 3668.130346]    ffff83007cf2fd18 ffff82d08010879a
> ffff83007cf2fd88 ffff82d080245e69
> (XEN) [ 3668.138402]    ffff83007d615000 ffff830b172b4658
> ffff83007cf2fd48 00000000000f0000
> (XEN) [ 3668.146464]    00007f9164fb8004 0000000000000048
> ffff830c52974000 0000000000000006
> (XEN) [ 3668.154523]    ffffffffffffffff ffffffffffffffea
> 00007f9164fb1004 0000000000000000
> (XEN) [ 3668.162584]    ffff83007cf2fe48 ffff82d0801dd8f5
> ffff82d080374d58 ffff82d08024b308
> (XEN) [ 3668.170643]    ffff83007cf2fdc8 ffff83007cf2ffff
> ffff83007cf2fdc8 ffff830b172b4000
> (XEN) [ 3668.178704]    0000024000000001 00000000000f0000
> 00007f9164fb8004 fffffffffffffffc
> (XEN) [ 3668.186763]    0000000000000293 00007f91631f85d3
> ffff82d080250834 ffff82d080250828
> (XEN) [ 3668.194820]    ffff82d080250834 ffff82d080250828
> ffff82d080250834 ffff83007cf2fef8
> (XEN) [ 3668.202882]    0000000000000022 ffff82d0801dc037
> deadbeefdeadf00d ffffffff8100144a
> (XEN) [ 3668.210942]    ffff83007cf2fee8 ffff82d080172aca
> 02ff82d080250834 0000000000000006
> (XEN) [ 3668.219000]    00007f9164fb1004 deadbeefdeadf00d
> deadbeefdeadf00d deadbeefdeadf00d
> (XEN) [ 3668.227062]    ffff82d080250834 ffff82d080250828
> ffff82d080250834 ffff82d080250828
> (XEN) [ 3668.235121]    ffff82d080250834 ffff82d080250828
> ffff82d080250834 ffff83007d615000
> (XEN) [ 3668.243180]    0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> (XEN) [ 3668.251240]    00007cff830d00e7 ffff82d080250899
> 00007ffef6baf1d0 0000000000305000
> (XEN) [ 3668.259298]    ffff88022740b900 fffffffffffffff2
> ffff88022b31fe98 ffff88026f3374d8
> (XEN) [ 3668.267361]    0000000000000282 0000000000000000
> ffff88007c995080 0000000000000000
> (XEN) [ 3668.275417] Xen call trace:
> (XEN) [ 3668.278714]    [<ffff82d0801327d2>] vcpu_sleep_sync+0x40/0x71
> (XEN) [ 3668.284952]    [<ffff82d08010735b>]
> domain.c#do_domain_pause+0x33/0x4f
> (XEN) [ 3668.291973]    [<ffff82d08010879a>] domain_pause+0x25/0x27
> (XEN) [ 3668.297952]    [<ffff82d080245e69>]
> hap_track_dirty_vram+0x2c1/0x4a7
> (XEN) [ 3668.304797]    [<ffff82d0801dd8f5>] do_hvm_op+0x18be/0x2b58
> (XEN) [ 3668.310864]    [<ffff82d080172aca>] pv_hypercall+0x1e5/0x402
> (XEN) [ 3668.317017]    [<ffff82d080250899>] entry.o#test_all_events+0/0x3d
> (XEN) [ 3668.323689]
> (XEN) [ 3668.325685]
> (XEN) [ 3668.327678] ****************************************
> (XEN) [ 3668.333138] Panic on CPU 0:
> (XEN) [ 3668.336428] FATAL TRAP: vector = 2 (nmi)
> (XEN) [ 3668.340850] [error_code=0000]
> (XEN) [ 3668.344404] ****************************************
> (XEN) [ 3668.349863]
> (XEN) [ 3668.351854] Reboot in five seconds...
> (XEN) [ 3668.356017] Dumping other CPUs
> (XEN) [ 3668.359567] *** Dumping CPU1 host state: ***
> (XEN) [ 3668.364337] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
> (XEN) [ 3668.370750] CPU:    1
> (XEN) [ 3668.373522] RIP:    e008:[<ffff82d08016b5a6>]
> domain.c#default_idle+0xa2/0xb5
> (XEN) [ 3668.381149] RFLAGS: 0000000000000202   CONTEXT: hypervisor
> (XEN) [ 3668.387128] rax: 0000000000000000   rbx: ffff830c529b7fff
> rcx: 0000000000000048
> (XEN) [ 3668.395101] rdx: 0000000000000000   rsi: ffff830c529b7fff
> rdi: ffff830c529b7ef8
> (XEN) [ 3668.403076] rbp: ffff830c529b7ed0   rsp: ffff830c529b7ed0   r8:
>  ffff830c529fe4a8
> (XEN) [ 3668.411048] r9:  ffff830c529bac20   r10: ffff830c529fe490
> r11: ffff830c529ba148
> (XEN) [ 3668.419019] r12: ffff830c529ba140   r13: ffff83007cf75000
> r14: 000003540fd7cd6b
> (XEN) [ 3668.426994] r15: ffffffffffffffff   cr0: 000000008005003b
> cr4: 00000000003526e0
> (XEN) [ 3668.434964] cr3: 000000007cf1d000   cr2: 0000000000000000
> (XEN) [ 3668.440861] fsb: 0000000000000000   gsb: 0000000000000000
> gss: 0000000000000000
> (XEN) [ 3668.448832] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
> 0000   cs: e008
> (XEN) [ 3668.456285] Xen code around <ffff82d08016b5a6>
> (domain.c#default_idle+0xa2/0xb5):
> (XEN) [ 3668.464260]  00 00 00 0f 30 90 fb f4 <0f> b6 46 f5 80 a7 fd 00
> 00 00 fe 0f 30 90 eb 01
> (XEN) [ 3668.472663] Xen stack trace from rsp=ffff830c529b7ed0:
> (XEN) [ 3668.478297]    ffff830c529b7ef0 ffff82d08016b628
> ffff82d080134ffe ffff83007cf75000
> (XEN) [ 3668.486358]    ffff830c529b7df0 0000000000000000
> 0000000000000000 0000000000000000
> (XEN) [ 3668.494417]    0000000000000000 00000000001c3a38
> 0000000000000000 0000000000000000
> (XEN) [ 3668.502478]    0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> (XEN) [ 3668.510538]    00000000000002ff 00000000001c00e9
> 0000000000000000 0000000000000000
> (XEN) [ 3668.518595]    0000beef0000beef 0000000000103f15
> 000000bf0000beef 0000000000000046
> (XEN) [ 3668.526656]    00000000001c3a38 000000000000beef
> ffffea000d5bbeef ffffea000d5bbeef
> (XEN) [ 3668.534715]    000000000000beef 000000000000beef
> 017fffc000000001 ffff83007cf75000
> (XEN) [ 3668.542775]    0000003bd2646380 00000000003526e0
> 0000000000000000 0000000c5299e000
> (XEN) [ 3668.550837]    0000070100000000 0000000000000000
> (XEN) [ 3668.555948] Xen call trace:
> (XEN) [ 3668.559242]    [<ffff82d08016b5a6>] domain.c#default_idle+0xa2/0xb5
> (XEN) [ 3668.566000]    [<ffff82d08016b628>] domain.c#idle_loop+0x57/0x6e
> (XEN) [ 3668.572502]
> (XEN) [ 3668.574494] *** Dumping CPU2 host state: ***
> (XEN) [ 3668.579261] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
> (XEN) [ 3668.585675] CPU:    2
> (XEN) [ 3668.588449] RIP:    e008:[<ffff82d080127880>]
> queue_read_lock_slowpath+0x27/0x4d
> (XEN) [ 3668.596332] RFLAGS: 0000000000000286   CONTEXT: hypervisor (d1v1)
> (XEN) [ 3668.602919] rax: 00000000000000ff   rbx: ffff830b1b2b6980
> rcx: 0000000000000000
> (XEN) [ 3668.610893] rdx: ffff830c52997fff   rsi: 0000000000000009
> rdi: ffff830b1b2b698a
> (XEN) [ 3668.618865] rbp: ffff830c52997a68   rsp: ffff830c52997a58   r8:
>  0000000000000000
> (XEN) [ 3668.626837] r9:  0000000000000003   r10: 0000000000000000
> r11: 0000000000000000
> (XEN) [ 3668.634812] r12: ffff830b1b2b6984   r13: ffff830c52997aa4
> r14: ffff830c52997c34
> (XEN) [ 3668.642786] r15: 00000000000001aa   cr0: 0000000080050033
> cr4: 00000000003526e0
> (XEN) [ 3668.650759] cr3: 0000000b105ef000   cr2: 00000190068c3000
> (XEN) [ 3668.656650] fsb: 0000000000000000   gsb: 0000000000000000
> gss: 0000004f58bd3000
> (XEN) [ 3668.664624] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
> 0000   cs: e008
> (XEN) [ 3668.672077] Xen code around <ffff82d080127880>
> (queue_read_lock_slowpath+0x27/0x4d):
> (XEN) [ 3668.680309]  84 c0 74 08 f3 90 8b 03 <84> c0 75 f8 b8 00 01 00
> 00 f0 0f c1 03 3c ff 75
> (XEN) [ 3668.688717] Xen stack trace from rsp=ffff830c52997a58:
> (XEN) [ 3668.694351]    ffff830b1b2b6980 ffff830c52997b54
> ffff830c52997ad8 ffff82d08020c1df
> (XEN) [ 3668.702411]    ffff830c52997b08 ffff82d080217db4
> ffff830b172b4000 0000000352997c44
> (XEN) [ 3668.710468]    000000000db12f43 0000000000000000
> ffff830c00000000 00000000000001aa
> (XEN) [ 3668.718529]    ffff830b1b2b6980 fffff801a1e18d03
> ffff830c52997c34 ffff830078ba7000
> (XEN) [ 3668.726591]    ffff830c52997b88 ffff82d080247208
> ffff830b1b2b6980 ffff830c52997c44
> (XEN) [ 3668.734648]    0000000000000000 fffff801a1e18d03
> ffff830c52997b68 ffff82d08020bf20
> (XEN) [ 3668.742707]    0000000000000000 0000000208a008e3
> ffff830c52997b58 0000000400000000
> (XEN) [ 3668.750768]    0000000000008000 0000000000000000
> ffff830c52997be0 0000000000000000
> (XEN) [ 3668.758826]    0000000000000000 ffff830078ba7000
> ffff830c52997c34 fffff801a1e18d03
> (XEN) [ 3668.766888]    ffff830b1b2b6980 ffff82d080311520
> ffff830c52997b98 ffff82d080247475
> (XEN) [ 3668.774945]    ffff830c52997be8 ffff82d080212751
> 0000000000008000 ffffef07c38b76b0
> (XEN) [ 3668.783006]    0000000000000010 fffff801a1e18d03
> fffff801a1e18d03 0000000000000d03
> (XEN) [ 3668.791067]    000fffff801a1e18 ffff830c52997ef8
> ffff830c52997c78 ffff82d0801d66a0
> (XEN) [ 3668.799128]    ffffef07c38b7708 ffff830c52997c44
> ffff830c52997c34 0000000000000004
> (XEN) [ 3668.807188]    ffff830c52997d38 0000001000000004
> ffff830078ba7000 0000001100000010
> (XEN) [ 3668.815244]    ffffea000d59beef ffffea000d59beef
> 000000000000beef ffff830c52997d10
> (XEN) [ 3668.823304]    ffff830078ba7000 0000000000000001
> 0000000000000000 ffff830c52997ef8
> (XEN) [ 3668.831363]    ffff830c52997c88 ffff82d0801d844d
> ffff830c52997ce8 ffff82d0801d13da
> (XEN) [ 3668.839423]    ffff830c52997d38 ffff82d0803107e0
> 0000000000000000 fffff801a1e18d03
> (XEN) [ 3668.847484]    ffff830c52997cd8 ffff830078ba7000
> ffff830c52997d10 000000000000002c
> (XEN) [ 3668.855544] Xen call trace:
> (XEN) [ 3668.858838]    [<ffff82d080127880>]
> queue_read_lock_slowpath+0x27/0x4d
> (XEN) [ 3668.865857]    [<ffff82d08020c1df>]
> get_page_from_gfn_p2m+0x14e/0x3b0
> (XEN) [ 3668.872792]    [<ffff82d080247208>]
> hap_p2m_ga_to_gfn_4_levels+0x48/0x299
> (XEN) [ 3668.880071]    [<ffff82d080247475>]
> hap_gva_to_gfn_4_levels+0x1c/0x1e
> (XEN) [ 3668.887004]    [<ffff82d080212751>] paging_gva_to_gfn+0x10e/0x11d
> (XEN) [ 3668.893590]    [<ffff82d0801d66a0>] hvm.c#__hvm_copy+0x98/0x37f
> (XEN) [ 3668.900003]    [<ffff82d0801d844d>]
> hvm_fetch_from_guest_virt_nofault+0x14/0x16
> (XEN) [ 3668.907801]    [<ffff82d0801d13da>]
> emulate.c#_hvm_emulate_one+0x118/0x2bc
> (XEN) [ 3668.915168]    [<ffff82d0801d1674>] hvm_emulate_one+0x10/0x12
> (XEN) [ 3668.921409]    [<ffff82d0801e08c2>] handle_mmio+0x52/0xc9
> (XEN) [ 3668.927303]    [<ffff82d0802034a2>]
> vmx_vmexit_handler+0x1e0e/0x1e45
> (XEN) [ 3668.934149]    [<ffff82d08020820c>]
> vmx_asm_vmexit_handler+0xec/0x250
> (XEN) [ 3668.941079]
> (XEN) [ 3668.943072] *** Dumping CPU2 guest state (d1v1): ***
> (XEN) [ 3668.948533] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
> (XEN) [ 3668.954948] CPU:    2
> (XEN) [ 3668.957719] RIP:    0010:[<fffff801a1e18d03>]
> (XEN) [ 3668.962572] RFLAGS: 0000000000010046   CONTEXT: hvm guest (d1v1)
> (XEN) [ 3668.969075] rax: fffff78880009000   rbx: 000000000004002f
> rcx: fffff801a1e19300
> (XEN) [ 3668.977045] rdx: ffffef07c38b76b8   rsi: ffffef07c38b7708
> rdi: 0000000000000000
> (XEN) [ 3668.985018] rbp: ffffef07c38b76b0   rsp: ffffef07c38b75f0   r8:
>  ffffef07c38b7708
> (XEN) [ 3668.992991] r9:  000000000000002f   r10: 0000000000000001
> r11: 0000000000000001
> (XEN) [ 3669.000966] r12: 0000000000000001   r13: 0000000000000000
> r14: 0000000000000001
> (XEN) [ 3669.008938] r15: 000000000000002f   cr0: 0000000080050031
> cr4: 0000000000170678
> (XEN) [ 3669.016913] cr3: 00000000001aa002   cr2: 00000190068c3000
> (XEN) [ 3669.022806] fsb: 0000000000000000   gsb: ffffc9814c820000
> gss: 0000000473bfe000
> (XEN) [ 3669.030776] ds: 002b   es: 002b   fs: 0053   gs: 002b   ss:
> 0000   cs: 0010
> (XEN) [ 3669.038229]
> (XEN) [ 3669.040223] *** Dumping CPU3 host state: ***
> (XEN) [ 3669.044988] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
> (XEN) [ 3669.051403] CPU:    3
> (XEN) [ 3669.054177] RIP:    e008:[<ffff82d08021006a>]
> vmx_start_reexecute_instruction+0x107/0x68a
> (XEN) [ 3669.062841] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d1v0)
> (XEN) [ 3669.069431] rax: ffff830078ba7000   rbx: ffff83007ccfc000
> rcx: 0000000000000002
> (XEN) [ 3669.077404] rdx: ffff830c5297ffff   rsi: 0000000000000246
> rdi: ffff830c52998148
> (XEN) [ 3669.085377] rbp: ffff830c5297fd18   rsp: ffff830c5297fcb8   r8:
>  0000000000000002
> (XEN) [ 3669.093349] r9:  0000000000000006   r10: 000000000003d976
> r11: 0000000000000006
> (XEN) [ 3669.101320] r12: 0000000000000000   r13: ffff82d08028a3e4
> r14: 0000000000000000
> (XEN) [ 3669.109296] r15: 0000000113f007f8   cr0: 0000000080050033
> cr4: 00000000003526e0
> (XEN) [ 3669.117269] cr3: 0000000b10380000   cr2: 0000000000000000
> (XEN) [ 3669.123163] fsb: 0000000000000000   gsb: 0000000000000000
> gss: fffff801a129e000
> (XEN) [ 3669.131132] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
> 0000   cs: e008
> (XEN) [ 3669.138586] Xen code around <ffff82d08021006a>
> (vmx_start_reexecute_instruction+0x107/0x68a):
> (XEN) [ 3669.147598]  90 80 b8 0b 01 00 00 00 <75> f5 48 8b 40 18 48 85
> c0 75 de e9 19 05 00 00
> (XEN) [ 3669.156005] Xen stack trace from rsp=ffff830c5297fcb8:
> (XEN) [ 3669.161640]    ffff830b17342800 0000000300000009
> 0000000000000100 ffff830b17342000
> (XEN) [ 3669.169697]    ffff830c00000000 ffff830b1b2b6980
> ffff830b172b4000 ffff830b1b2b6980
> (XEN) [ 3669.177761]    000000001b2b6801 0000000000000002
> ffff83007ccfc000 000000000000003b
> (XEN) [ 3669.185818]    ffff830c5297fda8 ffff82d080210b3e
> 0000000000113f00 0000000000000000
> (XEN) [ 3669.193877]    00007ff91cd34d60 0000000113f007f8
> 0000000000000000 ffff830c5297fdf0
> (XEN) [ 3669.201937]    0000000000113f00 0000000000000000
> ffff83007ccfc000 0000000000000005
> (XEN) [ 3669.209997]    ffff83007ccfc000 ffff830b172b4000
> ffff83007ccfc000 ffff83007ccfc000
> (XEN) [ 3669.218056]    0000000000113f00 0000000000000000
> ffff830c5297fe38 ffff82d0801dee9e
> (XEN) [ 3669.226116]    0000000000913f00 0000000000000000
> 00007ff91cd34d60 ffff830b1b2b6980
> (XEN) [ 3669.234177]    0000003b5297fe38 0000000113f007f8
> 0000000000000296 0000000000000000
> (XEN) [ 3669.242236]    ffff830b1b2b6980 0000000000000005
> ffff82d0802081d1 ffff830c5297fef8
> (XEN) [ 3669.250295]    ffff83007ccfc000 00000000000006ab
> 000000000000001b 0000000113f007f8
> (XEN) [ 3669.258354]    ffff830c5297fee8 ffff82d080202c00
> ffff82d0802081d1 0000000000000080
> (XEN) [ 3669.266417]    0000000000000000 0000000000000002
> ffff830b172b4000 0000000000113f00
> (XEN) [ 3669.274474]    00007ff91cd34d60 000000000000003b
> ffff82d0802081d1 ffff82d0802081c5
> (XEN) [ 3669.282537]    ffff82d0802081d1 ffff82d0802081c5
> ffff82d0802081d1 ffff82d0802081c5
> (XEN) [ 3669.290596]    ffff82d0802081d1 ffff83007ccfc000
> 0000000000000000 0000000000000000
> (XEN) [ 3669.298655]    0000000000000000 0000000000000000
> 00007cf3ad6800e7 ffff82d08020820c
> (XEN) [ 3669.306712]    00007ff91cd34d60 0000019285c42a50
> 00000192858a8eb0 0000000000000000
> (XEN) [ 3669.314772]    0000019285894438 41c64e6da3bd2845
> 0000104000000000 00000fff239a69ac
> (XEN) [ 3669.322832] Xen call trace:
> (XEN) [ 3669.326128]    [<ffff82d08021006a>]
> vmx_start_reexecute_instruction+0x107/0x68a
> (XEN) [ 3669.333925]    [<ffff82d080210b3e>]
> p2m_mem_access_check+0x551/0x64d
> (XEN) [ 3669.340774]    [<ffff82d0801dee9e>]
> hvm_hap_nested_page_fault+0x2f2/0x631
> (XEN) [ 3669.348051]    [<ffff82d080202c00>]
> vmx_vmexit_handler+0x156c/0x1e45
> (XEN) [ 3669.354899]    [<ffff82d08020820c>]
> vmx_asm_vmexit_handler+0xec/0x250
> (XEN) [ 3669.361832]
> (XEN) [ 3669.363827] *** Dumping CPU3 guest state (d1v0): ***
> (XEN) [ 3669.369285] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
> (XEN) [ 3669.375700] CPU:    3
> (XEN) [ 3669.378471] RIP:    0033:[<00007ff91cd34d60>]
> (XEN) [ 3669.383323] RFLAGS: 0000000000010247   CONTEXT: hvm guest (d1v0)
> (XEN) [ 3669.389824] rax: 00007ff91cd34d60   rbx: 41c64e6da3bd2845
> rcx: 41c64e6da3bd2845
> (XEN) [ 3669.397799] rdx: 0000000000000077   rsi: 0000000000000001
> rdi: 0000019285877150
> (XEN) [ 3669.405768] rbp: 0000019285894438   rsp: 0000008d6aa7f608   r8:
>  0000000000000000
> (XEN) [ 3669.413743] r9:  00000192858a8eb0   r10: 00000fff239a69ac
> r11: 0000104000000000
> (XEN) [ 3669.421716] r12: 0000000000000000   r13: 00000192858a8eb0
> r14: 0000019285c42a50
> (XEN) [ 3669.429690] r15: 00007ff91cd34d60   cr0: 0000000080050031
> cr4: 0000000000170678
> (XEN) [ 3669.437662] cr3: 0000000113f00002   cr2: 0000000000000000
> (XEN) [ 3669.443555] fsb: 0000000000000000   gsb: 0000008d6a7cf000
> gss: 0000002562d20000
> (XEN) [ 3669.451529] ds: 002b   es: 002b   fs: 0053   gs: 002b   ss:
> 002b   cs: 0033
> (XEN) [ 3669.458980]
> (XEN) [ 3669.463584] APIC error on CPU0: 40(00)
> 
> Some scheduler magic appears to happen here where it is unclear why
> is_running doesn't seem to end up being 0 as expected in our case. We'll
> keep digging.

There seems to be some kind of deadlock between
vmx_start_reexecute_instruction and hap_track_dirty_vram/handle_mmio.
Are you holding a lock while trying to put the other vcpus to sleep?

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22  9:50     ` Alexandru Stefan ISAILA
  2018-11-22 10:00       ` Jan Beulich
@ 2018-11-22 10:07       ` Roger Pau Monné
  1 sibling, 0 replies; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-22 10:07 UTC (permalink / raw)
  To: Alexandru Stefan ISAILA
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, Razvan Cojocaru,
	george.dunlap, andrew.cooper3, Mihai Donțu,
	Andrei Vlad LUTAS, jbeulich, xen-devel, Anshul Makkar

On Thu, Nov 22, 2018 at 09:50:28AM +0000, Alexandru Stefan ISAILA wrote:
> 
> On 21.11.2018 20:55, Razvan Cojocaru wrote:
> >> +            if ( a == v )
> >> +                continue;
> >> +
> >> +            /* Pause, synced. */
> >> +            while ( !a->arch.in_host )
> > Why not use a->is_running as a way to know whether the vCPU is
> > running?
> > 
> > I think the logic of using vcpu_pause and expecting the running vcpu
> > to take a vmexit and thus set in_host is wrong because a vcpu that
> > wasn't running when vcpu_pause_nosync is called won't get scheduled
> > anymore, thus not taking a vmexit and this function will lockup.
> 
> We can resolve this by using while ( !vcpu_runnable(a) && 
> !a->arch.in_host ), if this is suitable.

I'm afraid that without a reason why the generic vcpu_pause is not
suitable here adding more code to it is just papering over the real
issue.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 10:05     ` Roger Pau Monné
@ 2018-11-22 10:14       ` Razvan Cojocaru
  2018-11-22 10:58         ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-22 10:14 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On 11/22/18 12:05 PM, Roger Pau Monné wrote:
> On Wed, Nov 21, 2018 at 08:55:48PM +0200, Razvan Cojocaru wrote:
>> On 11/16/18 7:04 PM, Roger Pau Monné wrote:
>>>> +            if ( a == v )
>>>> +                continue;
>>>> +
>>>> +            /* Pause, synced. */
>>>> +            while ( !a->arch.in_host )
>>> Why not use a->is_running as a way to know whether the vCPU is
>>> running?
>>>
>>> I think the logic of using vcpu_pause and expecting the running vcpu
>>> to take a vmexit and thus set in_host is wrong because a vcpu that
>>> wasn't running when vcpu_pause_nosync is called won't get scheduled
>>> anymore, thus not taking a vmexit and this function will lockup.
>>>
>>> I don't think you need the in_host boolean at all.
>>>
>>>> +                cpu_relax();
>>> Is this really better than using vcpu_pause?
>>>
>>> I assume this is done to avoid waiting on each vcpu, and instead doing
>>> it here likely means less wait time?
>>
>> The problem with plain vcpu_pause() is that we weren't able to use it,
>> for the same reason (which remains unclear as of yet) that we couldn't
>> use a->is_running: we get CPU stuck hypervisor crashes that way. Here's
>> one that uses the same logic, but loops on a->is_running instead of
>> !a->arch.in_host:
>>
>> (XEN) [ 3663.19(XEN) [ 3667.995061] Watchdog timer detects that CPU0 is
>> stuck!
>> (XEN) [ 3668.000694] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
>> (XEN) [ 3668.007108] CPU:    0
>> (XEN) [ 3668.009882] RIP:    e008:[<ffff82d0801327d2>]
>> vcpu_sleep_sync+0x40/0x71
>> (XEN) [ 3668.016989] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d0v0)
>> (XEN) [ 3668.023575] rax: 0000000000000001   rbx: ffff83007ccfc000
>> rcx: ffff83007ccfc128
>> (XEN) [ 3668.031548] rdx: 0000000000000000   rsi: 0000000000000246
>> rdi: ffff830c52984148
>> (XEN) [ 3668.039522] rbp: ffff83007cf2fcd8   rsp: ffff83007cf2fcc8   r8:
>>  0000000000000003
>> (XEN) [ 3668.047495] r9:  0000000000000000   r10: ffff82d080348460
>> r11: 0000000000000000
>> (XEN) [ 3668.055465] r12: ffff82d080132792   r13: ffff830b172b4000
>> r14: ffff82c000225000
>> (XEN) [ 3668.063439] r15: 00000000000f0000   cr0: 0000000080050033
>> cr4: 00000000003526e0
>> (XEN) [ 3668.071415] cr3: 0000000b4ba94000   cr2: 00007f6161714f70
>> (XEN) [ 3668.077308] fsb: 00007f9164f088c0   gsb: ffff880276c00000
>> gss: 0000000000000000
>> (XEN) [ 3668.085280] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
>> e010   cs: e008
>> (XEN) [ 3668.092731] Xen code around <ffff82d0801327d2>
>> (vcpu_sleep_sync+0x40/0x71):
>> (XEN) [ 3668.100186]  01 00 00 00 74 24 f3 90 <8b> 11 48 8b 43 10 8b 80
>> dc 01 00 00 09 d0 48 98
>> (XEN) [ 3668.108593] Xen stack trace from rsp=ffff83007cf2fcc8:
>> (XEN) [ 3668.114223]    0000000000000240 ffff83007ccfc000
>> ffff83007cf2fd08 ffff82d08010735b
>> (XEN) [ 3668.122282]    ffff82d0801358ad ffff830b172b4000
>> 0000000000000240 0000000000000048
>> (XEN) [ 3668.130346]    ffff83007cf2fd18 ffff82d08010879a
>> ffff83007cf2fd88 ffff82d080245e69
>> (XEN) [ 3668.138402]    ffff83007d615000 ffff830b172b4658
>> ffff83007cf2fd48 00000000000f0000
>> (XEN) [ 3668.146464]    00007f9164fb8004 0000000000000048
>> ffff830c52974000 0000000000000006
>> (XEN) [ 3668.154523]    ffffffffffffffff ffffffffffffffea
>> 00007f9164fb1004 0000000000000000
>> (XEN) [ 3668.162584]    ffff83007cf2fe48 ffff82d0801dd8f5
>> ffff82d080374d58 ffff82d08024b308
>> (XEN) [ 3668.170643]    ffff83007cf2fdc8 ffff83007cf2ffff
>> ffff83007cf2fdc8 ffff830b172b4000
>> (XEN) [ 3668.178704]    0000024000000001 00000000000f0000
>> 00007f9164fb8004 fffffffffffffffc
>> (XEN) [ 3668.186763]    0000000000000293 00007f91631f85d3
>> ffff82d080250834 ffff82d080250828
>> (XEN) [ 3668.194820]    ffff82d080250834 ffff82d080250828
>> ffff82d080250834 ffff83007cf2fef8
>> (XEN) [ 3668.202882]    0000000000000022 ffff82d0801dc037
>> deadbeefdeadf00d ffffffff8100144a
>> (XEN) [ 3668.210942]    ffff83007cf2fee8 ffff82d080172aca
>> 02ff82d080250834 0000000000000006
>> (XEN) [ 3668.219000]    00007f9164fb1004 deadbeefdeadf00d
>> deadbeefdeadf00d deadbeefdeadf00d
>> (XEN) [ 3668.227062]    ffff82d080250834 ffff82d080250828
>> ffff82d080250834 ffff82d080250828
>> (XEN) [ 3668.235121]    ffff82d080250834 ffff82d080250828
>> ffff82d080250834 ffff83007d615000
>> (XEN) [ 3668.243180]    0000000000000000 0000000000000000
>> 0000000000000000 0000000000000000
>> (XEN) [ 3668.251240]    00007cff830d00e7 ffff82d080250899
>> 00007ffef6baf1d0 0000000000305000
>> (XEN) [ 3668.259298]    ffff88022740b900 fffffffffffffff2
>> ffff88022b31fe98 ffff88026f3374d8
>> (XEN) [ 3668.267361]    0000000000000282 0000000000000000
>> ffff88007c995080 0000000000000000
>> (XEN) [ 3668.275417] Xen call trace:
>> (XEN) [ 3668.278714]    [<ffff82d0801327d2>] vcpu_sleep_sync+0x40/0x71
>> (XEN) [ 3668.284952]    [<ffff82d08010735b>]
>> domain.c#do_domain_pause+0x33/0x4f
>> (XEN) [ 3668.291973]    [<ffff82d08010879a>] domain_pause+0x25/0x27
>> (XEN) [ 3668.297952]    [<ffff82d080245e69>]
>> hap_track_dirty_vram+0x2c1/0x4a7
>> (XEN) [ 3668.304797]    [<ffff82d0801dd8f5>] do_hvm_op+0x18be/0x2b58
>> (XEN) [ 3668.310864]    [<ffff82d080172aca>] pv_hypercall+0x1e5/0x402
>> (XEN) [ 3668.317017]    [<ffff82d080250899>] entry.o#test_all_events+0/0x3d
>> (XEN) [ 3668.323689]
>> (XEN) [ 3668.325685]
>> (XEN) [ 3668.327678] ****************************************
>> (XEN) [ 3668.333138] Panic on CPU 0:
>> (XEN) [ 3668.336428] FATAL TRAP: vector = 2 (nmi)
>> (XEN) [ 3668.340850] [error_code=0000]
>> (XEN) [ 3668.344404] ****************************************
>> (XEN) [ 3668.349863]
>> (XEN) [ 3668.351854] Reboot in five seconds...
>> (XEN) [ 3668.356017] Dumping other CPUs
>> (XEN) [ 3668.359567] *** Dumping CPU1 host state: ***
>> (XEN) [ 3668.364337] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
>> (XEN) [ 3668.370750] CPU:    1
>> (XEN) [ 3668.373522] RIP:    e008:[<ffff82d08016b5a6>]
>> domain.c#default_idle+0xa2/0xb5
>> (XEN) [ 3668.381149] RFLAGS: 0000000000000202   CONTEXT: hypervisor
>> (XEN) [ 3668.387128] rax: 0000000000000000   rbx: ffff830c529b7fff
>> rcx: 0000000000000048
>> (XEN) [ 3668.395101] rdx: 0000000000000000   rsi: ffff830c529b7fff
>> rdi: ffff830c529b7ef8
>> (XEN) [ 3668.403076] rbp: ffff830c529b7ed0   rsp: ffff830c529b7ed0   r8:
>>  ffff830c529fe4a8
>> (XEN) [ 3668.411048] r9:  ffff830c529bac20   r10: ffff830c529fe490
>> r11: ffff830c529ba148
>> (XEN) [ 3668.419019] r12: ffff830c529ba140   r13: ffff83007cf75000
>> r14: 000003540fd7cd6b
>> (XEN) [ 3668.426994] r15: ffffffffffffffff   cr0: 000000008005003b
>> cr4: 00000000003526e0
>> (XEN) [ 3668.434964] cr3: 000000007cf1d000   cr2: 0000000000000000
>> (XEN) [ 3668.440861] fsb: 0000000000000000   gsb: 0000000000000000
>> gss: 0000000000000000
>> (XEN) [ 3668.448832] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
>> 0000   cs: e008
>> (XEN) [ 3668.456285] Xen code around <ffff82d08016b5a6>
>> (domain.c#default_idle+0xa2/0xb5):
>> (XEN) [ 3668.464260]  00 00 00 0f 30 90 fb f4 <0f> b6 46 f5 80 a7 fd 00
>> 00 00 fe 0f 30 90 eb 01
>> (XEN) [ 3668.472663] Xen stack trace from rsp=ffff830c529b7ed0:
>> (XEN) [ 3668.478297]    ffff830c529b7ef0 ffff82d08016b628
>> ffff82d080134ffe ffff83007cf75000
>> (XEN) [ 3668.486358]    ffff830c529b7df0 0000000000000000
>> 0000000000000000 0000000000000000
>> (XEN) [ 3668.494417]    0000000000000000 00000000001c3a38
>> 0000000000000000 0000000000000000
>> (XEN) [ 3668.502478]    0000000000000000 0000000000000000
>> 0000000000000000 0000000000000000
>> (XEN) [ 3668.510538]    00000000000002ff 00000000001c00e9
>> 0000000000000000 0000000000000000
>> (XEN) [ 3668.518595]    0000beef0000beef 0000000000103f15
>> 000000bf0000beef 0000000000000046
>> (XEN) [ 3668.526656]    00000000001c3a38 000000000000beef
>> ffffea000d5bbeef ffffea000d5bbeef
>> (XEN) [ 3668.534715]    000000000000beef 000000000000beef
>> 017fffc000000001 ffff83007cf75000
>> (XEN) [ 3668.542775]    0000003bd2646380 00000000003526e0
>> 0000000000000000 0000000c5299e000
>> (XEN) [ 3668.550837]    0000070100000000 0000000000000000
>> (XEN) [ 3668.555948] Xen call trace:
>> (XEN) [ 3668.559242]    [<ffff82d08016b5a6>] domain.c#default_idle+0xa2/0xb5
>> (XEN) [ 3668.566000]    [<ffff82d08016b628>] domain.c#idle_loop+0x57/0x6e
>> (XEN) [ 3668.572502]
>> (XEN) [ 3668.574494] *** Dumping CPU2 host state: ***
>> (XEN) [ 3668.579261] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
>> (XEN) [ 3668.585675] CPU:    2
>> (XEN) [ 3668.588449] RIP:    e008:[<ffff82d080127880>]
>> queue_read_lock_slowpath+0x27/0x4d
>> (XEN) [ 3668.596332] RFLAGS: 0000000000000286   CONTEXT: hypervisor (d1v1)
>> (XEN) [ 3668.602919] rax: 00000000000000ff   rbx: ffff830b1b2b6980
>> rcx: 0000000000000000
>> (XEN) [ 3668.610893] rdx: ffff830c52997fff   rsi: 0000000000000009
>> rdi: ffff830b1b2b698a
>> (XEN) [ 3668.618865] rbp: ffff830c52997a68   rsp: ffff830c52997a58   r8:
>>  0000000000000000
>> (XEN) [ 3668.626837] r9:  0000000000000003   r10: 0000000000000000
>> r11: 0000000000000000
>> (XEN) [ 3668.634812] r12: ffff830b1b2b6984   r13: ffff830c52997aa4
>> r14: ffff830c52997c34
>> (XEN) [ 3668.642786] r15: 00000000000001aa   cr0: 0000000080050033
>> cr4: 00000000003526e0
>> (XEN) [ 3668.650759] cr3: 0000000b105ef000   cr2: 00000190068c3000
>> (XEN) [ 3668.656650] fsb: 0000000000000000   gsb: 0000000000000000
>> gss: 0000004f58bd3000
>> (XEN) [ 3668.664624] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
>> 0000   cs: e008
>> (XEN) [ 3668.672077] Xen code around <ffff82d080127880>
>> (queue_read_lock_slowpath+0x27/0x4d):
>> (XEN) [ 3668.680309]  84 c0 74 08 f3 90 8b 03 <84> c0 75 f8 b8 00 01 00
>> 00 f0 0f c1 03 3c ff 75
>> (XEN) [ 3668.688717] Xen stack trace from rsp=ffff830c52997a58:
>> (XEN) [ 3668.694351]    ffff830b1b2b6980 ffff830c52997b54
>> ffff830c52997ad8 ffff82d08020c1df
>> (XEN) [ 3668.702411]    ffff830c52997b08 ffff82d080217db4
>> ffff830b172b4000 0000000352997c44
>> (XEN) [ 3668.710468]    000000000db12f43 0000000000000000
>> ffff830c00000000 00000000000001aa
>> (XEN) [ 3668.718529]    ffff830b1b2b6980 fffff801a1e18d03
>> ffff830c52997c34 ffff830078ba7000
>> (XEN) [ 3668.726591]    ffff830c52997b88 ffff82d080247208
>> ffff830b1b2b6980 ffff830c52997c44
>> (XEN) [ 3668.734648]    0000000000000000 fffff801a1e18d03
>> ffff830c52997b68 ffff82d08020bf20
>> (XEN) [ 3668.742707]    0000000000000000 0000000208a008e3
>> ffff830c52997b58 0000000400000000
>> (XEN) [ 3668.750768]    0000000000008000 0000000000000000
>> ffff830c52997be0 0000000000000000
>> (XEN) [ 3668.758826]    0000000000000000 ffff830078ba7000
>> ffff830c52997c34 fffff801a1e18d03
>> (XEN) [ 3668.766888]    ffff830b1b2b6980 ffff82d080311520
>> ffff830c52997b98 ffff82d080247475
>> (XEN) [ 3668.774945]    ffff830c52997be8 ffff82d080212751
>> 0000000000008000 ffffef07c38b76b0
>> (XEN) [ 3668.783006]    0000000000000010 fffff801a1e18d03
>> fffff801a1e18d03 0000000000000d03
>> (XEN) [ 3668.791067]    000fffff801a1e18 ffff830c52997ef8
>> ffff830c52997c78 ffff82d0801d66a0
>> (XEN) [ 3668.799128]    ffffef07c38b7708 ffff830c52997c44
>> ffff830c52997c34 0000000000000004
>> (XEN) [ 3668.807188]    ffff830c52997d38 0000001000000004
>> ffff830078ba7000 0000001100000010
>> (XEN) [ 3668.815244]    ffffea000d59beef ffffea000d59beef
>> 000000000000beef ffff830c52997d10
>> (XEN) [ 3668.823304]    ffff830078ba7000 0000000000000001
>> 0000000000000000 ffff830c52997ef8
>> (XEN) [ 3668.831363]    ffff830c52997c88 ffff82d0801d844d
>> ffff830c52997ce8 ffff82d0801d13da
>> (XEN) [ 3668.839423]    ffff830c52997d38 ffff82d0803107e0
>> 0000000000000000 fffff801a1e18d03
>> (XEN) [ 3668.847484]    ffff830c52997cd8 ffff830078ba7000
>> ffff830c52997d10 000000000000002c
>> (XEN) [ 3668.855544] Xen call trace:
>> (XEN) [ 3668.858838]    [<ffff82d080127880>]
>> queue_read_lock_slowpath+0x27/0x4d
>> (XEN) [ 3668.865857]    [<ffff82d08020c1df>]
>> get_page_from_gfn_p2m+0x14e/0x3b0
>> (XEN) [ 3668.872792]    [<ffff82d080247208>]
>> hap_p2m_ga_to_gfn_4_levels+0x48/0x299
>> (XEN) [ 3668.880071]    [<ffff82d080247475>]
>> hap_gva_to_gfn_4_levels+0x1c/0x1e
>> (XEN) [ 3668.887004]    [<ffff82d080212751>] paging_gva_to_gfn+0x10e/0x11d
>> (XEN) [ 3668.893590]    [<ffff82d0801d66a0>] hvm.c#__hvm_copy+0x98/0x37f
>> (XEN) [ 3668.900003]    [<ffff82d0801d844d>]
>> hvm_fetch_from_guest_virt_nofault+0x14/0x16
>> (XEN) [ 3668.907801]    [<ffff82d0801d13da>]
>> emulate.c#_hvm_emulate_one+0x118/0x2bc
>> (XEN) [ 3668.915168]    [<ffff82d0801d1674>] hvm_emulate_one+0x10/0x12
>> (XEN) [ 3668.921409]    [<ffff82d0801e08c2>] handle_mmio+0x52/0xc9
>> (XEN) [ 3668.927303]    [<ffff82d0802034a2>]
>> vmx_vmexit_handler+0x1e0e/0x1e45
>> (XEN) [ 3668.934149]    [<ffff82d08020820c>]
>> vmx_asm_vmexit_handler+0xec/0x250
>> (XEN) [ 3668.941079]
>> (XEN) [ 3668.943072] *** Dumping CPU2 guest state (d1v1): ***
>> (XEN) [ 3668.948533] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
>> (XEN) [ 3668.954948] CPU:    2
>> (XEN) [ 3668.957719] RIP:    0010:[<fffff801a1e18d03>]
>> (XEN) [ 3668.962572] RFLAGS: 0000000000010046   CONTEXT: hvm guest (d1v1)
>> (XEN) [ 3668.969075] rax: fffff78880009000   rbx: 000000000004002f
>> rcx: fffff801a1e19300
>> (XEN) [ 3668.977045] rdx: ffffef07c38b76b8   rsi: ffffef07c38b7708
>> rdi: 0000000000000000
>> (XEN) [ 3668.985018] rbp: ffffef07c38b76b0   rsp: ffffef07c38b75f0   r8:
>>  ffffef07c38b7708
>> (XEN) [ 3668.992991] r9:  000000000000002f   r10: 0000000000000001
>> r11: 0000000000000001
>> (XEN) [ 3669.000966] r12: 0000000000000001   r13: 0000000000000000
>> r14: 0000000000000001
>> (XEN) [ 3669.008938] r15: 000000000000002f   cr0: 0000000080050031
>> cr4: 0000000000170678
>> (XEN) [ 3669.016913] cr3: 00000000001aa002   cr2: 00000190068c3000
>> (XEN) [ 3669.022806] fsb: 0000000000000000   gsb: ffffc9814c820000
>> gss: 0000000473bfe000
>> (XEN) [ 3669.030776] ds: 002b   es: 002b   fs: 0053   gs: 002b   ss:
>> 0000   cs: 0010
>> (XEN) [ 3669.038229]
>> (XEN) [ 3669.040223] *** Dumping CPU3 host state: ***
>> (XEN) [ 3669.044988] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
>> (XEN) [ 3669.051403] CPU:    3
>> (XEN) [ 3669.054177] RIP:    e008:[<ffff82d08021006a>]
>> vmx_start_reexecute_instruction+0x107/0x68a
>> (XEN) [ 3669.062841] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d1v0)
>> (XEN) [ 3669.069431] rax: ffff830078ba7000   rbx: ffff83007ccfc000
>> rcx: 0000000000000002
>> (XEN) [ 3669.077404] rdx: ffff830c5297ffff   rsi: 0000000000000246
>> rdi: ffff830c52998148
>> (XEN) [ 3669.085377] rbp: ffff830c5297fd18   rsp: ffff830c5297fcb8   r8:
>>  0000000000000002
>> (XEN) [ 3669.093349] r9:  0000000000000006   r10: 000000000003d976
>> r11: 0000000000000006
>> (XEN) [ 3669.101320] r12: 0000000000000000   r13: ffff82d08028a3e4
>> r14: 0000000000000000
>> (XEN) [ 3669.109296] r15: 0000000113f007f8   cr0: 0000000080050033
>> cr4: 00000000003526e0
>> (XEN) [ 3669.117269] cr3: 0000000b10380000   cr2: 0000000000000000
>> (XEN) [ 3669.123163] fsb: 0000000000000000   gsb: 0000000000000000
>> gss: fffff801a129e000
>> (XEN) [ 3669.131132] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss:
>> 0000   cs: e008
>> (XEN) [ 3669.138586] Xen code around <ffff82d08021006a>
>> (vmx_start_reexecute_instruction+0x107/0x68a):
>> (XEN) [ 3669.147598]  90 80 b8 0b 01 00 00 00 <75> f5 48 8b 40 18 48 85
>> c0 75 de e9 19 05 00 00
>> (XEN) [ 3669.156005] Xen stack trace from rsp=ffff830c5297fcb8:
>> (XEN) [ 3669.161640]    ffff830b17342800 0000000300000009
>> 0000000000000100 ffff830b17342000
>> (XEN) [ 3669.169697]    ffff830c00000000 ffff830b1b2b6980
>> ffff830b172b4000 ffff830b1b2b6980
>> (XEN) [ 3669.177761]    000000001b2b6801 0000000000000002
>> ffff83007ccfc000 000000000000003b
>> (XEN) [ 3669.185818]    ffff830c5297fda8 ffff82d080210b3e
>> 0000000000113f00 0000000000000000
>> (XEN) [ 3669.193877]    00007ff91cd34d60 0000000113f007f8
>> 0000000000000000 ffff830c5297fdf0
>> (XEN) [ 3669.201937]    0000000000113f00 0000000000000000
>> ffff83007ccfc000 0000000000000005
>> (XEN) [ 3669.209997]    ffff83007ccfc000 ffff830b172b4000
>> ffff83007ccfc000 ffff83007ccfc000
>> (XEN) [ 3669.218056]    0000000000113f00 0000000000000000
>> ffff830c5297fe38 ffff82d0801dee9e
>> (XEN) [ 3669.226116]    0000000000913f00 0000000000000000
>> 00007ff91cd34d60 ffff830b1b2b6980
>> (XEN) [ 3669.234177]    0000003b5297fe38 0000000113f007f8
>> 0000000000000296 0000000000000000
>> (XEN) [ 3669.242236]    ffff830b1b2b6980 0000000000000005
>> ffff82d0802081d1 ffff830c5297fef8
>> (XEN) [ 3669.250295]    ffff83007ccfc000 00000000000006ab
>> 000000000000001b 0000000113f007f8
>> (XEN) [ 3669.258354]    ffff830c5297fee8 ffff82d080202c00
>> ffff82d0802081d1 0000000000000080
>> (XEN) [ 3669.266417]    0000000000000000 0000000000000002
>> ffff830b172b4000 0000000000113f00
>> (XEN) [ 3669.274474]    00007ff91cd34d60 000000000000003b
>> ffff82d0802081d1 ffff82d0802081c5
>> (XEN) [ 3669.282537]    ffff82d0802081d1 ffff82d0802081c5
>> ffff82d0802081d1 ffff82d0802081c5
>> (XEN) [ 3669.290596]    ffff82d0802081d1 ffff83007ccfc000
>> 0000000000000000 0000000000000000
>> (XEN) [ 3669.298655]    0000000000000000 0000000000000000
>> 00007cf3ad6800e7 ffff82d08020820c
>> (XEN) [ 3669.306712]    00007ff91cd34d60 0000019285c42a50
>> 00000192858a8eb0 0000000000000000
>> (XEN) [ 3669.314772]    0000019285894438 41c64e6da3bd2845
>> 0000104000000000 00000fff239a69ac
>> (XEN) [ 3669.322832] Xen call trace:
>> (XEN) [ 3669.326128]    [<ffff82d08021006a>]
>> vmx_start_reexecute_instruction+0x107/0x68a
>> (XEN) [ 3669.333925]    [<ffff82d080210b3e>]
>> p2m_mem_access_check+0x551/0x64d
>> (XEN) [ 3669.340774]    [<ffff82d0801dee9e>]
>> hvm_hap_nested_page_fault+0x2f2/0x631
>> (XEN) [ 3669.348051]    [<ffff82d080202c00>]
>> vmx_vmexit_handler+0x156c/0x1e45
>> (XEN) [ 3669.354899]    [<ffff82d08020820c>]
>> vmx_asm_vmexit_handler+0xec/0x250
>> (XEN) [ 3669.361832]
>> (XEN) [ 3669.363827] *** Dumping CPU3 guest state (d1v0): ***
>> (XEN) [ 3669.369285] ----[ Xen-4.7.5  x86_64  debug=y  Not tainted ]----
>> (XEN) [ 3669.375700] CPU:    3
>> (XEN) [ 3669.378471] RIP:    0033:[<00007ff91cd34d60>]
>> (XEN) [ 3669.383323] RFLAGS: 0000000000010247   CONTEXT: hvm guest (d1v0)
>> (XEN) [ 3669.389824] rax: 00007ff91cd34d60   rbx: 41c64e6da3bd2845
>> rcx: 41c64e6da3bd2845
>> (XEN) [ 3669.397799] rdx: 0000000000000077   rsi: 0000000000000001
>> rdi: 0000019285877150
>> (XEN) [ 3669.405768] rbp: 0000019285894438   rsp: 0000008d6aa7f608   r8:
>>  0000000000000000
>> (XEN) [ 3669.413743] r9:  00000192858a8eb0   r10: 00000fff239a69ac
>> r11: 0000104000000000
>> (XEN) [ 3669.421716] r12: 0000000000000000   r13: 00000192858a8eb0
>> r14: 0000019285c42a50
>> (XEN) [ 3669.429690] r15: 00007ff91cd34d60   cr0: 0000000080050031
>> cr4: 0000000000170678
>> (XEN) [ 3669.437662] cr3: 0000000113f00002   cr2: 0000000000000000
>> (XEN) [ 3669.443555] fsb: 0000000000000000   gsb: 0000008d6a7cf000
>> gss: 0000002562d20000
>> (XEN) [ 3669.451529] ds: 002b   es: 002b   fs: 0053   gs: 002b   ss:
>> 002b   cs: 0033
>> (XEN) [ 3669.458980]
>> (XEN) [ 3669.463584] APIC error on CPU0: 40(00)
>>
>> Some scheduler magic appears to happen here where it is unclear why
>> is_running doesn't seem to end up being 0 as expected in our case. We'll
>> keep digging.
> 
> There seems to be some kind of deadlock between
> vmx_start_reexecute_instruction and hap_track_dirty_vram/handle_mmio.
> Are you holding a lock while trying to put the other vcpus to sleep?

d->arch.rexec_lock, but I don't see how that would matter in this case.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 10:14       ` Razvan Cojocaru
@ 2018-11-22 10:58         ` Roger Pau Monné
  2018-11-22 12:48           ` Razvan Cojocaru
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-22 10:58 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On Thu, Nov 22, 2018 at 12:14:59PM +0200, Razvan Cojocaru wrote:
> On 11/22/18 12:05 PM, Roger Pau Monné wrote:
> > On Wed, Nov 21, 2018 at 08:55:48PM +0200, Razvan Cojocaru wrote:
> >> On 11/16/18 7:04 PM, Roger Pau Monné wrote:
> >>>> +            if ( a == v )
> >>>> +                continue;
> >>>> +
> >>>> +            /* Pause, synced. */
> >>>> +            while ( !a->arch.in_host )
> >>> Why not use a->is_running as a way to know whether the vCPU is
> >>> running?
> >>>
> >>> I think the logic of using vcpu_pause and expecting the running vcpu
> >>> to take a vmexit and thus set in_host is wrong because a vcpu that
> >>> wasn't running when vcpu_pause_nosync is called won't get scheduled
> >>> anymore, thus not taking a vmexit and this function will lockup.
> >>>
> >>> I don't think you need the in_host boolean at all.
> >>>
> >>>> +                cpu_relax();
> >>> Is this really better than using vcpu_pause?
> >>>
> >>> I assume this is done to avoid waiting on each vcpu, and instead doing
> >>> it here likely means less wait time?
> >>
> >> The problem with plain vcpu_pause() is that we weren't able to use it,
> >> for the same reason (which remains unclear as of yet) that we couldn't
> >> use a->is_running: we get CPU stuck hypervisor crashes that way. Here's
> >> one that uses the same logic, but loops on a->is_running instead of
> >> !a->arch.in_host:

[...]

> >> Some scheduler magic appears to happen here where it is unclear why
> >> is_running doesn't seem to end up being 0 as expected in our case. We'll
> >> keep digging.
> > 
> > There seems to be some kind of deadlock between
> > vmx_start_reexecute_instruction and hap_track_dirty_vram/handle_mmio.
> > Are you holding a lock while trying to put the other vcpus to sleep?
> 
> d->arch.rexec_lock, but I don't see how that would matter in this case.

The trace from pCPU#0:

(XEN) [ 3668.016989] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d0v0)
[...]
(XEN) [ 3668.275417] Xen call trace:
(XEN) [ 3668.278714]    [<ffff82d0801327d2>] vcpu_sleep_sync+0x40/0x71
(XEN) [ 3668.284952]    [<ffff82d08010735b>] domain.c#do_domain_pause+0x33/0x4f
(XEN) [ 3668.291973]    [<ffff82d08010879a>] domain_pause+0x25/0x27
(XEN) [ 3668.297952]    [<ffff82d080245e69>] hap_track_dirty_vram+0x2c1/0x4a7
(XEN) [ 3668.304797]    [<ffff82d0801dd8f5>] do_hvm_op+0x18be/0x2b58
(XEN) [ 3668.310864]    [<ffff82d080172aca>] pv_hypercall+0x1e5/0x402
(XEN) [ 3668.317017]    [<ffff82d080250899>] entry.o#test_all_events+0/0x3d

Shows there's an hypercall executed from Dom0 that's trying to pause
the domain, thus pausing all the vCPUs.

Then pCPU#3:

(XEN) [ 3669.062841] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d1v0)
[...]
(XEN) [ 3669.322832] Xen call trace:
(XEN) [ 3669.326128]    [<ffff82d08021006a>] vmx_start_reexecute_instruction+0x107/0x68a
(XEN) [ 3669.333925]    [<ffff82d080210b3e>] p2m_mem_access_check+0x551/0x64d
(XEN) [ 3669.340774]    [<ffff82d0801dee9e>] hvm_hap_nested_page_fault+0x2f2/0x631
(XEN) [ 3669.348051]    [<ffff82d080202c00>] vmx_vmexit_handler+0x156c/0x1e45
(XEN) [ 3669.354899]    [<ffff82d08020820c>] vmx_asm_vmexit_handler+0xec/0x250

Seems to be blocked in vmx_start_reexecute_instruction, and thus not
getting paused and triggering the watchdog on pCPU#0?

You should check on which vCPU is the trace from pCPU#0 waiting, if
that's the vCPU running on pCPU#3 (d1v0) you will have to check what's
taking such a long time in vmx_start_reexecute_instruction.

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 10:58         ` Roger Pau Monné
@ 2018-11-22 12:48           ` Razvan Cojocaru
  2018-11-22 14:49             ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-22 12:48 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On 11/22/18 12:58 PM, Roger Pau Monné wrote:
> On Thu, Nov 22, 2018 at 12:14:59PM +0200, Razvan Cojocaru wrote:
>> On 11/22/18 12:05 PM, Roger Pau Monné wrote:
>>> On Wed, Nov 21, 2018 at 08:55:48PM +0200, Razvan Cojocaru wrote:
>>>> On 11/16/18 7:04 PM, Roger Pau Monné wrote:
>>>>>> +            if ( a == v )
>>>>>> +                continue;
>>>>>> +
>>>>>> +            /* Pause, synced. */
>>>>>> +            while ( !a->arch.in_host )
>>>>> Why not use a->is_running as a way to know whether the vCPU is
>>>>> running?
>>>>>
>>>>> I think the logic of using vcpu_pause and expecting the running vcpu
>>>>> to take a vmexit and thus set in_host is wrong because a vcpu that
>>>>> wasn't running when vcpu_pause_nosync is called won't get scheduled
>>>>> anymore, thus not taking a vmexit and this function will lockup.
>>>>>
>>>>> I don't think you need the in_host boolean at all.
>>>>>
>>>>>> +                cpu_relax();
>>>>> Is this really better than using vcpu_pause?
>>>>>
>>>>> I assume this is done to avoid waiting on each vcpu, and instead doing
>>>>> it here likely means less wait time?
>>>>
>>>> The problem with plain vcpu_pause() is that we weren't able to use it,
>>>> for the same reason (which remains unclear as of yet) that we couldn't
>>>> use a->is_running: we get CPU stuck hypervisor crashes that way. Here's
>>>> one that uses the same logic, but loops on a->is_running instead of
>>>> !a->arch.in_host:
> 
> [...]
> 
>>>> Some scheduler magic appears to happen here where it is unclear why
>>>> is_running doesn't seem to end up being 0 as expected in our case. We'll
>>>> keep digging.
>>>
>>> There seems to be some kind of deadlock between
>>> vmx_start_reexecute_instruction and hap_track_dirty_vram/handle_mmio.
>>> Are you holding a lock while trying to put the other vcpus to sleep?
>>
>> d->arch.rexec_lock, but I don't see how that would matter in this case.
> 
> The trace from pCPU#0:
> 
> (XEN) [ 3668.016989] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d0v0)
> [...]
> (XEN) [ 3668.275417] Xen call trace:
> (XEN) [ 3668.278714]    [<ffff82d0801327d2>] vcpu_sleep_sync+0x40/0x71
> (XEN) [ 3668.284952]    [<ffff82d08010735b>] domain.c#do_domain_pause+0x33/0x4f
> (XEN) [ 3668.291973]    [<ffff82d08010879a>] domain_pause+0x25/0x27
> (XEN) [ 3668.297952]    [<ffff82d080245e69>] hap_track_dirty_vram+0x2c1/0x4a7
> (XEN) [ 3668.304797]    [<ffff82d0801dd8f5>] do_hvm_op+0x18be/0x2b58
> (XEN) [ 3668.310864]    [<ffff82d080172aca>] pv_hypercall+0x1e5/0x402
> (XEN) [ 3668.317017]    [<ffff82d080250899>] entry.o#test_all_events+0/0x3d
> 
> Shows there's an hypercall executed from Dom0 that's trying to pause
> the domain, thus pausing all the vCPUs.
> 
> Then pCPU#3:
> 
> (XEN) [ 3669.062841] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d1v0)
> [...]
> (XEN) [ 3669.322832] Xen call trace:
> (XEN) [ 3669.326128]    [<ffff82d08021006a>] vmx_start_reexecute_instruction+0x107/0x68a
> (XEN) [ 3669.333925]    [<ffff82d080210b3e>] p2m_mem_access_check+0x551/0x64d
> (XEN) [ 3669.340774]    [<ffff82d0801dee9e>] hvm_hap_nested_page_fault+0x2f2/0x631
> (XEN) [ 3669.348051]    [<ffff82d080202c00>] vmx_vmexit_handler+0x156c/0x1e45
> (XEN) [ 3669.354899]    [<ffff82d08020820c>] vmx_asm_vmexit_handler+0xec/0x250
> 
> Seems to be blocked in vmx_start_reexecute_instruction, and thus not
> getting paused and triggering the watchdog on pCPU#0?
> 
> You should check on which vCPU is the trace from pCPU#0 waiting, if
> that's the vCPU running on pCPU#3 (d1v0) you will have to check what's
> taking such a long time in vmx_start_reexecute_instruction.

Right, so this is what appears to be happening, if the output of my test
is to be trusted: https://pastebin.com/YEDqNuwh

1. vmx_start_reexecute_instruction() pauses all VCPUs but self (which
appears to be VCPU 1):

(XEN) [  195.427141] 0 pause_count 0
(XEN) [  195.427142] 2 pause_count 0
(XEN) [  195.427143] 3 pause_count 0
(XEN) [  195.427144] 4 pause_count 0
(XEN) [  195.427146] 5 pause_count 0
(XEN) [  195.427147] 6 pause_count 0
(XEN) [  195.427148] 7 pause_count 0

2. The hypercall happens, which calls domain_pause(), which I've
modified thus:

@@ -959,7 +961,10 @@ static void do_domain_pause(struct domain *d,
     atomic_inc(&d->pause_count);

     for_each_vcpu( d, v )
+    {
+        printk("domain_pause %d\n", v->vcpu_id);
         sleep_fn(v);
+    }

     arch_domain_pause(d);
 }

and which says:

(XEN) [  195.492064] domain_pause 0

3. At this point, according to addr2line,
vmx_start_reexecute_instruction() does "while ( a->is_running )
cpu_relax();" for all VCPUs but itself.

Now, d1v0, which, if I'm reading this correctly, is the VCPU that
domain_pause() is stuck waiting for, does:

(XEN) [  200.829874] Xen call trace:
(XEN) [  200.833166]    [<ffff82d0801278c6>]
queue_read_lock_slowpath+0x25/0x4d
(XEN) [  200.840186]    [<ffff82d08020c1f6>]
get_page_from_gfn_p2m+0x14e/0x3b0
(XEN) [  200.847121]    [<ffff82d080247213>]
hap_p2m_ga_to_gfn_4_levels+0x48/0x299
(XEN) [  200.854400]    [<ffff82d080247480>]
hap_gva_to_gfn_4_levels+0x1c/0x1e
(XEN) [  200.861331]    [<ffff82d08021275c>] paging_gva_to_gfn+0x10e/0x11d
(XEN) [  200.867918]    [<ffff82d0801d66e0>] hvm.c#__hvm_copy+0x98/0x37f
(XEN) [  200.874329]    [<ffff82d0801d848d>]
hvm_fetch_from_guest_virt_nofault+0x14/0x16
(XEN) [  200.882130]    [<ffff82d0801d141a>]
emulate.c#_hvm_emulate_one+0x118/0x2bc
(XEN) [  200.889496]    [<ffff82d0801d16b4>] hvm_emulate_one+0x10/0x12
(XEN) [  200.895735]    [<ffff82d0801e0902>] handle_mmio+0x52/0xc9
(XEN) [  200.901626]    [<ffff82d0801e09ba>]
handle_mmio_with_translation+0x41/0x43
(XEN) [  200.908994]    [<ffff82d0801ded1f>]
hvm_hap_nested_page_fault+0x133/0x631
(XEN) [  200.916271]    [<ffff82d080202c40>]
vmx_vmexit_handler+0x156c/0x1e45
(XEN) [  200.923117]    [<ffff82d08020824c>]
vmx_asm_vmexit_handler+0xec/0x250

I hope I'm not reading this wrong.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 12:48           ` Razvan Cojocaru
@ 2018-11-22 14:49             ` Roger Pau Monné
  2018-11-22 15:25               ` Razvan Cojocaru
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-22 14:49 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On Thu, Nov 22, 2018 at 02:48:07PM +0200, Razvan Cojocaru wrote:
> On 11/22/18 12:58 PM, Roger Pau Monné wrote:
> > On Thu, Nov 22, 2018 at 12:14:59PM +0200, Razvan Cojocaru wrote:
> >> On 11/22/18 12:05 PM, Roger Pau Monné wrote:
> >>> On Wed, Nov 21, 2018 at 08:55:48PM +0200, Razvan Cojocaru wrote:
> >>>> On 11/16/18 7:04 PM, Roger Pau Monné wrote:
> >>>>>> +            if ( a == v )
> >>>>>> +                continue;
> >>>>>> +
> >>>>>> +            /* Pause, synced. */
> >>>>>> +            while ( !a->arch.in_host )
> >>>>> Why not use a->is_running as a way to know whether the vCPU is
> >>>>> running?
> >>>>>
> >>>>> I think the logic of using vcpu_pause and expecting the running vcpu
> >>>>> to take a vmexit and thus set in_host is wrong because a vcpu that
> >>>>> wasn't running when vcpu_pause_nosync is called won't get scheduled
> >>>>> anymore, thus not taking a vmexit and this function will lockup.
> >>>>>
> >>>>> I don't think you need the in_host boolean at all.
> >>>>>
> >>>>>> +                cpu_relax();
> >>>>> Is this really better than using vcpu_pause?
> >>>>>
> >>>>> I assume this is done to avoid waiting on each vcpu, and instead doing
> >>>>> it here likely means less wait time?
> >>>>
> >>>> The problem with plain vcpu_pause() is that we weren't able to use it,
> >>>> for the same reason (which remains unclear as of yet) that we couldn't
> >>>> use a->is_running: we get CPU stuck hypervisor crashes that way. Here's
> >>>> one that uses the same logic, but loops on a->is_running instead of
> >>>> !a->arch.in_host:
> > 
> > [...]
> > 
> >>>> Some scheduler magic appears to happen here where it is unclear why
> >>>> is_running doesn't seem to end up being 0 as expected in our case. We'll
> >>>> keep digging.
> >>>
> >>> There seems to be some kind of deadlock between
> >>> vmx_start_reexecute_instruction and hap_track_dirty_vram/handle_mmio.
> >>> Are you holding a lock while trying to put the other vcpus to sleep?
> >>
> >> d->arch.rexec_lock, but I don't see how that would matter in this case.
> > 
> > The trace from pCPU#0:
> > 
> > (XEN) [ 3668.016989] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d0v0)
> > [...]
> > (XEN) [ 3668.275417] Xen call trace:
> > (XEN) [ 3668.278714]    [<ffff82d0801327d2>] vcpu_sleep_sync+0x40/0x71
> > (XEN) [ 3668.284952]    [<ffff82d08010735b>] domain.c#do_domain_pause+0x33/0x4f
> > (XEN) [ 3668.291973]    [<ffff82d08010879a>] domain_pause+0x25/0x27
> > (XEN) [ 3668.297952]    [<ffff82d080245e69>] hap_track_dirty_vram+0x2c1/0x4a7
> > (XEN) [ 3668.304797]    [<ffff82d0801dd8f5>] do_hvm_op+0x18be/0x2b58
> > (XEN) [ 3668.310864]    [<ffff82d080172aca>] pv_hypercall+0x1e5/0x402
> > (XEN) [ 3668.317017]    [<ffff82d080250899>] entry.o#test_all_events+0/0x3d
> > 
> > Shows there's an hypercall executed from Dom0 that's trying to pause
> > the domain, thus pausing all the vCPUs.
> > 
> > Then pCPU#3:
> > 
> > (XEN) [ 3669.062841] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d1v0)
> > [...]
> > (XEN) [ 3669.322832] Xen call trace:
> > (XEN) [ 3669.326128]    [<ffff82d08021006a>] vmx_start_reexecute_instruction+0x107/0x68a
> > (XEN) [ 3669.333925]    [<ffff82d080210b3e>] p2m_mem_access_check+0x551/0x64d
> > (XEN) [ 3669.340774]    [<ffff82d0801dee9e>] hvm_hap_nested_page_fault+0x2f2/0x631
> > (XEN) [ 3669.348051]    [<ffff82d080202c00>] vmx_vmexit_handler+0x156c/0x1e45
> > (XEN) [ 3669.354899]    [<ffff82d08020820c>] vmx_asm_vmexit_handler+0xec/0x250
> > 
> > Seems to be blocked in vmx_start_reexecute_instruction, and thus not
> > getting paused and triggering the watchdog on pCPU#0?
> > 
> > You should check on which vCPU is the trace from pCPU#0 waiting, if
> > that's the vCPU running on pCPU#3 (d1v0) you will have to check what's
> > taking such a long time in vmx_start_reexecute_instruction.
> 
> Right, so this is what appears to be happening, if the output of my test
> is to be trusted: https://pastebin.com/YEDqNuwh
> 
> 1. vmx_start_reexecute_instruction() pauses all VCPUs but self (which
> appears to be VCPU 1):
> 
> (XEN) [  195.427141] 0 pause_count 0
> (XEN) [  195.427142] 2 pause_count 0
> (XEN) [  195.427143] 3 pause_count 0
> (XEN) [  195.427144] 4 pause_count 0
> (XEN) [  195.427146] 5 pause_count 0
> (XEN) [  195.427147] 6 pause_count 0
> (XEN) [  195.427148] 7 pause_count 0

The diff below doesn't show where you add this message, neither
what's actually printing. I guess the first number is the vCPU ID, and
the second the value of pause_count at some point?

> 
> 2. The hypercall happens, which calls domain_pause(), which I've
> modified thus:
> 
> @@ -959,7 +961,10 @@ static void do_domain_pause(struct domain *d,
>      atomic_inc(&d->pause_count);
> 
>      for_each_vcpu( d, v )
> +    {
> +        printk("domain_pause %d\n", v->vcpu_id);

Could you print both the domain and the vcpu ids?

>          sleep_fn(v);
> +    }
> 
>      arch_domain_pause(d);
>  }
> 
> and which says:
> 
> (XEN) [  195.492064] domain_pause 0

This is the hypercall code waiting for domain 1 vCPU 0 to pause?

> 
> 3. At this point, according to addr2line,
> vmx_start_reexecute_instruction() does "while ( a->is_running )
> cpu_relax();" for all VCPUs but itself.

Why don't you just start by using:

for_each_vcpu( d, v )
    if ( v != current )
        vcpu_pause(v);

Instead of open-coding it in vmx_start_reexecute_instruction.

> Now, d1v0, which, if I'm reading this correctly, is the VCPU that
> domain_pause() is stuck waiting for, does:
> 
> (XEN) [  200.829874] Xen call trace:
> (XEN) [  200.833166]    [<ffff82d0801278c6>]
> queue_read_lock_slowpath+0x25/0x4d
> (XEN) [  200.840186]    [<ffff82d08020c1f6>]
> get_page_from_gfn_p2m+0x14e/0x3b0
> (XEN) [  200.847121]    [<ffff82d080247213>]
> hap_p2m_ga_to_gfn_4_levels+0x48/0x299
> (XEN) [  200.854400]    [<ffff82d080247480>]
> hap_gva_to_gfn_4_levels+0x1c/0x1e
> (XEN) [  200.861331]    [<ffff82d08021275c>] paging_gva_to_gfn+0x10e/0x11d
> (XEN) [  200.867918]    [<ffff82d0801d66e0>] hvm.c#__hvm_copy+0x98/0x37f
> (XEN) [  200.874329]    [<ffff82d0801d848d>]
> hvm_fetch_from_guest_virt_nofault+0x14/0x16
> (XEN) [  200.882130]    [<ffff82d0801d141a>]
> emulate.c#_hvm_emulate_one+0x118/0x2bc
> (XEN) [  200.889496]    [<ffff82d0801d16b4>] hvm_emulate_one+0x10/0x12
> (XEN) [  200.895735]    [<ffff82d0801e0902>] handle_mmio+0x52/0xc9
> (XEN) [  200.901626]    [<ffff82d0801e09ba>]
> handle_mmio_with_translation+0x41/0x43
> (XEN) [  200.908994]    [<ffff82d0801ded1f>]
> hvm_hap_nested_page_fault+0x133/0x631
> (XEN) [  200.916271]    [<ffff82d080202c40>]
> vmx_vmexit_handler+0x156c/0x1e45
> (XEN) [  200.923117]    [<ffff82d08020824c>]
> vmx_asm_vmexit_handler+0xec/0x250

What lock is it waiting on? Is this the paging lock? If so you will
have to figure out who is holding this lock.

Is this on top of plain staging, or do you have other changes applied
to Xen?

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 14:49             ` Roger Pau Monné
@ 2018-11-22 15:25               ` Razvan Cojocaru
  2018-11-22 15:37                 ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-22 15:25 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS, jbeulich,
	Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On 11/22/18 4:49 PM, Roger Pau Monné wrote:
> On Thu, Nov 22, 2018 at 02:48:07PM +0200, Razvan Cojocaru wrote:
>> On 11/22/18 12:58 PM, Roger Pau Monné wrote:
>>> On Thu, Nov 22, 2018 at 12:14:59PM +0200, Razvan Cojocaru wrote:
>>>> On 11/22/18 12:05 PM, Roger Pau Monné wrote:
>>>>> On Wed, Nov 21, 2018 at 08:55:48PM +0200, Razvan Cojocaru wrote:
>>>>>> On 11/16/18 7:04 PM, Roger Pau Monné wrote:
>>>>>>>> +            if ( a == v )
>>>>>>>> +                continue;
>>>>>>>> +
>>>>>>>> +            /* Pause, synced. */
>>>>>>>> +            while ( !a->arch.in_host )
>>>>>>> Why not use a->is_running as a way to know whether the vCPU is
>>>>>>> running?
>>>>>>>
>>>>>>> I think the logic of using vcpu_pause and expecting the running vcpu
>>>>>>> to take a vmexit and thus set in_host is wrong because a vcpu that
>>>>>>> wasn't running when vcpu_pause_nosync is called won't get scheduled
>>>>>>> anymore, thus not taking a vmexit and this function will lockup.
>>>>>>>
>>>>>>> I don't think you need the in_host boolean at all.
>>>>>>>
>>>>>>>> +                cpu_relax();
>>>>>>> Is this really better than using vcpu_pause?
>>>>>>>
>>>>>>> I assume this is done to avoid waiting on each vcpu, and instead doing
>>>>>>> it here likely means less wait time?
>>>>>>
>>>>>> The problem with plain vcpu_pause() is that we weren't able to use it,
>>>>>> for the same reason (which remains unclear as of yet) that we couldn't
>>>>>> use a->is_running: we get CPU stuck hypervisor crashes that way. Here's
>>>>>> one that uses the same logic, but loops on a->is_running instead of
>>>>>> !a->arch.in_host:
>>>
>>> [...]
>>>
>>>>>> Some scheduler magic appears to happen here where it is unclear why
>>>>>> is_running doesn't seem to end up being 0 as expected in our case. We'll
>>>>>> keep digging.
>>>>>
>>>>> There seems to be some kind of deadlock between
>>>>> vmx_start_reexecute_instruction and hap_track_dirty_vram/handle_mmio.
>>>>> Are you holding a lock while trying to put the other vcpus to sleep?
>>>>
>>>> d->arch.rexec_lock, but I don't see how that would matter in this case.
>>>
>>> The trace from pCPU#0:
>>>
>>> (XEN) [ 3668.016989] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d0v0)
>>> [...]
>>> (XEN) [ 3668.275417] Xen call trace:
>>> (XEN) [ 3668.278714]    [<ffff82d0801327d2>] vcpu_sleep_sync+0x40/0x71
>>> (XEN) [ 3668.284952]    [<ffff82d08010735b>] domain.c#do_domain_pause+0x33/0x4f
>>> (XEN) [ 3668.291973]    [<ffff82d08010879a>] domain_pause+0x25/0x27
>>> (XEN) [ 3668.297952]    [<ffff82d080245e69>] hap_track_dirty_vram+0x2c1/0x4a7
>>> (XEN) [ 3668.304797]    [<ffff82d0801dd8f5>] do_hvm_op+0x18be/0x2b58
>>> (XEN) [ 3668.310864]    [<ffff82d080172aca>] pv_hypercall+0x1e5/0x402
>>> (XEN) [ 3668.317017]    [<ffff82d080250899>] entry.o#test_all_events+0/0x3d
>>>
>>> Shows there's an hypercall executed from Dom0 that's trying to pause
>>> the domain, thus pausing all the vCPUs.
>>>
>>> Then pCPU#3:
>>>
>>> (XEN) [ 3669.062841] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d1v0)
>>> [...]
>>> (XEN) [ 3669.322832] Xen call trace:
>>> (XEN) [ 3669.326128]    [<ffff82d08021006a>] vmx_start_reexecute_instruction+0x107/0x68a
>>> (XEN) [ 3669.333925]    [<ffff82d080210b3e>] p2m_mem_access_check+0x551/0x64d
>>> (XEN) [ 3669.340774]    [<ffff82d0801dee9e>] hvm_hap_nested_page_fault+0x2f2/0x631
>>> (XEN) [ 3669.348051]    [<ffff82d080202c00>] vmx_vmexit_handler+0x156c/0x1e45
>>> (XEN) [ 3669.354899]    [<ffff82d08020820c>] vmx_asm_vmexit_handler+0xec/0x250
>>>
>>> Seems to be blocked in vmx_start_reexecute_instruction, and thus not
>>> getting paused and triggering the watchdog on pCPU#0?
>>>
>>> You should check on which vCPU is the trace from pCPU#0 waiting, if
>>> that's the vCPU running on pCPU#3 (d1v0) you will have to check what's
>>> taking such a long time in vmx_start_reexecute_instruction.
>>
>> Right, so this is what appears to be happening, if the output of my test
>> is to be trusted: https://pastebin.com/YEDqNuwh
>>
>> 1. vmx_start_reexecute_instruction() pauses all VCPUs but self (which
>> appears to be VCPU 1):
>>
>> (XEN) [  195.427141] 0 pause_count 0
>> (XEN) [  195.427142] 2 pause_count 0
>> (XEN) [  195.427143] 3 pause_count 0
>> (XEN) [  195.427144] 4 pause_count 0
>> (XEN) [  195.427146] 5 pause_count 0
>> (XEN) [  195.427147] 6 pause_count 0
>> (XEN) [  195.427148] 7 pause_count 0
> 
> The diff below doesn't show where you add this message, neither
> what's actually printing. I guess the first number is the vCPU ID, and
> the second the value of pause_count at some point?

Yes, exactly. So the above tells us that VCPUs 0 and 2-7 have been
paused (nosync) by vmx_start_reexecute_instruction(), which is now doing
a while ( a->is_running ) cpu_relax().

>> 2. The hypercall happens, which calls domain_pause(), which I've
>> modified thus:
>>
>> @@ -959,7 +961,10 @@ static void do_domain_pause(struct domain *d,
>>      atomic_inc(&d->pause_count);
>>
>>      for_each_vcpu( d, v )
>> +    {
>> +        printk("domain_pause %d\n", v->vcpu_id);
> 
> Could you print both the domain and the vcpu ids?

Of course, but I think I've found the issue (please see below).

>>          sleep_fn(v);
>> +    }
>>
>>      arch_domain_pause(d);
>>  }
>>
>> and which says:
>>
>> (XEN) [  195.492064] domain_pause 0
> 
> This is the hypercall code waiting for domain 1 vCPU 0 to pause?

Yes.

>> 3. At this point, according to addr2line,
>> vmx_start_reexecute_instruction() does "while ( a->is_running )
>> cpu_relax();" for all VCPUs but itself.
> 
> Why don't you just start by using:
> 
> for_each_vcpu( d, v )
>     if ( v != current )
>         vcpu_pause(v);
> 
> Instead of open-coding it in vmx_start_reexecute_instruction.

That's the intention if we can get it to work.

>> Now, d1v0, which, if I'm reading this correctly, is the VCPU that
>> domain_pause() is stuck waiting for, does:
>>
>> (XEN) [  200.829874] Xen call trace:
>> (XEN) [  200.833166]    [<ffff82d0801278c6>]
>> queue_read_lock_slowpath+0x25/0x4d
>> (XEN) [  200.840186]    [<ffff82d08020c1f6>]
>> get_page_from_gfn_p2m+0x14e/0x3b0
>> (XEN) [  200.847121]    [<ffff82d080247213>]
>> hap_p2m_ga_to_gfn_4_levels+0x48/0x299
>> (XEN) [  200.854400]    [<ffff82d080247480>]
>> hap_gva_to_gfn_4_levels+0x1c/0x1e
>> (XEN) [  200.861331]    [<ffff82d08021275c>] paging_gva_to_gfn+0x10e/0x11d
>> (XEN) [  200.867918]    [<ffff82d0801d66e0>] hvm.c#__hvm_copy+0x98/0x37f
>> (XEN) [  200.874329]    [<ffff82d0801d848d>]
>> hvm_fetch_from_guest_virt_nofault+0x14/0x16
>> (XEN) [  200.882130]    [<ffff82d0801d141a>]
>> emulate.c#_hvm_emulate_one+0x118/0x2bc
>> (XEN) [  200.889496]    [<ffff82d0801d16b4>] hvm_emulate_one+0x10/0x12
>> (XEN) [  200.895735]    [<ffff82d0801e0902>] handle_mmio+0x52/0xc9
>> (XEN) [  200.901626]    [<ffff82d0801e09ba>]
>> handle_mmio_with_translation+0x41/0x43
>> (XEN) [  200.908994]    [<ffff82d0801ded1f>]
>> hvm_hap_nested_page_fault+0x133/0x631
>> (XEN) [  200.916271]    [<ffff82d080202c40>]
>> vmx_vmexit_handler+0x156c/0x1e45
>> (XEN) [  200.923117]    [<ffff82d08020824c>]
>> vmx_asm_vmexit_handler+0xec/0x250
> 
> What lock is it waiting on? Is this the paging lock? If so you will
> have to figure out who is holding this lock.

It turns out that it's the p2m lock. I've looked at the code more
closely, and hvm_hap_nested_page_fault() takes a p2m lock:

1923     /*
1924      * Take a lock on the host p2m speculatively, to avoid potential
1925      * locking order problems later and to handle unshare etc.
1926      */
1927     hostp2m = p2m_get_hostp2m(currd);

then ends up calling p2m_mem_access_check() with said lock taken.

Then p2m_mem_access_check() also does a bit of gfn_lock(p2m, gfn, 0)
(although it also unlocks those by the time
vmx_start_reexecute_instruction() gets called).

And then, of course, vmx_start_reexecute() runs, gets stuck in that loop
(it doesn't matter if we manually look at is_running or call
vcpu_pause(), the effect is the same), while the other VCPU ends up
running get_page_from_gfn_p2m(), which tries to also lock the p2m and
deadlocks.

I've placed printk()s before and after p2m_read_lock(p2m); in the if (
likely(!p2m_locked_by_me(p2m)) ) conditional in get_page_from_gfn_p2m(),
and it seems to confirm this theory - the last thing that gets printed
before the crash is the line before p2m_read_lock(p2m).

This seems to imply that is_running gets much later than in_host would
have, so with the in_host code vmx_start_reexecute_instruction() (and so
p2m_mem_access_check() and hvm_hap_nested_page_fault()) are able to exit
in a timely manner, allowing get_page_from_gfn_p2m() to take the p2m
lock afterwards and the show goes on.

> Is this on top of plain staging, or do you have other changes applied
> to Xen?

My tests are done on an older, XenServer-related 4.7.5 Xen. I also have
a patch applied that dumps all the CPUs on crash (otherwise I'd only see
one of them which is not very helpful in situations like these) that
Andrew has kindly provided.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 15:25               ` Razvan Cojocaru
@ 2018-11-22 15:37                 ` Roger Pau Monné
  2018-11-22 16:52                   ` Razvan Cojocaru
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-22 15:37 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS, jbeulich,
	Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On Thu, Nov 22, 2018 at 05:25:02PM +0200, Razvan Cojocaru wrote:
> On 11/22/18 4:49 PM, Roger Pau Monné wrote:
> > On Thu, Nov 22, 2018 at 02:48:07PM +0200, Razvan Cojocaru wrote:
> >> On 11/22/18 12:58 PM, Roger Pau Monné wrote:
> >>> On Thu, Nov 22, 2018 at 12:14:59PM +0200, Razvan Cojocaru wrote:
> >>>> On 11/22/18 12:05 PM, Roger Pau Monné wrote:
> >>>>> On Wed, Nov 21, 2018 at 08:55:48PM +0200, Razvan Cojocaru wrote:
> >>>>>> On 11/16/18 7:04 PM, Roger Pau Monné wrote:
> >>>>>>>> +            if ( a == v )
> >>>>>>>> +                continue;
> >>>>>>>> +
> >>>>>>>> +            /* Pause, synced. */
> >>>>>>>> +            while ( !a->arch.in_host )
> >>>>>>> Why not use a->is_running as a way to know whether the vCPU is
> >>>>>>> running?
> >>>>>>>
> >>>>>>> I think the logic of using vcpu_pause and expecting the running vcpu
> >>>>>>> to take a vmexit and thus set in_host is wrong because a vcpu that
> >>>>>>> wasn't running when vcpu_pause_nosync is called won't get scheduled
> >>>>>>> anymore, thus not taking a vmexit and this function will lockup.
> >>>>>>>
> >>>>>>> I don't think you need the in_host boolean at all.
> >>>>>>>
> >>>>>>>> +                cpu_relax();
> >>>>>>> Is this really better than using vcpu_pause?
> >>>>>>>
> >>>>>>> I assume this is done to avoid waiting on each vcpu, and instead doing
> >>>>>>> it here likely means less wait time?
> >>>>>>
> >>>>>> The problem with plain vcpu_pause() is that we weren't able to use it,
> >>>>>> for the same reason (which remains unclear as of yet) that we couldn't
> >>>>>> use a->is_running: we get CPU stuck hypervisor crashes that way. Here's
> >>>>>> one that uses the same logic, but loops on a->is_running instead of
> >>>>>> !a->arch.in_host:
> >>>
> >>> [...]
> >>>
> >>>>>> Some scheduler magic appears to happen here where it is unclear why
> >>>>>> is_running doesn't seem to end up being 0 as expected in our case. We'll
> >>>>>> keep digging.
> >>>>>
> >>>>> There seems to be some kind of deadlock between
> >>>>> vmx_start_reexecute_instruction and hap_track_dirty_vram/handle_mmio.
> >>>>> Are you holding a lock while trying to put the other vcpus to sleep?
> >>>>
> >>>> d->arch.rexec_lock, but I don't see how that would matter in this case.
> >>>
> >>> The trace from pCPU#0:
> >>>
> >>> (XEN) [ 3668.016989] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d0v0)
> >>> [...]
> >>> (XEN) [ 3668.275417] Xen call trace:
> >>> (XEN) [ 3668.278714]    [<ffff82d0801327d2>] vcpu_sleep_sync+0x40/0x71
> >>> (XEN) [ 3668.284952]    [<ffff82d08010735b>] domain.c#do_domain_pause+0x33/0x4f
> >>> (XEN) [ 3668.291973]    [<ffff82d08010879a>] domain_pause+0x25/0x27
> >>> (XEN) [ 3668.297952]    [<ffff82d080245e69>] hap_track_dirty_vram+0x2c1/0x4a7
> >>> (XEN) [ 3668.304797]    [<ffff82d0801dd8f5>] do_hvm_op+0x18be/0x2b58
> >>> (XEN) [ 3668.310864]    [<ffff82d080172aca>] pv_hypercall+0x1e5/0x402
> >>> (XEN) [ 3668.317017]    [<ffff82d080250899>] entry.o#test_all_events+0/0x3d
> >>>
> >>> Shows there's an hypercall executed from Dom0 that's trying to pause
> >>> the domain, thus pausing all the vCPUs.
> >>>
> >>> Then pCPU#3:
> >>>
> >>> (XEN) [ 3669.062841] RFLAGS: 0000000000000202   CONTEXT: hypervisor (d1v0)
> >>> [...]
> >>> (XEN) [ 3669.322832] Xen call trace:
> >>> (XEN) [ 3669.326128]    [<ffff82d08021006a>] vmx_start_reexecute_instruction+0x107/0x68a
> >>> (XEN) [ 3669.333925]    [<ffff82d080210b3e>] p2m_mem_access_check+0x551/0x64d
> >>> (XEN) [ 3669.340774]    [<ffff82d0801dee9e>] hvm_hap_nested_page_fault+0x2f2/0x631
> >>> (XEN) [ 3669.348051]    [<ffff82d080202c00>] vmx_vmexit_handler+0x156c/0x1e45
> >>> (XEN) [ 3669.354899]    [<ffff82d08020820c>] vmx_asm_vmexit_handler+0xec/0x250
> >>>
> >>> Seems to be blocked in vmx_start_reexecute_instruction, and thus not
> >>> getting paused and triggering the watchdog on pCPU#0?
> >>>
> >>> You should check on which vCPU is the trace from pCPU#0 waiting, if
> >>> that's the vCPU running on pCPU#3 (d1v0) you will have to check what's
> >>> taking such a long time in vmx_start_reexecute_instruction.
> >>
> >> Right, so this is what appears to be happening, if the output of my test
> >> is to be trusted: https://pastebin.com/YEDqNuwh
> >>
> >> 1. vmx_start_reexecute_instruction() pauses all VCPUs but self (which
> >> appears to be VCPU 1):
> >>
> >> (XEN) [  195.427141] 0 pause_count 0
> >> (XEN) [  195.427142] 2 pause_count 0
> >> (XEN) [  195.427143] 3 pause_count 0
> >> (XEN) [  195.427144] 4 pause_count 0
> >> (XEN) [  195.427146] 5 pause_count 0
> >> (XEN) [  195.427147] 6 pause_count 0
> >> (XEN) [  195.427148] 7 pause_count 0
> > 
> > The diff below doesn't show where you add this message, neither
> > what's actually printing. I guess the first number is the vCPU ID, and
> > the second the value of pause_count at some point?
> 
> Yes, exactly. So the above tells us that VCPUs 0 and 2-7 have been
> paused (nosync) by vmx_start_reexecute_instruction(), which is now doing
> a while ( a->is_running ) cpu_relax().
> 
> >> 2. The hypercall happens, which calls domain_pause(), which I've
> >> modified thus:
> >>
> >> @@ -959,7 +961,10 @@ static void do_domain_pause(struct domain *d,
> >>      atomic_inc(&d->pause_count);
> >>
> >>      for_each_vcpu( d, v )
> >> +    {
> >> +        printk("domain_pause %d\n", v->vcpu_id);
> > 
> > Could you print both the domain and the vcpu ids?
> 
> Of course, but I think I've found the issue (please see below).
> 
> >>          sleep_fn(v);
> >> +    }
> >>
> >>      arch_domain_pause(d);
> >>  }
> >>
> >> and which says:
> >>
> >> (XEN) [  195.492064] domain_pause 0
> > 
> > This is the hypercall code waiting for domain 1 vCPU 0 to pause?
> 
> Yes.
> 
> >> 3. At this point, according to addr2line,
> >> vmx_start_reexecute_instruction() does "while ( a->is_running )
> >> cpu_relax();" for all VCPUs but itself.
> > 
> > Why don't you just start by using:
> > 
> > for_each_vcpu( d, v )
> >     if ( v != current )
> >         vcpu_pause(v);
> > 
> > Instead of open-coding it in vmx_start_reexecute_instruction.
> 
> That's the intention if we can get it to work.
> 
> >> Now, d1v0, which, if I'm reading this correctly, is the VCPU that
> >> domain_pause() is stuck waiting for, does:
> >>
> >> (XEN) [  200.829874] Xen call trace:
> >> (XEN) [  200.833166]    [<ffff82d0801278c6>]
> >> queue_read_lock_slowpath+0x25/0x4d
> >> (XEN) [  200.840186]    [<ffff82d08020c1f6>]
> >> get_page_from_gfn_p2m+0x14e/0x3b0
> >> (XEN) [  200.847121]    [<ffff82d080247213>]
> >> hap_p2m_ga_to_gfn_4_levels+0x48/0x299
> >> (XEN) [  200.854400]    [<ffff82d080247480>]
> >> hap_gva_to_gfn_4_levels+0x1c/0x1e
> >> (XEN) [  200.861331]    [<ffff82d08021275c>] paging_gva_to_gfn+0x10e/0x11d
> >> (XEN) [  200.867918]    [<ffff82d0801d66e0>] hvm.c#__hvm_copy+0x98/0x37f
> >> (XEN) [  200.874329]    [<ffff82d0801d848d>]
> >> hvm_fetch_from_guest_virt_nofault+0x14/0x16
> >> (XEN) [  200.882130]    [<ffff82d0801d141a>]
> >> emulate.c#_hvm_emulate_one+0x118/0x2bc
> >> (XEN) [  200.889496]    [<ffff82d0801d16b4>] hvm_emulate_one+0x10/0x12
> >> (XEN) [  200.895735]    [<ffff82d0801e0902>] handle_mmio+0x52/0xc9
> >> (XEN) [  200.901626]    [<ffff82d0801e09ba>]
> >> handle_mmio_with_translation+0x41/0x43
> >> (XEN) [  200.908994]    [<ffff82d0801ded1f>]
> >> hvm_hap_nested_page_fault+0x133/0x631
> >> (XEN) [  200.916271]    [<ffff82d080202c40>]
> >> vmx_vmexit_handler+0x156c/0x1e45
> >> (XEN) [  200.923117]    [<ffff82d08020824c>]
> >> vmx_asm_vmexit_handler+0xec/0x250
> > 
> > What lock is it waiting on? Is this the paging lock? If so you will
> > have to figure out who is holding this lock.
> 
> It turns out that it's the p2m lock. I've looked at the code more
> closely, and hvm_hap_nested_page_fault() takes a p2m lock:
> 
> 1923     /*
> 1924      * Take a lock on the host p2m speculatively, to avoid potential
> 1925      * locking order problems later and to handle unshare etc.
> 1926      */
> 1927     hostp2m = p2m_get_hostp2m(currd);
> 
> then ends up calling p2m_mem_access_check() with said lock taken.
> 
> Then p2m_mem_access_check() also does a bit of gfn_lock(p2m, gfn, 0)
> (although it also unlocks those by the time
> vmx_start_reexecute_instruction() gets called).
> 
> And then, of course, vmx_start_reexecute() runs, gets stuck in that loop
> (it doesn't matter if we manually look at is_running or call
> vcpu_pause(), the effect is the same), while the other VCPU ends up
> running get_page_from_gfn_p2m(), which tries to also lock the p2m and
> deadlocks.

I don't think you are supposed to try to pause other vcpus while
holding a lock, as you can see it's quite likely that you will end up
deadlocking because the vCPU you are trying to pause is stuck waiting
on the lock that you are holding.

You should figure out whether you can get into vmx_start_reexecute
without holding any locks, or alternatively drop the lock, pause the
vCPUs and pick the lock again.

See for example how hap_track_dirty_vram releases the lock before
attempting to pause the domain for this same reason.

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 15:37                 ` Roger Pau Monné
@ 2018-11-22 16:52                   ` Razvan Cojocaru
  2018-11-22 17:08                     ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-22 16:52 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS, jbeulich,
	Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On 11/22/18 5:37 PM, Roger Pau Monné wrote:
> I don't think you are supposed to try to pause other vcpus while
> holding a lock, as you can see it's quite likely that you will end up
> deadlocking because the vCPU you are trying to pause is stuck waiting
> on the lock that you are holding.
> 
> You should figure out whether you can get into vmx_start_reexecute
> without holding any locks, or alternatively drop the lock, pause the
> vCPUs and pick the lock again.
> 
> See for example how hap_track_dirty_vram releases the lock before
> attempting to pause the domain for this same reason.

Right, this will take more thinking.

I've unlocked the p2m for testing and the initial hang is gone, however
the same problem now applies to rexec_lock: nothing prevents two or more
VCPUs from arriving in vmx_start_reexecute_instruction() simultaneously,
at which point one of them might take the lock and try to pause the
other, while the other is waiting to take the lock, with predictable
results.

On the other hand, releasing rexec_lock as well will allow two VCPUs to
end up trying to pause each other (especially unpleasant in a 2 VCPU
guest). At any given moment, there should be only one VCPU alive and
trying to reexecute an instruction - and at least one VCPU alive on the
guest.

We'll get more coffee, and of course suggestions are appreciated (as has
been all your help).


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 16:52                   ` Razvan Cojocaru
@ 2018-11-22 17:08                     ` Roger Pau Monné
  2018-11-22 18:24                       ` Razvan Cojocaru
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-22 17:08 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, tamas, wei.liu2, jun.nakajima, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS, jbeulich,
	Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On Thu, Nov 22, 2018 at 06:52:07PM +0200, Razvan Cojocaru wrote:
> On 11/22/18 5:37 PM, Roger Pau Monné wrote:
> > I don't think you are supposed to try to pause other vcpus while
> > holding a lock, as you can see it's quite likely that you will end up
> > deadlocking because the vCPU you are trying to pause is stuck waiting
> > on the lock that you are holding.
> > 
> > You should figure out whether you can get into vmx_start_reexecute
> > without holding any locks, or alternatively drop the lock, pause the
> > vCPUs and pick the lock again.
> > 
> > See for example how hap_track_dirty_vram releases the lock before
> > attempting to pause the domain for this same reason.
> 
> Right, this will take more thinking.
> 
> I've unlocked the p2m for testing and the initial hang is gone, however
> the same problem now applies to rexec_lock: nothing prevents two or more
> VCPUs from arriving in vmx_start_reexecute_instruction() simultaneously,
> at which point one of them might take the lock and try to pause the
> other, while the other is waiting to take the lock, with predictable
> results.
> 
> On the other hand, releasing rexec_lock as well will allow two VCPUs to
> end up trying to pause each other (especially unpleasant in a 2 VCPU
> guest). At any given moment, there should be only one VCPU alive and
> trying to reexecute an instruction - and at least one VCPU alive on the
> guest.
> 
> We'll get more coffee, and of course suggestions are appreciated (as has
> been all your help).

Hm, I don't think it's generally safe to try to pause domain vCPUs
from the same domain context, as you say it's likely to deadlock since
two vCPUs from the same domain might try to pause one another.

My knowledge of all this introspection logic is very vague, do you
really need to stop the other vCPUs while performing this reexecution?

What are you trying to prevent by pausing other vCPUs?

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 17:08                     ` Roger Pau Monné
@ 2018-11-22 18:24                       ` Razvan Cojocaru
  2018-11-23  8:54                         ` Roger Pau Monné
       [not found]                         ` <838191050200006B34861ACF@prv1-mh.provo.novell.com>
  0 siblings, 2 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-22 18:24 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel, Anshul Makkar

On 11/22/18 7:08 PM, Roger Pau Monné wrote:
> On Thu, Nov 22, 2018 at 06:52:07PM +0200, Razvan Cojocaru wrote:
>> On 11/22/18 5:37 PM, Roger Pau Monné wrote:
>>> I don't think you are supposed to try to pause other vcpus while
>>> holding a lock, as you can see it's quite likely that you will end up
>>> deadlocking because the vCPU you are trying to pause is stuck waiting
>>> on the lock that you are holding.
>>>
>>> You should figure out whether you can get into vmx_start_reexecute
>>> without holding any locks, or alternatively drop the lock, pause the
>>> vCPUs and pick the lock again.
>>>
>>> See for example how hap_track_dirty_vram releases the lock before
>>> attempting to pause the domain for this same reason.
>>
>> Right, this will take more thinking.
>>
>> I've unlocked the p2m for testing and the initial hang is gone, however
>> the same problem now applies to rexec_lock: nothing prevents two or more
>> VCPUs from arriving in vmx_start_reexecute_instruction() simultaneously,
>> at which point one of them might take the lock and try to pause the
>> other, while the other is waiting to take the lock, with predictable
>> results.
>>
>> On the other hand, releasing rexec_lock as well will allow two VCPUs to
>> end up trying to pause each other (especially unpleasant in a 2 VCPU
>> guest). At any given moment, there should be only one VCPU alive and
>> trying to reexecute an instruction - and at least one VCPU alive on the
>> guest.
>>
>> We'll get more coffee, and of course suggestions are appreciated (as has
>> been all your help).
> 
> Hm, I don't think it's generally safe to try to pause domain vCPUs
> from the same domain context, as you say it's likely to deadlock since
> two vCPUs from the same domain might try to pause one another.
> 
> My knowledge of all this introspection logic is very vague, do you
> really need to stop the other vCPUs while performing this reexecution?
> 
> What are you trying to prevent by pausing other vCPUs?

Yes, that's unfortunately very necessary.

The scenario is this: for introspection purposes, a bunch of pages are
marked read-only in the EPT (or no-execute, but for the purposes of this
example let's stick to read-only).

Now, we'll get vm_events whenever an instruction will try to write into
one of those. Vm_events are expensive, so we _really_ want to get as few
of those as possible while still keeping the guest protected. So we want
to filter out irrelevant ones.

The main category of irrelevant ones are faults caused by walking the
guest's page table. We only want events caused by an actual write into a
protected page by an actual instruction running at RIP in the guest.

So, we don't want to get those vm_events where npfec.kind !=
npfec_kind_with_gla in p2m_mem_access_check(), hence this patch:

https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=c5387c4d75602dbb2f0d3d961a5c4b8faf3873db

_However_, please picture an instruction that both writes into a page P1
we're interested in, _and_ causes a write into a read-only page-walk
related page P2. Emulating the current instruction, as the upstream
patch does, does eliminate the vm_event caused by writing into P2, but
with the unfortunate side-effect of losing a potentially critical event
for the write into P1.

What this patch attempts to do is to mark P1 rwx (so allow the write),
then put the faulting VCPU into singlestep mode, then restore the
restrictions after it has finished single stepping. By now it's obvious
why all the other VCPUs need to be paused: one of them might do a
malicious write into P1 that silently succeeds (since the EPT is shared
among all VCPUs - putting altp2m aside for a moment). We don't want that.

Alternatively, we'd be happy with simply being able to set the relevant
A/D bits in the pages touched by the page walk, but after lenghty
negotiations that can be found in the xen-devel archives we were unable
to find a safe, architecturally correct way of doing that.

I hope this sheds some light on it.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-22 18:24                       ` Razvan Cojocaru
@ 2018-11-23  8:54                         ` Roger Pau Monné
       [not found]                           ` <59739FBC020000C234861ACF@prv1-mh.provo.novell.com>
  2018-11-27 10:31                           ` Razvan Cojocaru
       [not found]                         ` <838191050200006B34861ACF@prv1-mh.provo.novell.com>
  1 sibling, 2 replies; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-23  8:54 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel

On Thu, Nov 22, 2018 at 08:24:52PM +0200, Razvan Cojocaru wrote:
> On 11/22/18 7:08 PM, Roger Pau Monné wrote:
> > On Thu, Nov 22, 2018 at 06:52:07PM +0200, Razvan Cojocaru wrote:
> >> On 11/22/18 5:37 PM, Roger Pau Monné wrote:
> >>> I don't think you are supposed to try to pause other vcpus while
> >>> holding a lock, as you can see it's quite likely that you will end up
> >>> deadlocking because the vCPU you are trying to pause is stuck waiting
> >>> on the lock that you are holding.
> >>>
> >>> You should figure out whether you can get into vmx_start_reexecute
> >>> without holding any locks, or alternatively drop the lock, pause the
> >>> vCPUs and pick the lock again.
> >>>
> >>> See for example how hap_track_dirty_vram releases the lock before
> >>> attempting to pause the domain for this same reason.
> >>
> >> Right, this will take more thinking.
> >>
> >> I've unlocked the p2m for testing and the initial hang is gone, however
> >> the same problem now applies to rexec_lock: nothing prevents two or more
> >> VCPUs from arriving in vmx_start_reexecute_instruction() simultaneously,
> >> at which point one of them might take the lock and try to pause the
> >> other, while the other is waiting to take the lock, with predictable
> >> results.
> >>
> >> On the other hand, releasing rexec_lock as well will allow two VCPUs to
> >> end up trying to pause each other (especially unpleasant in a 2 VCPU
> >> guest). At any given moment, there should be only one VCPU alive and
> >> trying to reexecute an instruction - and at least one VCPU alive on the
> >> guest.
> >>
> >> We'll get more coffee, and of course suggestions are appreciated (as has
> >> been all your help).
> > 
> > Hm, I don't think it's generally safe to try to pause domain vCPUs
> > from the same domain context, as you say it's likely to deadlock since
> > two vCPUs from the same domain might try to pause one another.
> > 
> > My knowledge of all this introspection logic is very vague, do you
> > really need to stop the other vCPUs while performing this reexecution?
> > 
> > What are you trying to prevent by pausing other vCPUs?
> 
> Yes, that's unfortunately very necessary.
> 
> The scenario is this: for introspection purposes, a bunch of pages are
> marked read-only in the EPT (or no-execute, but for the purposes of this
> example let's stick to read-only).
> 
> Now, we'll get vm_events whenever an instruction will try to write into
> one of those. Vm_events are expensive, so we _really_ want to get as few
> of those as possible while still keeping the guest protected. So we want
> to filter out irrelevant ones.
> 
> The main category of irrelevant ones are faults caused by walking the
> guest's page table. We only want events caused by an actual write into a
> protected page by an actual instruction running at RIP in the guest.
> 
> So, we don't want to get those vm_events where npfec.kind !=
> npfec_kind_with_gla in p2m_mem_access_check(), hence this patch:
> 
> https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=c5387c4d75602dbb2f0d3d961a5c4b8faf3873db
> 
> _However_, please picture an instruction that both writes into a page P1
> we're interested in, _and_ causes a write into a read-only page-walk
> related page P2. Emulating the current instruction, as the upstream
> patch does, does eliminate the vm_event caused by writing into P2, but
> with the unfortunate side-effect of losing a potentially critical event
> for the write into P1.

How could the event for P1 be lost? If the instruction writes to both
P1 and P2, you already got some kind of event since writing to P1
would trigger a fault. Then you can just discard the P2 part, forward
the P1 access and just emulate the instruction?

(I guess I'm missing something on the above)

> What this patch attempts to do is to mark P1 rwx (so allow the write),
> then put the faulting VCPU into singlestep mode, then restore the
> restrictions after it has finished single stepping. By now it's obvious
> why all the other VCPUs need to be paused: one of them might do a
> malicious write into P1 that silently succeeds (since the EPT is shared
> among all VCPUs - putting altp2m aside for a moment). We don't want that.

Can't you just change the p2m of a single vCPU? Either using altp2m or
some other mechanism.

Also keep in mind that this pause approach might work for guests with
a relatively small number of vCPUs, but I'm unsure this is going to
work for guests with high number of vCPUs, pausing all vCPUs for each
trapped instruction is likely going to stall the guest.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
       [not found]                         ` <838191050200006B34861ACF@prv1-mh.provo.novell.com>
@ 2018-11-23  9:07                           ` Jan Beulich
  2018-11-27 10:49                             ` Razvan Cojocaru
       [not found]                           ` <A31948D30200007D0063616D@prv1-mh.provo.novell.com>
  1 sibling, 1 reply; 52+ messages in thread
From: Jan Beulich @ 2018-11-23  9:07 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, George Dunlap,
	Andrew Cooper, Mihai Dontu, Kevin Tian, Jun Nakajima, aisaila,
	xen-devel, Anshul Makkar, Roger Pau Monne

>>> On 22.11.18 at 19:24, <rcojocaru@bitdefender.com> wrote:
> _However_, please picture an instruction that both writes into a page P1
> we're interested in, _and_ causes a write into a read-only page-walk
> related page P2. Emulating the current instruction, as the upstream
> patch does, does eliminate the vm_event caused by writing into P2, but
> with the unfortunate side-effect of losing a potentially critical event
> for the write into P1.
> 
> What this patch attempts to do is to mark P1 rwx (so allow the write),
> then put the faulting VCPU into singlestep mode, then restore the
> restrictions after it has finished single stepping. By now it's obvious
> why all the other VCPUs need to be paused: one of them might do a
> malicious write into P1 that silently succeeds (since the EPT is shared
> among all VCPUs - putting altp2m aside for a moment). We don't want that.

I think this all goes into the fundamentally wrong direction. If lost
events during emulation are your issue, then let's make sure
emulation paths trigger the same events hardware would.

With a sufficiently complete insn emulator, single-stepping should
not be needed at all imo. Granted we're not quite there yet with
the emulator, but we've made quite a bit of progress. As before,
if there are particular instructions you know of that the emulator
doesn't handle yet, please keep pointing these out. Last I know
were some AVX move instructions, which have long been
implemented.

> Alternatively, we'd be happy with simply being able to set the relevant
> A/D bits in the pages touched by the page walk, but after lenghty
> negotiations that can be found in the xen-devel archives we were unable
> to find a safe, architecturally correct way of doing that.

Hmm, I don't recall that we had settled that this would be entirely
impossible, but then again - as per above - this as well was only
curing symptoms rather than the cause.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
       [not found]                           ` <A31948D30200007D0063616D@prv1-mh.provo.novell.com>
@ 2018-11-23  9:10                             ` Jan Beulich
       [not found]                             ` <9B05ED9E020000C434861ACF@prv1-mh.provo.novell.com>
  1 sibling, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2018-11-23  9:10 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, Razvan Cojocaru,
	George Dunlap, Andrew Cooper, Mihai Dontu, Kevin Tian,
	Jun Nakajima, aisaila, xen-devel

>>> On 23.11.18 at 09:54, <roger.pau@citrix.com> wrote:
> On Thu, Nov 22, 2018 at 08:24:52PM +0200, Razvan Cojocaru wrote:
>> What this patch attempts to do is to mark P1 rwx (so allow the write),
>> then put the faulting VCPU into singlestep mode, then restore the
>> restrictions after it has finished single stepping. By now it's obvious
>> why all the other VCPUs need to be paused: one of them might do a
>> malicious write into P1 that silently succeeds (since the EPT is shared
>> among all VCPUs - putting altp2m aside for a moment). We don't want that.
> 
> Can't you just change the p2m of a single vCPU? Either using altp2m or
> some other mechanism.

I guess as a very basic limitation there are not enough distinct
altp2m-s available to use one per vCPU.

> Also keep in mind that this pause approach might work for guests with
> a relatively small number of vCPUs, but I'm unsure this is going to
> work for guests with high number of vCPUs, pausing all vCPUs for each
> trapped instruction is likely going to stall the guest.

Indeed. Yet for smaller guests a per-vCPU-altp2m approach would
seem to be feasible at least from an abstract pov.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-23  8:54                         ` Roger Pau Monné
       [not found]                           ` <59739FBC020000C234861ACF@prv1-mh.provo.novell.com>
@ 2018-11-27 10:31                           ` Razvan Cojocaru
  2018-11-27 11:32                             ` Roger Pau Monné
  1 sibling, 1 reply; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-27 10:31 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel

>> _However_, please picture an instruction that both writes into a page P1
>> we're interested in, _and_ causes a write into a read-only page-walk
>> related page P2. Emulating the current instruction, as the upstream
>> patch does, does eliminate the vm_event caused by writing into P2, but
>> with the unfortunate side-effect of losing a potentially critical event
>> for the write into P1.
> 
> How could the event for P1 be lost? If the instruction writes to both
> P1 and P2, you already got some kind of event since writing to P1
> would trigger a fault. Then you can just discard the P2 part, forward
> the P1 access and just emulate the instruction?

Sorry for the late reply, I'm not in the office and have spotty access
to a real computer.

The instruction will write to P1, and running it will trigger a page
walk that writes into P2 (where both P1 and P2 are write-protected).

The Xen emulator currently _completely_ ignores EPT restrictions, which
is both the reason why we're able to use it for introspection purposes
(so we can run instructions that write to protected pages that we've
deemed to be safe, without lifting said restrictions), and the problem
in this case.

So emulating the instruction we're talking about will silently write
both P1 and P2, even though we'd like the write to P2 (the page walk
part) to succeed, but still have the vm_event for P1.

>> What this patch attempts to do is to mark P1 rwx (so allow the write),
>> then put the faulting VCPU into singlestep mode, then restore the
>> restrictions after it has finished single stepping. By now it's obvious
>> why all the other VCPUs need to be paused: one of them might do a
>> malicious write into P1 that silently succeeds (since the EPT is shared
>> among all VCPUs - putting altp2m aside for a moment). We don't want that.
> 
> Can't you just change the p2m of a single vCPU? Either using altp2m or
> some other mechanism.

As Jan has pointed out, we'd need too many altp2ms (there's currently a
hardcoded limit of 10 in Xen). But even more importantly, perhaps, is
that altp2m is not usable at all at the moment (at least until the
series I've been working on with George's kind help goes in) - because
the guests' displays freeze when switching to a new altp2m early on
boot, or after a screen resize.

Also, not all Intel hardware supports altp2m, and while Xen does emulate
altp2m support for hardware that does not, it's not ideal to use that
performance-wise.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-23  9:07                           ` Jan Beulich
@ 2018-11-27 10:49                             ` Razvan Cojocaru
  2018-11-27 11:28                               ` Jan Beulich
  2019-05-13 13:58                                 ` [Xen-devel] " Razvan Cojocaru
  0 siblings, 2 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-27 10:49 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, George Dunlap,
	Andrew Cooper, Mihai Dontu, Kevin Tian, Jun Nakajima, aisaila,
	xen-devel, Anshul Makkar, Roger Pau Monne

On 11/23/18 11:07 AM, Jan Beulich wrote:
>>>> On 22.11.18 at 19:24, <rcojocaru@bitdefender.com> wrote:
>> _However_, please picture an instruction that both writes into a page P1
>> we're interested in, _and_ causes a write into a read-only page-walk
>> related page P2. Emulating the current instruction, as the upstream
>> patch does, does eliminate the vm_event caused by writing into P2, but
>> with the unfortunate side-effect of losing a potentially critical event
>> for the write into P1.
>>
>> What this patch attempts to do is to mark P1 rwx (so allow the write),
>> then put the faulting VCPU into singlestep mode, then restore the
>> restrictions after it has finished single stepping. By now it's obvious
>> why all the other VCPUs need to be paused: one of them might do a
>> malicious write into P1 that silently succeeds (since the EPT is shared
>> among all VCPUs - putting altp2m aside for a moment). We don't want that.
> 
> I think this all goes into the fundamentally wrong direction. If lost
> events during emulation are your issue, then let's make sure
> emulation paths trigger the same events hardware would.

It's complicated: we very much like that the emulator is ignoring page
restrictions - this allows us to proceed with instructions writing into
protected pages without lifting said restrictions (when those
instructions are deemed to be safe by the introspection engine). That is
the most efficient mechanism we have, since we can just reply "emulate"
to a vm_event and that's it.

The alternative is to use altp2m, have an unrestricted view (view 0 fits
the bill nicely since restrictions on in propagate to all other active
altp2ms), get an EPT fault vm_event, reply with "switch to view 0 and
put VCPU in single-step mode", then wait for the single step event, and
then reply "switch back to restricted altp2m view and get the VCPU out
of single-step mode". Clearly the altp2m option is at least twice as
slow, so we prefer to emulate the instruction with a single vm_event
reply and move on.

A special case is when the emulator doesn't support an instruction, in
which case we can get an UNIMPLEMENTED vm_event, and switch to altp2m
just for that. Between these, everything should be covered and the
guests should run without problems.

However, we need to get altp2m up to speed, fully working and reliable
to be able to do all that.

About the emulator and events: if we could have a toggle for the
emulator to tell it "emulate the current instruction and send out a
vm_event only if it touches a protected page that's NOT part of the page
walk", that would also work - though I can't at this point tell how
feasible those modifications are.

> With a sufficiently complete insn emulator, single-stepping should
> not be needed at all imo. Granted we're not quite there yet with
> the emulator, but we've made quite a bit of progress. As before,
> if there are particular instructions you know of that the emulator
> doesn't handle yet, please keep pointing these out. Last I know
> were some AVX move instructions, which have long been
> implemented.

True, I haven't seen emulator issues in that respect with staging - the
emulator appears lately to be sufficiently complete. Thank you very much
for your help and support - we'll definitely point out unsupported
instructions if we spot some again.

The bigger practical problem is having something that works with older
Xen versions - for example current XenServer releases are still on Xen
4.7, and backporting the whole emulator machinery there is not
reasonable, with the large number of changes that have occured in the
meantime.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-27 10:49                             ` Razvan Cojocaru
@ 2018-11-27 11:28                               ` Jan Beulich
  2018-11-27 11:44                                 ` Razvan Cojocaru
  2019-05-13 13:58                                 ` [Xen-devel] " Razvan Cojocaru
  1 sibling, 1 reply; 52+ messages in thread
From: Jan Beulich @ 2018-11-27 11:28 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, George Dunlap,
	Andrew Cooper, Mihai Dontu, Kevin Tian, Jun Nakajima, aisaila,
	xen-devel, Anshul Makkar, Roger Pau Monne

>>> On 27.11.18 at 11:49, <rcojocaru@bitdefender.com> wrote:
> On 11/23/18 11:07 AM, Jan Beulich wrote:
>>>>> On 22.11.18 at 19:24, <rcojocaru@bitdefender.com> wrote:
>>> _However_, please picture an instruction that both writes into a page P1
>>> we're interested in, _and_ causes a write into a read-only page-walk
>>> related page P2. Emulating the current instruction, as the upstream
>>> patch does, does eliminate the vm_event caused by writing into P2, but
>>> with the unfortunate side-effect of losing a potentially critical event
>>> for the write into P1.
>>>
>>> What this patch attempts to do is to mark P1 rwx (so allow the write),
>>> then put the faulting VCPU into singlestep mode, then restore the
>>> restrictions after it has finished single stepping. By now it's obvious
>>> why all the other VCPUs need to be paused: one of them might do a
>>> malicious write into P1 that silently succeeds (since the EPT is shared
>>> among all VCPUs - putting altp2m aside for a moment). We don't want that.
>> 
>> I think this all goes into the fundamentally wrong direction. If lost
>> events during emulation are your issue, then let's make sure
>> emulation paths trigger the same events hardware would.
> 
> It's complicated: we very much like that the emulator is ignoring page
> restrictions - this allows us to proceed with instructions writing into
> protected pages without lifting said restrictions (when those
> instructions are deemed to be safe by the introspection engine). That is
> the most efficient mechanism we have, since we can just reply "emulate"
> to a vm_event and that's it.
> 
> The alternative is to use altp2m, have an unrestricted view (view 0 fits
> the bill nicely since restrictions on in propagate to all other active
> altp2ms), get an EPT fault vm_event, reply with "switch to view 0 and
> put VCPU in single-step mode", then wait for the single step event, and
> then reply "switch back to restricted altp2m view and get the VCPU out
> of single-step mode". Clearly the altp2m option is at least twice as
> slow, so we prefer to emulate the instruction with a single vm_event
> reply and move on.
> 
> A special case is when the emulator doesn't support an instruction, in
> which case we can get an UNIMPLEMENTED vm_event, and switch to altp2m
> just for that. Between these, everything should be covered and the
> guests should run without problems.
> 
> However, we need to get altp2m up to speed, fully working and reliable
> to be able to do all that.
> 
> About the emulator and events: if we could have a toggle for the
> emulator to tell it "emulate the current instruction and send out a
> vm_event only if it touches a protected page that's NOT part of the page
> walk", that would also work - though I can't at this point tell how
> feasible those modifications are.

For the emulation paths it is certainly possible to have controls for
(almost) everything, if needed. So going that route continues to
look more desirable to me than going the route you've chosen.

>> With a sufficiently complete insn emulator, single-stepping should
>> not be needed at all imo. Granted we're not quite there yet with
>> the emulator, but we've made quite a bit of progress. As before,
>> if there are particular instructions you know of that the emulator
>> doesn't handle yet, please keep pointing these out. Last I know
>> were some AVX move instructions, which have long been
>> implemented.
> 
> True, I haven't seen emulator issues in that respect with staging - the
> emulator appears lately to be sufficiently complete. Thank you very much
> for your help and support - we'll definitely point out unsupported
> instructions if we spot some again.
> 
> The bigger practical problem is having something that works with older
> Xen versions - for example current XenServer releases are still on Xen
> 4.7, and backporting the whole emulator machinery there is not
> reasonable, with the large number of changes that have occured in the
> meantime.

Well - wouldn't this apply to extensive altp2m changes as well?

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-27 10:31                           ` Razvan Cojocaru
@ 2018-11-27 11:32                             ` Roger Pau Monné
  2018-11-27 11:45                               ` Razvan Cojocaru
  2018-12-19 16:49                               ` Alexandru Stefan ISAILA
  0 siblings, 2 replies; 52+ messages in thread
From: Roger Pau Monné @ 2018-11-27 11:32 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel

On Tue, Nov 27, 2018 at 12:31:35PM +0200, Razvan Cojocaru wrote:
> >> _However_, please picture an instruction that both writes into a page P1
> >> we're interested in, _and_ causes a write into a read-only page-walk
> >> related page P2. Emulating the current instruction, as the upstream
> >> patch does, does eliminate the vm_event caused by writing into P2, but
> >> with the unfortunate side-effect of losing a potentially critical event
> >> for the write into P1.
> > 
> > How could the event for P1 be lost? If the instruction writes to both
> > P1 and P2, you already got some kind of event since writing to P1
> > would trigger a fault. Then you can just discard the P2 part, forward
> > the P1 access and just emulate the instruction?
> 
> Sorry for the late reply, I'm not in the office and have spotty access
> to a real computer.
> 
> The instruction will write to P1, and running it will trigger a page
> walk that writes into P2 (where both P1 and P2 are write-protected).
> 
> The Xen emulator currently _completely_ ignores EPT restrictions, which
> is both the reason why we're able to use it for introspection purposes
> (so we can run instructions that write to protected pages that we've
> deemed to be safe, without lifting said restrictions), and the problem
> in this case.
> 
> So emulating the instruction we're talking about will silently write
> both P1 and P2, even though we'd like the write to P2 (the page walk
> part) to succeed, but still have the vm_event for P1.

Would it be possible to add some kind of flag to the emulator to
signal whether p2m restrictions should be enforced/ignored?
hvmemul_acquire_page seems like a suitable place, but I'm not that
familiar with the emulator.

Then you could generate vm events from the emulator itself, which
AFAICT is the only way to handle this instruction execution issue.

> >> What this patch attempts to do is to mark P1 rwx (so allow the write),
> >> then put the faulting VCPU into singlestep mode, then restore the
> >> restrictions after it has finished single stepping. By now it's obvious
> >> why all the other VCPUs need to be paused: one of them might do a
> >> malicious write into P1 that silently succeeds (since the EPT is shared
> >> among all VCPUs - putting altp2m aside for a moment). We don't want that.
> > 
> > Can't you just change the p2m of a single vCPU? Either using altp2m or
> > some other mechanism.
> 
> As Jan has pointed out, we'd need too many altp2ms (there's currently a
> hardcoded limit of 10 in Xen). But even more importantly, perhaps, is
> that altp2m is not usable at all at the moment (at least until the
> series I've been working on with George's kind help goes in) - because
> the guests' displays freeze when switching to a new altp2m early on
> boot, or after a screen resize.
> 
> Also, not all Intel hardware supports altp2m, and while Xen does emulate
> altp2m support for hardware that does not, it's not ideal to use that
> performance-wise.

IMO, the best way is move forward with this issue is to enhance the
emulator to be able to generate vm events.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-27 11:28                               ` Jan Beulich
@ 2018-11-27 11:44                                 ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-27 11:44 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, George Dunlap,
	Andrew Cooper, Mihai Dontu, Kevin Tian, Jun Nakajima, aisaila,
	xen-devel, Anshul Makkar, Roger Pau Monne

>> About the emulator and events: if we could have a toggle for the
>> emulator to tell it "emulate the current instruction and send out a
>> vm_event only if it touches a protected page that's NOT part of the page
>> walk", that would also work - though I can't at this point tell how
>> feasible those modifications are.
> 
> For the emulation paths it is certainly possible to have controls for
> (almost) everything, if needed. So going that route continues to
> look more desirable to me than going the route you've chosen.

It does sound very reasonable, we'll look into that.

>> The bigger practical problem is having something that works with older
>> Xen versions - for example current XenServer releases are still on Xen
>> 4.7, and backporting the whole emulator machinery there is not
>> reasonable, with the large number of changes that have occured in the
>> meantime.
>
> Well - wouldn't this apply to extensive altp2m changes as well?

It does, however for one, for some reason the altp2m patches have so far
proven (for us at least) much easier to backport (and there's less of them).

However, you do make a good point, and assuming this works emulator
changes do seem to potentially solve the problem we're having in a way
that's acceptable to everyone and is architecturally correct.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-27 11:32                             ` Roger Pau Monné
@ 2018-11-27 11:45                               ` Razvan Cojocaru
  2018-11-27 11:59                                 ` Andrew Cooper
  2018-12-19 16:49                               ` Alexandru Stefan ISAILA
  1 sibling, 1 reply; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-27 11:45 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, Alexandru Stefan ISAILA, xen-devel

On 11/27/18 1:32 PM, Roger Pau Monné wrote:
> Would it be possible to add some kind of flag to the emulator to
> signal whether p2m restrictions should be enforced/ignored?
> hvmemul_acquire_page seems like a suitable place, but I'm not that
> familiar with the emulator.
> 
> Then you could generate vm events from the emulator itself, which
> AFAICT is the only way to handle this instruction execution issue.

I hope so, we'll definitely look into that.

> IMO, the best way is move forward with this issue is to enhance the
> emulator to be able to generate vm events.

Right, it does look that way to me too. Hopefully we can get something
working that way.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-27 11:45                               ` Razvan Cojocaru
@ 2018-11-27 11:59                                 ` Andrew Cooper
  2018-11-27 12:12                                   ` Razvan Cojocaru
  0 siblings, 1 reply; 52+ messages in thread
From: Andrew Cooper @ 2018-11-27 11:59 UTC (permalink / raw)
  To: Razvan Cojocaru, Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	Mihai Donțu, Andrei Vlad LUTAS, jun.nakajima,
	Alexandru Stefan ISAILA, xen-devel

On 27/11/2018 11:45, Razvan Cojocaru wrote:
> On 11/27/18 1:32 PM, Roger Pau Monné wrote:
>> Would it be possible to add some kind of flag to the emulator to
>> signal whether p2m restrictions should be enforced/ignored?
>> hvmemul_acquire_page seems like a suitable place, but I'm not that
>> familiar with the emulator.
>>
>> Then you could generate vm events from the emulator itself, which
>> AFAICT is the only way to handle this instruction execution issue.
> I hope so, we'll definitely look into that.

FWIW, There is already a plan(tm).  It was discussed at least in part in
Budapest.

The emulator needs to start honouring P2M permissions and generating
vm_events.

Then, a vm_event response can reply saying "please emulate the
instruction with this temporary change to the permissions", so
write-ability to a read-only page can be granted at the discretion of
the introspection agent.

That said, there is a huge amount of work required to make this happen,
and I haven't had time to do a clear design yet.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-27 11:59                                 ` Andrew Cooper
@ 2018-11-27 12:12                                   ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2018-11-27 12:12 UTC (permalink / raw)
  To: Andrew Cooper, Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	Mihai Donțu, Andrei Vlad LUTAS, jun.nakajima,
	Alexandru Stefan ISAILA, xen-devel

On 11/27/18 1:59 PM, Andrew Cooper wrote:
> On 27/11/2018 11:45, Razvan Cojocaru wrote:
>> On 11/27/18 1:32 PM, Roger Pau Monné wrote:
>>> Would it be possible to add some kind of flag to the emulator to
>>> signal whether p2m restrictions should be enforced/ignored?
>>> hvmemul_acquire_page seems like a suitable place, but I'm not that
>>> familiar with the emulator.
>>>
>>> Then you could generate vm events from the emulator itself, which
>>> AFAICT is the only way to handle this instruction execution issue.
>> I hope so, we'll definitely look into that.
> 
> FWIW, There is already a plan(tm).  It was discussed at least in part in
> Budapest.
> 
> The emulator needs to start honouring P2M permissions any generating
> vm_events.
> 
> Then, a vm_event response can reply saying "please emulate the
> instruction with this temporary change to the permissions", so
> write-ability to a read-only page can be granted at the discretion of
> the introspection agent.
> 
> That said, there is a huge amount of work required to make this happen,
> and I haven't had time to do a clear design yet.

Right, but for starters all we need is the ability to say
"hvm_emulate_one_vm_event(bool honour_page_walk_faults, bool
honor_gla_faults)".

Then we just replace all callsites of hvm_emulate_one_vm_event() with
hvm_emulate_one_vm_event(false, false), and the one in
p2m_mem_access_check() that we currently have with
hvm_emulate_one_vm_event(false, true).

Hopefully that makes sense. :)

Finer grained vm_event-based control is probably useful, but to the best
of my knowledge not currently (or in the near-medium future) necessary.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-11-27 11:32                             ` Roger Pau Monné
  2018-11-27 11:45                               ` Razvan Cojocaru
@ 2018-12-19 16:49                               ` Alexandru Stefan ISAILA
  2018-12-19 17:40                                 ` Roger Pau Monné
  1 sibling, 1 reply; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-12-19 16:49 UTC (permalink / raw)
  To: Roger Pau Monné, Razvan Cojocaru
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, george.dunlap,
	andrew.cooper3, Mihai Donțu, Andrei Vlad LUTAS,
	jun.nakajima, xen-devel

On 27.11.2018 13:32, Roger Pau Monné wrote:
> Would it be possible to add some kind of flag to the emulator to
> signal whether p2m restrictions should be enforced/ignored?
> hvmemul_acquire_page seems like a suitable place, but I'm not that
> familiar with the emulator.
> 
> Then you could generate vm events from the emulator itself, which
> AFAICT is the only way to handle this instruction execution issue.

I've been testing what place would be the best to have the emulator send 
a event and it turns out the hvmemul_acquire_page is not ok. What worked 
form me was having the emulator send write access violations from 
hvmemul_map_linear_addr(). Here I can get the gfn, gla and offset for 
the event.

For the exec access violation I've tried to send events from 
hvmemul_insn_fetch() but there is a problem to get the same variables 
for the event. Is there a way to go around and get those params for the 
event? Any thoughts are appreciated.

Regards,
Alex
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-12-19 16:49                               ` Alexandru Stefan ISAILA
@ 2018-12-19 17:40                                 ` Roger Pau Monné
  2018-12-20 14:37                                   ` Alexandru Stefan ISAILA
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2018-12-19 17:40 UTC (permalink / raw)
  To: Alexandru Stefan ISAILA
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, Razvan Cojocaru,
	george.dunlap, andrew.cooper3, Mihai Donțu,
	Andrei Vlad LUTAS, jun.nakajima, xen-devel

On Wed, Dec 19, 2018 at 04:49:43PM +0000, Alexandru Stefan ISAILA wrote:
> On 27.11.2018 13:32, Roger Pau Monné wrote:
> > Would it be possible to add some kind of flag to the emulator to
> > signal whether p2m restrictions should be enforced/ignored?
> > hvmemul_acquire_page seems like a suitable place, but I'm not that
> > familiar with the emulator.
> > 
> > Then you could generate vm events from the emulator itself, which
> > AFAICT is the only way to handle this instruction execution issue.
> 
> I've been testing what place would be the best to have the emulator send 
> a event and it turns out the hvmemul_acquire_page is not ok. What worked 
> form me was having the emulator send write access violations from 
> hvmemul_map_linear_addr(). Here I can get the gfn, gla and offset for 
> the event.
> 
> For the exec access violation I've tried to send events from 
> hvmemul_insn_fetch() but there is a problem to get the same variables 
> for the event. Is there a way to go around and get those params for the 
> event? Any thoughts are appreciated.

You have the IP in hvmemul_insn_fetch, can't you get the gfn from
there? Either directly or by translating the gla to a gfn if the guest
is running with paging enabled?

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
       [not found]                                 ` <0D3C56BA0200004834861ACF@prv1-mh.provo.novell.com>
@ 2018-12-20  9:07                                   ` Jan Beulich
  0 siblings, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2018-12-20  9:07 UTC (permalink / raw)
  To: aisaila
  Cc: Andrei LUTAS, Tamas K Lengyel, Wei Liu, Razvan Cojocaru,
	George Dunlap, Andrew Cooper, Mihai Dontu, Kevin Tian,
	Jun Nakajima, xen-devel, Roger Pau Monne

>>> On 19.12.18 at 17:49, <aisaila@bitdefender.com> wrote:
> On 27.11.2018 13:32, Roger Pau Monné wrote:
>> Would it be possible to add some kind of flag to the emulator to
>> signal whether p2m restrictions should be enforced/ignored?
>> hvmemul_acquire_page seems like a suitable place, but I'm not that
>> familiar with the emulator.
>> 
>> Then you could generate vm events from the emulator itself, which
>> AFAICT is the only way to handle this instruction execution issue.
> 
> I've been testing what place would be the best to have the emulator send 
> a event and it turns out the hvmemul_acquire_page is not ok. What worked 
> form me was having the emulator send write access violations from 
> hvmemul_map_linear_addr(). Here I can get the gfn, gla and offset for 
> the event.
> 
> For the exec access violation I've tried to send events from 
> hvmemul_insn_fetch() but there is a problem to get the same variables 
> for the event. Is there a way to go around and get those params for the 
> event? Any thoughts are appreciated.

One question is whether __hvmemul_read() couldn't / shouldn't
also be switched to use hvmemul_map_linear_addr(), which would
allow your insertion to live in one central place.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
  2018-12-19 17:40                                 ` Roger Pau Monné
@ 2018-12-20 14:37                                   ` Alexandru Stefan ISAILA
  0 siblings, 0 replies; 52+ messages in thread
From: Alexandru Stefan ISAILA @ 2018-12-20 14:37 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: kevin.tian, tamas, wei.liu2, jbeulich, Razvan Cojocaru,
	george.dunlap, andrew.cooper3, Mihai Donțu,
	Andrei Vlad LUTAS, jun.nakajima, xen-devel



On 19.12.2018 19:40, Roger Pau Monné wrote:
> On Wed, Dec 19, 2018 at 04:49:43PM +0000, Alexandru Stefan ISAILA wrote:
>> On 27.11.2018 13:32, Roger Pau Monné wrote:
>>> Would it be possible to add some kind of flag to the emulator to
>>> signal whether p2m restrictions should be enforced/ignored?
>>> hvmemul_acquire_page seems like a suitable place, but I'm not that
>>> familiar with the emulator.
>>>
>>> Then you could generate vm events from the emulator itself, which
>>> AFAICT is the only way to handle this instruction execution issue.
>>
>> I've been testing what place would be the best to have the emulator send
>> a event and it turns out the hvmemul_acquire_page is not ok. What worked
>> form me was having the emulator send write access violations from
>> hvmemul_map_linear_addr(). Here I can get the gfn, gla and offset for
>> the event.
>>
>> For the exec access violation I've tried to send events from
>> hvmemul_insn_fetch() but there is a problem to get the same variables
>> for the event. Is there a way to go around and get those params for the
>> event? Any thoughts are appreciated.
> 
> You have the IP in hvmemul_insn_fetch, can't you get the gfn from
> there? Either directly or by translating the gla to a gfn if the guest
> is running with paging enabled?
> 
I've managed to solve the issue with hvmemul_linear_to_phys() for gpa
and hvmemul_virtual_to_linear() for gla. I will post the patch in a rfc 
after I will test if everything in ok.

Thanks,
Alex
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-13 13:58                                 ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2019-05-13 13:58 UTC (permalink / raw)
  To: Jan Beulich; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

On 11/27/18 12:49 PM, Razvan Cojocaru wrote:
>> With a sufficiently complete insn emulator, single-stepping should
>> not be needed at all imo. Granted we're not quite there yet with
>> the emulator, but we've made quite a bit of progress. As before,
>> if there are particular instructions you know of that the emulator
>> doesn't handle yet, please keep pointing these out. Last I know
>> were some AVX move instructions, which have long been
>> implemented.
> True, I haven't seen emulator issues in that respect with staging - the
> emulator appears lately to be sufficiently complete. Thank you very much
> for your help and support - we'll definitely point out unsupported
> instructions if we spot some again.

We've come accross a new instruction that the emulator can't handle in 
Xen 4.13-unstable today:

vpmaddwd xmm4,xmm4,XMMWORD PTR ds:0x513fbb20

Perhaps there are plans for this to go into the emulator as well?


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [Xen-devel] [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-13 13:58                                 ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2019-05-13 13:58 UTC (permalink / raw)
  To: Jan Beulich; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

On 11/27/18 12:49 PM, Razvan Cojocaru wrote:
>> With a sufficiently complete insn emulator, single-stepping should
>> not be needed at all imo. Granted we're not quite there yet with
>> the emulator, but we've made quite a bit of progress. As before,
>> if there are particular instructions you know of that the emulator
>> doesn't handle yet, please keep pointing these out. Last I know
>> were some AVX move instructions, which have long been
>> implemented.
> True, I haven't seen emulator issues in that respect with staging - the
> emulator appears lately to be sufficiently complete. Thank you very much
> for your help and support - we'll definitely point out unsupported
> instructions if we spot some again.

We've come accross a new instruction that the emulator can't handle in 
Xen 4.13-unstable today:

vpmaddwd xmm4,xmm4,XMMWORD PTR ds:0x513fbb20

Perhaps there are plans for this to go into the emulator as well?


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-13 14:06                                   ` Jan Beulich
  0 siblings, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2019-05-13 14:06 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

>>> On 13.05.19 at 15:58, <rcojocaru@bitdefender.com> wrote:
> On 11/27/18 12:49 PM, Razvan Cojocaru wrote:
>>> With a sufficiently complete insn emulator, single-stepping should
>>> not be needed at all imo. Granted we're not quite there yet with
>>> the emulator, but we've made quite a bit of progress. As before,
>>> if there are particular instructions you know of that the emulator
>>> doesn't handle yet, please keep pointing these out. Last I know
>>> were some AVX move instructions, which have long been
>>> implemented.
>> True, I haven't seen emulator issues in that respect with staging - the
>> emulator appears lately to be sufficiently complete. Thank you very much
>> for your help and support - we'll definitely point out unsupported
>> instructions if we spot some again.
> 
> We've come accross a new instruction that the emulator can't handle in 
> Xen 4.13-unstable today:
> 
> vpmaddwd xmm4,xmm4,XMMWORD PTR ds:0x513fbb20
> 
> Perhaps there are plans for this to go into the emulator as well?

You're kidding? This is already in 4.12.0, and if it weren't I'm sure
you're aware there are about 40 more AVX512 patches pending
review.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [Xen-devel] [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-13 14:06                                   ` Jan Beulich
  0 siblings, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2019-05-13 14:06 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

>>> On 13.05.19 at 15:58, <rcojocaru@bitdefender.com> wrote:
> On 11/27/18 12:49 PM, Razvan Cojocaru wrote:
>>> With a sufficiently complete insn emulator, single-stepping should
>>> not be needed at all imo. Granted we're not quite there yet with
>>> the emulator, but we've made quite a bit of progress. As before,
>>> if there are particular instructions you know of that the emulator
>>> doesn't handle yet, please keep pointing these out. Last I know
>>> were some AVX move instructions, which have long been
>>> implemented.
>> True, I haven't seen emulator issues in that respect with staging - the
>> emulator appears lately to be sufficiently complete. Thank you very much
>> for your help and support - we'll definitely point out unsupported
>> instructions if we spot some again.
> 
> We've come accross a new instruction that the emulator can't handle in 
> Xen 4.13-unstable today:
> 
> vpmaddwd xmm4,xmm4,XMMWORD PTR ds:0x513fbb20
> 
> Perhaps there are plans for this to go into the emulator as well?

You're kidding? This is already in 4.12.0, and if it weren't I'm sure
you're aware there are about 40 more AVX512 patches pending
review.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-13 14:15                                     ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2019-05-13 14:15 UTC (permalink / raw)
  To: Jan Beulich; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

On 5/13/19 5:06 PM, Jan Beulich wrote:
>>>> On 13.05.19 at 15:58, <rcojocaru@bitdefender.com> wrote:
>> On 11/27/18 12:49 PM, Razvan Cojocaru wrote:
>>>> With a sufficiently complete insn emulator, single-stepping should
>>>> not be needed at all imo. Granted we're not quite there yet with
>>>> the emulator, but we've made quite a bit of progress. As before,
>>>> if there are particular instructions you know of that the emulator
>>>> doesn't handle yet, please keep pointing these out. Last I know
>>>> were some AVX move instructions, which have long been
>>>> implemented.
>>> True, I haven't seen emulator issues in that respect with staging - the
>>> emulator appears lately to be sufficiently complete. Thank you very much
>>> for your help and support - we'll definitely point out unsupported
>>> instructions if we spot some again.
>>
>> We've come accross a new instruction that the emulator can't handle in
>> Xen 4.13-unstable today:
>>
>> vpmaddwd xmm4,xmm4,XMMWORD PTR ds:0x513fbb20
>>
>> Perhaps there are plans for this to go into the emulator as well?
> 
> You're kidding? This is already in 4.12.0, and if it weren't I'm sure
> you're aware there are about 40 more AVX512 patches pending
> review.

Right, I did indeed forget about the pending review part, for some 
reason I was sure they made it in. I've double-checked and we really are 
using 4.13-unstable - but we've also made changes to the emulator, 
working on the send-vm-events-from-the-emulator patch, so we'll revert 
to a pristine staging and retry, there's a chance this happens because 
of our changes.

We'll find out what's going on exactly.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [Xen-devel] [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-13 14:15                                     ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2019-05-13 14:15 UTC (permalink / raw)
  To: Jan Beulich; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

On 5/13/19 5:06 PM, Jan Beulich wrote:
>>>> On 13.05.19 at 15:58, <rcojocaru@bitdefender.com> wrote:
>> On 11/27/18 12:49 PM, Razvan Cojocaru wrote:
>>>> With a sufficiently complete insn emulator, single-stepping should
>>>> not be needed at all imo. Granted we're not quite there yet with
>>>> the emulator, but we've made quite a bit of progress. As before,
>>>> if there are particular instructions you know of that the emulator
>>>> doesn't handle yet, please keep pointing these out. Last I know
>>>> were some AVX move instructions, which have long been
>>>> implemented.
>>> True, I haven't seen emulator issues in that respect with staging - the
>>> emulator appears lately to be sufficiently complete. Thank you very much
>>> for your help and support - we'll definitely point out unsupported
>>> instructions if we spot some again.
>>
>> We've come accross a new instruction that the emulator can't handle in
>> Xen 4.13-unstable today:
>>
>> vpmaddwd xmm4,xmm4,XMMWORD PTR ds:0x513fbb20
>>
>> Perhaps there are plans for this to go into the emulator as well?
> 
> You're kidding? This is already in 4.12.0, and if it weren't I'm sure
> you're aware there are about 40 more AVX512 patches pending
> review.

Right, I did indeed forget about the pending review part, for some 
reason I was sure they made it in. I've double-checked and we really are 
using 4.13-unstable - but we've also made changes to the emulator, 
working on the send-vm-events-from-the-emulator patch, so we'll revert 
to a pristine staging and retry, there's a chance this happens because 
of our changes.

We'll find out what's going on exactly.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-14 13:47                                       ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2019-05-14 13:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

On 5/13/19 5:15 PM, Razvan Cojocaru wrote:
> On 5/13/19 5:06 PM, Jan Beulich wrote:
>>>>> On 13.05.19 at 15:58, <rcojocaru@bitdefender.com> wrote:
>>> On 11/27/18 12:49 PM, Razvan Cojocaru wrote:
>>>>> With a sufficiently complete insn emulator, single-stepping should
>>>>> not be needed at all imo. Granted we're not quite there yet with
>>>>> the emulator, but we've made quite a bit of progress. As before,
>>>>> if there are particular instructions you know of that the emulator
>>>>> doesn't handle yet, please keep pointing these out. Last I know
>>>>> were some AVX move instructions, which have long been
>>>>> implemented.
>>>> True, I haven't seen emulator issues in that respect with staging - the
>>>> emulator appears lately to be sufficiently complete. Thank you very 
>>>> much
>>>> for your help and support - we'll definitely point out unsupported
>>>> instructions if we spot some again.
>>>
>>> We've come accross a new instruction that the emulator can't handle in
>>> Xen 4.13-unstable today:
>>>
>>> vpmaddwd xmm4,xmm4,XMMWORD PTR ds:0x513fbb20
>>>
>>> Perhaps there are plans for this to go into the emulator as well?
>>
>> You're kidding? This is already in 4.12.0, and if it weren't I'm sure
>> you're aware there are about 40 more AVX512 patches pending
>> review.
> 
> Right, I did indeed forget about the pending review part, for some 
> reason I was sure they made it in. I've double-checked and we really are 
> using 4.13-unstable - but we've also made changes to the emulator, 
> working on the send-vm-events-from-the-emulator patch, so we'll revert 
> to a pristine staging and retry, there's a chance this happens because 
> of our changes.
> 
> We'll find out what's going on exactly.

I promised I'd return with more details. After some debugging, it 
certainly looks like the emulator returns UNIMPLEMENTED (5):

Mem event emulation failed (5): d5v0 32bit @ 001b:6d96efff -> c5 f9 f5 
05 c0 be ad 6d c5 e1 fe 1d a0 20 af 6d

Looking at the source code, the emulator does appear to support 
vpmaddwd, however only for EVEX:

http://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/arch/x86/x86_emulate/x86_emulate.c;h=032995ea586aa7dd90a1953b6ded656436652049;hb=refs/heads/staging#l6696

whereas our fail case uses VEX.

This may be in the works in the aforementioned series, but is 
legitimately unsupported in 4.13 staging.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [Xen-devel] [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-14 13:47                                       ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2019-05-14 13:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

On 5/13/19 5:15 PM, Razvan Cojocaru wrote:
> On 5/13/19 5:06 PM, Jan Beulich wrote:
>>>>> On 13.05.19 at 15:58, <rcojocaru@bitdefender.com> wrote:
>>> On 11/27/18 12:49 PM, Razvan Cojocaru wrote:
>>>>> With a sufficiently complete insn emulator, single-stepping should
>>>>> not be needed at all imo. Granted we're not quite there yet with
>>>>> the emulator, but we've made quite a bit of progress. As before,
>>>>> if there are particular instructions you know of that the emulator
>>>>> doesn't handle yet, please keep pointing these out. Last I know
>>>>> were some AVX move instructions, which have long been
>>>>> implemented.
>>>> True, I haven't seen emulator issues in that respect with staging - the
>>>> emulator appears lately to be sufficiently complete. Thank you very 
>>>> much
>>>> for your help and support - we'll definitely point out unsupported
>>>> instructions if we spot some again.
>>>
>>> We've come accross a new instruction that the emulator can't handle in
>>> Xen 4.13-unstable today:
>>>
>>> vpmaddwd xmm4,xmm4,XMMWORD PTR ds:0x513fbb20
>>>
>>> Perhaps there are plans for this to go into the emulator as well?
>>
>> You're kidding? This is already in 4.12.0, and if it weren't I'm sure
>> you're aware there are about 40 more AVX512 patches pending
>> review.
> 
> Right, I did indeed forget about the pending review part, for some 
> reason I was sure they made it in. I've double-checked and we really are 
> using 4.13-unstable - but we've also made changes to the emulator, 
> working on the send-vm-events-from-the-emulator patch, so we'll revert 
> to a pristine staging and retry, there's a chance this happens because 
> of our changes.
> 
> We'll find out what's going on exactly.

I promised I'd return with more details. After some debugging, it 
certainly looks like the emulator returns UNIMPLEMENTED (5):

Mem event emulation failed (5): d5v0 32bit @ 001b:6d96efff -> c5 f9 f5 
05 c0 be ad 6d c5 e1 fe 1d a0 20 af 6d

Looking at the source code, the emulator does appear to support 
vpmaddwd, however only for EVEX:

http://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/arch/x86/x86_emulate/x86_emulate.c;h=032995ea586aa7dd90a1953b6ded656436652049;hb=refs/heads/staging#l6696

whereas our fail case uses VEX.

This may be in the works in the aforementioned series, but is 
legitimately unsupported in 4.13 staging.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-14 14:16                                         ` Jan Beulich
  0 siblings, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2019-05-14 14:16 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

>>> On 14.05.19 at 15:47, <rcojocaru@bitdefender.com> wrote:
> Mem event emulation failed (5): d5v0 32bit @ 001b:6d96efff -> c5 f9 f5 
> 05 c0 be ad 6d c5 e1 fe 1d a0 20 af 6d
> 
> Looking at the source code, the emulator does appear to support 
> vpmaddwd, however only for EVEX:
> 
> http://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/arch/x86/x86_emulate/x 
> 86_emulate.c;h=032995ea586aa7dd90a1953b6ded656436652049;hb=refs/heads/staging
> #l6696
> 
> whereas our fail case uses VEX.
> 
> This may be in the works in the aforementioned series, but is 
> legitimately unsupported in 4.13 staging.

Hmm, interesting. The origin of the encoding is at MMX times,
which means it's more than just VPMADDWD that's missing, and
it's been an omission back in the MMX/SSE2 series then. That's
a genuine oversight, and in the light of this I'd like to apologize
for my unfriendly initial reaction. I'll see about getting this fixed.
(It would have helped if you had shared the encoding right away,
since the mnemonic and operands are now often insufficient.)

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [Xen-devel] [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-14 14:16                                         ` Jan Beulich
  0 siblings, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2019-05-14 14:16 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel

>>> On 14.05.19 at 15:47, <rcojocaru@bitdefender.com> wrote:
> Mem event emulation failed (5): d5v0 32bit @ 001b:6d96efff -> c5 f9 f5 
> 05 c0 be ad 6d c5 e1 fe 1d a0 20 af 6d
> 
> Looking at the source code, the emulator does appear to support 
> vpmaddwd, however only for EVEX:
> 
> http://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/arch/x86/x86_emulate/x 
> 86_emulate.c;h=032995ea586aa7dd90a1953b6ded656436652049;hb=refs/heads/staging
> #l6696
> 
> whereas our fail case uses VEX.
> 
> This may be in the works in the aforementioned series, but is 
> legitimately unsupported in 4.13 staging.

Hmm, interesting. The origin of the encoding is at MMX times,
which means it's more than just VPMADDWD that's missing, and
it's been an omission back in the MMX/SSE2 series then. That's
a genuine oversight, and in the light of this I'd like to apologize
for my unfriendly initial reaction. I'll see about getting this fixed.
(It would have helped if you had shared the encoding right away,
since the mnemonic and operands are now often insufficient.)

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-14 14:20                                           ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2019-05-14 14:20 UTC (permalink / raw)
  To: Jan Beulich; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel



On 5/14/19 5:16 PM, Jan Beulich wrote:
>>>> On 14.05.19 at 15:47, <rcojocaru@bitdefender.com> wrote:
>> Mem event emulation failed (5): d5v0 32bit @ 001b:6d96efff -> c5 f9 f5
>> 05 c0 be ad 6d c5 e1 fe 1d a0 20 af 6d
>>
>> Looking at the source code, the emulator does appear to support
>> vpmaddwd, however only for EVEX:
>>
>> http://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/arch/x86/x86_emulate/x
>> 86_emulate.c;h=032995ea586aa7dd90a1953b6ded656436652049;hb=refs/heads/staging
>> #l6696
>>
>> whereas our fail case uses VEX.
>>
>> This may be in the works in the aforementioned series, but is
>> legitimately unsupported in 4.13 staging.
> 
> Hmm, interesting. The origin of the encoding is at MMX times,
> which means it's more than just VPMADDWD that's missing, and
> it's been an omission back in the MMX/SSE2 series then. That's
> a genuine oversight, and in the light of this I'd like to apologize
> for my unfriendly initial reaction. I'll see about getting this fixed.
> (It would have helped if you had shared the encoding right away,
> since the mnemonic and operands are now often insufficient.)

No problem at all. Indeed, sharing the encoding would have cleared 
things up faster.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [Xen-devel] [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
@ 2019-05-14 14:20                                           ` Razvan Cojocaru
  0 siblings, 0 replies; 52+ messages in thread
From: Razvan Cojocaru @ 2019-05-14 14:20 UTC (permalink / raw)
  To: Jan Beulich; +Cc: aisaila, Andrew Cooper, Andrei LUTAS, xen-devel



On 5/14/19 5:16 PM, Jan Beulich wrote:
>>>> On 14.05.19 at 15:47, <rcojocaru@bitdefender.com> wrote:
>> Mem event emulation failed (5): d5v0 32bit @ 001b:6d96efff -> c5 f9 f5
>> 05 c0 be ad 6d c5 e1 fe 1d a0 20 af 6d
>>
>> Looking at the source code, the emulator does appear to support
>> vpmaddwd, however only for EVEX:
>>
>> http://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/arch/x86/x86_emulate/x
>> 86_emulate.c;h=032995ea586aa7dd90a1953b6ded656436652049;hb=refs/heads/staging
>> #l6696
>>
>> whereas our fail case uses VEX.
>>
>> This may be in the works in the aforementioned series, but is
>> legitimately unsupported in 4.13 staging.
> 
> Hmm, interesting. The origin of the encoding is at MMX times,
> which means it's more than just VPMADDWD that's missing, and
> it's been an omission back in the MMX/SSE2 series then. That's
> a genuine oversight, and in the light of this I'd like to apologize
> for my unfriendly initial reaction. I'll see about getting this fixed.
> (It would have helped if you had shared the encoding right away,
> since the mnemonic and operands are now often insufficient.)

No problem at all. Indeed, sharing the encoding would have cleared 
things up faster.


Thanks,
Razvan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2019-05-14 14:21 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-16 10:06 [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults Alexandru Stefan ISAILA
2018-11-16 17:04 ` Roger Pau Monné
2018-11-19 13:30   ` Alexandru Stefan ISAILA
2018-11-19 14:26     ` Jan Beulich
2018-11-19 15:08     ` Roger Pau Monné
2018-11-19 15:56       ` Alexandru Stefan ISAILA
2018-11-21  9:56         ` Roger Pau Monné
2018-11-21 10:28           ` Alexandru Stefan ISAILA
2018-11-21 11:41             ` Roger Pau Monné
2018-11-21 12:00               ` Alexandru Stefan ISAILA
2018-11-19 13:33   ` Jan Beulich
2018-11-21 18:55   ` Razvan Cojocaru
2018-11-22  9:50     ` Alexandru Stefan ISAILA
2018-11-22 10:00       ` Jan Beulich
2018-11-22 10:07       ` Roger Pau Monné
2018-11-22 10:05     ` Roger Pau Monné
2018-11-22 10:14       ` Razvan Cojocaru
2018-11-22 10:58         ` Roger Pau Monné
2018-11-22 12:48           ` Razvan Cojocaru
2018-11-22 14:49             ` Roger Pau Monné
2018-11-22 15:25               ` Razvan Cojocaru
2018-11-22 15:37                 ` Roger Pau Monné
2018-11-22 16:52                   ` Razvan Cojocaru
2018-11-22 17:08                     ` Roger Pau Monné
2018-11-22 18:24                       ` Razvan Cojocaru
2018-11-23  8:54                         ` Roger Pau Monné
     [not found]                           ` <59739FBC020000C234861ACF@prv1-mh.provo.novell.com>
     [not found]                             ` <F553A58C020000AB0063616D@prv1-mh.provo.novell.com>
     [not found]                               ` <4D445A680200003E34861ACF@prv1-mh.provo.novell.com>
     [not found]                                 ` <DAD49D5A020000780063616D@prv1-mh.provo.novell.com>
     [not found]                                   ` <5400A6CB0200003634861ACF@prv1-mh.provo.novell.com>
     [not found]                                     ` <203C1A92020000400063616D@prv1-mh.provo.novell.com>
     [not found]                                       ` <0DF3BC62020000E934861ACF@prv1-mh.provo.novell.com>
     [not found]                                         ` <C6A2E442020000640063616D@prv1-mh.provo.novell.com>
     [not found]                                           ` <6EEA58AB020000EA34861ACF@prv1-mh.provo.novell.com>
2018-11-27 10:31                           ` Razvan Cojocaru
2018-11-27 11:32                             ` Roger Pau Monné
2018-11-27 11:45                               ` Razvan Cojocaru
2018-11-27 11:59                                 ` Andrew Cooper
2018-11-27 12:12                                   ` Razvan Cojocaru
2018-12-19 16:49                               ` Alexandru Stefan ISAILA
2018-12-19 17:40                                 ` Roger Pau Monné
2018-12-20 14:37                                   ` Alexandru Stefan ISAILA
     [not found]                         ` <838191050200006B34861ACF@prv1-mh.provo.novell.com>
2018-11-23  9:07                           ` Jan Beulich
2018-11-27 10:49                             ` Razvan Cojocaru
2018-11-27 11:28                               ` Jan Beulich
2018-11-27 11:44                                 ` Razvan Cojocaru
2019-05-13 13:58                               ` Razvan Cojocaru
2019-05-13 13:58                                 ` [Xen-devel] " Razvan Cojocaru
2019-05-13 14:06                                 ` Jan Beulich
2019-05-13 14:06                                   ` [Xen-devel] " Jan Beulich
2019-05-13 14:15                                   ` Razvan Cojocaru
2019-05-13 14:15                                     ` [Xen-devel] " Razvan Cojocaru
2019-05-14 13:47                                     ` Razvan Cojocaru
2019-05-14 13:47                                       ` [Xen-devel] " Razvan Cojocaru
2019-05-14 14:16                                       ` Jan Beulich
2019-05-14 14:16                                         ` [Xen-devel] " Jan Beulich
2019-05-14 14:20                                         ` Razvan Cojocaru
2019-05-14 14:20                                           ` [Xen-devel] " Razvan Cojocaru
     [not found]                           ` <A31948D30200007D0063616D@prv1-mh.provo.novell.com>
2018-11-23  9:10                             ` Jan Beulich
     [not found]                             ` <9B05ED9E020000C434861ACF@prv1-mh.provo.novell.com>
     [not found]                               ` <626A217B020000C50063616D@prv1-mh.provo.novell.com>
     [not found]                                 ` <0D3C56BA0200004834861ACF@prv1-mh.provo.novell.com>
2018-12-20  9:07                                   ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.