All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC V3 1/5] xen: Emulate with no writes
@ 2014-07-23 12:34 Razvan Cojocaru
  2014-07-23 12:34 ` [PATCH RFC V3 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
                   ` (5 more replies)
  0 siblings, 6 replies; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-23 12:34 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, Ian.Campbell, Razvan Cojocaru, stefano.stabellini,
	Ian.Jackson, eddie.dong, JBeulich, andres, jun.nakajima,
	andrew.cooper3

Added support for emulating an instruction with no memory writes.
Additionally, introduced hvm_emulate_one_full(), which inspects
possible return values from the hvm_emulate_one() functions
(EXCEPTION, UNHANDLEABLE) and acts on them.

Changes since V1:
 - Removed the Linux code that computes the length of an instruction.
 - Unused function parameters are no longer marked.
 - Refactored the code to eliminate redundancy.
 - Made the exception passed on to the guest by hvm_emulate_one_full()
   configurable.

Changes since V2:
 - Renamed hvmemul_write_dummy() to hvmemul_write_discard().
 - Fixed a comment (Xen coding style rules).
 - Renamed hvm_emulate_one_with_ops() to _hvm_emulate_one().
 - Changed stack variable hvm_emulate_ops_no_write into a static const.
 - Modified struct hvm_emulate_ctxt ctx initialization syntax.
 - Renamed unhandleable_trapnr to trapnr and unhandleable_errcode
   to errcode.
 - Changed errcode's type to unsigned int.
 - Removed hvm_emulate_one() loop that went on until it returned
   something other than X86EMUL_RETRY (to prevent potential
   blocking against the time calibration rendezvous).

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/emulate.c        |   91 +++++++++++++++++++++++++++++++++++--
 xen/include/asm-x86/hvm/emulate.h |    5 ++
 2 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index eac159f..f4be6cd 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -688,6 +688,17 @@ static int hvmemul_write(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_write_discard(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    /* Discarding the write. */
+    return X86EMUL_OKAY;
+}
+
 static int hvmemul_cmpxchg(
     enum x86_segment seg,
     unsigned long offset,
@@ -1138,8 +1149,33 @@ static const struct x86_emulate_ops hvm_emulate_ops = {
     .invlpg        = hvmemul_invlpg
 };
 
-int hvm_emulate_one(
-    struct hvm_emulate_ctxt *hvmemul_ctxt)
+static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
+    .read          = hvmemul_read,
+    .insn_fetch    = hvmemul_insn_fetch,
+    .write         = hvmemul_write_discard,
+    .cmpxchg       = hvmemul_cmpxchg,
+    .rep_ins       = hvmemul_rep_ins,
+    .rep_outs      = hvmemul_rep_outs,
+    .rep_movs      = hvmemul_rep_movs,
+    .read_segment  = hvmemul_read_segment,
+    .write_segment = hvmemul_write_segment,
+    .read_io       = hvmemul_read_io,
+    .write_io      = hvmemul_write_io,
+    .read_cr       = hvmemul_read_cr,
+    .write_cr      = hvmemul_write_cr,
+    .read_msr      = hvmemul_read_msr,
+    .write_msr     = hvmemul_write_msr,
+    .wbinvd        = hvmemul_wbinvd,
+    .cpuid         = hvmemul_cpuid,
+    .inject_hw_exception = hvmemul_inject_hw_exception,
+    .inject_sw_interrupt = hvmemul_inject_sw_interrupt,
+    .get_fpu       = hvmemul_get_fpu,
+    .put_fpu       = hvmemul_put_fpu,
+    .invlpg        = hvmemul_invlpg
+};
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
 {
     struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
@@ -1191,7 +1227,7 @@ int hvm_emulate_one(
     vio->mmio_retrying = vio->mmio_retry;
     vio->mmio_retry = 0;
 
-    rc = x86_emulate(&hvmemul_ctxt->ctxt, &hvm_emulate_ops);
+    rc = x86_emulate(&hvmemul_ctxt->ctxt, ops);
 
     if ( rc == X86EMUL_OKAY && vio->mmio_retry )
         rc = X86EMUL_RETRY;
@@ -1239,6 +1275,55 @@ int hvm_emulate_one(
     return X86EMUL_OKAY;
 }
 
+int hvm_emulate_one(
+    struct hvm_emulate_ctxt *hvmemul_ctxt)
+{
+    return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops);
+}
+
+int hvm_emulate_one_no_write(
+    struct hvm_emulate_ctxt *hvmemul_ctxt)
+{
+    return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops_no_write);
+}
+
+void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
+    unsigned int errcode)
+{
+    struct hvm_emulate_ctxt ctx = {{ 0 }};
+    int rc;
+
+    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
+
+    if ( nowrite )
+        rc = hvm_emulate_one_no_write(&ctx);
+    else
+        rc = hvm_emulate_one(&ctx);
+
+    switch ( rc )
+    {
+    case X86EMUL_UNHANDLEABLE:
+        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
+               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
+               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
+               ctx.insn_buf_eip,
+               ctx.insn_buf[0], ctx.insn_buf[1],
+               ctx.insn_buf[2], ctx.insn_buf[3],
+               ctx.insn_buf[4], ctx.insn_buf[5],
+               ctx.insn_buf[6], ctx.insn_buf[7],
+               ctx.insn_buf[8], ctx.insn_buf[9]);
+        hvm_inject_hw_exception(trapnr, errcode);
+        break;
+    case X86EMUL_EXCEPTION:
+        if ( ctx.exn_pending )
+            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
+        /* fall through */
+    default:
+        hvm_emulate_writeback(&ctx);
+        break;
+    }
+}
+
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs)
diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h
index 00a06cc..a74f310 100644
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -37,6 +37,11 @@ struct hvm_emulate_ctxt {
 
 int hvm_emulate_one(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
+int hvm_emulate_one_no_write(
+    struct hvm_emulate_ctxt *hvmemul_ctxt);
+void hvm_emulate_one_full(bool_t nowrite,
+    unsigned int trapnr,
+    unsigned int errcode);
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH RFC V3 2/5] xen: Optimize introspection access to guest state
  2014-07-23 12:34 [PATCH RFC V3 1/5] xen: Emulate with no writes Razvan Cojocaru
@ 2014-07-23 12:34 ` Razvan Cojocaru
  2014-07-24 15:07   ` Jan Beulich
  2014-07-23 12:34 ` [PATCH RFC V3 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-23 12:34 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, Ian.Campbell, Razvan Cojocaru, stefano.stabellini,
	Ian.Jackson, eddie.dong, JBeulich, andres, jun.nakajima,
	andrew.cooper3

Speed optimization for introspection purposes: a handful of registers
are sent along with each mem_event. This requires enlargement of the
mem_event_request / mem_event_response stuctures, and additional code
to fill in relevant values. Since the EPT event processing code needs
more data than CR3 or MSR event processors, hvm_mem_event_fill_regs()
fills in less data than p2m_mem_event_fill_regs(), in order to avoid
overhead. Struct hvm_hw_cpu has been considered instead of the custom
struct mem_event_regs_st, but its size would cause quick filling up
of the mem_event ring buffer.

Changes since V1:
 - Replaced guest_x86_mode with cs_arbytes in the mem_event_regs_st
   structure.
 - Removed superfluous preprocessor check for __x86_64__ in
   p2m_mem_event_fill_regs().

Changes since V2:
 - Removed inline function compiler suggestions.
 - Removed superfluous memset(&ctxt, 0, sizeof(struct hvm_hw_cpu)).
 - Padded struct mem_event_regs_st.
 - Renamed mem_event_regs_t to mem_event_regs.
 - Readability changes.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/hvm.c         |   33 +++++++++++++++++++++++
 xen/arch/x86/mm/p2m.c          |   57 ++++++++++++++++++++++++++++++++++++++++
 xen/include/public/mem_event.h |   39 +++++++++++++++++++++++++++
 3 files changed, 129 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index ef2411c..a84f6cd 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -6085,6 +6085,38 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
     return rc;
 }
 
+static void hvm_mem_event_fill_regs(mem_event_request_t *req)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct vcpu *v = current;
+
+    req->regs.rax = regs->eax;
+    req->regs.rcx = regs->ecx;
+    req->regs.rdx = regs->edx;
+    req->regs.rbx = regs->ebx;
+    req->regs.rsp = regs->esp;
+    req->regs.rbp = regs->ebp;
+    req->regs.rsi = regs->esi;
+    req->regs.rdi = regs->edi;
+
+    req->regs.r8  = regs->r8;
+    req->regs.r9  = regs->r9;
+    req->regs.r10 = regs->r10;
+    req->regs.r11 = regs->r11;
+    req->regs.r12 = regs->r12;
+    req->regs.r13 = regs->r13;
+    req->regs.r14 = regs->r14;
+    req->regs.r15 = regs->r15;
+
+    req->regs.rflags = regs->eflags;
+    req->regs.rip    = regs->eip;
+
+    req->regs.msr_efer = v->arch.hvm_vcpu.guest_efer;
+    req->regs.cr0 = v->arch.hvm_vcpu.guest_cr[0];
+    req->regs.cr3 = v->arch.hvm_vcpu.guest_cr[3];
+    req->regs.cr4 = v->arch.hvm_vcpu.guest_cr[4];
+}
+
 static int hvm_memory_event_traps(long p, uint32_t reason,
                                   unsigned long value, unsigned long old, 
                                   bool_t gla_valid, unsigned long gla) 
@@ -6129,6 +6161,7 @@ static int hvm_memory_event_traps(long p, uint32_t reason,
         req.gla = old;
     }
     
+    hvm_mem_event_fill_regs(&req);
     mem_event_put_request(d, &d->mem_event->access, &req);
     
     return 1;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 642ec28..6b52ab8 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1314,6 +1314,61 @@ void p2m_mem_paging_resume(struct domain *d)
     }
 }
 
+static void p2m_mem_event_fill_regs(mem_event_request_t *req)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct segment_register seg;
+    struct hvm_hw_cpu ctxt;
+    struct vcpu *v = current;
+
+    /* Architecture-specific vmcs/vmcb bits */
+    hvm_funcs.save_cpu_ctxt(v, &ctxt);
+
+    req->regs.rax = regs->eax;
+    req->regs.rcx = regs->ecx;
+    req->regs.rdx = regs->edx;
+    req->regs.rbx = regs->ebx;
+    req->regs.rsp = regs->esp;
+    req->regs.rbp = regs->ebp;
+    req->regs.rsi = regs->esi;
+    req->regs.rdi = regs->edi;
+
+    req->regs.r8  = regs->r8;
+    req->regs.r9  = regs->r9;
+    req->regs.r10 = regs->r10;
+    req->regs.r11 = regs->r11;
+    req->regs.r12 = regs->r12;
+    req->regs.r13 = regs->r13;
+    req->regs.r14 = regs->r14;
+    req->regs.r15 = regs->r15;
+
+    req->regs.rflags = regs->eflags;
+    req->regs.rip    = regs->eip;
+
+    req->regs.dr7 = v->arch.debugreg[7];
+    req->regs.cr0 = ctxt.cr0;
+    req->regs.cr2 = ctxt.cr2;
+    req->regs.cr3 = ctxt.cr3;
+    req->regs.cr4 = ctxt.cr4;
+
+    req->regs.sysenter_cs = ctxt.sysenter_cs;
+    req->regs.sysenter_esp = ctxt.sysenter_esp;
+    req->regs.sysenter_eip = ctxt.sysenter_eip;
+
+    req->regs.msr_efer = ctxt.msr_efer;
+    req->regs.msr_star = ctxt.msr_star;
+    req->regs.msr_lstar = ctxt.msr_lstar;
+
+    hvm_get_segment_register(v, x86_seg_fs, &seg);
+    req->regs.fs_base = seg.base;
+
+    hvm_get_segment_register(v, x86_seg_gs, &seg);
+    req->regs.gs_base = seg.base;
+
+    hvm_get_segment_register(v, x86_seg_cs, &seg);
+    req->regs.cs_arbytes = seg.attr.bytes;
+}
+
 bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, 
                           bool_t access_r, bool_t access_w, bool_t access_x,
                           mem_event_request_t **req_ptr)
@@ -1401,6 +1456,8 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
         req->access_x = access_x;
     
         req->vcpu_id = v->vcpu_id;
+
+        p2m_mem_event_fill_regs(req);
     }
 
     /* Pause the current VCPU */
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index 3831b41..fbf2f07 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -48,6 +48,44 @@
 #define MEM_EVENT_REASON_MSR         7    /* MSR was hit: gfn is MSR value, gla is MSR address;
                                              does NOT honour HVMPME_onchangeonly */
 
+/* Using a custom struct (not hvm_hw_cpu) so as to not fill
+ * the mem_event ring buffer too quickly. */
+typedef struct mem_event_regs_st {
+    uint64_t rax;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbx;
+    uint64_t rsp;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+    uint64_t rflags;
+    uint64_t dr7;
+    uint64_t rip;
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+    uint64_t msr_efer;
+    uint64_t msr_star;
+    uint64_t msr_lstar;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint32_t cs_arbytes;
+    uint32_t _pad;
+} mem_event_regs;
+
 typedef struct mem_event_st {
     uint32_t flags;
     uint32_t vcpu_id;
@@ -65,6 +103,7 @@ typedef struct mem_event_st {
     uint16_t available:12;
 
     uint16_t reason;
+    mem_event_regs regs;
 } mem_event_request_t, mem_event_response_t;
 
 DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH RFC V3 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-23 12:34 [PATCH RFC V3 1/5] xen: Emulate with no writes Razvan Cojocaru
  2014-07-23 12:34 ` [PATCH RFC V3 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
@ 2014-07-23 12:34 ` Razvan Cojocaru
  2014-07-24 15:14   ` Jan Beulich
  2014-07-23 12:34 ` [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-23 12:34 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, Ian.Campbell, Razvan Cojocaru, stefano.stabellini,
	Ian.Jackson, eddie.dong, JBeulich, andres, jun.nakajima,
	andrew.cooper3

Vmx_disable_intercept_for_msr() will now refuse to disable interception of
MSRs needed for memory introspection. It is not possible to gate this on
mem_access being active for the domain, since by the time mem_access does
become active the interception for the interesting MSRs has already been
disabled (vmx_disable_intercept_for_msr() runs very early on).

Changes since V1:
 - Replaced printk() with gdprintk(XENLOG_DEBUG, ...).

Changes since V2:
 - Split a log line differently to keep it grepable.
 - Interception for relevant MSRs will be disabled only if
   mem_access is not enabled.
 - Since they end up being disabled early on (when mem_access
   is not enabled yet), re-enable interception when mem_access
   becomes active.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/vmx/vmcs.c |   24 ++++++++++++++++++++++++
 xen/arch/x86/mm/mem_event.c |   17 +++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 8ffc562..2de6f5a 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -39,6 +39,7 @@
 #include <xen/keyhandler.h>
 #include <asm/shadow.h>
 #include <asm/tboot.h>
+#include <asm/mem_event.h>
 
 static bool_t __read_mostly opt_vpid_enabled = 1;
 boolean_param("vpid", opt_vpid_enabled);
@@ -695,11 +696,34 @@ static void vmx_set_host_env(struct vcpu *v)
 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
 {
     unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
+    struct domain *d = v->domain;
 
     /* VMX MSR bitmap supported? */
     if ( msr_bitmap == NULL )
         return;
 
+    if ( mem_event_check_ring(&d->mem_event->access) )
+    {
+        /* Filter out MSR-s needed for memory introspection */
+        switch ( msr )
+        {
+        case MSR_IA32_SYSENTER_EIP:
+        case MSR_IA32_SYSENTER_ESP:
+        case MSR_IA32_SYSENTER_CS:
+        case MSR_IA32_MC0_CTL:
+        case MSR_STAR:
+        case MSR_LSTAR:
+
+            gdprintk(XENLOG_DEBUG, "MSR 0x%08x "
+                "needed for memory introspection, still intercepted\n",
+                msr);
+            return;
+
+        default:
+            break;
+        }
+    }
+
     /*
      * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
      * have the write-low and read-high bitmap offsets the wrong way round.
diff --git a/xen/arch/x86/mm/mem_event.c b/xen/arch/x86/mm/mem_event.c
index 40ae841..050a1fa 100644
--- a/xen/arch/x86/mm/mem_event.c
+++ b/xen/arch/x86/mm/mem_event.c
@@ -30,6 +30,7 @@
 #include <asm/mem_access.h>
 #include <asm/mem_sharing.h>
 #include <xsm/xsm.h>
+#include <asm/hvm/vmx/vmcs.h>
 
 /* for public/io/ring.h macros */
 #define xen_mb()   mb()
@@ -600,6 +601,22 @@ int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
             rc = mem_event_enable(d, mec, med, _VPF_mem_access, 
                                     HVM_PARAM_ACCESS_RING_PFN,
                                     mem_access_notification);
+            if ( rc == 0 )
+            {
+                struct vcpu *v;
+
+                /* Enable interception for MSRs needed for memory introspection. */
+                for_each_vcpu ( d, v )
+                {
+                    /* Safe, because of previous if ( !cpu_has_vmx ) check. */
+                    vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, MSR_TYPE_W);
+                    vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, MSR_TYPE_W);
+                    vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, MSR_TYPE_W);
+                    vmx_enable_intercept_for_msr(v, MSR_IA32_MC0_CTL, MSR_TYPE_W);
+                    vmx_enable_intercept_for_msr(v, MSR_STAR, MSR_TYPE_W);
+                    vmx_enable_intercept_for_msr(v, MSR_LSTAR, MSR_TYPE_W);
+                }
+            }
         }
         break;
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 12:34 [PATCH RFC V3 1/5] xen: Emulate with no writes Razvan Cojocaru
  2014-07-23 12:34 ` [PATCH RFC V3 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
  2014-07-23 12:34 ` [PATCH RFC V3 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
@ 2014-07-23 12:34 ` Razvan Cojocaru
  2014-07-23 13:40   ` Tamas Lengyel
                     ` (2 more replies)
  2014-07-23 12:34 ` [PATCH RFC V3 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
                   ` (2 subsequent siblings)
  5 siblings, 3 replies; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-23 12:34 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, Ian.Campbell, Razvan Cojocaru, stefano.stabellini,
	Ian.Jackson, eddie.dong, JBeulich, andres, jun.nakajima,
	andrew.cooper3

Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
new xc_domain_set_pagefault_info() function to set per-domain page
fault injection information. All a call does is set per-domain info,
and nothing actually happens until VMENTRY time, and then only if
all conditions are met (the guest is in user mode, the set value
matches CR3, and there are no other pending traps).
This mechanism allows bringing in swapped-out pages for inspection.

Changes since V2:
 - Removed superfluous memset(&ctxt, 0, sizeof(struct hvm_hw_cpu)).
 - Now checking SS.DPL instead of CS.DPL to see if the guest is in
   user mode.
 - Removed superfluous fault_info members initialization.
 - Now returning -EINVAL if !has_hvm_container_domain(d) on
   XEN_DOMCTL_set_pagefault_info.
 - Moved struct fault_info from struct domain to struct hvm_domain.
 - Collapsed explicit initialization of the fault_info members into a
   one-line assignment on the tools/libxc side.
 - Now using write_access as a proper bool instead of shifting and
   OR-ing it with PFEC_user_mode.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 tools/libxc/xc_domain.c          |   16 ++++++++++++++++
 tools/libxc/xenctrl.h            |    4 ++++
 xen/arch/x86/hvm/vmx/vmx.c       |   37 +++++++++++++++++++++++++++++++++++++
 xen/common/domctl.c              |   26 ++++++++++++++++++++++++++
 xen/include/asm-x86/hvm/domain.h |    7 +++++++
 xen/include/public/domctl.h      |   14 ++++++++++++++
 6 files changed, 104 insertions(+)

diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index 0230c6c..49f0c61 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -506,6 +506,22 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
     return ret;
 }
 
+int xc_domain_set_pagefault_info(xc_interface *xch,
+                                 uint32_t domid,
+                                 xen_domctl_set_pagefault_info_t *info)
+{
+    DECLARE_DOMCTL;
+
+    if (info == NULL)
+        return -1;
+
+    domctl.cmd = XEN_DOMCTL_set_pagefault_info;
+    domctl.domain = (domid_t)domid;
+    domctl.u.set_pagefault_info = *info;
+
+    return do_domctl(xch, &domctl);
+}
+
 int xc_vcpu_getcontext(xc_interface *xch,
                        uint32_t domid,
                        uint32_t vcpu,
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 5beb846..a913f10 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -803,6 +803,10 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
 const char *xc_domain_get_native_protocol(xc_interface *xch,
                                           uint32_t domid);
 
+int xc_domain_set_pagefault_info(xc_interface *xch,
+                                 uint32_t domid,
+                                 xen_domctl_set_pagefault_info_t *info);
+
 /**
  * This function returns information about the execution context of a
  * particular vcpu of a domain.
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 2caa04a..5ea3188 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3097,6 +3097,41 @@ out:
         nvmx_idtv_handling();
 }
 
+static void check_pf_injection(void)
+{
+    struct vcpu *curr = current;
+    struct domain *d = curr->domain;
+    struct hvm_hw_cpu ctxt;
+    struct segment_register seg;
+    int errcode = PFEC_user_mode;
+
+    if ( !is_hvm_domain(d)
+         || d->arch.hvm_domain.fault_info.virtual_address == 0 )
+        return;
+
+    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
+    hvm_get_segment_register(curr, x86_seg_ss, &seg);
+
+    if ( seg.attr.fields.dpl == 3 /* Guest is in user mode */
+         && !ctxt.pending_event
+         && ctxt.cr3 == d->arch.hvm_domain.fault_info.address_space )
+    {
+        /* Cache */
+        uint64_t virtual_address = d->arch.hvm_domain.fault_info.virtual_address;
+        uint32_t write_access = d->arch.hvm_domain.fault_info.write_access;
+
+        /* Reset */
+        d->arch.hvm_domain.fault_info.address_space = 0;
+        d->arch.hvm_domain.fault_info.virtual_address = 0;
+        d->arch.hvm_domain.fault_info.write_access = 0;
+
+        if ( write_access )
+            errcode |= PFEC_write_access;
+
+        hvm_inject_page_fault(errcode, virtual_address);
+    }
+}
+
 void vmx_vmenter_helper(const struct cpu_user_regs *regs)
 {
     struct vcpu *curr = current;
@@ -3137,6 +3172,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
     if ( unlikely(need_flush) )
         vpid_sync_all();
 
+    check_pf_injection();
+
  out:
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index c326aba..0d67601 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -967,6 +967,32 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
     }
     break;
 
+    case XEN_DOMCTL_set_pagefault_info:
+    {
+        struct domain *d;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->domain);
+        if ( d != NULL )
+        {
+            ret = -EINVAL;
+
+            if ( has_hvm_container_domain(d) )
+            {
+                d->arch.hvm_domain.fault_info.address_space =
+                    op->u.set_pagefault_info.address_space;
+                d->arch.hvm_domain.fault_info.virtual_address =
+                    op->u.set_pagefault_info.virtual_address;
+                d->arch.hvm_domain.fault_info.write_access =
+                    op->u.set_pagefault_info.write_access;
+                ret = 0;
+            }
+
+            rcu_unlock_domain(d);
+        }
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, d, u_domctl);
         break;
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 291a2e0..f9bd098 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -141,6 +141,13 @@ struct hvm_domain {
      */
     uint64_t sync_tsc;
 
+    /* Memory introspection page fault injection data. */
+    struct {
+        uint64_t address_space;
+        uint64_t virtual_address;
+        uint32_t write_access;
+    } fault_info;
+
     union {
         struct vmx_domain vmx;
         struct svm_domain svm;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 5b11bbf..c8bf3f8 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -936,6 +936,18 @@ typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
 #endif
 
+/* XEN_DOMCTL_set_pagefault_info requests that a page fault occur at
+ * the next VMENTRY.
+ *  */
+struct xen_domctl_set_pagefault_info {
+    uint64_t address_space;
+    uint64_t virtual_address;
+    uint32_t write_access;
+};
+typedef struct xen_domctl_set_pagefault_info xen_domctl_set_pagefault_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_pagefault_info_t);
+
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -1012,6 +1024,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
 #define XEN_DOMCTL_gdbsx_domstatus             1003
+#define XEN_DOMCTL_set_pagefault_info          1004
     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
     domid_t  domain;
     union {
@@ -1068,6 +1081,7 @@ struct xen_domctl {
         struct xen_domctl_cacheflush        cacheflush;
         struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
+        struct xen_domctl_set_pagefault_info set_pagefault_info;
         uint8_t                             pad[128];
     } u;
 };
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH RFC V3 5/5] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-23 12:34 [PATCH RFC V3 1/5] xen: Emulate with no writes Razvan Cojocaru
                   ` (2 preceding siblings ...)
  2014-07-23 12:34 ` [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
@ 2014-07-23 12:34 ` Razvan Cojocaru
  2014-07-24 15:42   ` Jan Beulich
  2014-07-24 11:20 ` [PATCH RFC V3 1/5] xen: Emulate with no writes Tim Deegan
  2014-07-24 12:32 ` Ian Jackson
  5 siblings, 1 reply; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-23 12:34 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, Ian.Campbell, Razvan Cojocaru, stefano.stabellini,
	Ian.Jackson, eddie.dong, JBeulich, andres, jun.nakajima,
	andrew.cooper3

In a scenario where a page fault that triggered a mem_event occured,
p2m_mem_access_check() will now be able to either 1) emulate the
current instruction, or 2) emulate it, but don't allow it to perform
any writes.

Changes since V1:
 - Removed the 'skip' code which required computing the current
   instruction length.
 - Removed the set_ad_bits() code that attempted to modify the
   'accessed' and 'dirty' bits for instructions that the emulator
   can't handle at the moment.

Changes since V2:
 - Moved the __vmread(EXIT_QUALIFICATION, &exit_qualification); code
   in vmx.c, accessible via hvm_funcs.
 - Incorporated changes by Andrew Cooper ("[PATCH 1/2] Xen/mem_event:
   Validate the response vcpu_id before acting on it."

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/domain.c          |    5 ++
 xen/arch/x86/hvm/vmx/vmx.c     |   13 +++++
 xen/arch/x86/mm/mem_sharing.c  |   11 +++-
 xen/arch/x86/mm/p2m.c          |  108 +++++++++++++++++++++++++++++++++++++++-
 xen/include/asm-x86/domain.h   |    9 ++++
 xen/include/asm-x86/hvm/hvm.h  |    2 +
 xen/include/public/mem_event.h |   12 +++--
 7 files changed, 152 insertions(+), 8 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index e896210..5cd283b 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -407,6 +407,11 @@ int vcpu_initialise(struct vcpu *v)
 
     v->arch.flags = TF_kernel_mode;
 
+    /* By default, do not emulate */
+    v->arch.mem_event.emulate_flags = 0;
+    v->arch.mem_event.gpa = 0;
+    v->arch.mem_event.eip = 0;
+
     rc = mapcache_vcpu_init(v);
     if ( rc )
         return rc;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 5ea3188..25d5663 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1682,6 +1682,18 @@ void vmx_hypervisor_cpuid_leaf(uint32_t sub_idx,
         *eax |= XEN_HVM_CPUID_X2APIC_VIRT;
 }
 
+static bool_t vmx_exited_by_pagefault(void)
+{
+    unsigned long exit_qualification;
+
+    __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+    if ( (exit_qualification & EPT_GLA_FAULT) == 0 )
+        return 0;
+
+    return 1;
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -1740,6 +1752,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .handle_eoi           = vmx_handle_eoi,
     .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
     .hypervisor_cpuid_leaf = vmx_hypervisor_cpuid_leaf,
+    .exited_by_pagefault  = vmx_exited_by_pagefault,
 };
 
 const struct hvm_function_table * __init start_vmx(void)
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 7293f31..ec99266 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -596,11 +596,20 @@ int mem_sharing_sharing_resume(struct domain *d)
     /* Get all requests off the ring */
     while ( mem_event_get_response(d, &d->mem_event->share, &rsp) )
     {
+        struct vcpu *v;
+
         if ( rsp.flags & MEM_EVENT_FLAG_DUMMY )
             continue;
+
+        /* Validate the vcpu_id in the response. */
+        if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
+            continue;
+
+        v = d->vcpu[rsp.vcpu_id];
+
         /* Unpause domain/vcpu */
         if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
-            vcpu_unpause(d->vcpu[rsp.vcpu_id]);
+            vcpu_unpause(v);
     }
 
     return 0;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 6b52ab8..c51d27a 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1290,8 +1290,17 @@ void p2m_mem_paging_resume(struct domain *d)
     /* Pull all responses off the ring */
     while( mem_event_get_response(d, &d->mem_event->paging, &rsp) )
     {
+        struct vcpu *v;
+
         if ( rsp.flags & MEM_EVENT_FLAG_DUMMY )
             continue;
+
+        /* Validate the vcpu_id in the response. */
+        if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
+            continue;
+
+        v = d->vcpu[rsp.vcpu_id];
+
         /* Fix p2m entry if the page was not dropped */
         if ( !(rsp.flags & MEM_EVENT_FLAG_DROP_PAGE) )
         {
@@ -1310,7 +1319,7 @@ void p2m_mem_paging_resume(struct domain *d)
         }
         /* Unpause domain */
         if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
-            vcpu_unpause(d->vcpu[rsp.vcpu_id]);
+            vcpu_unpause(v);
     }
 }
 
@@ -1382,6 +1391,7 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
     p2m_access_t p2ma;
     mem_event_request_t *req;
     int rc;
+    unsigned long eip = guest_cpu_user_regs()->eip;
 
     /* First, handle rx2rw conversion automatically.
      * These calls to p2m->set_entry() must succeed: we have the gfn
@@ -1434,6 +1444,37 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
             return 1;
         }
     }
+    else
+    {
+        /* There's a mem_event listener */
+        if ( hvm_funcs.exited_by_pagefault && !hvm_funcs.exited_by_pagefault() ) /* don't send a mem_event */
+        {
+            if ( v->arch.mem_event.emulate_flags == 0 )
+            {
+                v->arch.mem_event.emulate_flags = MEM_EVENT_FLAG_EMULATE;
+                v->arch.mem_event.gpa = gpa;
+                v->arch.mem_event.eip = eip;
+            }
+        }
+    }
+
+    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
+    {
+        v->arch.mem_event.emulate_flags = 0;
+        v->arch.mem_event.gpa = gpa;
+        v->arch.mem_event.eip = eip;
+    }
+
+    if ( v->arch.mem_event.emulate_flags )
+    {
+        if ( v->arch.mem_event.emulate_flags & MEM_EVENT_FLAG_EMULATE_NOWRITE )
+            hvm_emulate_one_full(1, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+        else
+            hvm_emulate_one_full(0, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+
+        v->arch.mem_event.emulate_flags = 0;
+        return 1;
+    }
 
     *req_ptr = NULL;
     req = xzalloc(mem_event_request_t);
@@ -1475,11 +1516,74 @@ void p2m_mem_access_resume(struct domain *d)
     /* Pull all responses off the ring */
     while( mem_event_get_response(d, &d->mem_event->access, &rsp) )
     {
+        struct vcpu *v;
+
         if ( rsp.flags & MEM_EVENT_FLAG_DUMMY )
             continue;
+
+        /* Validate the vcpu_id in the response. */
+        if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
+            continue;
+
+        v = d->vcpu[rsp.vcpu_id];
+
+        /* Mark vcpu for skipping one instruction upon rescheduling */
+        if ( rsp.flags & MEM_EVENT_FLAG_EMULATE )
+        {
+            xenmem_access_t access;
+            int violation = 1;
+
+            v->arch.mem_event.emulate_flags = 0;
+
+            if ( p2m_get_mem_access(d, rsp.gfn, &access) == 0 )
+            {
+                violation = 0;
+
+                switch (access)
+                {
+                case XENMEM_access_n:
+                case XENMEM_access_n2rwx:
+                default:
+                    violation = rsp.access_r || rsp.access_w || rsp.access_x;
+                    break;
+
+                case XENMEM_access_r:
+                    violation = rsp.access_w || rsp.access_x;
+                    break;
+
+                case XENMEM_access_w:
+                    violation = rsp.access_r || rsp.access_x;
+                    break;
+
+                case XENMEM_access_x:
+                    violation = rsp.access_r || rsp.access_w;
+                    break;
+
+                case XENMEM_access_rx:
+                case XENMEM_access_rx2rw:
+                    violation = rsp.access_w;
+                    break;
+
+                case XENMEM_access_wx:
+                    violation = rsp.access_r;
+                    break;
+
+                case XENMEM_access_rw:
+                    violation = rsp.access_x;
+                    break;
+
+                case XENMEM_access_rwx:
+                    break;
+                }
+            }
+
+            if ( violation )
+                v->arch.mem_event.emulate_flags = rsp.flags;
+        }
+
         /* Unpause domain */
         if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
-            vcpu_unpause(d->vcpu[rsp.vcpu_id]);
+            vcpu_unpause(v);
     }
 }
 
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index abf55fb..0fa4d3d 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -446,6 +446,15 @@ struct arch_vcpu
 
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
+
+    /* Should we emulate the next matching instruction on VCPU resume
+     * after a mem_event? */
+    struct {
+        uint32_t emulate_flags;
+        unsigned long gpa;
+        unsigned long eip;
+    } mem_event;
+
 } __cacheline_aligned;
 
 /* Shorthands to improve code legibility. */
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 0ebd478..fecd3ef 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -205,6 +205,8 @@ struct hvm_function_table {
     void (*hypervisor_cpuid_leaf)(uint32_t sub_idx,
                                   uint32_t *eax, uint32_t *ebx,
                                   uint32_t *ecx, uint32_t *edx);
+
+    bool_t (*exited_by_pagefault)(void);
 };
 
 extern struct hvm_function_table hvm_funcs;
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index fbf2f07..582b427 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -31,11 +31,13 @@
 #include "io/ring.h"
 
 /* Memory event flags */
-#define MEM_EVENT_FLAG_VCPU_PAUSED  (1 << 0)
-#define MEM_EVENT_FLAG_DROP_PAGE    (1 << 1)
-#define MEM_EVENT_FLAG_EVICT_FAIL   (1 << 2)
-#define MEM_EVENT_FLAG_FOREIGN      (1 << 3)
-#define MEM_EVENT_FLAG_DUMMY        (1 << 4)
+#define MEM_EVENT_FLAG_VCPU_PAUSED     (1 << 0)
+#define MEM_EVENT_FLAG_DROP_PAGE       (1 << 1)
+#define MEM_EVENT_FLAG_EVICT_FAIL      (1 << 2)
+#define MEM_EVENT_FLAG_FOREIGN         (1 << 3)
+#define MEM_EVENT_FLAG_DUMMY           (1 << 4)
+#define MEM_EVENT_FLAG_EMULATE         (1 << 5)
+#define MEM_EVENT_FLAG_EMULATE_NOWRITE (1 << 6)
 
 /* Reasons for the memory event request */
 #define MEM_EVENT_REASON_UNKNOWN     0    /* typical reason */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 12:34 ` [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
@ 2014-07-23 13:40   ` Tamas Lengyel
  2014-07-23 14:08     ` Razvan Cojocaru
  2014-07-24 12:33   ` Ian Jackson
  2014-07-24 15:27   ` Jan Beulich
  2 siblings, 1 reply; 31+ messages in thread
From: Tamas Lengyel @ 2014-07-23 13:40 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian Campbell, Stefano Stabellini, jun.nakajima,
	Ian Jackson, eddie.dong, xen-devel, Andres Lagar-Cavilla,
	Jan Beulich, Andrew Cooper


[-- Attachment #1.1: Type: text/plain, Size: 1363 bytes --]

> @@ -3137,6 +3172,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs
> *regs)
>      if ( unlikely(need_flush) )
>          vpid_sync_all();
>
> +    check_pf_injection();
>

The function check_pf_injection isn't just 'checking', it does issue the
page fault injection as well under the right circumstances, and as such the
naming of the function is a bit misleading. Breaking it into two functions
might be an improvement where the actual check itself could be wrapped into
an unlikely(). But isn't this a bit of an overkill at every VMENTER for
every domain? Surely there are less invasive mechanisms to trigger a VMEXIT
when you know your VM will be in a state where you can inject your page
fault, without incurring an overhead for every domain.

Another question is, how do you know when the page fault had been handled
and the page can be inspected? You would need some other trigger just at
the right point in the execution of the VM or the page could be swapped out
again before you had a chance to inspect it.

Also, it might make sense to perform some sanity checks on the vaddr and
address space before injection (ie. is the page really swapped out). There
is no guarantee that the page is still swapped out, even if you checked
before issuing xc_domain_set_pagefault_info, unless the domain had been
paused for both checking and setting.

Tamas

[-- Attachment #1.2: Type: text/html, Size: 1724 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 13:40   ` Tamas Lengyel
@ 2014-07-23 14:08     ` Razvan Cojocaru
  2014-07-23 14:27       ` Tamas Lengyel
  0 siblings, 1 reply; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-23 14:08 UTC (permalink / raw)
  To: Tamas Lengyel
  Cc: kevin.tian, Ian Campbell, Stefano Stabellini, Ian Jackson,
	eddie.dong, xen-devel, Jan Beulich, Andres Lagar-Cavilla,
	jun.nakajima, Andrew Cooper

On 07/23/2014 04:40 PM, Tamas Lengyel wrote:
> 
>     @@ -3137,6 +3172,8 @@ void vmx_vmenter_helper(const struct
>     cpu_user_regs *regs)
>          if ( unlikely(need_flush) )
>              vpid_sync_all();
> 
>     +    check_pf_injection();
> 
> 
> The function check_pf_injection isn't just 'checking', it does issue the
> page fault injection as well under the right circumstances, and as such
> the naming of the function is a bit misleading. Breaking it into two
> functions might be an improvement where the actual check itself could be

That's a good point. I'll split it into two functions (and add a vmx_
prefix to their names while I'm at it).

> wrapped into an unlikely(). But isn't this a bit of an overkill at every
> VMENTER for every domain? Surely there are less invasive mechanisms to
> trigger a VMEXIT when you know your VM will be in a state where you can
> inject your page fault, without incurring an overhead for every domain.

It's not much of an overhead, basically if
d->arch.hvm_domain.fault_info.virtual_address == 0 (which is almost
always the case), nothing happens.

> Another question is, how do you know when the page fault had been
> handled and the page can be inspected? You would need some other trigger
> just at the right point in the execution of the VM or the page could be
> swapped out again before you had a chance to inspect it.

In our case, this always happens while the VCPU is paused waiting for a
mem_event reply, so it fits quite well.

> Also, it might make sense to perform some sanity checks on the vaddr and
> address space before injection (ie. is the page really swapped out).
> There is no guarantee that the page is still swapped out, even if you
> checked before issuing xc_domain_set_pagefault_info, unless the domain
> had been paused for both checking and setting.

As said above, the particular VCPU is in our case paused and waiting for
a mem_event reply. The assumption is that other clients will work under
similar circumstances, however it's always a good idea to check
everything that can be checked.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 14:08     ` Razvan Cojocaru
@ 2014-07-23 14:27       ` Tamas Lengyel
  2014-07-23 15:13         ` Razvan Cojocaru
  0 siblings, 1 reply; 31+ messages in thread
From: Tamas Lengyel @ 2014-07-23 14:27 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian Campbell, Stefano Stabellini, Ian Jackson,
	eddie.dong, xen-devel, Jan Beulich, Andres Lagar-Cavilla,
	jun.nakajima, Andrew Cooper


[-- Attachment #1.1: Type: text/plain, Size: 1821 bytes --]

> > wrapped into an unlikely(). But isn't this a bit of an overkill at every
> > VMENTER for every domain? Surely there are less invasive mechanisms to
> > trigger a VMEXIT when you know your VM will be in a state where you can
> > inject your page fault, without incurring an overhead for every domain.
>
> It's not much of an overhead, basically if
> d->arch.hvm_domain.fault_info.virtual_address == 0 (which is almost
> always the case), nothing happens.
>

Since the majority of the domains will never use it, even a tiny overhead
adds up, especially over time. It would be a lot cleaner to trap the
execution of the VM at the moment when it is safe to inject the page fault
instead. For example you could just mark the process' code pages
non-executable in the EPT, catch the violations, and if the conditions are
met inject your pagefault.


> > Also, it might make sense to perform some sanity checks on the vaddr and
> > address space before injection (ie. is the page really swapped out).
> > There is no guarantee that the page is still swapped out, even if you
> > checked before issuing xc_domain_set_pagefault_info, unless the domain
> > had been paused for both checking and setting.
>
> As said above, the particular VCPU is in our case paused and waiting for
> a mem_event reply. The assumption is that other clients will work under
> similar circumstances, however it's always a good idea to check
> everything that can be checked.
>

I don't think having just the VCPU paused is enough, another still active
VCPU might still swap the page back, so you would really need to have the
entire VM paused for this to be safe. Furthermore, if there are any
limitations/assumptions like this about the intended use of the function,
describing them in a comment in xenctrl.h would be appropriate.

Cheers!
Tamas

[-- Attachment #1.2: Type: text/html, Size: 2384 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 14:27       ` Tamas Lengyel
@ 2014-07-23 15:13         ` Razvan Cojocaru
  2014-07-23 20:17           ` Andrei LUTAS
  0 siblings, 1 reply; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-23 15:13 UTC (permalink / raw)
  To: Tamas Lengyel
  Cc: kevin.tian, Ian Campbell, Stefano Stabellini, jun.nakajima,
	Ian Jackson, eddie.dong, xen-devel, Andres Lagar-Cavilla,
	Jan Beulich, Andrew Cooper

On 07/23/2014 05:27 PM, Tamas Lengyel wrote:
>     > wrapped into an unlikely(). But isn't this a bit of an overkill at
>     every
>     > VMENTER for every domain? Surely there are less invasive mechanisms to
>     > trigger a VMEXIT when you know your VM will be in a state where
>     you can
>     > inject your page fault, without incurring an overhead for every
>     domain.
> 
>     It's not much of an overhead, basically if
>     d->arch.hvm_domain.fault_info.virtual_address == 0 (which is almost
>     always the case), nothing happens.
> 
> 
> Since the majority of the domains will never use it, even a tiny
> overhead adds up, especially over time. It would be a lot cleaner to
> trap the execution of the VM at the moment when it is safe to inject the
> page fault instead. For example you could just mark the process' code
> pages non-executable in the EPT, catch the violations, and if the
> conditions are met inject your pagefault.

Ufortunately it's more complicated than that. Our application wants to
be able to inject page faults at process initialization time, for PEB,
etc., where we're not necessarily talking about a process executing code.

>     > Also, it might make sense to perform some sanity checks on the
>     vaddr and
>     > address space before injection (ie. is the page really swapped out).
>     > There is no guarantee that the page is still swapped out, even if you
>     > checked before issuing xc_domain_set_pagefault_info, unless the domain
>     > had been paused for both checking and setting.
> 
>     As said above, the particular VCPU is in our case paused and waiting for
>     a mem_event reply. The assumption is that other clients will work under
>     similar circumstances, however it's always a good idea to check
>     everything that can be checked.
> 
> 
> I don't think having just the VCPU paused is enough, another still
> active VCPU might still swap the page back, so you would really need to
> have the entire VM paused for this to be safe. Furthermore, if there are
> any limitations/assumptions like this about the intended use of the
> function, describing them in a comment in xenctrl.h would be appropriate.

A typical use case for this is:

1. the application figures out that it needs a swapped out page;
2. the application tries to bring it in (via the code in this patch);
3. the application maps the page.

Now, if the application fails step 3, it might go back to step 2 until
it succeeds, or it might give up after some retries. If, however, it
succeeds, the mapped page should be safe to use until unmapped via libxc.

I would add that in practice this problem never occured with any of the
HVM Windows guests we've used for testing. The OS doesn't seem likely to
immediately swap out a page that's just been brought in.

If this is a concern, maybe I could simply add a comment in xenctrl.h
that would say that the new function is only to be used while the domain
is paused. I seem to recall having seen such comments there for other
libxc functions.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 15:13         ` Razvan Cojocaru
@ 2014-07-23 20:17           ` Andrei LUTAS
  2014-07-24 12:38             ` Ian Jackson
  0 siblings, 1 reply; 31+ messages in thread
From: Andrei LUTAS @ 2014-07-23 20:17 UTC (permalink / raw)
  To: Razvan Cojocaru, Tamas Lengyel
  Cc: kevin.tian, Ian Campbell, Stefano Stabellini, Ian Jackson,
	eddie.dong, xen-devel, Jan Beulich, Andres Lagar-Cavilla,
	jun.nakajima, Andrew Cooper

Hello, everybody!

The logic with the #PF injection goes like this:
- The memory introspection engine wants to inspect (for whatever reason) a
   virtual page X insidevirtual address space Y(inside user-mode, although
   with proper knowledge about the guest kernel,one can also inject #PF 
inside
   kernel-mode on Windows - proper IRQL, working set trim inactive, etc.)
- The memory introspection engine will write-protect (via EPT) the Page 
Table
   Entries responsible for thetranslation of X to the Guest Physical 
Address Z
- The memory introspection engine requests the #PF injection
- The memory introspection engine patiently waits until the PTE 
corresponding
   tothe virtualpage Xis written - this means that the page has been 
swapped
   in, either due to our synthetic #PF, orbecause the guest OS accessed it
- We now need to analyse the faulting instruction in order to extract the
   written value, which containsthe address of the guest-physical page
   containing the datawe want to analyse - W - what X will translate to 
after
   the swap-in operation will be complete; we basically need to map W,
   without doing a new translation for X. No pause of the entire domain is
   normally needed for this.

Hopefully this covers in more detail the actual use-case of the #PF 
injection
and how it interacts with the introspection engine.

Thanks!
Andrei

On 7/23/2014 6:13 PM, Razvan Cojocaru wrote:
> On 07/23/2014 05:27 PM, Tamas Lengyel wrote:
>>      > wrapped into an unlikely(). But isn't this a bit of an overkill at
>>      every
>>      > VMENTER for every domain? Surely there are less invasive mechanisms to
>>      > trigger a VMEXIT when you know your VM will be in a state where
>>      you can
>>      > inject your page fault, without incurring an overhead for every
>>      domain.
>>
>>      It's not much of an overhead, basically if
>>      d->arch.hvm_domain.fault_info.virtual_address == 0 (which is almost
>>      always the case), nothing happens.
>>
>>
>> Since the majority of the domains will never use it, even a tiny
>> overhead adds up, especially over time. It would be a lot cleaner to
>> trap the execution of the VM at the moment when it is safe to inject the
>> page fault instead. For example you could just mark the process' code
>> pages non-executable in the EPT, catch the violations, and if the
>> conditions are met inject your pagefault.
> Ufortunately it's more complicated than that. Our application wants to
> be able to inject page faults at process initialization time, for PEB,
> etc., where we're not necessarily talking about a process executing code.
>
>>      > Also, it might make sense to perform some sanity checks on the
>>      vaddr and
>>      > address space before injection (ie. is the page really swapped out).
>>      > There is no guarantee that the page is still swapped out, even if you
>>      > checked before issuing xc_domain_set_pagefault_info, unless the domain
>>      > had been paused for both checking and setting.
>>
>>      As said above, the particular VCPU is in our case paused and waiting for
>>      a mem_event reply. The assumption is that other clients will work under
>>      similar circumstances, however it's always a good idea to check
>>      everything that can be checked.
>>
>>
>> I don't think having just the VCPU paused is enough, another still
>> active VCPU might still swap the page back, so you would really need to
>> have the entire VM paused for this to be safe. Furthermore, if there are
>> any limitations/assumptions like this about the intended use of the
>> function, describing them in a comment in xenctrl.h would be appropriate.
> A typical use case for this is:
>
> 1. the application figures out that it needs a swapped out page;
> 2. the application tries to bring it in (via the code in this patch);
> 3. the application maps the page.
>
> Now, if the application fails step 3, it might go back to step 2 until
> it succeeds, or it might give up after some retries. If, however, it
> succeeds, the mapped page should be safe to use until unmapped via libxc.
>
> I would add that in practice this problem never occured with any of the
> HVM Windows guests we've used for testing. The OS doesn't seem likely to
> immediately swap out a page that's just been brought in.
>
> If this is a concern, maybe I could simply add a comment in xenctrl.h
> that would say that the new function is only to be used while the domain
> is paused. I seem to recall having seen such comments there for other
> libxc functions.
>
>
> Thanks,
> Razvan Cojocaru
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 1/5] xen: Emulate with no writes
  2014-07-23 12:34 [PATCH RFC V3 1/5] xen: Emulate with no writes Razvan Cojocaru
                   ` (3 preceding siblings ...)
  2014-07-23 12:34 ` [PATCH RFC V3 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
@ 2014-07-24 11:20 ` Tim Deegan
  2014-07-24 11:40   ` Razvan Cojocaru
  2014-07-25 13:52   ` Razvan Cojocaru
  2014-07-24 12:32 ` Ian Jackson
  5 siblings, 2 replies; 31+ messages in thread
From: Tim Deegan @ 2014-07-24 11:20 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, jun.nakajima,
	Ian.Jackson, eddie.dong, xen-devel, andres, JBeulich,
	andrew.cooper3

Hi,

At 15:34 +0300 on 23 Jul (1406126080), Razvan Cojocaru wrote:
> +static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
> +    .read          = hvmemul_read,
> +    .insn_fetch    = hvmemul_insn_fetch,
> +    .write         = hvmemul_write_discard,
> +    .cmpxchg       = hvmemul_cmpxchg,
> +    .rep_ins       = hvmemul_rep_ins,
> +    .rep_outs      = hvmemul_rep_outs,
> +    .rep_movs      = hvmemul_rep_movs,

If you want to discard all writes from this guest instruction, you'll
also need modified versions of the rep_ins, rep_movs and cmpxchg
handlers.

> +    .read_segment  = hvmemul_read_segment,
> +    .write_segment = hvmemul_write_segment,
> +    .read_io       = hvmemul_read_io,
> +    .write_io      = hvmemul_write_io,
> +    .read_cr       = hvmemul_read_cr,
> +    .write_cr      = hvmemul_write_cr,
> +    .read_msr      = hvmemul_read_msr,
> +    .write_msr     = hvmemul_write_msr,
> +    .wbinvd        = hvmemul_wbinvd,
> +    .cpuid         = hvmemul_cpuid,
> +    .inject_hw_exception = hvmemul_inject_hw_exception,
> +    .inject_sw_interrupt = hvmemul_inject_sw_interrupt,

Do you care about injecting exceptions (stack updates &c)?

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 1/5] xen: Emulate with no writes
  2014-07-24 11:20 ` [PATCH RFC V3 1/5] xen: Emulate with no writes Tim Deegan
@ 2014-07-24 11:40   ` Razvan Cojocaru
  2014-07-25 13:52   ` Razvan Cojocaru
  1 sibling, 0 replies; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-24 11:40 UTC (permalink / raw)
  To: Tim Deegan
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, Ian.Jackson,
	eddie.dong, xen-devel, JBeulich, andres, jun.nakajima,
	andrew.cooper3

On 07/24/2014 02:20 PM, Tim Deegan wrote:
> Hi,
> 
> At 15:34 +0300 on 23 Jul (1406126080), Razvan Cojocaru wrote:
>> +static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
>> +    .read          = hvmemul_read,
>> +    .insn_fetch    = hvmemul_insn_fetch,
>> +    .write         = hvmemul_write_discard,
>> +    .cmpxchg       = hvmemul_cmpxchg,
>> +    .rep_ins       = hvmemul_rep_ins,
>> +    .rep_outs      = hvmemul_rep_outs,
>> +    .rep_movs      = hvmemul_rep_movs,
> 
> If you want to discard all writes from this guest instruction, you'll
> also need modified versions of the rep_ins, rep_movs and cmpxchg
> handlers.

Noted, thanks.

>> +    .read_segment  = hvmemul_read_segment,
>> +    .write_segment = hvmemul_write_segment,
>> +    .read_io       = hvmemul_read_io,
>> +    .write_io      = hvmemul_write_io,
>> +    .read_cr       = hvmemul_read_cr,
>> +    .write_cr      = hvmemul_write_cr,
>> +    .read_msr      = hvmemul_read_msr,
>> +    .write_msr     = hvmemul_write_msr,
>> +    .wbinvd        = hvmemul_wbinvd,
>> +    .cpuid         = hvmemul_cpuid,
>> +    .inject_hw_exception = hvmemul_inject_hw_exception,
>> +    .inject_sw_interrupt = hvmemul_inject_sw_interrupt,
> 
> Do you care about injecting exceptions (stack updates &c)?

Yes, those should work.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 1/5] xen: Emulate with no writes
  2014-07-23 12:34 [PATCH RFC V3 1/5] xen: Emulate with no writes Razvan Cojocaru
                   ` (4 preceding siblings ...)
  2014-07-24 11:20 ` [PATCH RFC V3 1/5] xen: Emulate with no writes Tim Deegan
@ 2014-07-24 12:32 ` Ian Jackson
  2014-07-24 12:36   ` Razvan Cojocaru
  5 siblings, 1 reply; 31+ messages in thread
From: Ian Jackson @ 2014-07-24 12:32 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, JBeulich, andres, jun.nakajima

Razvan Cojocaru writes ("[PATCH RFC V3 1/5] xen: Emulate with no writes"):
> Added support for emulating an instruction with no memory writes.
> Additionally, introduced hvm_emulate_one_full(), which inspects
> possible return values from the hvm_emulate_one() functions
> (EXCEPTION, UNHANDLEABLE) and acts on them.

Can you explain the motivation for this series ?  (You've CC'd me on
1-5 but if there is a 0/5 I haven't seen it.)

Thanks,
Ian.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 12:34 ` [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
  2014-07-23 13:40   ` Tamas Lengyel
@ 2014-07-24 12:33   ` Ian Jackson
  2014-07-24 15:27   ` Jan Beulich
  2 siblings, 0 replies; 31+ messages in thread
From: Ian Jackson @ 2014-07-24 12:33 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, JBeulich, andres, jun.nakajima

Razvan Cojocaru writes ("[PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc"):
> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
> new xc_domain_set_pagefault_info() function to set per-domain page
> fault injection information. All a call does is set per-domain info,
> and nothing actually happens until VMENTRY time, and then only if
> all conditions are met (the guest is in user mode, the set value
> matches CR3, and there are no other pending traps).
> This mechanism allows bringing in swapped-out pages for inspection.

Can this objective not be achieved without a hypervisor change, by a
suitable control interface to the paging daemon ?

Ian.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 1/5] xen: Emulate with no writes
  2014-07-24 12:32 ` Ian Jackson
@ 2014-07-24 12:36   ` Razvan Cojocaru
  2014-07-24 12:37     ` Razvan Cojocaru
  2014-07-24 17:25     ` Tian, Kevin
  0 siblings, 2 replies; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-24 12:36 UTC (permalink / raw)
  To: Ian Jackson
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, jun.nakajima,
	andrew.cooper3, eddie.dong, xen-devel, andres, JBeulich

On 07/24/2014 03:32 PM, Ian Jackson wrote:
> Razvan Cojocaru writes ("[PATCH RFC V3 1/5] xen: Emulate with no writes"):
>> Added support for emulating an instruction with no memory writes.
>> Additionally, introduced hvm_emulate_one_full(), which inspects
>> possible return values from the hvm_emulate_one() functions
>> (EXCEPTION, UNHANDLEABLE) and acts on them.
> 
> Can you explain the motivation for this series ?  (You've CC'd me on
> 1-5 but if there is a 0/5 I haven't seen it.)

0/5 is here:
http://lists.xen.org/archives/html/xen-devel/2014-07/msg00304.html

But I think the initial description explains things better:

http://lists.xen.org/archives/html/xen-devel/2014-07/msg00304.html


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 1/5] xen: Emulate with no writes
  2014-07-24 12:36   ` Razvan Cojocaru
@ 2014-07-24 12:37     ` Razvan Cojocaru
  2014-07-24 17:25     ` Tian, Kevin
  1 sibling, 0 replies; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-24 12:37 UTC (permalink / raw)
  To: Ian Jackson
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, JBeulich, andres, jun.nakajima

On 07/24/2014 03:36 PM, Razvan Cojocaru wrote:
> On 07/24/2014 03:32 PM, Ian Jackson wrote:
>> Razvan Cojocaru writes ("[PATCH RFC V3 1/5] xen: Emulate with no writes"):
>>> Added support for emulating an instruction with no memory writes.
>>> Additionally, introduced hvm_emulate_one_full(), which inspects
>>> possible return values from the hvm_emulate_one() functions
>>> (EXCEPTION, UNHANDLEABLE) and acts on them.
>>
>> Can you explain the motivation for this series ?  (You've CC'd me on
>> 1-5 but if there is a 0/5 I haven't seen it.)
> 
> 0/5 is here:
> http://lists.xen.org/archives/html/xen-devel/2014-07/msg00304.html

Sorry, correct link for 0/5:
http://lists.xen.org/archives/html/xen-devel/2014-07/msg02923.html


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 20:17           ` Andrei LUTAS
@ 2014-07-24 12:38             ` Ian Jackson
  2014-07-24 12:41               ` Andrew Cooper
  0 siblings, 1 reply; 31+ messages in thread
From: Ian Jackson @ 2014-07-24 12:38 UTC (permalink / raw)
  To: Andrei LUTAS
  Cc: kevin.tian, Ian Campbell, Razvan Cojocaru, Stefano Stabellini,
	Tamas Lengyel, eddie.dong, xen-devel, Jan Beulich,
	Andres Lagar-Cavilla, jun.nakajima, Andrew Cooper

Andrei LUTAS writes ("Re: [Xen-devel] [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc"):
> The logic with the #PF injection goes like this:

Sorry, I didn't see this message when I sent my last one asking about
the motivation.

> - The memory introspection engine wants to inspect (for whatever reason) a
>    virtual page X insidevirtual address space Y(inside user-mode, although
>    with proper knowledge about the guest kernel,one can also inject #PF 
> inside
>    kernel-mode on Windows - proper IRQL, working set trim inactive, etc.)

I don't understand why this needs additional hypervisor code.  Can
this not be done by negotiating with the paging daemon ?

Ian.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-24 12:38             ` Ian Jackson
@ 2014-07-24 12:41               ` Andrew Cooper
  2014-07-24 12:44                 ` Razvan Cojocaru
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Cooper @ 2014-07-24 12:41 UTC (permalink / raw)
  To: Ian Jackson, Andrei LUTAS
  Cc: kevin.tian, Ian Campbell, Razvan Cojocaru, Stefano Stabellini,
	Tamas Lengyel, eddie.dong, xen-devel, Jan Beulich,
	Andres Lagar-Cavilla, jun.nakajima

On 24/07/14 13:38, Ian Jackson wrote:
> Andrei LUTAS writes ("Re: [Xen-devel] [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc"):
>> The logic with the #PF injection goes like this:
> Sorry, I didn't see this message when I sent my last one asking about
> the motivation.
>
>> - The memory introspection engine wants to inspect (for whatever reason) a
>>    virtual page X insidevirtual address space Y(inside user-mode, although
>>    with proper knowledge about the guest kernel,one can also inject #PF 
>> inside
>>    kernel-mode on Windows - proper IRQL, working set trim inactive, etc.)
> I don't understand why this needs additional hypervisor code.  Can
> this not be done by negotiating with the paging daemon ?
>
> Ian.

The purpose of this is to play with the paging algorithm inside the
guest OS, which can only be done from outside by providing pagefaults
when considered safe to do so.

This has nothing to do with xen-paging transparently playing with the
guest physical address space behind the guests back.

~Andrew

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-24 12:41               ` Andrew Cooper
@ 2014-07-24 12:44                 ` Razvan Cojocaru
  0 siblings, 0 replies; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-24 12:44 UTC (permalink / raw)
  To: Andrew Cooper, Ian Jackson, Andrei LUTAS
  Cc: kevin.tian, Ian Campbell, Stefano Stabellini, jun.nakajima,
	Tamas Lengyel, eddie.dong, xen-devel, Andres Lagar-Cavilla,
	Jan Beulich

On 07/24/2014 03:41 PM, Andrew Cooper wrote:
> On 24/07/14 13:38, Ian Jackson wrote:
>> Andrei LUTAS writes ("Re: [Xen-devel] [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc"):
>>> The logic with the #PF injection goes like this:
>> Sorry, I didn't see this message when I sent my last one asking about
>> the motivation.
>>
>>> - The memory introspection engine wants to inspect (for whatever reason) a
>>>    virtual page X insidevirtual address space Y(inside user-mode, although
>>>    with proper knowledge about the guest kernel,one can also inject #PF 
>>> inside
>>>    kernel-mode on Windows - proper IRQL, working set trim inactive, etc.)
>> I don't understand why this needs additional hypervisor code.  Can
>> this not be done by negotiating with the paging daemon ?
>>
>> Ian.
> 
> The purpose of this is to play with the paging algorithm inside the
> guest OS, which can only be done from outside by providing pagefaults
> when considered safe to do so.
> 
> This has nothing to do with xen-paging transparently playing with the
> guest physical address space behind the guests back.

Thanks, you beat me to it :)


Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 2/5] xen: Optimize introspection access to guest state
  2014-07-23 12:34 ` [PATCH RFC V3 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
@ 2014-07-24 15:07   ` Jan Beulich
  0 siblings, 0 replies; 31+ messages in thread
From: Jan Beulich @ 2014-07-24 15:07 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
> +static void hvm_mem_event_fill_regs(mem_event_request_t *req)
> +{
> +    struct cpu_user_regs *regs = guest_cpu_user_regs();
> +    struct vcpu *v = current;

This should be "curr" instead of "v" (also further down).

> --- a/xen/include/public/mem_event.h
> +++ b/xen/include/public/mem_event.h
> @@ -48,6 +48,44 @@
>  #define MEM_EVENT_REASON_MSR         7    /* MSR was hit: gfn is MSR value, gla is MSR address;
>                                               does NOT honour HVMPME_onchangeonly */
>  
> +/* Using a custom struct (not hvm_hw_cpu) so as to not fill
> + * the mem_event ring buffer too quickly. */
> +typedef struct mem_event_regs_st {
> +    uint64_t rax;
> +    uint64_t rcx;
> +    uint64_t rdx;
> +    uint64_t rbx;
> +    uint64_t rsp;
> +    uint64_t rbp;
> +    uint64_t rsi;
> +    uint64_t rdi;
> +    uint64_t r8;
> +    uint64_t r9;
> +    uint64_t r10;
> +    uint64_t r11;
> +    uint64_t r12;
> +    uint64_t r13;
> +    uint64_t r14;
> +    uint64_t r15;
> +    uint64_t rflags;
> +    uint64_t dr7;
> +    uint64_t rip;
> +    uint64_t cr0;
> +    uint64_t cr2;
> +    uint64_t cr3;
> +    uint64_t cr4;
> +    uint64_t sysenter_cs;
> +    uint64_t sysenter_esp;
> +    uint64_t sysenter_eip;
> +    uint64_t msr_efer;
> +    uint64_t msr_star;
> +    uint64_t msr_lstar;
> +    uint64_t fs_base;
> +    uint64_t gs_base;
> +    uint32_t cs_arbytes;

I realize that the header is already pretty x86-centric despite not sitting
under arch-x86/, but this clearly gets it over the boundary: These should
live in an x86-only header, or at least in an x86-only section.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-23 12:34 ` [PATCH RFC V3 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
@ 2014-07-24 15:14   ` Jan Beulich
  2014-07-24 15:56     ` Razvan Cojocaru
  0 siblings, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2014-07-24 15:14 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
> @@ -695,11 +696,34 @@ static void vmx_set_host_env(struct vcpu *v)
>  void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
>  {
>      unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
> +    struct domain *d = v->domain;
>  
>      /* VMX MSR bitmap supported? */
>      if ( msr_bitmap == NULL )
>          return;
>  
> +    if ( mem_event_check_ring(&d->mem_event->access) )
> +    {
> +        /* Filter out MSR-s needed for memory introspection */
> +        switch ( msr )
> +        {
> +        case MSR_IA32_SYSENTER_EIP:
> +        case MSR_IA32_SYSENTER_ESP:
> +        case MSR_IA32_SYSENTER_CS:
> +        case MSR_IA32_MC0_CTL:
> +        case MSR_STAR:
> +        case MSR_LSTAR:
> +
> +            gdprintk(XENLOG_DEBUG, "MSR 0x%08x "

Is the current domain/vCPU really useful in this message? And
do you really need the file name to be printed here? And does the
MSR number really need to always be 8 characters wide? Or
perhaps - is this message useful at all?

> --- a/xen/arch/x86/mm/mem_event.c
> +++ b/xen/arch/x86/mm/mem_event.c
> @@ -30,6 +30,7 @@
>  #include <asm/mem_access.h>
>  #include <asm/mem_sharing.h>
>  #include <xsm/xsm.h>
> +#include <asm/hvm/vmx/vmcs.h>

Please don't.

> @@ -600,6 +601,22 @@ int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
>              rc = mem_event_enable(d, mec, med, _VPF_mem_access, 
>                                      HVM_PARAM_ACCESS_RING_PFN,
>                                      mem_access_notification);
> +            if ( rc == 0 )
> +            {
> +                struct vcpu *v;
> +
> +                /* Enable interception for MSRs needed for memory introspection. */
> +                for_each_vcpu ( d, v )
> +                {
> +                    /* Safe, because of previous if ( !cpu_has_vmx ) check. */

Safe or not, VMX-specific code doesn't belong here.

> +                    vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, MSR_TYPE_W);
> +                    vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, MSR_TYPE_W);
> +                    vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, MSR_TYPE_W);
> +                    vmx_enable_intercept_for_msr(v, MSR_IA32_MC0_CTL, MSR_TYPE_W);
> +                    vmx_enable_intercept_for_msr(v, MSR_STAR, MSR_TYPE_W);
> +                    vmx_enable_intercept_for_msr(v, MSR_LSTAR, MSR_TYPE_W);
> +                }
> +            }
>          }
>          break;

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-23 12:34 ` [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
  2014-07-23 13:40   ` Tamas Lengyel
  2014-07-24 12:33   ` Ian Jackson
@ 2014-07-24 15:27   ` Jan Beulich
  2014-07-24 16:18     ` Razvan Cojocaru
  2 siblings, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2014-07-24 15:27 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
> +static void check_pf_injection(void)
> +{
> +    struct vcpu *curr = current;
> +    struct domain *d = curr->domain;
> +    struct hvm_hw_cpu ctxt;
> +    struct segment_register seg;
> +    int errcode = PFEC_user_mode;
> +
> +    if ( !is_hvm_domain(d)
> +         || d->arch.hvm_domain.fault_info.virtual_address == 0 )
> +        return;
> +
> +    hvm_funcs.save_cpu_ctxt(curr, &ctxt);

Isn't this a little heavy handed?

> +    hvm_get_segment_register(curr, x86_seg_ss, &seg);
> +
> +    if ( seg.attr.fields.dpl == 3 /* Guest is in user mode */

Did you verify that this covers VM86 mode too?

> +         && !ctxt.pending_event
> +         && ctxt.cr3 == d->arch.hvm_domain.fault_info.address_space )
> +    {
> +        /* Cache */

Cache? Did you mean "Latch" or some such perhaps?

> +        uint64_t virtual_address = d->arch.hvm_domain.fault_info.virtual_address;
> +        uint32_t write_access = d->arch.hvm_domain.fault_info.write_access;
> +
> +        /* Reset */
> +        d->arch.hvm_domain.fault_info.address_space = 0;
> +        d->arch.hvm_domain.fault_info.virtual_address = 0;
> +        d->arch.hvm_domain.fault_info.write_access = 0;
> +
> +        if ( write_access )
> +            errcode |= PFEC_write_access;
> +
> +        hvm_inject_page_fault(errcode, virtual_address);
> +    }
> +}

Even together with its call site it's remaining unclear without knowing
almost all of the rest of your code why this function would inject
only two types of page faults (and I'm still not finally sure this is
intended/correct). A comment would certainly help, short of a
more descriptive name for the function (which I suppose would get
unreasonably long).

> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -967,6 +967,32 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
>      }
>      break;
>  
> +    case XEN_DOMCTL_set_pagefault_info:
> +    {
> +        struct domain *d;
> +
> +        ret = -ESRCH;
> +        d = rcu_lock_domain_by_id(op->domain);
> +        if ( d != NULL )
> +        {
> +            ret = -EINVAL;
> +
> +            if ( has_hvm_container_domain(d) )

This is inconsistent with the earlier use of is_hvm_domain(). You
ought to decide whether you want PVH to be supported.

> @@ -1012,6 +1024,7 @@ struct xen_domctl {
>  #define XEN_DOMCTL_gdbsx_pausevcpu             1001
>  #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
>  #define XEN_DOMCTL_gdbsx_domstatus             1003
> +#define XEN_DOMCTL_set_pagefault_info          1004

That doesn't look like the range you want to extend.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 5/5] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-23 12:34 ` [PATCH RFC V3 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
@ 2014-07-24 15:42   ` Jan Beulich
  2014-07-24 16:29     ` Razvan Cojocaru
  0 siblings, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2014-07-24 15:42 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
> @@ -1434,6 +1444,37 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
>              return 1;
>          }
>      }
> +    else
> +    {
> +        /* There's a mem_event listener */
> +        if ( hvm_funcs.exited_by_pagefault && !hvm_funcs.exited_by_pagefault() ) /* don't send a mem_event */
> +        {
> +            if ( v->arch.mem_event.emulate_flags == 0 )

The five lines above (leaving aside the comment) should all become
a simple "else if()".

> +            {
> +                v->arch.mem_event.emulate_flags = MEM_EVENT_FLAG_EMULATE;
> +                v->arch.mem_event.gpa = gpa;
> +                v->arch.mem_event.eip = eip;
> +            }
> +        }
> +    }
> +
> +    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
> +    {
> +        v->arch.mem_event.emulate_flags = 0;
> +        v->arch.mem_event.gpa = gpa;
> +        v->arch.mem_event.eip = eip;
> +    }

So the default value for gpa and eip is zero - how do you deal with
guests having code/data at virtual/physical address zero? It would
seem to me that you need a "valid" qualification for these fields, but
since the purpose of this last block isn't really clear to me maybe I'm
simply missing something and a comment might help.

> +
> +    if ( v->arch.mem_event.emulate_flags )
> +    {
> +        if ( v->arch.mem_event.emulate_flags & MEM_EVENT_FLAG_EMULATE_NOWRITE )
> +            hvm_emulate_one_full(1, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
> +        else
> +            hvm_emulate_one_full(0, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);

I'd prefer such to be coded as a single call with the first argument
being suitable calculated (you wouldn't even need a conditional
operator here).

> @@ -1475,11 +1516,74 @@ void p2m_mem_access_resume(struct domain *d)
>      /* Pull all responses off the ring */
>      while( mem_event_get_response(d, &d->mem_event->access, &rsp) )
>      {
> +        struct vcpu *v;
> +
>          if ( rsp.flags & MEM_EVENT_FLAG_DUMMY )
>              continue;
> +
> +        /* Validate the vcpu_id in the response. */
> +        if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
> +            continue;
> +
> +        v = d->vcpu[rsp.vcpu_id];
> +
> +        /* Mark vcpu for skipping one instruction upon rescheduling */
> +        if ( rsp.flags & MEM_EVENT_FLAG_EMULATE )
> +        {
> +            xenmem_access_t access;
> +            int violation = 1;

bool_t?

> +
> +            v->arch.mem_event.emulate_flags = 0;
> +
> +            if ( p2m_get_mem_access(d, rsp.gfn, &access) == 0 )
> +            {
> +                violation = 0;
> +
> +                switch (access)

Coding style.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-24 15:14   ` Jan Beulich
@ 2014-07-24 15:56     ` Razvan Cojocaru
  0 siblings, 0 replies; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-24 15:56 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

On 07/24/2014 06:14 PM, Jan Beulich wrote:
>>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
>> @@ -695,11 +696,34 @@ static void vmx_set_host_env(struct vcpu *v)
>>  void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
>>  {
>>      unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
>> +    struct domain *d = v->domain;
>>  
>>      /* VMX MSR bitmap supported? */
>>      if ( msr_bitmap == NULL )
>>          return;
>>  
>> +    if ( mem_event_check_ring(&d->mem_event->access) )
>> +    {
>> +        /* Filter out MSR-s needed for memory introspection */
>> +        switch ( msr )
>> +        {
>> +        case MSR_IA32_SYSENTER_EIP:
>> +        case MSR_IA32_SYSENTER_ESP:
>> +        case MSR_IA32_SYSENTER_CS:
>> +        case MSR_IA32_MC0_CTL:
>> +        case MSR_STAR:
>> +        case MSR_LSTAR:
>> +
>> +            gdprintk(XENLOG_DEBUG, "MSR 0x%08x "
> 
> Is the current domain/vCPU really useful in this message? And
> do you really need the file name to be printed here? And does the
> MSR number really need to always be 8 characters wide? Or
> perhaps - is this message useful at all?

I thought so, but since it is indeed a minor thing I'll be happy to
remove it.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-24 15:27   ` Jan Beulich
@ 2014-07-24 16:18     ` Razvan Cojocaru
  2014-07-25  7:50       ` Jan Beulich
  0 siblings, 1 reply; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-24 16:18 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

On 07/24/2014 06:27 PM, Jan Beulich wrote:
>>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
>> +static void check_pf_injection(void)
>> +{
>> +    struct vcpu *curr = current;
>> +    struct domain *d = curr->domain;
>> +    struct hvm_hw_cpu ctxt;
>> +    struct segment_register seg;
>> +    int errcode = PFEC_user_mode;
>> +
>> +    if ( !is_hvm_domain(d)
>> +         || d->arch.hvm_domain.fault_info.virtual_address == 0 )
>> +        return;
>> +
>> +    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
> 
> Isn't this a little heavy handed?

It did seem that way to me too, but I couldn't find another way to get
to ctxt.pending_event and CR3 at this point in the code. Is there an
alternative I'm not aware of?

>> +    hvm_get_segment_register(curr, x86_seg_ss, &seg);
>> +
>> +    if ( seg.attr.fields.dpl == 3 /* Guest is in user mode */
> 
> Did you verify that this covers VM86 mode too?

Assuming CPL is SS.DPL (I've been previously been using CS.DPL), it did
work in our tests, yes.

>> +         && !ctxt.pending_event
>> +         && ctxt.cr3 == d->arch.hvm_domain.fault_info.address_space )
>> +    {
>> +        /* Cache */
> 
> Cache? Did you mean "Latch" or some such perhaps?
> 
>> +        uint64_t virtual_address = d->arch.hvm_domain.fault_info.virtual_address;
>> +        uint32_t write_access = d->arch.hvm_domain.fault_info.write_access;

I meant "cache", as in "I'm caching
d->arch.hvm_domain.fault_info.virtual_address into virtual_address"
before resetting it, but the comment is not only unimportant but
obviously confusing, so I'll take it out.

>> +        /* Reset */
>> +        d->arch.hvm_domain.fault_info.address_space = 0;
>> +        d->arch.hvm_domain.fault_info.virtual_address = 0;
>> +        d->arch.hvm_domain.fault_info.write_access = 0;
>> +
>> +        if ( write_access )
>> +            errcode |= PFEC_write_access;
>> +
>> +        hvm_inject_page_fault(errcode, virtual_address);
>> +    }
>> +}
> 
> Even together with its call site it's remaining unclear without knowing
> almost all of the rest of your code why this function would inject
> only two types of page faults (and I'm still not finally sure this is
> intended/correct). A comment would certainly help, short of a
> more descriptive name for the function (which I suppose would get
> unreasonably long).

The function, as previously suggested, will be split into the check part
and the injection part.

What our code does in this case has been briefly described here:
http://lists.xen.org/archives/html/xen-devel/2014-07/msg03013.html

>> @@ -1012,6 +1024,7 @@ struct xen_domctl {
>>  #define XEN_DOMCTL_gdbsx_pausevcpu             1001
>>  #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
>>  #define XEN_DOMCTL_gdbsx_domstatus             1003
>> +#define XEN_DOMCTL_set_pagefault_info          1004
> 
> That doesn't look like the range you want to extend.

Could you please elaborate? Thank you.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 5/5] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-24 15:42   ` Jan Beulich
@ 2014-07-24 16:29     ` Razvan Cojocaru
  2014-07-25  7:52       ` Jan Beulich
  0 siblings, 1 reply; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-24 16:29 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

On 07/24/2014 06:42 PM, Jan Beulich wrote:
>>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
>> +            {
>> +                v->arch.mem_event.emulate_flags = MEM_EVENT_FLAG_EMULATE;
>> +                v->arch.mem_event.gpa = gpa;
>> +                v->arch.mem_event.eip = eip;
>> +            }
>> +        }
>> +    }
>> +
>> +    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
>> +    {
>> +        v->arch.mem_event.emulate_flags = 0;
>> +        v->arch.mem_event.gpa = gpa;
>> +        v->arch.mem_event.eip = eip;
>> +    }
> 
> So the default value for gpa and eip is zero - how do you deal with
> guests having code/data at virtual/physical address zero? It would
> seem to me that you need a "valid" qualification for these fields, but
> since the purpose of this last block isn't really clear to me maybe I'm
> simply missing something and a comment might help.

Some instruction may trigger a page fault, which then sends out a
mem_event, and the reply might have the MEM_EVENT_FLAG_EMULATE flag set.
If it is set, we could simply emulate the next instruction that triggers
a page fault, but unless eip and gpa match, that might be some other
instruction, not the one that originally triggered the mem_event.

What the last block does is it verifies that we're not emulating an
instruction different from the one that triggered the mem_event in the
first place. If this is not the same instruction, then don't emulate it
(but send out a new mem_event for it), and just reset those values.

That the default values are zero has not been a problem for us, but I do
see how that might not be desirable for other clients. A "valid"
qualification might indeed be the solution for this.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 1/5] xen: Emulate with no writes
  2014-07-24 12:36   ` Razvan Cojocaru
  2014-07-24 12:37     ` Razvan Cojocaru
@ 2014-07-24 17:25     ` Tian, Kevin
  1 sibling, 0 replies; 31+ messages in thread
From: Tian, Kevin @ 2014-07-24 17:25 UTC (permalink / raw)
  To: Razvan Cojocaru, Ian Jackson
  Cc: Ian.Campbell, stefano.stabellini, Nakajima, Jun, andrew.cooper3,
	Dong, Eddie, xen-devel, andres, JBeulich

> From: Razvan Cojocaru [mailto:rcojocaru@bitdefender.com]
> Sent: Thursday, July 24, 2014 5:36 AM
> 
> On 07/24/2014 03:32 PM, Ian Jackson wrote:
> > Razvan Cojocaru writes ("[PATCH RFC V3 1/5] xen: Emulate with no writes"):
> >> Added support for emulating an instruction with no memory writes.
> >> Additionally, introduced hvm_emulate_one_full(), which inspects
> >> possible return values from the hvm_emulate_one() functions
> >> (EXCEPTION, UNHANDLEABLE) and acts on them.
> >
> > Can you explain the motivation for this series ?  (You've CC'd me on
> > 1-5 but if there is a 0/5 I haven't seen it.)
> 
> 0/5 is here:
> http://lists.xen.org/archives/html/xen-devel/2014-07/msg00304.html
> 
> But I think the initial description explains things better:
> 
> http://lists.xen.org/archives/html/xen-devel/2014-07/msg00304.html

better to have the description in each patch series.

> 
> 
> Thanks,
> Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc
  2014-07-24 16:18     ` Razvan Cojocaru
@ 2014-07-25  7:50       ` Jan Beulich
  0 siblings, 0 replies; 31+ messages in thread
From: Jan Beulich @ 2014-07-25  7:50 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

>>> On 24.07.14 at 18:18, <rcojocaru@bitdefender.com> wrote:
> On 07/24/2014 06:27 PM, Jan Beulich wrote:
>>>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
>>> +static void check_pf_injection(void)
>>> +{
>>> +    struct vcpu *curr = current;
>>> +    struct domain *d = curr->domain;
>>> +    struct hvm_hw_cpu ctxt;
>>> +    struct segment_register seg;
>>> +    int errcode = PFEC_user_mode;
>>> +
>>> +    if ( !is_hvm_domain(d)
>>> +         || d->arch.hvm_domain.fault_info.virtual_address == 0 )
>>> +        return;
>>> +
>>> +    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
>> 
>> Isn't this a little heavy handed?
> 
> It did seem that way to me too, but I couldn't find another way to get
> to ctxt.pending_event and CR3 at this point in the code. Is there an
> alternative I'm not aware of?

The question was more towards "Wouldn't it be worthwhile
introducing a way for you to get at just the information you
need?"

>>> @@ -1012,6 +1024,7 @@ struct xen_domctl {
>>>  #define XEN_DOMCTL_gdbsx_pausevcpu             1001
>>>  #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
>>>  #define XEN_DOMCTL_gdbsx_domstatus             1003
>>> +#define XEN_DOMCTL_set_pagefault_info          1004
>> 
>> That doesn't look like the range you want to extend.
> 
> Could you please elaborate? Thank you.

Since your code is gdbsx unrelated, your addition belongs at the
end of the "base" range (currently in the 70s) rather than at the
end of the gdbsx one.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 5/5] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-24 16:29     ` Razvan Cojocaru
@ 2014-07-25  7:52       ` Jan Beulich
  0 siblings, 0 replies; 31+ messages in thread
From: Jan Beulich @ 2014-07-25  7:52 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, andres, jun.nakajima, Ian.Jackson

>>> On 24.07.14 at 18:29, <rcojocaru@bitdefender.com> wrote:
> On 07/24/2014 06:42 PM, Jan Beulich wrote:
>>>>> On 23.07.14 at 14:34, <rcojocaru@bitdefender.com> wrote:
>>> +            {
>>> +                v->arch.mem_event.emulate_flags = MEM_EVENT_FLAG_EMULATE;
>>> +                v->arch.mem_event.gpa = gpa;
>>> +                v->arch.mem_event.eip = eip;
>>> +            }
>>> +        }
>>> +    }
>>> +
>>> +    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
>>> +    {
>>> +        v->arch.mem_event.emulate_flags = 0;
>>> +        v->arch.mem_event.gpa = gpa;
>>> +        v->arch.mem_event.eip = eip;
>>> +    }
>> 
>> So the default value for gpa and eip is zero - how do you deal with
>> guests having code/data at virtual/physical address zero? It would
>> seem to me that you need a "valid" qualification for these fields, but
>> since the purpose of this last block isn't really clear to me maybe I'm
>> simply missing something and a comment might help.
> 
> Some instruction may trigger a page fault, which then sends out a
> mem_event, and the reply might have the MEM_EVENT_FLAG_EMULATE flag set.
> If it is set, we could simply emulate the next instruction that triggers
> a page fault, but unless eip and gpa match, that might be some other
> instruction, not the one that originally triggered the mem_event.
> 
> What the last block does is it verifies that we're not emulating an
> instruction different from the one that triggered the mem_event in the
> first place. If this is not the same instruction, then don't emulate it
> (but send out a new mem_event for it), and just reset those values.

Without a code comment I can see why you clear .emulate_flags, but
I don't see the point of updating .gpa and .eip.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 1/5] xen: Emulate with no writes
  2014-07-24 11:20 ` [PATCH RFC V3 1/5] xen: Emulate with no writes Tim Deegan
  2014-07-24 11:40   ` Razvan Cojocaru
@ 2014-07-25 13:52   ` Razvan Cojocaru
  2014-07-25 14:07     ` Jan Beulich
  1 sibling, 1 reply; 31+ messages in thread
From: Razvan Cojocaru @ 2014-07-25 13:52 UTC (permalink / raw)
  To: Tim Deegan
  Cc: kevin.tian, Ian.Campbell, stefano.stabellini, Ian.Jackson,
	eddie.dong, xen-devel, JBeulich, andres, jun.nakajima,
	andrew.cooper3

On 07/24/2014 02:20 PM, Tim Deegan wrote:
> Hi,
> 
> At 15:34 +0300 on 23 Jul (1406126080), Razvan Cojocaru wrote:
>> +static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
>> +    .read          = hvmemul_read,
>> +    .insn_fetch    = hvmemul_insn_fetch,
>> +    .write         = hvmemul_write_discard,
>> +    .cmpxchg       = hvmemul_cmpxchg,
>> +    .rep_ins       = hvmemul_rep_ins,
>> +    .rep_outs      = hvmemul_rep_outs,
>> +    .rep_movs      = hvmemul_rep_movs,
> 
> If you want to discard all writes from this guest instruction, you'll
> also need modified versions of the rep_ins, rep_movs and cmpxchg
> handlers.

Since this seems to have been the only comment on this particular patch
so far, would it be safe to assume that once the issue is addressed the
patch is acceptable?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC V3 1/5] xen: Emulate with no writes
  2014-07-25 13:52   ` Razvan Cojocaru
@ 2014-07-25 14:07     ` Jan Beulich
  0 siblings, 0 replies; 31+ messages in thread
From: Jan Beulich @ 2014-07-25 14:07 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: Tim Deegan, kevin.tian, Ian.Campbell, stefano.stabellini,
	andrew.cooper3, eddie.dong, xen-devel, andres, jun.nakajima,
	Ian.Jackson

>>> On 25.07.14 at 15:52, <rcojocaru@bitdefender.com> wrote:
> On 07/24/2014 02:20 PM, Tim Deegan wrote:
>> At 15:34 +0300 on 23 Jul (1406126080), Razvan Cojocaru wrote:
>>> +static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
>>> +    .read          = hvmemul_read,
>>> +    .insn_fetch    = hvmemul_insn_fetch,
>>> +    .write         = hvmemul_write_discard,
>>> +    .cmpxchg       = hvmemul_cmpxchg,
>>> +    .rep_ins       = hvmemul_rep_ins,
>>> +    .rep_outs      = hvmemul_rep_outs,
>>> +    .rep_movs      = hvmemul_rep_movs,
>> 
>> If you want to discard all writes from this guest instruction, you'll
>> also need modified versions of the rep_ins, rep_movs and cmpxchg
>> handlers.
> 
> Since this seems to have been the only comment on this particular patch
> so far, would it be safe to assume that once the issue is addressed the
> patch is acceptable?

I would think so, but on its own it doesn't make much sense, so
I'd assume it would only go in together with consumers of the
newly added code.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2014-07-25 14:07 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-23 12:34 [PATCH RFC V3 1/5] xen: Emulate with no writes Razvan Cojocaru
2014-07-23 12:34 ` [PATCH RFC V3 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
2014-07-24 15:07   ` Jan Beulich
2014-07-23 12:34 ` [PATCH RFC V3 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
2014-07-24 15:14   ` Jan Beulich
2014-07-24 15:56     ` Razvan Cojocaru
2014-07-23 12:34 ` [PATCH RFC V3 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
2014-07-23 13:40   ` Tamas Lengyel
2014-07-23 14:08     ` Razvan Cojocaru
2014-07-23 14:27       ` Tamas Lengyel
2014-07-23 15:13         ` Razvan Cojocaru
2014-07-23 20:17           ` Andrei LUTAS
2014-07-24 12:38             ` Ian Jackson
2014-07-24 12:41               ` Andrew Cooper
2014-07-24 12:44                 ` Razvan Cojocaru
2014-07-24 12:33   ` Ian Jackson
2014-07-24 15:27   ` Jan Beulich
2014-07-24 16:18     ` Razvan Cojocaru
2014-07-25  7:50       ` Jan Beulich
2014-07-23 12:34 ` [PATCH RFC V3 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
2014-07-24 15:42   ` Jan Beulich
2014-07-24 16:29     ` Razvan Cojocaru
2014-07-25  7:52       ` Jan Beulich
2014-07-24 11:20 ` [PATCH RFC V3 1/5] xen: Emulate with no writes Tim Deegan
2014-07-24 11:40   ` Razvan Cojocaru
2014-07-25 13:52   ` Razvan Cojocaru
2014-07-25 14:07     ` Jan Beulich
2014-07-24 12:32 ` Ian Jackson
2014-07-24 12:36   ` Razvan Cojocaru
2014-07-24 12:37     ` Razvan Cojocaru
2014-07-24 17:25     ` Tian, Kevin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.