All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC V7 1/5] xen: Emulate with no writes
@ 2014-08-13 15:28 Razvan Cojocaru
  2014-08-13 15:28 ` [PATCH RFC V7 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
                   ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-13 15:28 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, jbeulich, eddie.dong, jun.nakajima

Added support for emulating an instruction with no memory writes.
Additionally, introduced hvm_emulate_one_full(), which inspects
possible return values from the hvm_emulate_one() functions
(EXCEPTION, UNHANDLEABLE) and acts on them.

Changes since V1:
 - Removed the Linux code that computes the length of an instruction.
 - Unused function parameters are no longer marked.
 - Refactored the code to eliminate redundancy.
 - Made the exception passed on to the guest by hvm_emulate_one_full()
   configurable.

Changes since V2:
 - Renamed hvmemul_write_dummy() to hvmemul_write_discard().
 - Fixed a comment (Xen coding style rules).
 - Renamed hvm_emulate_one_with_ops() to _hvm_emulate_one().
 - Changed stack variable hvm_emulate_ops_no_write into a static const.
 - Modified struct hvm_emulate_ctxt ctx initialization syntax.
 - Renamed unhandleable_trapnr to trapnr and unhandleable_errcode
   to errcode.
 - Changed errcode's type to unsigned int.
 - Removed hvm_emulate_one() loop that went on until it returned
   something other than X86EMUL_RETRY (to prevent potential
   blocking against the time calibration rendezvous).

Changes since V3:
 - The rep_ins, rep_movs and cmpxchg handlers are
   now also inactive.

Changes since V4:
 - Also discarding IO reads (dummy read_io handler).

Changes since V5:
 - Added discard versions of write_io and rep_outs.

Changes since V6:
 - Added discard versions of write_msr and wbinvd.
 - Now calling hvm_emulate_writeback() for the X86EMUL_UNHANDLEABLE
   case as well.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/emulate.c        |  167 ++++++++++++++++++++++++++++++++++++-
 xen/include/asm-x86/hvm/emulate.h |    5 ++
 2 files changed, 169 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index eac159f..7563685 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -688,6 +688,94 @@ static int hvmemul_write(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_write_discard(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    /* Discarding the write. */
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_rep_ins_discard(
+    uint16_t src_port,
+    enum x86_segment dst_seg,
+    unsigned long dst_offset,
+    unsigned int bytes_per_rep,
+    unsigned long *reps,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_rep_movs_discard(
+   enum x86_segment src_seg,
+   unsigned long src_offset,
+   enum x86_segment dst_seg,
+   unsigned long dst_offset,
+   unsigned int bytes_per_rep,
+   unsigned long *reps,
+   struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_rep_outs_discard(
+    enum x86_segment src_seg,
+    unsigned long src_offset,
+    uint16_t dst_port,
+    unsigned int bytes_per_rep,
+    unsigned long *reps,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_cmpxchg_discard(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_read_io_discard(
+    unsigned int port,
+    unsigned int bytes,
+    unsigned long *val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_write_io_discard(
+    unsigned int port,
+    unsigned int bytes,
+    unsigned long val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_write_msr_discard(
+    unsigned long reg,
+    uint64_t val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_wbinvd_discard(
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
 static int hvmemul_cmpxchg(
     enum x86_segment seg,
     unsigned long offset,
@@ -1138,8 +1226,33 @@ static const struct x86_emulate_ops hvm_emulate_ops = {
     .invlpg        = hvmemul_invlpg
 };
 
-int hvm_emulate_one(
-    struct hvm_emulate_ctxt *hvmemul_ctxt)
+static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
+    .read          = hvmemul_read,
+    .insn_fetch    = hvmemul_insn_fetch,
+    .write         = hvmemul_write_discard,
+    .cmpxchg       = hvmemul_cmpxchg_discard,
+    .rep_ins       = hvmemul_rep_ins_discard,
+    .rep_outs      = hvmemul_rep_outs_discard,
+    .rep_movs      = hvmemul_rep_movs_discard,
+    .read_segment  = hvmemul_read_segment,
+    .write_segment = hvmemul_write_segment,
+    .read_io       = hvmemul_read_io_discard,
+    .write_io      = hvmemul_write_io_discard,
+    .read_cr       = hvmemul_read_cr,
+    .write_cr      = hvmemul_write_cr,
+    .read_msr      = hvmemul_read_msr,
+    .write_msr     = hvmemul_write_msr_discard,
+    .wbinvd        = hvmemul_wbinvd_discard,
+    .cpuid         = hvmemul_cpuid,
+    .inject_hw_exception = hvmemul_inject_hw_exception,
+    .inject_sw_interrupt = hvmemul_inject_sw_interrupt,
+    .get_fpu       = hvmemul_get_fpu,
+    .put_fpu       = hvmemul_put_fpu,
+    .invlpg        = hvmemul_invlpg
+};
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
 {
     struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
@@ -1191,7 +1304,7 @@ int hvm_emulate_one(
     vio->mmio_retrying = vio->mmio_retry;
     vio->mmio_retry = 0;
 
-    rc = x86_emulate(&hvmemul_ctxt->ctxt, &hvm_emulate_ops);
+    rc = x86_emulate(&hvmemul_ctxt->ctxt, ops);
 
     if ( rc == X86EMUL_OKAY && vio->mmio_retry )
         rc = X86EMUL_RETRY;
@@ -1239,6 +1352,54 @@ int hvm_emulate_one(
     return X86EMUL_OKAY;
 }
 
+int hvm_emulate_one(
+    struct hvm_emulate_ctxt *hvmemul_ctxt)
+{
+    return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops);
+}
+
+int hvm_emulate_one_no_write(
+    struct hvm_emulate_ctxt *hvmemul_ctxt)
+{
+    return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops_no_write);
+}
+
+void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
+    unsigned int errcode)
+{
+    struct hvm_emulate_ctxt ctx = {{ 0 }};
+    int rc;
+
+    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
+
+    if ( nowrite )
+        rc = hvm_emulate_one_no_write(&ctx);
+    else
+        rc = hvm_emulate_one(&ctx);
+
+    switch ( rc )
+    {
+    case X86EMUL_UNHANDLEABLE:
+        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
+               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
+               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
+               ctx.insn_buf_eip,
+               ctx.insn_buf[0], ctx.insn_buf[1],
+               ctx.insn_buf[2], ctx.insn_buf[3],
+               ctx.insn_buf[4], ctx.insn_buf[5],
+               ctx.insn_buf[6], ctx.insn_buf[7],
+               ctx.insn_buf[8], ctx.insn_buf[9]);
+        hvm_inject_hw_exception(trapnr, errcode);
+        break;
+    case X86EMUL_EXCEPTION:
+        if ( ctx.exn_pending )
+            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
+        break;
+    }
+
+    hvm_emulate_writeback(&ctx);
+}
+
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs)
diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h
index 00a06cc..a74f310 100644
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -37,6 +37,11 @@ struct hvm_emulate_ctxt {
 
 int hvm_emulate_one(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
+int hvm_emulate_one_no_write(
+    struct hvm_emulate_ctxt *hvmemul_ctxt);
+void hvm_emulate_one_full(bool_t nowrite,
+    unsigned int trapnr,
+    unsigned int errcode);
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH RFC V7 2/5] xen: Optimize introspection access to guest state
  2014-08-13 15:28 [PATCH RFC V7 1/5] xen: Emulate with no writes Razvan Cojocaru
@ 2014-08-13 15:28 ` Razvan Cojocaru
  2014-08-13 15:28 ` [PATCH RFC V7 3/5] xen, libxc: Force-enable relevant MSR events Razvan Cojocaru
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-13 15:28 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, jbeulich, eddie.dong, jun.nakajima

Speed optimization for introspection purposes: a handful of registers
are sent along with each mem_event. This requires enlargement of the
mem_event_request / mem_event_response stuctures, and additional code
to fill in relevant values. Since the EPT event processing code needs
more data than CR3 or MSR event processors, hvm_mem_event_fill_regs()
fills in less data than p2m_mem_event_fill_regs(), in order to avoid
overhead. Struct hvm_hw_cpu has been considered instead of the custom
struct mem_event_regs_st, but its size would cause quick filling up
of the mem_event ring buffer.

Changes since V1:
 - Replaced guest_x86_mode with cs_arbytes in the mem_event_regs_st
   structure.
 - Removed superfluous preprocessor check for __x86_64__ in
   p2m_mem_event_fill_regs().

Changes since V2:
 - Removed inline function compiler suggestions.
 - Removed superfluous memset(&ctxt, 0, sizeof(struct hvm_hw_cpu)).
 - Padded struct mem_event_regs_st.
 - Renamed mem_event_regs_t to mem_event_regs.
 - Readability changes.

Changes since V3:
 - Changed the name of VCPU variables referring to the current
   VCPU to "curr".
 - Renamed "mem_event_regs" to "x86_mem_event_regs" to make it
   explicit.

Changes since V4:
 - Renamed "x86_mem_event_regs" to "mem_event_regs_x86" and
   removed a typedef.

Changes since V5:
 - Const-correctness.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
---
 xen/arch/x86/hvm/hvm.c         |   33 +++++++++++++++++++++++
 xen/arch/x86/mm/p2m.c          |   57 ++++++++++++++++++++++++++++++++++++++++
 xen/include/public/mem_event.h |   39 +++++++++++++++++++++++++++
 3 files changed, 129 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index d40c48e..a43d892 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -6150,6 +6150,38 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
     return rc;
 }
 
+static void hvm_mem_event_fill_regs(mem_event_request_t *req)
+{
+    const struct cpu_user_regs *regs = guest_cpu_user_regs();
+    const struct vcpu *curr = current;
+
+    req->x86_regs.rax = regs->eax;
+    req->x86_regs.rcx = regs->ecx;
+    req->x86_regs.rdx = regs->edx;
+    req->x86_regs.rbx = regs->ebx;
+    req->x86_regs.rsp = regs->esp;
+    req->x86_regs.rbp = regs->ebp;
+    req->x86_regs.rsi = regs->esi;
+    req->x86_regs.rdi = regs->edi;
+
+    req->x86_regs.r8  = regs->r8;
+    req->x86_regs.r9  = regs->r9;
+    req->x86_regs.r10 = regs->r10;
+    req->x86_regs.r11 = regs->r11;
+    req->x86_regs.r12 = regs->r12;
+    req->x86_regs.r13 = regs->r13;
+    req->x86_regs.r14 = regs->r14;
+    req->x86_regs.r15 = regs->r15;
+
+    req->x86_regs.rflags = regs->eflags;
+    req->x86_regs.rip    = regs->eip;
+
+    req->x86_regs.msr_efer = curr->arch.hvm_vcpu.guest_efer;
+    req->x86_regs.cr0 = curr->arch.hvm_vcpu.guest_cr[0];
+    req->x86_regs.cr3 = curr->arch.hvm_vcpu.guest_cr[3];
+    req->x86_regs.cr4 = curr->arch.hvm_vcpu.guest_cr[4];
+}
+
 static int hvm_memory_event_traps(long p, uint32_t reason,
                                   unsigned long value, unsigned long old, 
                                   bool_t gla_valid, unsigned long gla) 
@@ -6194,6 +6226,7 @@ static int hvm_memory_event_traps(long p, uint32_t reason,
         req.gla = old;
     }
     
+    hvm_mem_event_fill_regs(&req);
     mem_event_put_request(d, &d->mem_event->access, &req);
     
     return 1;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index bca9f0f..41a316a 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1323,6 +1323,61 @@ void p2m_mem_paging_resume(struct domain *d)
     }
 }
 
+static void p2m_mem_event_fill_regs(mem_event_request_t *req)
+{
+    const struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct segment_register seg;
+    struct hvm_hw_cpu ctxt;
+    struct vcpu *curr = current;
+
+    /* Architecture-specific vmcs/vmcb bits */
+    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
+
+    req->x86_regs.rax = regs->eax;
+    req->x86_regs.rcx = regs->ecx;
+    req->x86_regs.rdx = regs->edx;
+    req->x86_regs.rbx = regs->ebx;
+    req->x86_regs.rsp = regs->esp;
+    req->x86_regs.rbp = regs->ebp;
+    req->x86_regs.rsi = regs->esi;
+    req->x86_regs.rdi = regs->edi;
+
+    req->x86_regs.r8  = regs->r8;
+    req->x86_regs.r9  = regs->r9;
+    req->x86_regs.r10 = regs->r10;
+    req->x86_regs.r11 = regs->r11;
+    req->x86_regs.r12 = regs->r12;
+    req->x86_regs.r13 = regs->r13;
+    req->x86_regs.r14 = regs->r14;
+    req->x86_regs.r15 = regs->r15;
+
+    req->x86_regs.rflags = regs->eflags;
+    req->x86_regs.rip    = regs->eip;
+
+    req->x86_regs.dr7 = curr->arch.debugreg[7];
+    req->x86_regs.cr0 = ctxt.cr0;
+    req->x86_regs.cr2 = ctxt.cr2;
+    req->x86_regs.cr3 = ctxt.cr3;
+    req->x86_regs.cr4 = ctxt.cr4;
+
+    req->x86_regs.sysenter_cs = ctxt.sysenter_cs;
+    req->x86_regs.sysenter_esp = ctxt.sysenter_esp;
+    req->x86_regs.sysenter_eip = ctxt.sysenter_eip;
+
+    req->x86_regs.msr_efer = ctxt.msr_efer;
+    req->x86_regs.msr_star = ctxt.msr_star;
+    req->x86_regs.msr_lstar = ctxt.msr_lstar;
+
+    hvm_get_segment_register(curr, x86_seg_fs, &seg);
+    req->x86_regs.fs_base = seg.base;
+
+    hvm_get_segment_register(curr, x86_seg_gs, &seg);
+    req->x86_regs.gs_base = seg.base;
+
+    hvm_get_segment_register(curr, x86_seg_cs, &seg);
+    req->x86_regs.cs_arbytes = seg.attr.bytes;
+}
+
 bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, 
                           bool_t access_r, bool_t access_w, bool_t access_x,
                           mem_event_request_t **req_ptr)
@@ -1410,6 +1465,8 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
         req->access_x = access_x;
     
         req->vcpu_id = v->vcpu_id;
+
+        p2m_mem_event_fill_regs(req);
     }
 
     /* Pause the current VCPU */
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index 3831b41..bbd1636 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -48,6 +48,44 @@
 #define MEM_EVENT_REASON_MSR         7    /* MSR was hit: gfn is MSR value, gla is MSR address;
                                              does NOT honour HVMPME_onchangeonly */
 
+/* Using a custom struct (not hvm_hw_cpu) so as to not fill
+ * the mem_event ring buffer too quickly. */
+struct mem_event_regs_x86 {
+    uint64_t rax;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbx;
+    uint64_t rsp;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+    uint64_t rflags;
+    uint64_t dr7;
+    uint64_t rip;
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+    uint64_t msr_efer;
+    uint64_t msr_star;
+    uint64_t msr_lstar;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint32_t cs_arbytes;
+    uint32_t _pad;
+};
+
 typedef struct mem_event_st {
     uint32_t flags;
     uint32_t vcpu_id;
@@ -65,6 +103,7 @@ typedef struct mem_event_st {
     uint16_t available:12;
 
     uint16_t reason;
+    struct mem_event_regs_x86 x86_regs;
 } mem_event_request_t, mem_event_response_t;
 
 DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH RFC V7 3/5] xen, libxc: Force-enable relevant MSR events
  2014-08-13 15:28 [PATCH RFC V7 1/5] xen: Emulate with no writes Razvan Cojocaru
  2014-08-13 15:28 ` [PATCH RFC V7 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
@ 2014-08-13 15:28 ` Razvan Cojocaru
  2014-08-26 14:05   ` Jan Beulich
  2014-08-13 15:28 ` [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-13 15:28 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, jbeulich, eddie.dong, jun.nakajima

Vmx_disable_intercept_for_msr() will now refuse to disable interception of
MSRs needed for memory introspection. It is not possible to gate this on
mem_access being active for the domain, since by the time mem_access does
become active the interception for the interesting MSRs has already been
disabled (vmx_disable_intercept_for_msr() runs very early on).

Changes since V1:
 - Replaced printk() with gdprintk(XENLOG_DEBUG, ...).

Changes since V2:
 - Split a log line differently to keep it grepable.
 - Interception for relevant MSRs will be disabled only if
   mem_access is not enabled.
 - Since they end up being disabled early on (when mem_access
   is not enabled yet), re-enable interception when mem_access
   becomes active.

Changes since V3:
 - Removed log line stating that MSR interception cannot be disabled.
 - Removed superfluous #include <asm/hvm/vmx/vmcs.h>.
 - Moved VMX-specific code to vmx.c (as a new hvm_funcs member).

Changes since V5:
 - Added xc_mem_access_enable_introspection() to libxc, which has
   the same parameters and semantics as xc_mem_access_enable(),
   but it additionally sets d->arch.hvm_domain.introspection_enabled
   and enables relevant MSR interception.
 - Renamed vmx_enable_intro_msr_interception() to
   vmx_enable_msr_exit_interception().
 - Simplified vmx_enable_msr_exit_interception() (const array of MSRs).

Changes since V6:
 - Moved the array of interesting MSRs to common header.
 - Minor code cleanup.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 tools/libxc/xc_mem_access.c        |   10 +++++++++-
 tools/libxc/xc_mem_event.c         |    7 +++++--
 tools/libxc/xc_private.h           |    2 +-
 tools/libxc/xenctrl.h              |    2 ++
 xen/arch/x86/hvm/vmx/vmcs.c        |   13 +++++++++++++
 xen/arch/x86/hvm/vmx/vmx.c         |   12 ++++++++++++
 xen/arch/x86/mm/mem_event.c        |   13 +++++++++++++
 xen/include/asm-x86/hvm/domain.h   |    2 ++
 xen/include/asm-x86/hvm/hvm.h      |    2 ++
 xen/include/asm-x86/hvm/vmx/vmcs.h |    9 +++++++++
 xen/include/public/domctl.h        |    7 ++++---
 11 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/tools/libxc/xc_mem_access.c b/tools/libxc/xc_mem_access.c
index 461f0e9..55d0e9f 100644
--- a/tools/libxc/xc_mem_access.c
+++ b/tools/libxc/xc_mem_access.c
@@ -26,7 +26,15 @@
 
 void *xc_mem_access_enable(xc_interface *xch, domid_t domain_id, uint32_t *port)
 {
-    return xc_mem_event_enable(xch, domain_id, HVM_PARAM_ACCESS_RING_PFN, port);
+    return xc_mem_event_enable(xch, domain_id, HVM_PARAM_ACCESS_RING_PFN,
+                               port, 0);
+}
+
+void *xc_mem_access_enable_introspection(xc_interface *xch, domid_t domain_id,
+                                         uint32_t *port)
+{
+    return xc_mem_event_enable(xch, domain_id, HVM_PARAM_ACCESS_RING_PFN,
+                               port, 1);
 }
 
 int xc_mem_access_disable(xc_interface *xch, domid_t domain_id)
diff --git a/tools/libxc/xc_mem_event.c b/tools/libxc/xc_mem_event.c
index faf1cc6..8c0be4e 100644
--- a/tools/libxc/xc_mem_event.c
+++ b/tools/libxc/xc_mem_event.c
@@ -57,7 +57,7 @@ int xc_mem_event_memop(xc_interface *xch, domid_t domain_id,
 }
 
 void *xc_mem_event_enable(xc_interface *xch, domid_t domain_id, int param,
-                          uint32_t *port)
+                          uint32_t *port, int enable_introspection)
 {
     void *ring_page = NULL;
     uint64_t pfn;
@@ -120,7 +120,10 @@ void *xc_mem_event_enable(xc_interface *xch, domid_t domain_id, int param,
         break;
 
     case HVM_PARAM_ACCESS_RING_PFN:
-        op = XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE;
+        if ( enable_introspection )
+            op = XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE_INTROSPECTION;
+        else
+            op = XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE;
         mode = XEN_DOMCTL_MEM_EVENT_OP_ACCESS;
         break;
 
diff --git a/tools/libxc/xc_private.h b/tools/libxc/xc_private.h
index c50a7c9..94df688 100644
--- a/tools/libxc/xc_private.h
+++ b/tools/libxc/xc_private.h
@@ -381,6 +381,6 @@ int xc_mem_event_memop(xc_interface *xch, domid_t domain_id,
  * param can be HVM_PARAM_PAGING/ACCESS/SHARING_RING_PFN
  */
 void *xc_mem_event_enable(xc_interface *xch, domid_t domain_id, int param,
-                          uint32_t *port);
+                          uint32_t *port, int enable_introspection);
 
 #endif /* __XC_PRIVATE_H__ */
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 1c5d0db..28b5562 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -2263,6 +2263,8 @@ int xc_mem_paging_load(xc_interface *xch, domid_t domain_id,
  * Caller has to unmap this page when done.
  */
 void *xc_mem_access_enable(xc_interface *xch, domid_t domain_id, uint32_t *port);
+void *xc_mem_access_enable_introspection(xc_interface *xch, domid_t domain_id,
+                                         uint32_t *port);
 int xc_mem_access_disable(xc_interface *xch, domid_t domain_id);
 int xc_mem_access_resume(xc_interface *xch, domid_t domain_id);
 
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 4a4f4e1..48f3c5d 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -39,6 +39,7 @@
 #include <xen/keyhandler.h>
 #include <asm/shadow.h>
 #include <asm/tboot.h>
+#include <asm/mem_event.h>
 
 static bool_t __read_mostly opt_vpid_enabled = 1;
 boolean_param("vpid", opt_vpid_enabled);
@@ -695,11 +696,23 @@ static void vmx_set_host_env(struct vcpu *v)
 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
 {
     unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
+    struct domain *d = v->domain;
 
     /* VMX MSR bitmap supported? */
     if ( msr_bitmap == NULL )
         return;
 
+    if ( mem_event_check_ring(&d->mem_event->access) &&
+         d->arch.hvm_domain.introspection_enabled )
+    {
+        unsigned int i;
+
+        /* Filter out MSR-s needed for memory introspection */
+        for ( i = 0; i < ARRAY_SIZE(msrs_exit_array); i++ )
+            if ( msr == msrs_exit_array[i] )
+                return;
+    }
+
     /*
      * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
      * have the write-low and read-high bitmap offsets the wrong way round.
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index fb65c7d..1de9f09 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1682,6 +1682,17 @@ void vmx_hypervisor_cpuid_leaf(uint32_t sub_idx,
         *eax |= XEN_HVM_CPUID_X2APIC_VIRT;
 }
 
+static void vmx_enable_msr_exit_interception(struct domain *d)
+{
+    struct vcpu *v;
+    unsigned int i;
+
+    /* Enable interception for MSRs needed for memory introspection. */
+    for_each_vcpu ( d, v )
+        for ( i = 0; i < ARRAY_SIZE(msrs_exit_array); i++ )
+            vmx_enable_intercept_for_msr(v, msrs_exit_array[i], MSR_TYPE_W);
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -1740,6 +1751,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .handle_eoi           = vmx_handle_eoi,
     .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
     .hypervisor_cpuid_leaf = vmx_hypervisor_cpuid_leaf,
+    .enable_msr_exit_interception = vmx_enable_msr_exit_interception,
 };
 
 const struct hvm_function_table * __init start_vmx(void)
diff --git a/xen/arch/x86/mm/mem_event.c b/xen/arch/x86/mm/mem_event.c
index ba7e71e..84a1d8d 100644
--- a/xen/arch/x86/mm/mem_event.c
+++ b/xen/arch/x86/mm/mem_event.c
@@ -587,6 +587,7 @@ int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
         switch( mec->op )
         {
         case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE:
+        case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE_INTROSPECTION:
         {
             rc = -ENODEV;
             /* Only HAP is supported */
@@ -600,13 +601,25 @@ int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
             rc = mem_event_enable(d, mec, med, _VPF_mem_access, 
                                     HVM_PARAM_ACCESS_RING_PFN,
                                     mem_access_notification);
+
+            if (  mec->op == XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE )
+                break;
+
+            if ( rc == 0 && hvm_funcs.enable_msr_exit_interception )
+            {
+                d->arch.hvm_domain.introspection_enabled = 1;
+                hvm_funcs.enable_msr_exit_interception(d);
+            }
         }
         break;
 
         case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE:
         {
             if ( med->ring_page )
+            {
                 rc = mem_event_disable(d, med);
+                d->arch.hvm_domain.introspection_enabled = 0;
+            }
         }
         break;
 
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 291a2e0..947b061 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -145,6 +145,8 @@ struct hvm_domain {
         struct vmx_domain vmx;
         struct svm_domain svm;
     };
+
+    bool_t introspection_enabled;
 };
 
 #define hap_enabled(d)  ((d)->arch.hvm_domain.hap_enabled)
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 0ebd478..069ba39 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -205,6 +205,8 @@ struct hvm_function_table {
     void (*hypervisor_cpuid_leaf)(uint32_t sub_idx,
                                   uint32_t *eax, uint32_t *ebx,
                                   uint32_t *ecx, uint32_t *edx);
+
+    void (*enable_msr_exit_interception)(struct domain *d);
 };
 
 extern struct hvm_function_table hvm_funcs;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 215d93c..d5e4cc0 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -471,6 +471,15 @@ enum vmcs_field {
     HOST_RIP                        = 0x00006c16,
 };
 
+static const u32 msrs_exit_array[] = {
+    MSR_IA32_SYSENTER_EIP,
+    MSR_IA32_SYSENTER_ESP,
+    MSR_IA32_SYSENTER_CS,
+    MSR_IA32_MC0_CTL,
+    MSR_STAR,
+    MSR_LSTAR
+};
+
 #define VMCS_VPID_WIDTH 16
 
 #define MSR_TYPE_R 1
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 5b11bbf..a3431ec 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -772,10 +772,11 @@ struct xen_domctl_gdbsx_domstatus {
  * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest
  * EBUSY  - guest has or had access enabled, ring buffer still active
  */
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS            2
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS                        2
 
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE     0
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE    1
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE                 0
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE                1
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE_INTROSPECTION   2
 
 /*
  * Sharing ENOMEM helper.
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-13 15:28 [PATCH RFC V7 1/5] xen: Emulate with no writes Razvan Cojocaru
  2014-08-13 15:28 ` [PATCH RFC V7 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
  2014-08-13 15:28 ` [PATCH RFC V7 3/5] xen, libxc: Force-enable relevant MSR events Razvan Cojocaru
@ 2014-08-13 15:28 ` Razvan Cojocaru
  2014-08-26 14:13   ` Jan Beulich
  2014-08-13 15:28 ` [PATCH RFC V7 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
  2014-08-26 13:56 ` [PATCH RFC V7 1/5] xen: Emulate with no writes Jan Beulich
  4 siblings, 1 reply; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-13 15:28 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, jbeulich, eddie.dong, jun.nakajima

Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
new xc_domain_set_pagefault_info() function to set per-domain page
fault injection information. All a call does is set per-domain info,
and nothing actually happens until VMENTRY time, and then only if
all conditions are met (the guest is in user mode, the set value
matches CR3, and there are no other pending traps).
This mechanism allows bringing in swapped-out pages for inspection.

Changes since V2:
 - Removed superfluous memset(&ctxt, 0, sizeof(struct hvm_hw_cpu)).
 - Now checking SS.DPL instead of CS.DPL to see if the guest is in
   user mode.
 - Removed superfluous fault_info members initialization.
 - Now returning -EINVAL if !has_hvm_container_domain(d) on
   XEN_DOMCTL_set_pagefault_info.
 - Moved struct fault_info from struct domain to struct hvm_domain.
 - Collapsed explicit initialization of the fault_info members into a
   one-line assignment on the tools/libxc side.
 - Now using write_access as a proper bool instead of shifting and
   OR-ing it with PFEC_user_mode.

Changes since V3:
 - Now checking that is_hvm_domain(d) in do_domctl() (consistent with
   the test in the actual page fault injection code).
 - XEN_DOMCTL_set_pagefault_info now extends the proper range.
 - Added a likely() around the
   d->arch.hvm_domain.fault_info.virtual_address == 0 check.
 - Replaced hvm_funcs.save_cpu_ctxt() function call with lighter code.
 - Split the vmx.c page fault injection function into a check
   function and an inject function.

Changes since V4:
 - Added "valid" flag instead of relying on virtual address 0 as a
   special case.
 - Using #defines instead of magic constants in vmx_vmcs_save().
 - Padded struct xen_domctl_set_pagefault_info.
 - Coding style and readability changes.

Changes since V5:
 - Renamed "struct domain *d" to "currd" where appropriate.
 - Using ! to test bool_t variables.
 - Replaced hvm_get_segment_register() with
   vmx_get_segment_register() in vmx.c.
 - Removed double blank line.
 - Proper comparison of CR3 and address_space (taking into account
   LM, PAE).
 - Using domain variable already set up in do_domctl().
 - Removed optional parentheses.
 - Removed d->arch.hvm_domain.fault_info reset (only the "valid"
   flag counts anyway).

Changes since V6:
 - Removed the write_access flag, now passing the whole error code
   (still OR-ed with PFEC_user_mode).
 - Replaced hex constants with #defined macros.
 - Minor code / coding style cleanup.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
---
 tools/libxc/xc_domain.c          |   16 +++++++++++
 tools/libxc/xenctrl.h            |    4 +++
 xen/arch/x86/hvm/vmx/vmx.c       |   56 ++++++++++++++++++++++++++++++++++++++
 xen/common/domctl.c              |   19 +++++++++++++
 xen/include/asm-x86/hvm/domain.h |    8 ++++++
 xen/include/public/domctl.h      |   16 +++++++++++
 6 files changed, 119 insertions(+)

diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index c67ac9a..6e327de 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -507,6 +507,22 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
     return ret;
 }
 
+int xc_domain_set_pagefault_info(xc_interface *xch,
+                                 uint32_t domid,
+                                 xen_domctl_set_pagefault_info_t *info)
+{
+    DECLARE_DOMCTL;
+
+    if (info == NULL)
+        return -1;
+
+    domctl.cmd = XEN_DOMCTL_set_pagefault_info;
+    domctl.domain = (domid_t)domid;
+    domctl.u.set_pagefault_info = *info;
+
+    return do_domctl(xch, &domctl);
+}
+
 int xc_vcpu_getcontext(xc_interface *xch,
                        uint32_t domid,
                        uint32_t vcpu,
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 28b5562..6dad615 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -803,6 +803,10 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
 const char *xc_domain_get_native_protocol(xc_interface *xch,
                                           uint32_t domid);
 
+int xc_domain_set_pagefault_info(xc_interface *xch,
+                                 uint32_t domid,
+                                 xen_domctl_set_pagefault_info_t *info);
+
 /**
  * This function returns information about the execution context of a
  * particular vcpu of a domain.
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 1de9f09..45553e2 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3109,6 +3109,59 @@ out:
         nvmx_idtv_handling();
 }
 
+static bool_t vmx_check_pf_injection(void)
+{
+    struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    struct segment_register seg;
+    unsigned long ev;
+    uint32_t pending_event = 0;
+    uint64_t mask;
+
+    if ( !is_hvm_domain(currd) ||
+         likely(!currd->arch.hvm_domain.fault_info.valid) )
+        return 0;
+
+    vmx_get_segment_register(curr, x86_seg_ss, &seg);
+
+    if ( seg.attr.fields.dpl != 3 ) /* Guest is not in user mode */
+        return 0;
+
+    if ( hvm_long_mode_enabled(curr) )
+        mask = (PADDR_MASK & PAGE_MASK); /* Bits 51:12. */
+    else if ( hvm_pae_enabled(curr) )
+        mask = 0x00000000ffffffe0; /* Bits 31:5. */
+    else
+        mask = (uint32_t)PAGE_MASK; /* Bits 31:12. */
+
+    if ( (curr->arch.hvm_vcpu.guest_cr[3] & mask) !=
+         (currd->arch.hvm_domain.fault_info.address_space & mask) )
+        return 0;
+
+    vmx_vmcs_enter(curr);
+    __vmread(VM_ENTRY_INTR_INFO, &ev);
+    vmx_vmcs_exit(curr);
+
+    if ( (ev & INTR_INFO_VALID_MASK) &&
+         hvm_event_needs_reinjection(MASK_EXTR(ev, INTR_INFO_INTR_TYPE_MASK),
+                                     ev & INTR_INFO_VECTOR_MASK) )
+        pending_event = ev;
+
+    return pending_event == 0;
+}
+
+static void vmx_inject_pf(void)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    uint64_t virtual_address = currd->arch.hvm_domain.fault_info.virtual_address;
+
+    currd->arch.hvm_domain.fault_info.valid = 0;
+
+    hvm_inject_page_fault(currd->arch.hvm_domain.fault_info.errcode |
+                          PFEC_user_mode, virtual_address);
+}
+
 void vmx_vmenter_helper(const struct cpu_user_regs *regs)
 {
     struct vcpu *curr = current;
@@ -3149,6 +3202,9 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
     if ( unlikely(need_flush) )
         vpid_sync_all();
 
+    if ( vmx_check_pf_injection() )
+        vmx_inject_pf();
+
  out:
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index c326aba..841ba37 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -967,6 +967,25 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
     }
     break;
 
+    case XEN_DOMCTL_set_pagefault_info:
+    {
+        ret = -EINVAL;
+
+        if ( is_hvm_domain(d) )
+        {
+            d->arch.hvm_domain.fault_info.address_space =
+                op->u.set_pagefault_info.address_space;
+            d->arch.hvm_domain.fault_info.virtual_address =
+                op->u.set_pagefault_info.virtual_address;
+            d->arch.hvm_domain.fault_info.errcode =
+                op->u.set_pagefault_info.errcode;
+            d->arch.hvm_domain.fault_info.valid = 1;
+
+            ret = 0;
+        }
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, d, u_domctl);
         break;
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 947b061..bb13469 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -141,6 +141,14 @@ struct hvm_domain {
      */
     uint64_t sync_tsc;
 
+    /* Memory introspection page fault injection data. */
+    struct {
+        uint64_t address_space;
+        uint64_t virtual_address;
+        uint32_t errcode;
+        bool_t valid;
+    } fault_info;
+
     union {
         struct vmx_domain vmx;
         struct svm_domain svm;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index a3431ec..e848590 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -937,6 +937,20 @@ typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
 #endif
 
+/*
+ * XEN_DOMCTL_set_pagefault_info requests that a page fault occur at
+ * the next VMENTRY.
+ */
+struct xen_domctl_set_pagefault_info {
+    uint64_t address_space;
+    uint64_t virtual_address;
+    uint32_t errcode;
+    uint32_t _pad;
+};
+typedef struct xen_domctl_set_pagefault_info xen_domctl_set_pagefault_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_pagefault_info_t);
+
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -1009,6 +1023,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_cacheflush                    71
 #define XEN_DOMCTL_get_vcpu_msrs                 72
 #define XEN_DOMCTL_set_vcpu_msrs                 73
+#define XEN_DOMCTL_set_pagefault_info            74
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -1069,6 +1084,7 @@ struct xen_domctl {
         struct xen_domctl_cacheflush        cacheflush;
         struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
+        struct xen_domctl_set_pagefault_info set_pagefault_info;
         uint8_t                             pad[128];
     } u;
 };
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH RFC V7 5/5] xen: Handle resumed instruction based on previous mem_event reply
  2014-08-13 15:28 [PATCH RFC V7 1/5] xen: Emulate with no writes Razvan Cojocaru
                   ` (2 preceding siblings ...)
  2014-08-13 15:28 ` [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
@ 2014-08-13 15:28 ` Razvan Cojocaru
  2014-08-26 13:56 ` [PATCH RFC V7 1/5] xen: Emulate with no writes Jan Beulich
  4 siblings, 0 replies; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-13 15:28 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, jbeulich, eddie.dong, jun.nakajima

In a scenario where a page fault that triggered a mem_event occured,
p2m_mem_access_check() will now be able to either 1) emulate the
current instruction, or 2) emulate it, but don't allow it to perform
any writes.

Changes since V1:
 - Removed the 'skip' code which required computing the current
   instruction length.
 - Removed the set_ad_bits() code that attempted to modify the
   'accessed' and 'dirty' bits for instructions that the emulator
   can't handle at the moment.

Changes since V2:
 - Moved the __vmread(EXIT_QUALIFICATION, &exit_qualification); code
   in vmx.c, accessible via hvm_funcs.
 - Incorporated changes by Andrew Cooper ("[PATCH 1/2] Xen/mem_event:
   Validate the response vcpu_id before acting on it."

Changes since V3:
 - Collapsed verbose lines into a single "else if()".
 - Changed an int to bool_t.
 - Fixed a minor coding style issue.
 - Now computing the first parameter to hvm_emulate_one_full()
   (replaced an "if" with a single call).
 - Added code comments about eip and gla reset (clarity issue).
 - Removed duplicate code by Andrew Cooper (introduced in V2,
   since committed).

Changes since V4:
 - Renamed "exited_by_pagefault" to "exited_by_nested_pagefault".
 - Folded two long lines.
 - Combined two "if" statements.
 - Moved "violation = 0;" to the XENMEM_access_rwx case.

Changes since V5:
 - Fixed indentation.
 - Re-ordered a set of if() conditions.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/domain.c          |    3 ++
 xen/arch/x86/hvm/vmx/vmx.c     |   13 +++++++
 xen/arch/x86/mm/p2m.c          |   84 ++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/domain.h   |    9 +++++
 xen/include/asm-x86/hvm/hvm.h  |    2 +
 xen/include/public/mem_event.h |   12 +++---
 6 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f7e0e78..7b1dfe6 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -415,6 +415,9 @@ int vcpu_initialise(struct vcpu *v)
 
     v->arch.flags = TF_kernel_mode;
 
+    /* By default, do not emulate */
+    v->arch.mem_event.emulate_flags = 0;
+
     rc = mapcache_vcpu_init(v);
     if ( rc )
         return rc;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 45553e2..815eb1a 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1693,6 +1693,18 @@ static void vmx_enable_msr_exit_interception(struct domain *d)
             vmx_enable_intercept_for_msr(v, msrs_exit_array[i], MSR_TYPE_W);
 }
 
+static bool_t vmx_exited_by_nested_pagefault(void)
+{
+    unsigned long exit_qualification;
+
+    __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+    if ( (exit_qualification & EPT_GLA_FAULT) == 0 )
+        return 0;
+
+    return 1;
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -1752,6 +1764,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
     .hypervisor_cpuid_leaf = vmx_hypervisor_cpuid_leaf,
     .enable_msr_exit_interception = vmx_enable_msr_exit_interception,
+    .exited_by_nested_pagefault  = vmx_exited_by_nested_pagefault,
 };
 
 const struct hvm_function_table * __init start_vmx(void)
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 41a316a..98b2eba 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1391,6 +1391,7 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
     p2m_access_t p2ma;
     mem_event_request_t *req;
     int rc;
+    unsigned long eip = guest_cpu_user_regs()->eip;
 
     /* First, handle rx2rw conversion automatically.
      * These calls to p2m->set_entry() must succeed: we have the gfn
@@ -1443,6 +1444,36 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
             return 1;
         }
     }
+    else if ( v->arch.mem_event.emulate_flags == 0 &&
+              hvm_funcs.exited_by_nested_pagefault &&
+              !hvm_funcs.exited_by_nested_pagefault() ) /* don't send a mem_event */
+    {
+        v->arch.mem_event.emulate_flags = MEM_EVENT_FLAG_EMULATE;
+        v->arch.mem_event.gpa = gpa;
+        v->arch.mem_event.eip = eip;
+    }
+
+    /* The previous mem_event reply does not match the current state. */
+    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
+    {
+        /* Don't emulate the current instruction, send a new mem_event. */
+        v->arch.mem_event.emulate_flags = 0;
+
+        /* Make sure to mark the current state to match it again against
+         * the new mem_event about to be sent. */
+        v->arch.mem_event.gpa = gpa;
+        v->arch.mem_event.eip = eip;
+    }
+
+    if ( v->arch.mem_event.emulate_flags )
+    {
+        hvm_emulate_one_full((v->arch.mem_event.emulate_flags &
+                              MEM_EVENT_FLAG_EMULATE_NOWRITE) != 0,
+                             TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+
+        v->arch.mem_event.emulate_flags = 0;
+        return 1;
+    }
 
     *req_ptr = NULL;
     req = xzalloc(mem_event_request_t);
@@ -1495,6 +1526,59 @@ void p2m_mem_access_resume(struct domain *d)
 
         v = d->vcpu[rsp.vcpu_id];
 
+        /* Mark vcpu for skipping one instruction upon rescheduling */
+        if ( rsp.flags & MEM_EVENT_FLAG_EMULATE )
+        {
+            xenmem_access_t access;
+            bool_t violation = 1;
+
+            v->arch.mem_event.emulate_flags = 0;
+
+            if ( p2m_get_mem_access(d, rsp.gfn, &access) == 0 )
+            {
+                switch ( access )
+                {
+                case XENMEM_access_n:
+                case XENMEM_access_n2rwx:
+                default:
+                    violation = rsp.access_r || rsp.access_w || rsp.access_x;
+                    break;
+
+                case XENMEM_access_r:
+                    violation = rsp.access_w || rsp.access_x;
+                    break;
+
+                case XENMEM_access_w:
+                    violation = rsp.access_r || rsp.access_x;
+                    break;
+
+                case XENMEM_access_x:
+                    violation = rsp.access_r || rsp.access_w;
+                    break;
+
+                case XENMEM_access_rx:
+                case XENMEM_access_rx2rw:
+                    violation = rsp.access_w;
+                    break;
+
+                case XENMEM_access_wx:
+                    violation = rsp.access_r;
+                    break;
+
+                case XENMEM_access_rw:
+                    violation = rsp.access_x;
+                    break;
+
+                case XENMEM_access_rwx:
+                    violation = 0;
+                    break;
+                }
+            }
+
+            if ( violation )
+                v->arch.mem_event.emulate_flags = rsp.flags;
+        }
+
         /* Unpause domain */
         if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
             mem_event_vcpu_unpause(v);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 83329ed..440aa81 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -458,6 +458,15 @@ struct arch_vcpu
 
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
+
+    /* Should we emulate the next matching instruction on VCPU resume
+     * after a mem_event? */
+    struct {
+        uint32_t emulate_flags;
+        unsigned long gpa;
+        unsigned long eip;
+    } mem_event;
+
 } __cacheline_aligned;
 
 smap_check_policy_t smap_policy_change(struct vcpu *v,
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 069ba39..8f0292c 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -207,6 +207,8 @@ struct hvm_function_table {
                                   uint32_t *ecx, uint32_t *edx);
 
     void (*enable_msr_exit_interception)(struct domain *d);
+
+    bool_t (*exited_by_nested_pagefault)(void);
 };
 
 extern struct hvm_function_table hvm_funcs;
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index bbd1636..49f1b84 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -31,11 +31,13 @@
 #include "io/ring.h"
 
 /* Memory event flags */
-#define MEM_EVENT_FLAG_VCPU_PAUSED  (1 << 0)
-#define MEM_EVENT_FLAG_DROP_PAGE    (1 << 1)
-#define MEM_EVENT_FLAG_EVICT_FAIL   (1 << 2)
-#define MEM_EVENT_FLAG_FOREIGN      (1 << 3)
-#define MEM_EVENT_FLAG_DUMMY        (1 << 4)
+#define MEM_EVENT_FLAG_VCPU_PAUSED     (1 << 0)
+#define MEM_EVENT_FLAG_DROP_PAGE       (1 << 1)
+#define MEM_EVENT_FLAG_EVICT_FAIL      (1 << 2)
+#define MEM_EVENT_FLAG_FOREIGN         (1 << 3)
+#define MEM_EVENT_FLAG_DUMMY           (1 << 4)
+#define MEM_EVENT_FLAG_EMULATE         (1 << 5)
+#define MEM_EVENT_FLAG_EMULATE_NOWRITE (1 << 6)
 
 /* Reasons for the memory event request */
 #define MEM_EVENT_REASON_UNKNOWN     0    /* typical reason */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 1/5] xen: Emulate with no writes
  2014-08-13 15:28 [PATCH RFC V7 1/5] xen: Emulate with no writes Razvan Cojocaru
                   ` (3 preceding siblings ...)
  2014-08-13 15:28 ` [PATCH RFC V7 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
@ 2014-08-26 13:56 ` Jan Beulich
  2014-08-26 14:01   ` Razvan Cojocaru
  4 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-08-26 13:56 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
> +void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
> +    unsigned int errcode)
> +{
> +    struct hvm_emulate_ctxt ctx = {{ 0 }};
> +    int rc;
> +
> +    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
> +
> +    if ( nowrite )
> +        rc = hvm_emulate_one_no_write(&ctx);
> +    else
> +        rc = hvm_emulate_one(&ctx);
> +
> +    switch ( rc )
> +    {
> +    case X86EMUL_UNHANDLEABLE:
> +        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
> +               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
> +               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
> +               ctx.insn_buf_eip,
> +               ctx.insn_buf[0], ctx.insn_buf[1],
> +               ctx.insn_buf[2], ctx.insn_buf[3],
> +               ctx.insn_buf[4], ctx.insn_buf[5],
> +               ctx.insn_buf[6], ctx.insn_buf[7],
> +               ctx.insn_buf[8], ctx.insn_buf[9]);
> +        hvm_inject_hw_exception(trapnr, errcode);
> +        break;
> +    case X86EMUL_EXCEPTION:
> +        if ( ctx.exn_pending )
> +            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
> +        break;

Shouldn't you act on X86EMUL_RETRY here? Or at least not fall through
to the writeback below?

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 1/5] xen: Emulate with no writes
  2014-08-26 13:56 ` [PATCH RFC V7 1/5] xen: Emulate with no writes Jan Beulich
@ 2014-08-26 14:01   ` Razvan Cojocaru
  2014-08-26 14:19     ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-26 14:01 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/26/2014 04:56 PM, Jan Beulich wrote:
>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>> +void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
>> +    unsigned int errcode)
>> +{
>> +    struct hvm_emulate_ctxt ctx = {{ 0 }};
>> +    int rc;
>> +
>> +    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
>> +
>> +    if ( nowrite )
>> +        rc = hvm_emulate_one_no_write(&ctx);
>> +    else
>> +        rc = hvm_emulate_one(&ctx);
>> +
>> +    switch ( rc )
>> +    {
>> +    case X86EMUL_UNHANDLEABLE:
>> +        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
>> +               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
>> +               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
>> +               ctx.insn_buf_eip,
>> +               ctx.insn_buf[0], ctx.insn_buf[1],
>> +               ctx.insn_buf[2], ctx.insn_buf[3],
>> +               ctx.insn_buf[4], ctx.insn_buf[5],
>> +               ctx.insn_buf[6], ctx.insn_buf[7],
>> +               ctx.insn_buf[8], ctx.insn_buf[9]);
>> +        hvm_inject_hw_exception(trapnr, errcode);
>> +        break;
>> +    case X86EMUL_EXCEPTION:
>> +        if ( ctx.exn_pending )
>> +            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
>> +        break;
> 
> Shouldn't you act on X86EMUL_RETRY here? Or at least not fall through
> to the writeback below?

Thanks for the review, I did initially loop around hvm_emulate_one()
until rc != X86EMUL_RETRY, but I've been told that that might block
against time calibration rendezvous points.

I'll avoid the fallthrough in the next version of the series.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 3/5] xen, libxc: Force-enable relevant MSR events
  2014-08-13 15:28 ` [PATCH RFC V7 3/5] xen, libxc: Force-enable relevant MSR events Razvan Cojocaru
@ 2014-08-26 14:05   ` Jan Beulich
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Beulich @ 2014-08-26 14:05 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
> Changes since V6:
>  - Moved the array of interesting MSRs to common header.

Urgh (see below).

> @@ -695,11 +696,23 @@ static void vmx_set_host_env(struct vcpu *v)
>  void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
>  {
>      unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
> +    struct domain *d = v->domain;
>  
>      /* VMX MSR bitmap supported? */
>      if ( msr_bitmap == NULL )
>          return;
>  
> +    if ( mem_event_check_ring(&d->mem_event->access) &&
> +         d->arch.hvm_domain.introspection_enabled )

Reverse the two sides of the && to do the obviously cheap check
first (and maybe wrap that in unlikely())?

> @@ -600,13 +601,25 @@ int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
>              rc = mem_event_enable(d, mec, med, _VPF_mem_access, 
>                                      HVM_PARAM_ACCESS_RING_PFN,
>                                      mem_access_notification);
> +
> +            if (  mec->op == XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE )
> +                break;
> +
> +            if ( rc == 0 && hvm_funcs.enable_msr_exit_interception )

No reason not to combine the two if()s afaict.

> --- a/xen/include/asm-x86/hvm/domain.h
> +++ b/xen/include/asm-x86/hvm/domain.h
> @@ -145,6 +145,8 @@ struct hvm_domain {
>          struct vmx_domain vmx;
>          struct svm_domain svm;
>      };
> +
> +    bool_t introspection_enabled;

Please put this adjacent to the four other bool_t-s a few lines above,
thus filling (rather than creating) a gap.

> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -471,6 +471,15 @@ enum vmcs_field {
>      HOST_RIP                        = 0x00006c16,
>  };
>  
> +static const u32 msrs_exit_array[] = {
> +    MSR_IA32_SYSENTER_EIP,
> +    MSR_IA32_SYSENTER_ESP,
> +    MSR_IA32_SYSENTER_CS,
> +    MSR_IA32_MC0_CTL,
> +    MSR_STAR,
> +    MSR_LSTAR
> +};

So why do you need more than one instance of this in the hypervisor?
This should just be a declaration here, and a definition in one of the
source files. Furthermore its name should start with vmx_.

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-13 15:28 ` [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
@ 2014-08-26 14:13   ` Jan Beulich
  2014-08-26 14:24     ` Razvan Cojocaru
  2014-08-27 11:54     ` Razvan Cojocaru
  0 siblings, 2 replies; 25+ messages in thread
From: Jan Beulich @ 2014-08-26 14:13 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
> +static bool_t vmx_check_pf_injection(void)
> +{
> +    struct vcpu *curr = current;
> +    const struct domain *currd = curr->domain;
> +    struct segment_register seg;
> +    unsigned long ev;
> +    uint32_t pending_event = 0;
> +    uint64_t mask;
> +
> +    if ( !is_hvm_domain(currd) ||
> +         likely(!currd->arch.hvm_domain.fault_info.valid) )
> +        return 0;
> +
> +    vmx_get_segment_register(curr, x86_seg_ss, &seg);
> +
> +    if ( seg.attr.fields.dpl != 3 ) /* Guest is not in user mode */
> +        return 0;
> +
> +    if ( hvm_long_mode_enabled(curr) )
> +        mask = (PADDR_MASK & PAGE_MASK); /* Bits 51:12. */

Pointless parentheses.

> +    case XEN_DOMCTL_set_pagefault_info:
> +    {
> +        ret = -EINVAL;
> +
> +        if ( is_hvm_domain(d) )
> +        {
> +            d->arch.hvm_domain.fault_info.address_space =
> +                op->u.set_pagefault_info.address_space;
> +            d->arch.hvm_domain.fault_info.virtual_address =
> +                op->u.set_pagefault_info.virtual_address;
> +            d->arch.hvm_domain.fault_info.errcode =
> +                op->u.set_pagefault_info.errcode;
> +            d->arch.hvm_domain.fault_info.valid = 1;
> +
> +            ret = 0;
> +        }
> +    }

Pointless curly braces.

> --- a/xen/include/asm-x86/hvm/domain.h
> +++ b/xen/include/asm-x86/hvm/domain.h
> @@ -141,6 +141,14 @@ struct hvm_domain {
>       */
>      uint64_t sync_tsc;
>  
> +    /* Memory introspection page fault injection data. */
> +    struct {
> +        uint64_t address_space;
> +        uint64_t virtual_address;
> +        uint32_t errcode;
> +        bool_t valid;
> +    } fault_info;

Sorry for noticing this only now, but how can this be a per-domain
thing rather than a per-vCPU one?

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 1/5] xen: Emulate with no writes
  2014-08-26 14:01   ` Razvan Cojocaru
@ 2014-08-26 14:19     ` Jan Beulich
  2014-08-26 14:30       ` Razvan Cojocaru
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-08-26 14:19 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 26.08.14 at 16:01, <rcojocaru@bitdefender.com> wrote:
> On 08/26/2014 04:56 PM, Jan Beulich wrote:
>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>> +void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
>>> +    unsigned int errcode)
>>> +{
>>> +    struct hvm_emulate_ctxt ctx = {{ 0 }};
>>> +    int rc;
>>> +
>>> +    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
>>> +
>>> +    if ( nowrite )
>>> +        rc = hvm_emulate_one_no_write(&ctx);
>>> +    else
>>> +        rc = hvm_emulate_one(&ctx);
>>> +
>>> +    switch ( rc )
>>> +    {
>>> +    case X86EMUL_UNHANDLEABLE:
>>> +        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
>>> +               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
>>> +               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
>>> +               ctx.insn_buf_eip,
>>> +               ctx.insn_buf[0], ctx.insn_buf[1],
>>> +               ctx.insn_buf[2], ctx.insn_buf[3],
>>> +               ctx.insn_buf[4], ctx.insn_buf[5],
>>> +               ctx.insn_buf[6], ctx.insn_buf[7],
>>> +               ctx.insn_buf[8], ctx.insn_buf[9]);
>>> +        hvm_inject_hw_exception(trapnr, errcode);
>>> +        break;
>>> +    case X86EMUL_EXCEPTION:
>>> +        if ( ctx.exn_pending )
>>> +            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
>>> +        break;
>> 
>> Shouldn't you act on X86EMUL_RETRY here? Or at least not fall through
>> to the writeback below?
> 
> Thanks for the review, I did initially loop around hvm_emulate_one()
> until rc != X86EMUL_RETRY, but I've been told that that might block
> against time calibration rendezvous points.

In any event it strikes me as odd that you ignore that state
altogether rather than propagating it back up, so that someone
in suitable position to do the retry can invoke it.

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-26 14:13   ` Jan Beulich
@ 2014-08-26 14:24     ` Razvan Cojocaru
  2014-08-26 14:44       ` Jan Beulich
  2014-08-27 11:54     ` Razvan Cojocaru
  1 sibling, 1 reply; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-26 14:24 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>> --- a/xen/include/asm-x86/hvm/domain.h
>> +++ b/xen/include/asm-x86/hvm/domain.h
>> @@ -141,6 +141,14 @@ struct hvm_domain {
>>       */
>>      uint64_t sync_tsc;
>>  
>> +    /* Memory introspection page fault injection data. */
>> +    struct {
>> +        uint64_t address_space;
>> +        uint64_t virtual_address;
>> +        uint32_t errcode;
>> +        bool_t valid;
>> +    } fault_info;
> 
> Sorry for noticing this only now, but how can this be a per-domain
> thing rather than a per-vCPU one?

The requirement for our introspection application has simply been to
bring back in a swapped-out page, regardless of what VCPU ends up
actually doing it.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 1/5] xen: Emulate with no writes
  2014-08-26 14:19     ` Jan Beulich
@ 2014-08-26 14:30       ` Razvan Cojocaru
  2014-08-26 14:40         ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-26 14:30 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/26/2014 05:19 PM, Jan Beulich wrote:
>>>> On 26.08.14 at 16:01, <rcojocaru@bitdefender.com> wrote:
>> On 08/26/2014 04:56 PM, Jan Beulich wrote:
>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>>> +void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
>>>> +    unsigned int errcode)
>>>> +{
>>>> +    struct hvm_emulate_ctxt ctx = {{ 0 }};
>>>> +    int rc;
>>>> +
>>>> +    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
>>>> +
>>>> +    if ( nowrite )
>>>> +        rc = hvm_emulate_one_no_write(&ctx);
>>>> +    else
>>>> +        rc = hvm_emulate_one(&ctx);
>>>> +
>>>> +    switch ( rc )
>>>> +    {
>>>> +    case X86EMUL_UNHANDLEABLE:
>>>> +        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
>>>> +               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
>>>> +               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
>>>> +               ctx.insn_buf_eip,
>>>> +               ctx.insn_buf[0], ctx.insn_buf[1],
>>>> +               ctx.insn_buf[2], ctx.insn_buf[3],
>>>> +               ctx.insn_buf[4], ctx.insn_buf[5],
>>>> +               ctx.insn_buf[6], ctx.insn_buf[7],
>>>> +               ctx.insn_buf[8], ctx.insn_buf[9]);
>>>> +        hvm_inject_hw_exception(trapnr, errcode);
>>>> +        break;
>>>> +    case X86EMUL_EXCEPTION:
>>>> +        if ( ctx.exn_pending )
>>>> +            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
>>>> +        break;
>>>
>>> Shouldn't you act on X86EMUL_RETRY here? Or at least not fall through
>>> to the writeback below?
>>
>> Thanks for the review, I did initially loop around hvm_emulate_one()
>> until rc != X86EMUL_RETRY, but I've been told that that might block
>> against time calibration rendezvous points.
> 
> In any event it strikes me as odd that you ignore that state
> altogether rather than propagating it back up, so that someone
> in suitable position to do the retry can invoke it.

Since it's being called in the context of handling a mem_event response,
the X86EMUL_RETRY case would lead to a retry anyway (since we couldn't
emulate the current instruction, and we haven't lifted the page access
restrictions). So if we've failed to somehow modify the guest's EIP, the
instruction will hit the page again, cause a new mem_event and a new
attempt to emulate it - so that would seem to fit with the spirit of
X86EMUL_RETRY.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 1/5] xen: Emulate with no writes
  2014-08-26 14:30       ` Razvan Cojocaru
@ 2014-08-26 14:40         ` Jan Beulich
  2014-08-26 14:45           ` Razvan Cojocaru
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-08-26 14:40 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 26.08.14 at 16:30, <rcojocaru@bitdefender.com> wrote:
> On 08/26/2014 05:19 PM, Jan Beulich wrote:
>>>>> On 26.08.14 at 16:01, <rcojocaru@bitdefender.com> wrote:
>>> On 08/26/2014 04:56 PM, Jan Beulich wrote:
>>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>>>> +void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
>>>>> +    unsigned int errcode)
>>>>> +{
>>>>> +    struct hvm_emulate_ctxt ctx = {{ 0 }};
>>>>> +    int rc;
>>>>> +
>>>>> +    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
>>>>> +
>>>>> +    if ( nowrite )
>>>>> +        rc = hvm_emulate_one_no_write(&ctx);
>>>>> +    else
>>>>> +        rc = hvm_emulate_one(&ctx);
>>>>> +
>>>>> +    switch ( rc )
>>>>> +    {
>>>>> +    case X86EMUL_UNHANDLEABLE:
>>>>> +        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
>>>>> +               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
>>>>> +               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
>>>>> +               ctx.insn_buf_eip,
>>>>> +               ctx.insn_buf[0], ctx.insn_buf[1],
>>>>> +               ctx.insn_buf[2], ctx.insn_buf[3],
>>>>> +               ctx.insn_buf[4], ctx.insn_buf[5],
>>>>> +               ctx.insn_buf[6], ctx.insn_buf[7],
>>>>> +               ctx.insn_buf[8], ctx.insn_buf[9]);
>>>>> +        hvm_inject_hw_exception(trapnr, errcode);
>>>>> +        break;
>>>>> +    case X86EMUL_EXCEPTION:
>>>>> +        if ( ctx.exn_pending )
>>>>> +            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
>>>>> +        break;
>>>>
>>>> Shouldn't you act on X86EMUL_RETRY here? Or at least not fall through
>>>> to the writeback below?
>>>
>>> Thanks for the review, I did initially loop around hvm_emulate_one()
>>> until rc != X86EMUL_RETRY, but I've been told that that might block
>>> against time calibration rendezvous points.
>> 
>> In any event it strikes me as odd that you ignore that state
>> altogether rather than propagating it back up, so that someone
>> in suitable position to do the retry can invoke it.
> 
> Since it's being called in the context of handling a mem_event response,
> the X86EMUL_RETRY case would lead to a retry anyway (since we couldn't
> emulate the current instruction, and we haven't lifted the page access
> restrictions). So if we've failed to somehow modify the guest's EIP, the
> instruction will hit the page again, cause a new mem_event and a new
> attempt to emulate it - so that would seem to fit with the spirit of
> X86EMUL_RETRY.

Makes sense. Please add a brief comment to this effect when you
add this specific case (bailing without writeback). One thing to
consider though is which function you're in: Based on its name it
has no connection to the specific mem-access use, and hence - with
the behavior you intend to have here not being generically usable -
renaming the function may be a good idea.

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-26 14:24     ` Razvan Cojocaru
@ 2014-08-26 14:44       ` Jan Beulich
  2014-08-26 14:56         ` Razvan Cojocaru
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-08-26 14:44 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 26.08.14 at 16:24, <rcojocaru@bitdefender.com> wrote:
> On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>> --- a/xen/include/asm-x86/hvm/domain.h
>>> +++ b/xen/include/asm-x86/hvm/domain.h
>>> @@ -141,6 +141,14 @@ struct hvm_domain {
>>>       */
>>>      uint64_t sync_tsc;
>>>  
>>> +    /* Memory introspection page fault injection data. */
>>> +    struct {
>>> +        uint64_t address_space;
>>> +        uint64_t virtual_address;
>>> +        uint32_t errcode;
>>> +        bool_t valid;
>>> +    } fault_info;
>> 
>> Sorry for noticing this only now, but how can this be a per-domain
>> thing rather than a per-vCPU one?
> 
> The requirement for our introspection application has simply been to
> bring back in a swapped-out page, regardless of what VCPU ends up
> actually doing it.

But please remember that what you add to the public code base
shouldn't be tied to specific needs of your application, it should
be coded in a generally useful way.

Furthermore, how would this work if you have 2 vCPU-s hit such
a condition, and you need to bring in 2 pages in parallel?

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 1/5] xen: Emulate with no writes
  2014-08-26 14:40         ` Jan Beulich
@ 2014-08-26 14:45           ` Razvan Cojocaru
  0 siblings, 0 replies; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-26 14:45 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/26/2014 05:40 PM, Jan Beulich wrote:
>>>> On 26.08.14 at 16:30, <rcojocaru@bitdefender.com> wrote:
>> On 08/26/2014 05:19 PM, Jan Beulich wrote:
>>>>>> On 26.08.14 at 16:01, <rcojocaru@bitdefender.com> wrote:
>>>> On 08/26/2014 04:56 PM, Jan Beulich wrote:
>>>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>>>>> +void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
>>>>>> +    unsigned int errcode)
>>>>>> +{
>>>>>> +    struct hvm_emulate_ctxt ctx = {{ 0 }};
>>>>>> +    int rc;
>>>>>> +
>>>>>> +    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
>>>>>> +
>>>>>> +    if ( nowrite )
>>>>>> +        rc = hvm_emulate_one_no_write(&ctx);
>>>>>> +    else
>>>>>> +        rc = hvm_emulate_one(&ctx);
>>>>>> +
>>>>>> +    switch ( rc )
>>>>>> +    {
>>>>>> +    case X86EMUL_UNHANDLEABLE:
>>>>>> +        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
>>>>>> +               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
>>>>>> +               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
>>>>>> +               ctx.insn_buf_eip,
>>>>>> +               ctx.insn_buf[0], ctx.insn_buf[1],
>>>>>> +               ctx.insn_buf[2], ctx.insn_buf[3],
>>>>>> +               ctx.insn_buf[4], ctx.insn_buf[5],
>>>>>> +               ctx.insn_buf[6], ctx.insn_buf[7],
>>>>>> +               ctx.insn_buf[8], ctx.insn_buf[9]);
>>>>>> +        hvm_inject_hw_exception(trapnr, errcode);
>>>>>> +        break;
>>>>>> +    case X86EMUL_EXCEPTION:
>>>>>> +        if ( ctx.exn_pending )
>>>>>> +            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
>>>>>> +        break;
>>>>>
>>>>> Shouldn't you act on X86EMUL_RETRY here? Or at least not fall through
>>>>> to the writeback below?
>>>>
>>>> Thanks for the review, I did initially loop around hvm_emulate_one()
>>>> until rc != X86EMUL_RETRY, but I've been told that that might block
>>>> against time calibration rendezvous points.
>>>
>>> In any event it strikes me as odd that you ignore that state
>>> altogether rather than propagating it back up, so that someone
>>> in suitable position to do the retry can invoke it.
>>
>> Since it's being called in the context of handling a mem_event response,
>> the X86EMUL_RETRY case would lead to a retry anyway (since we couldn't
>> emulate the current instruction, and we haven't lifted the page access
>> restrictions). So if we've failed to somehow modify the guest's EIP, the
>> instruction will hit the page again, cause a new mem_event and a new
>> attempt to emulate it - so that would seem to fit with the spirit of
>> X86EMUL_RETRY.
> 
> Makes sense. Please add a brief comment to this effect when you
> add this specific case (bailing without writeback). One thing to
> consider though is which function you're in: Based on its name it
> has no connection to the specific mem-access use, and hence - with
> the behavior you intend to have here not being generically usable -
> renaming the function may be a good idea.

Will do, thank you very much for your comments!


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-26 14:44       ` Jan Beulich
@ 2014-08-26 14:56         ` Razvan Cojocaru
  2014-08-26 15:49           ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-26 14:56 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/26/2014 05:44 PM, Jan Beulich wrote:
>>>> On 26.08.14 at 16:24, <rcojocaru@bitdefender.com> wrote:
>> On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>>> --- a/xen/include/asm-x86/hvm/domain.h
>>>> +++ b/xen/include/asm-x86/hvm/domain.h
>>>> @@ -141,6 +141,14 @@ struct hvm_domain {
>>>>       */
>>>>      uint64_t sync_tsc;
>>>>  
>>>> +    /* Memory introspection page fault injection data. */
>>>> +    struct {
>>>> +        uint64_t address_space;
>>>> +        uint64_t virtual_address;
>>>> +        uint32_t errcode;
>>>> +        bool_t valid;
>>>> +    } fault_info;
>>>
>>> Sorry for noticing this only now, but how can this be a per-domain
>>> thing rather than a per-vCPU one?
>>
>> The requirement for our introspection application has simply been to
>> bring back in a swapped-out page, regardless of what VCPU ends up
>> actually doing it.
> 
> But please remember that what you add to the public code base
> shouldn't be tied to specific needs of your application, it should
> be coded in a generally useful way.

Of course, perhaps I should have written "the scenario we're working
with" rather than "the requirement for our application". I'm just trying
to understand all the usual cases for this.

> Furthermore, how would this work if you have 2 vCPU-s hit such
> a condition, and you need to bring in 2 pages in parallel?

Since this is all happening in the context of processing mem_events,
it's not really possible for two VCPUs to need to do this in parallel,
since processing mem_events is being done sequentially. A VCPU needs to
put a mem_event in the ring buffer and pause before this hypercall can
be called from userspace.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-26 14:56         ` Razvan Cojocaru
@ 2014-08-26 15:49           ` Jan Beulich
  2014-08-26 16:59             ` Razvan Cojocaru
  2014-08-28 13:15             ` Tim Deegan
  0 siblings, 2 replies; 25+ messages in thread
From: Jan Beulich @ 2014-08-26 15:49 UTC (permalink / raw)
  To: Razvan Cojocaru, Tim Deegan
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 26.08.14 at 16:56, <rcojocaru@bitdefender.com> wrote:
> On 08/26/2014 05:44 PM, Jan Beulich wrote:
>>>>> On 26.08.14 at 16:24, <rcojocaru@bitdefender.com> wrote:
>>> On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>>>> --- a/xen/include/asm-x86/hvm/domain.h
>>>>> +++ b/xen/include/asm-x86/hvm/domain.h
>>>>> @@ -141,6 +141,14 @@ struct hvm_domain {
>>>>>       */
>>>>>      uint64_t sync_tsc;
>>>>>  
>>>>> +    /* Memory introspection page fault injection data. */
>>>>> +    struct {
>>>>> +        uint64_t address_space;
>>>>> +        uint64_t virtual_address;
>>>>> +        uint32_t errcode;
>>>>> +        bool_t valid;
>>>>> +    } fault_info;
>>>>
>>>> Sorry for noticing this only now, but how can this be a per-domain
>>>> thing rather than a per-vCPU one?
>>>
>>> The requirement for our introspection application has simply been to
>>> bring back in a swapped-out page, regardless of what VCPU ends up
>>> actually doing it.
>> 
>> But please remember that what you add to the public code base
>> shouldn't be tied to specific needs of your application, it should
>> be coded in a generally useful way.
> 
> Of course, perhaps I should have written "the scenario we're working
> with" rather than "the requirement for our application". I'm just trying
> to understand all the usual cases for this.
> 
>> Furthermore, how would this work if you have 2 vCPU-s hit such
>> a condition, and you need to bring in 2 pages in parallel?
> 
> Since this is all happening in the context of processing mem_events,
> it's not really possible for two VCPUs to need to do this in parallel,
> since processing mem_events is being done sequentially. A VCPU needs to
> put a mem_event in the ring buffer and pause before this hypercall can
> be called from userspace.

I'd certainly want to hear Tim's opinion here before settling on
either model. Considering that this is at least mem-event related,
it's slightly odd you didn't copy him in the first place.

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-26 15:49           ` Jan Beulich
@ 2014-08-26 16:59             ` Razvan Cojocaru
  2014-08-27  0:54               ` Tian, Kevin
  2014-08-28 13:15             ` Tim Deegan
  1 sibling, 1 reply; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-26 16:59 UTC (permalink / raw)
  To: Jan Beulich, Tim Deegan
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/26/14 18:49, Jan Beulich wrote:
>>>> On 26.08.14 at 16:56, <rcojocaru@bitdefender.com> wrote:
>> On 08/26/2014 05:44 PM, Jan Beulich wrote:
>>>>>> On 26.08.14 at 16:24, <rcojocaru@bitdefender.com> wrote:
>>>> On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>>>>> --- a/xen/include/asm-x86/hvm/domain.h
>>>>>> +++ b/xen/include/asm-x86/hvm/domain.h
>>>>>> @@ -141,6 +141,14 @@ struct hvm_domain {
>>>>>>       */
>>>>>>      uint64_t sync_tsc;
>>>>>>  
>>>>>> +    /* Memory introspection page fault injection data. */
>>>>>> +    struct {
>>>>>> +        uint64_t address_space;
>>>>>> +        uint64_t virtual_address;
>>>>>> +        uint32_t errcode;
>>>>>> +        bool_t valid;
>>>>>> +    } fault_info;
>>>>>
>>>>> Sorry for noticing this only now, but how can this be a per-domain
>>>>> thing rather than a per-vCPU one?
>>>>
>>>> The requirement for our introspection application has simply been to
>>>> bring back in a swapped-out page, regardless of what VCPU ends up
>>>> actually doing it.
>>>
>>> But please remember that what you add to the public code base
>>> shouldn't be tied to specific needs of your application, it should
>>> be coded in a generally useful way.
>>
>> Of course, perhaps I should have written "the scenario we're working
>> with" rather than "the requirement for our application". I'm just trying
>> to understand all the usual cases for this.
>>
>>> Furthermore, how would this work if you have 2 vCPU-s hit such
>>> a condition, and you need to bring in 2 pages in parallel?
>>
>> Since this is all happening in the context of processing mem_events,
>> it's not really possible for two VCPUs to need to do this in parallel,
>> since processing mem_events is being done sequentially. A VCPU needs to
>> put a mem_event in the ring buffer and pause before this hypercall can
>> be called from userspace.
> 
> I'd certainly want to hear Tim's opinion here before settling on
> either model. Considering that this is at least mem-event related,
> it's slightly odd you didn't copy him in the first place.

Sorry about that, scripts/get_maintainter.pl did not list him and I
forgot to CC him.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-26 16:59             ` Razvan Cojocaru
@ 2014-08-27  0:54               ` Tian, Kevin
  2014-08-27  6:58                 ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Tian, Kevin @ 2014-08-27  0:54 UTC (permalink / raw)
  To: Razvan Cojocaru, Jan Beulich, Tim Deegan
  Cc: ian.campbell, stefano.stabellini, andrew.cooper3, Dong, Eddie,
	xen-devel, Nakajima, Jun, ian.jackson

> From: Razvan Cojocaru [mailto:rcojocaru@bitdefender.com]
> Sent: Tuesday, August 26, 2014 9:59 AM
> 
> On 08/26/14 18:49, Jan Beulich wrote:
> >>>> On 26.08.14 at 16:56, <rcojocaru@bitdefender.com> wrote:
> >> On 08/26/2014 05:44 PM, Jan Beulich wrote:
> >>>>>> On 26.08.14 at 16:24, <rcojocaru@bitdefender.com> wrote:
> >>>> On 08/26/2014 05:13 PM, Jan Beulich wrote:
> >>>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
> >>>>>> --- a/xen/include/asm-x86/hvm/domain.h
> >>>>>> +++ b/xen/include/asm-x86/hvm/domain.h
> >>>>>> @@ -141,6 +141,14 @@ struct hvm_domain {
> >>>>>>       */
> >>>>>>      uint64_t sync_tsc;
> >>>>>>
> >>>>>> +    /* Memory introspection page fault injection data. */
> >>>>>> +    struct {
> >>>>>> +        uint64_t address_space;
> >>>>>> +        uint64_t virtual_address;
> >>>>>> +        uint32_t errcode;
> >>>>>> +        bool_t valid;
> >>>>>> +    } fault_info;
> >>>>>
> >>>>> Sorry for noticing this only now, but how can this be a per-domain
> >>>>> thing rather than a per-vCPU one?
> >>>>
> >>>> The requirement for our introspection application has simply been to
> >>>> bring back in a swapped-out page, regardless of what VCPU ends up
> >>>> actually doing it.
> >>>
> >>> But please remember that what you add to the public code base
> >>> shouldn't be tied to specific needs of your application, it should
> >>> be coded in a generally useful way.
> >>
> >> Of course, perhaps I should have written "the scenario we're working
> >> with" rather than "the requirement for our application". I'm just trying
> >> to understand all the usual cases for this.
> >>
> >>> Furthermore, how would this work if you have 2 vCPU-s hit such
> >>> a condition, and you need to bring in 2 pages in parallel?
> >>
> >> Since this is all happening in the context of processing mem_events,
> >> it's not really possible for two VCPUs to need to do this in parallel,
> >> since processing mem_events is being done sequentially. A VCPU needs to
> >> put a mem_event in the ring buffer and pause before this hypercall can
> >> be called from userspace.
> >
> > I'd certainly want to hear Tim's opinion here before settling on
> > either model. Considering that this is at least mem-event related,
> > it's slightly odd you didn't copy him in the first place.
> 
> Sorry about that, scripts/get_maintainter.pl did not list him and I
> forgot to CC him.
> 
> 

>From code seems this info is a condition for PF injection, instead of
recording VCPU faulting information. So it looks OK to be a per-domain
structure, but the structure name 'fault_info' is too generic...

Thanks
Kevin

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-27  0:54               ` Tian, Kevin
@ 2014-08-27  6:58                 ` Jan Beulich
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Beulich @ 2014-08-27  6:58 UTC (permalink / raw)
  To: Razvan Cojocaru, Kevin Tian, Tim Deegan
  Cc: ian.campbell, stefano.stabellini, andrew.cooper3, Eddie Dong,
	xen-devel, Jun Nakajima, ian.jackson

>>> On 27.08.14 at 02:54, <kevin.tian@intel.com> wrote:
> From code seems this info is a condition for PF injection, instead of
> recording VCPU faulting information. So it looks OK to be a per-domain
> structure, but the structure name 'fault_info' is too generic...

Not really, because it also makes assumptions about the guest OS.
Which I'd consider okay for a particular end product like the one
BitDefender is creating, but not for an interface. In particular here
the assumption is that you can inject a fault on an arbitrary vCPU
to achieve a certain goal, which certainly isn't valid without knowing
how the OS works.

But yes, with the naming changed to reflect the special purpose,
this might become less of an issue. Pretending (or making appear)
something to be general purpose when it isn't, however, would
only be setting us up for future problems.

An alternative might be to design the external interface in a way
being vCPU-centric, with a way to specify a "wildcard" to serve the
particular purpose here.

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-26 14:13   ` Jan Beulich
  2014-08-26 14:24     ` Razvan Cojocaru
@ 2014-08-27 11:54     ` Razvan Cojocaru
  2014-08-27 12:10       ` Jan Beulich
  1 sibling, 1 reply; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-27 11:54 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>> +    case XEN_DOMCTL_set_pagefault_info:
>> +    {
>> +        ret = -EINVAL;
>> +
>> +        if ( is_hvm_domain(d) )
>> +        {
>> +            d->arch.hvm_domain.fault_info.address_space =
>> +                op->u.set_pagefault_info.address_space;
>> +            d->arch.hvm_domain.fault_info.virtual_address =
>> +                op->u.set_pagefault_info.virtual_address;
>> +            d->arch.hvm_domain.fault_info.errcode =
>> +                op->u.set_pagefault_info.errcode;
>> +            d->arch.hvm_domain.fault_info.valid = 1;
>> +
>> +            ret = 0;
>> +        }
>> +    }
> 
> Pointless curly braces.

You're right, of course, but I've written it like that because that
seems to be the style (even where it is not necessary / no local
variables are introduced) in
do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl).

Should I break with the coding style for this switch case?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-27 11:54     ` Razvan Cojocaru
@ 2014-08-27 12:10       ` Jan Beulich
  2014-08-27 12:15         ` Razvan Cojocaru
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-08-27 12:10 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 27.08.14 at 13:54, <rcojocaru@bitdefender.com> wrote:
> On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>> +    case XEN_DOMCTL_set_pagefault_info:
>>> +    {
>>> +        ret = -EINVAL;
>>> +
>>> +        if ( is_hvm_domain(d) )
>>> +        {
>>> +            d->arch.hvm_domain.fault_info.address_space =
>>> +                op->u.set_pagefault_info.address_space;
>>> +            d->arch.hvm_domain.fault_info.virtual_address =
>>> +                op->u.set_pagefault_info.virtual_address;
>>> +            d->arch.hvm_domain.fault_info.errcode =
>>> +                op->u.set_pagefault_info.errcode;
>>> +            d->arch.hvm_domain.fault_info.valid = 1;
>>> +
>>> +            ret = 0;
>>> +        }
>>> +    }
>> 
>> Pointless curly braces.
> 
> You're right, of course, but I've written it like that because that
> seems to be the style (even where it is not necessary / no local
> variables are introduced) in
> do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl).
> 
> Should I break with the coding style for this switch case?

Neither do_domctl() nor x86's arch_do_domctl() really consistently
do like you say. Hence I think rather than continuing the bad habit,
making new additions do better is the right approach. (As to why
I really think this isn't just a cosmetic thing: These braces usually
get placed at the same indentation level as the containing switch
statement's, breaking consistent indentation, potentially leading to
two immediately successive closing braces at the same indentation
level.)

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-27 12:10       ` Jan Beulich
@ 2014-08-27 12:15         ` Razvan Cojocaru
  0 siblings, 0 replies; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-27 12:15 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/27/2014 03:10 PM, Jan Beulich wrote:
>>>> On 27.08.14 at 13:54, <rcojocaru@bitdefender.com> wrote:
>> On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>>> +    case XEN_DOMCTL_set_pagefault_info:
>>>> +    {
>>>> +        ret = -EINVAL;
>>>> +
>>>> +        if ( is_hvm_domain(d) )
>>>> +        {
>>>> +            d->arch.hvm_domain.fault_info.address_space =
>>>> +                op->u.set_pagefault_info.address_space;
>>>> +            d->arch.hvm_domain.fault_info.virtual_address =
>>>> +                op->u.set_pagefault_info.virtual_address;
>>>> +            d->arch.hvm_domain.fault_info.errcode =
>>>> +                op->u.set_pagefault_info.errcode;
>>>> +            d->arch.hvm_domain.fault_info.valid = 1;
>>>> +
>>>> +            ret = 0;
>>>> +        }
>>>> +    }
>>>
>>> Pointless curly braces.
>>
>> You're right, of course, but I've written it like that because that
>> seems to be the style (even where it is not necessary / no local
>> variables are introduced) in
>> do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl).
>>
>> Should I break with the coding style for this switch case?
> 
> Neither do_domctl() nor x86's arch_do_domctl() really consistently
> do like you say. Hence I think rather than continuing the bad habit,
> making new additions do better is the right approach. (As to why
> I really think this isn't just a cosmetic thing: These braces usually
> get placed at the same indentation level as the containing switch
> statement's, breaking consistent indentation, potentially leading to
> two immediately successive closing braces at the same indentation
> level.)

I understand, thanks for the reply! I'll change the code.


Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-26 15:49           ` Jan Beulich
  2014-08-26 16:59             ` Razvan Cojocaru
@ 2014-08-28 13:15             ` Tim Deegan
  2014-08-28 13:19               ` Razvan Cojocaru
  1 sibling, 1 reply; 25+ messages in thread
From: Tim Deegan @ 2014-08-28 13:15 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, eddie.dong, xen-devel, jun.nakajima, ian.jackson

At 16:49 +0100 on 26 Aug (1409068141), Jan Beulich wrote:
> >>> On 26.08.14 at 16:56, <rcojocaru@bitdefender.com> wrote:
> > On 08/26/2014 05:44 PM, Jan Beulich wrote:
> >>>>> On 26.08.14 at 16:24, <rcojocaru@bitdefender.com> wrote:
> >>> On 08/26/2014 05:13 PM, Jan Beulich wrote:
> >>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
> >>>>> --- a/xen/include/asm-x86/hvm/domain.h
> >>>>> +++ b/xen/include/asm-x86/hvm/domain.h
> >>>>> @@ -141,6 +141,14 @@ struct hvm_domain {
> >>>>>       */
> >>>>>      uint64_t sync_tsc;
> >>>>>  
> >>>>> +    /* Memory introspection page fault injection data. */
> >>>>> +    struct {
> >>>>> +        uint64_t address_space;
> >>>>> +        uint64_t virtual_address;
> >>>>> +        uint32_t errcode;
> >>>>> +        bool_t valid;
> >>>>> +    } fault_info;
> >>>>
> >>>> Sorry for noticing this only now, but how can this be a per-domain
> >>>> thing rather than a per-vCPU one?
> >>>
> >>> The requirement for our introspection application has simply been to
> >>> bring back in a swapped-out page, regardless of what VCPU ends up
> >>> actually doing it.
> >> 
> >> But please remember that what you add to the public code base
> >> shouldn't be tied to specific needs of your application, it should
> >> be coded in a generally useful way.
> > 
> > Of course, perhaps I should have written "the scenario we're working
> > with" rather than "the requirement for our application". I'm just trying
> > to understand all the usual cases for this.
> > 
> >> Furthermore, how would this work if you have 2 vCPU-s hit such
> >> a condition, and you need to bring in 2 pages in parallel?
> > 
> > Since this is all happening in the context of processing mem_events,
> > it's not really possible for two VCPUs to need to do this in parallel,
> > since processing mem_events is being done sequentially. A VCPU needs to
> > put a mem_event in the ring buffer and pause before this hypercall can
> > be called from userspace.
> 
> I'd certainly want to hear Tim's opinion here before settling on
> either model.

Yes, I think it's much better to have this be a per-vcpu operation; I
see that's the way it's going already in later versions.

Tim.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-28 13:15             ` Tim Deegan
@ 2014-08-28 13:19               ` Razvan Cojocaru
  0 siblings, 0 replies; 25+ messages in thread
From: Razvan Cojocaru @ 2014-08-28 13:19 UTC (permalink / raw)
  To: Tim Deegan, Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/28/2014 04:15 PM, Tim Deegan wrote:
> At 16:49 +0100 on 26 Aug (1409068141), Jan Beulich wrote:
>>>>> On 26.08.14 at 16:56, <rcojocaru@bitdefender.com> wrote:
>>> On 08/26/2014 05:44 PM, Jan Beulich wrote:
>>>>>>> On 26.08.14 at 16:24, <rcojocaru@bitdefender.com> wrote:
>>>>> On 08/26/2014 05:13 PM, Jan Beulich wrote:
>>>>>>>>> On 13.08.14 at 17:28, <rcojocaru@bitdefender.com> wrote:
>>>>>>> --- a/xen/include/asm-x86/hvm/domain.h
>>>>>>> +++ b/xen/include/asm-x86/hvm/domain.h
>>>>>>> @@ -141,6 +141,14 @@ struct hvm_domain {
>>>>>>>       */
>>>>>>>      uint64_t sync_tsc;
>>>>>>>  
>>>>>>> +    /* Memory introspection page fault injection data. */
>>>>>>> +    struct {
>>>>>>> +        uint64_t address_space;
>>>>>>> +        uint64_t virtual_address;
>>>>>>> +        uint32_t errcode;
>>>>>>> +        bool_t valid;
>>>>>>> +    } fault_info;
>>>>>>
>>>>>> Sorry for noticing this only now, but how can this be a per-domain
>>>>>> thing rather than a per-vCPU one?
>>>>>
>>>>> The requirement for our introspection application has simply been to
>>>>> bring back in a swapped-out page, regardless of what VCPU ends up
>>>>> actually doing it.
>>>>
>>>> But please remember that what you add to the public code base
>>>> shouldn't be tied to specific needs of your application, it should
>>>> be coded in a generally useful way.
>>>
>>> Of course, perhaps I should have written "the scenario we're working
>>> with" rather than "the requirement for our application". I'm just trying
>>> to understand all the usual cases for this.
>>>
>>>> Furthermore, how would this work if you have 2 vCPU-s hit such
>>>> a condition, and you need to bring in 2 pages in parallel?
>>>
>>> Since this is all happening in the context of processing mem_events,
>>> it's not really possible for two VCPUs to need to do this in parallel,
>>> since processing mem_events is being done sequentially. A VCPU needs to
>>> put a mem_event in the ring buffer and pause before this hypercall can
>>> be called from userspace.
>>
>> I'd certainly want to hear Tim's opinion here before settling on
>> either model.
> 
> Yes, I think it's much better to have this be a per-vcpu operation; I
> see that's the way it's going already in later versions.

I've just reverted that change in V10 after discussing it with Jan. :)

I'll check into more detail if it can be done per-VCPU with no
ill-effects on our side, and if so I'll come back to the per-VCPU
implementation.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2014-08-28 13:19 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-13 15:28 [PATCH RFC V7 1/5] xen: Emulate with no writes Razvan Cojocaru
2014-08-13 15:28 ` [PATCH RFC V7 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
2014-08-13 15:28 ` [PATCH RFC V7 3/5] xen, libxc: Force-enable relevant MSR events Razvan Cojocaru
2014-08-26 14:05   ` Jan Beulich
2014-08-13 15:28 ` [PATCH RFC V7 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
2014-08-26 14:13   ` Jan Beulich
2014-08-26 14:24     ` Razvan Cojocaru
2014-08-26 14:44       ` Jan Beulich
2014-08-26 14:56         ` Razvan Cojocaru
2014-08-26 15:49           ` Jan Beulich
2014-08-26 16:59             ` Razvan Cojocaru
2014-08-27  0:54               ` Tian, Kevin
2014-08-27  6:58                 ` Jan Beulich
2014-08-28 13:15             ` Tim Deegan
2014-08-28 13:19               ` Razvan Cojocaru
2014-08-27 11:54     ` Razvan Cojocaru
2014-08-27 12:10       ` Jan Beulich
2014-08-27 12:15         ` Razvan Cojocaru
2014-08-13 15:28 ` [PATCH RFC V7 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
2014-08-26 13:56 ` [PATCH RFC V7 1/5] xen: Emulate with no writes Jan Beulich
2014-08-26 14:01   ` Razvan Cojocaru
2014-08-26 14:19     ` Jan Beulich
2014-08-26 14:30       ` Razvan Cojocaru
2014-08-26 14:40         ` Jan Beulich
2014-08-26 14:45           ` Razvan Cojocaru

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.