All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC V4 1/5] xen: Emulate with no writes
@ 2014-08-04 11:30 Razvan Cojocaru
  2014-08-04 11:30 ` [PATCH RFC V4 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
                   ` (4 more replies)
  0 siblings, 5 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 11:30 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, JBeulich, eddie.dong, jun.nakajima

Added support for emulating an instruction with no memory writes.
Additionally, introduced hvm_emulate_one_full(), which inspects
possible return values from the hvm_emulate_one() functions
(EXCEPTION, UNHANDLEABLE) and acts on them.

Changes since V1:
 - Removed the Linux code that computes the length of an instruction.
 - Unused function parameters are no longer marked.
 - Refactored the code to eliminate redundancy.
 - Made the exception passed on to the guest by hvm_emulate_one_full()
   configurable.

Changes since V2:
 - Renamed hvmemul_write_dummy() to hvmemul_write_discard().
 - Fixed a comment (Xen coding style rules).
 - Renamed hvm_emulate_one_with_ops() to _hvm_emulate_one().
 - Changed stack variable hvm_emulate_ops_no_write into a static const.
 - Modified struct hvm_emulate_ctxt ctx initialization syntax.
 - Renamed unhandleable_trapnr to trapnr and unhandleable_errcode
   to errcode.
 - Changed errcode's type to unsigned int.
 - Removed hvm_emulate_one() loop that went on until it returned
   something other than X86EMUL_RETRY (to prevent potential
   blocking against the time calibration rendezvous).

Changes since V3:
 - The rep_ins, rep_movs and cmpxchg handlers are
   now also inactive.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/emulate.c        |  125 ++++++++++++++++++++++++++++++++++++-
 xen/include/asm-x86/hvm/emulate.h |    5 ++
 2 files changed, 127 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index eac159f..be32692 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -688,6 +688,51 @@ static int hvmemul_write(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_write_discard(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    /* Discarding the write. */
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_rep_ins_discard(
+    uint16_t src_port,
+    enum x86_segment dst_seg,
+    unsigned long dst_offset,
+    unsigned int bytes_per_rep,
+    unsigned long *reps,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_rep_movs_discard(
+   enum x86_segment src_seg,
+   unsigned long src_offset,
+   enum x86_segment dst_seg,
+   unsigned long dst_offset,
+   unsigned int bytes_per_rep,
+   unsigned long *reps,
+   struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_cmpxchg_discard(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_old,
+    void *p_new,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
 static int hvmemul_cmpxchg(
     enum x86_segment seg,
     unsigned long offset,
@@ -1138,8 +1183,33 @@ static const struct x86_emulate_ops hvm_emulate_ops = {
     .invlpg        = hvmemul_invlpg
 };
 
-int hvm_emulate_one(
-    struct hvm_emulate_ctxt *hvmemul_ctxt)
+static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
+    .read          = hvmemul_read,
+    .insn_fetch    = hvmemul_insn_fetch,
+    .write         = hvmemul_write_discard,
+    .cmpxchg       = hvmemul_cmpxchg_discard,
+    .rep_ins       = hvmemul_rep_ins_discard,
+    .rep_outs      = hvmemul_rep_outs,
+    .rep_movs      = hvmemul_rep_movs_discard,
+    .read_segment  = hvmemul_read_segment,
+    .write_segment = hvmemul_write_segment,
+    .read_io       = hvmemul_read_io,
+    .write_io      = hvmemul_write_io,
+    .read_cr       = hvmemul_read_cr,
+    .write_cr      = hvmemul_write_cr,
+    .read_msr      = hvmemul_read_msr,
+    .write_msr     = hvmemul_write_msr,
+    .wbinvd        = hvmemul_wbinvd,
+    .cpuid         = hvmemul_cpuid,
+    .inject_hw_exception = hvmemul_inject_hw_exception,
+    .inject_sw_interrupt = hvmemul_inject_sw_interrupt,
+    .get_fpu       = hvmemul_get_fpu,
+    .put_fpu       = hvmemul_put_fpu,
+    .invlpg        = hvmemul_invlpg
+};
+
+static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+    const struct x86_emulate_ops *ops)
 {
     struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
@@ -1191,7 +1261,7 @@ int hvm_emulate_one(
     vio->mmio_retrying = vio->mmio_retry;
     vio->mmio_retry = 0;
 
-    rc = x86_emulate(&hvmemul_ctxt->ctxt, &hvm_emulate_ops);
+    rc = x86_emulate(&hvmemul_ctxt->ctxt, ops);
 
     if ( rc == X86EMUL_OKAY && vio->mmio_retry )
         rc = X86EMUL_RETRY;
@@ -1239,6 +1309,55 @@ int hvm_emulate_one(
     return X86EMUL_OKAY;
 }
 
+int hvm_emulate_one(
+    struct hvm_emulate_ctxt *hvmemul_ctxt)
+{
+    return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops);
+}
+
+int hvm_emulate_one_no_write(
+    struct hvm_emulate_ctxt *hvmemul_ctxt)
+{
+    return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops_no_write);
+}
+
+void hvm_emulate_one_full(bool_t nowrite, unsigned int trapnr,
+    unsigned int errcode)
+{
+    struct hvm_emulate_ctxt ctx = {{ 0 }};
+    int rc;
+
+    hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
+
+    if ( nowrite )
+        rc = hvm_emulate_one_no_write(&ctx);
+    else
+        rc = hvm_emulate_one(&ctx);
+
+    switch ( rc )
+    {
+    case X86EMUL_UNHANDLEABLE:
+        gdprintk(XENLOG_DEBUG, "Emulation failed @ %04x:%lx: "
+               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
+               hvmemul_get_seg_reg(x86_seg_cs, &ctx)->sel,
+               ctx.insn_buf_eip,
+               ctx.insn_buf[0], ctx.insn_buf[1],
+               ctx.insn_buf[2], ctx.insn_buf[3],
+               ctx.insn_buf[4], ctx.insn_buf[5],
+               ctx.insn_buf[6], ctx.insn_buf[7],
+               ctx.insn_buf[8], ctx.insn_buf[9]);
+        hvm_inject_hw_exception(trapnr, errcode);
+        break;
+    case X86EMUL_EXCEPTION:
+        if ( ctx.exn_pending )
+            hvm_inject_hw_exception(ctx.exn_vector, ctx.exn_error_code);
+        /* fall through */
+    default:
+        hvm_emulate_writeback(&ctx);
+        break;
+    }
+}
+
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs)
diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h
index 00a06cc..a74f310 100644
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -37,6 +37,11 @@ struct hvm_emulate_ctxt {
 
 int hvm_emulate_one(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
+int hvm_emulate_one_no_write(
+    struct hvm_emulate_ctxt *hvmemul_ctxt);
+void hvm_emulate_one_full(bool_t nowrite,
+    unsigned int trapnr,
+    unsigned int errcode);
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH RFC V4 2/5] xen: Optimize introspection access to guest state
  2014-08-04 11:30 [PATCH RFC V4 1/5] xen: Emulate with no writes Razvan Cojocaru
@ 2014-08-04 11:30 ` Razvan Cojocaru
  2014-08-04 14:16   ` Jan Beulich
  2014-08-04 11:30 ` [PATCH RFC V4 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 11:30 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, JBeulich, eddie.dong, jun.nakajima

Speed optimization for introspection purposes: a handful of registers
are sent along with each mem_event. This requires enlargement of the
mem_event_request / mem_event_response stuctures, and additional code
to fill in relevant values. Since the EPT event processing code needs
more data than CR3 or MSR event processors, hvm_mem_event_fill_regs()
fills in less data than p2m_mem_event_fill_regs(), in order to avoid
overhead. Struct hvm_hw_cpu has been considered instead of the custom
struct mem_event_regs_st, but its size would cause quick filling up
of the mem_event ring buffer.

Changes since V1:
 - Replaced guest_x86_mode with cs_arbytes in the mem_event_regs_st
   structure.
 - Removed superfluous preprocessor check for __x86_64__ in
   p2m_mem_event_fill_regs().

Changes since V2:
 - Removed inline function compiler suggestions.
 - Removed superfluous memset(&ctxt, 0, sizeof(struct hvm_hw_cpu)).
 - Padded struct mem_event_regs_st.
 - Renamed mem_event_regs_t to mem_event_regs.
 - Readability changes.

Changes since V3:
 - Changed the name of VCPU variables referring to the current
   VCPU to "curr".
 - Renamed "mem_event_regs" to "x86_mem_event_regs" to make it
   explicit.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/hvm.c         |   33 +++++++++++++++++++++++
 xen/arch/x86/mm/p2m.c          |   57 ++++++++++++++++++++++++++++++++++++++++
 xen/include/public/mem_event.h |   39 +++++++++++++++++++++++++++
 3 files changed, 129 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index e834406..77f4b36 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -6127,6 +6127,38 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
     return rc;
 }
 
+static void hvm_mem_event_fill_regs(mem_event_request_t *req)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct vcpu *curr = current;
+
+    req->x86_regs.rax = regs->eax;
+    req->x86_regs.rcx = regs->ecx;
+    req->x86_regs.rdx = regs->edx;
+    req->x86_regs.rbx = regs->ebx;
+    req->x86_regs.rsp = regs->esp;
+    req->x86_regs.rbp = regs->ebp;
+    req->x86_regs.rsi = regs->esi;
+    req->x86_regs.rdi = regs->edi;
+
+    req->x86_regs.r8  = regs->r8;
+    req->x86_regs.r9  = regs->r9;
+    req->x86_regs.r10 = regs->r10;
+    req->x86_regs.r11 = regs->r11;
+    req->x86_regs.r12 = regs->r12;
+    req->x86_regs.r13 = regs->r13;
+    req->x86_regs.r14 = regs->r14;
+    req->x86_regs.r15 = regs->r15;
+
+    req->x86_regs.rflags = regs->eflags;
+    req->x86_regs.rip    = regs->eip;
+
+    req->x86_regs.msr_efer = curr->arch.hvm_vcpu.guest_efer;
+    req->x86_regs.cr0 = curr->arch.hvm_vcpu.guest_cr[0];
+    req->x86_regs.cr3 = curr->arch.hvm_vcpu.guest_cr[3];
+    req->x86_regs.cr4 = curr->arch.hvm_vcpu.guest_cr[4];
+}
+
 static int hvm_memory_event_traps(long p, uint32_t reason,
                                   unsigned long value, unsigned long old, 
                                   bool_t gla_valid, unsigned long gla) 
@@ -6171,6 +6203,7 @@ static int hvm_memory_event_traps(long p, uint32_t reason,
         req.gla = old;
     }
     
+    hvm_mem_event_fill_regs(&req);
     mem_event_put_request(d, &d->mem_event->access, &req);
     
     return 1;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index bca9f0f..069e869 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1323,6 +1323,61 @@ void p2m_mem_paging_resume(struct domain *d)
     }
 }
 
+static void p2m_mem_event_fill_regs(mem_event_request_t *req)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct segment_register seg;
+    struct hvm_hw_cpu ctxt;
+    struct vcpu *curr = current;
+
+    /* Architecture-specific vmcs/vmcb bits */
+    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
+
+    req->x86_regs.rax = regs->eax;
+    req->x86_regs.rcx = regs->ecx;
+    req->x86_regs.rdx = regs->edx;
+    req->x86_regs.rbx = regs->ebx;
+    req->x86_regs.rsp = regs->esp;
+    req->x86_regs.rbp = regs->ebp;
+    req->x86_regs.rsi = regs->esi;
+    req->x86_regs.rdi = regs->edi;
+
+    req->x86_regs.r8  = regs->r8;
+    req->x86_regs.r9  = regs->r9;
+    req->x86_regs.r10 = regs->r10;
+    req->x86_regs.r11 = regs->r11;
+    req->x86_regs.r12 = regs->r12;
+    req->x86_regs.r13 = regs->r13;
+    req->x86_regs.r14 = regs->r14;
+    req->x86_regs.r15 = regs->r15;
+
+    req->x86_regs.rflags = regs->eflags;
+    req->x86_regs.rip    = regs->eip;
+
+    req->x86_regs.dr7 = curr->arch.debugreg[7];
+    req->x86_regs.cr0 = ctxt.cr0;
+    req->x86_regs.cr2 = ctxt.cr2;
+    req->x86_regs.cr3 = ctxt.cr3;
+    req->x86_regs.cr4 = ctxt.cr4;
+
+    req->x86_regs.sysenter_cs = ctxt.sysenter_cs;
+    req->x86_regs.sysenter_esp = ctxt.sysenter_esp;
+    req->x86_regs.sysenter_eip = ctxt.sysenter_eip;
+
+    req->x86_regs.msr_efer = ctxt.msr_efer;
+    req->x86_regs.msr_star = ctxt.msr_star;
+    req->x86_regs.msr_lstar = ctxt.msr_lstar;
+
+    hvm_get_segment_register(curr, x86_seg_fs, &seg);
+    req->x86_regs.fs_base = seg.base;
+
+    hvm_get_segment_register(curr, x86_seg_gs, &seg);
+    req->x86_regs.gs_base = seg.base;
+
+    hvm_get_segment_register(curr, x86_seg_cs, &seg);
+    req->x86_regs.cs_arbytes = seg.attr.bytes;
+}
+
 bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, 
                           bool_t access_r, bool_t access_w, bool_t access_x,
                           mem_event_request_t **req_ptr)
@@ -1410,6 +1465,8 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
         req->access_x = access_x;
     
         req->vcpu_id = v->vcpu_id;
+
+        p2m_mem_event_fill_regs(req);
     }
 
     /* Pause the current VCPU */
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index 3831b41..37eb0de 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -48,6 +48,44 @@
 #define MEM_EVENT_REASON_MSR         7    /* MSR was hit: gfn is MSR value, gla is MSR address;
                                              does NOT honour HVMPME_onchangeonly */
 
+/* Using a custom struct (not hvm_hw_cpu) so as to not fill
+ * the mem_event ring buffer too quickly. */
+typedef struct x86_mem_event_regs_st {
+    uint64_t rax;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbx;
+    uint64_t rsp;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+    uint64_t rflags;
+    uint64_t dr7;
+    uint64_t rip;
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+    uint64_t msr_efer;
+    uint64_t msr_star;
+    uint64_t msr_lstar;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint32_t cs_arbytes;
+    uint32_t _pad;
+} x86_mem_event_regs;
+
 typedef struct mem_event_st {
     uint32_t flags;
     uint32_t vcpu_id;
@@ -65,6 +103,7 @@ typedef struct mem_event_st {
     uint16_t available:12;
 
     uint16_t reason;
+    x86_mem_event_regs x86_regs;
 } mem_event_request_t, mem_event_response_t;
 
 DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH RFC V4 3/5] xen: Force-enable relevant MSR events; optimize the  number of sent MSR events
  2014-08-04 11:30 [PATCH RFC V4 1/5] xen: Emulate with no writes Razvan Cojocaru
  2014-08-04 11:30 ` [PATCH RFC V4 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
@ 2014-08-04 11:30 ` Razvan Cojocaru
  2014-08-04 11:30 ` [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 11:30 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, JBeulich, eddie.dong, jun.nakajima

Vmx_disable_intercept_for_msr() will now refuse to disable interception of
MSRs needed for memory introspection. It is not possible to gate this on
mem_access being active for the domain, since by the time mem_access does
become active the interception for the interesting MSRs has already been
disabled (vmx_disable_intercept_for_msr() runs very early on).

Changes since V1:
 - Replaced printk() with gdprintk(XENLOG_DEBUG, ...).

Changes since V2:
 - Split a log line differently to keep it grepable.
 - Interception for relevant MSRs will be disabled only if
   mem_access is not enabled.
 - Since they end up being disabled early on (when mem_access
   is not enabled yet), re-enable interception when mem_access
   becomes active.

Changes since V3:
 - Removed log line stating that MSR interception cannot be disabled.
 - Removed superfluous #include <asm/hvm/vmx/vmcs.h>.
 - Moved VMX-specific code to vmx.c (as a new hvm_funcs member).

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/vmx/vmcs.c   |   20 ++++++++++++++++++++
 xen/arch/x86/hvm/vmx/vmx.c    |   17 +++++++++++++++++
 xen/arch/x86/mm/mem_event.c   |    3 +++
 xen/include/asm-x86/hvm/hvm.h |    2 ++
 4 files changed, 42 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 8ffc562..2703c58 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -39,6 +39,7 @@
 #include <xen/keyhandler.h>
 #include <asm/shadow.h>
 #include <asm/tboot.h>
+#include <asm/mem_event.h>
 
 static bool_t __read_mostly opt_vpid_enabled = 1;
 boolean_param("vpid", opt_vpid_enabled);
@@ -695,11 +696,30 @@ static void vmx_set_host_env(struct vcpu *v)
 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
 {
     unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
+    struct domain *d = v->domain;
 
     /* VMX MSR bitmap supported? */
     if ( msr_bitmap == NULL )
         return;
 
+    if ( mem_event_check_ring(&d->mem_event->access) )
+    {
+        /* Filter out MSR-s needed for memory introspection */
+        switch ( msr )
+        {
+        case MSR_IA32_SYSENTER_EIP:
+        case MSR_IA32_SYSENTER_ESP:
+        case MSR_IA32_SYSENTER_CS:
+        case MSR_IA32_MC0_CTL:
+        case MSR_STAR:
+        case MSR_LSTAR:
+            return;
+
+        default:
+            break;
+        }
+    }
+
     /*
      * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
      * have the write-low and read-high bitmap offsets the wrong way round.
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 2caa04a..dfb0c95 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1682,6 +1682,22 @@ void vmx_hypervisor_cpuid_leaf(uint32_t sub_idx,
         *eax |= XEN_HVM_CPUID_X2APIC_VIRT;
 }
 
+static void vmx_enable_intro_msr_interception(struct domain *d)
+{
+    struct vcpu *v;
+
+    /* Enable interception for MSRs needed for memory introspection. */
+    for_each_vcpu ( d, v )
+    {
+        vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, MSR_TYPE_W);
+        vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, MSR_TYPE_W);
+        vmx_enable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, MSR_TYPE_W);
+        vmx_enable_intercept_for_msr(v, MSR_IA32_MC0_CTL, MSR_TYPE_W);
+        vmx_enable_intercept_for_msr(v, MSR_STAR, MSR_TYPE_W);
+        vmx_enable_intercept_for_msr(v, MSR_LSTAR, MSR_TYPE_W);
+    }
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -1740,6 +1756,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .handle_eoi           = vmx_handle_eoi,
     .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
     .hypervisor_cpuid_leaf = vmx_hypervisor_cpuid_leaf,
+    .enable_intro_msr_interception = vmx_enable_intro_msr_interception,
 };
 
 const struct hvm_function_table * __init start_vmx(void)
diff --git a/xen/arch/x86/mm/mem_event.c b/xen/arch/x86/mm/mem_event.c
index ba7e71e..d5959d9 100644
--- a/xen/arch/x86/mm/mem_event.c
+++ b/xen/arch/x86/mm/mem_event.c
@@ -600,6 +600,9 @@ int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
             rc = mem_event_enable(d, mec, med, _VPF_mem_access, 
                                     HVM_PARAM_ACCESS_RING_PFN,
                                     mem_access_notification);
+
+            if ( rc == 0 && hvm_funcs.enable_intro_msr_interception )
+                hvm_funcs.enable_intro_msr_interception(d);
         }
         break;
 
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 0ebd478..2a87e9b 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -205,6 +205,8 @@ struct hvm_function_table {
     void (*hypervisor_cpuid_leaf)(uint32_t sub_idx,
                                   uint32_t *eax, uint32_t *ebx,
                                   uint32_t *ecx, uint32_t *edx);
+
+    void (*enable_intro_msr_interception)(struct domain *d);
 };
 
 extern struct hvm_function_table hvm_funcs;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-04 11:30 [PATCH RFC V4 1/5] xen: Emulate with no writes Razvan Cojocaru
  2014-08-04 11:30 ` [PATCH RFC V4 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
  2014-08-04 11:30 ` [PATCH RFC V4 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
@ 2014-08-04 11:30 ` Razvan Cojocaru
  2014-08-04 11:51   ` Ian Campbell
  2014-08-04 14:26   ` Jan Beulich
  2014-08-04 11:30 ` [PATCH RFC V4 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
  2014-08-04 14:09 ` [PATCH RFC V4 1/5] xen: Emulate with no writes Jan Beulich
  4 siblings, 2 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 11:30 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, JBeulich, eddie.dong, jun.nakajima

Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
new xc_domain_set_pagefault_info() function to set per-domain page
fault injection information. All a call does is set per-domain info,
and nothing actually happens until VMENTRY time, and then only if
all conditions are met (the guest is in user mode, the set value
matches CR3, and there are no other pending traps).
This mechanism allows bringing in swapped-out pages for inspection.

Changes since V2:
 - Removed superfluous memset(&ctxt, 0, sizeof(struct hvm_hw_cpu)).
 - Now checking SS.DPL instead of CS.DPL to see if the guest is in
   user mode.
 - Removed superfluous fault_info members initialization.
 - Now returning -EINVAL if !has_hvm_container_domain(d) on
   XEN_DOMCTL_set_pagefault_info.
 - Moved struct fault_info from struct domain to struct hvm_domain.
 - Collapsed explicit initialization of the fault_info members into a
   one-line assignment on the tools/libxc side.
 - Now using write_access as a proper bool instead of shifting and
   OR-ing it with PFEC_user_mode.

Changes since V3:
 - Now checking that is_hvm_domain(d) in do_domctl() (consistent with
   the test in the actual page fault injection code).
 - XEN_DOMCTL_set_pagefault_info now extends the proper range.
 - Added a likely() around the
   d->arch.hvm_domain.fault_info.virtual_address == 0 check.
 - Replaced hvm_funcs.save_cpu_ctxt() function call with lighter code.
 - Split the vmx.c page fault injection function into a check
   function and an inject function.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 tools/libxc/xc_domain.c          |   16 ++++++++++
 tools/libxc/xenctrl.h            |    4 +++
 xen/arch/x86/hvm/vmx/vmx.c       |   63 ++++++++++++++++++++++++++++++++++++++
 xen/common/domctl.c              |   26 ++++++++++++++++
 xen/include/asm-x86/hvm/domain.h |    7 +++++
 xen/include/public/domctl.h      |   14 +++++++++
 6 files changed, 130 insertions(+)

diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index 0230c6c..49f0c61 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -506,6 +506,22 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
     return ret;
 }
 
+int xc_domain_set_pagefault_info(xc_interface *xch,
+                                 uint32_t domid,
+                                 xen_domctl_set_pagefault_info_t *info)
+{
+    DECLARE_DOMCTL;
+
+    if (info == NULL)
+        return -1;
+
+    domctl.cmd = XEN_DOMCTL_set_pagefault_info;
+    domctl.domain = (domid_t)domid;
+    domctl.u.set_pagefault_info = *info;
+
+    return do_domctl(xch, &domctl);
+}
+
 int xc_vcpu_getcontext(xc_interface *xch,
                        uint32_t domid,
                        uint32_t vcpu,
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 5beb846..a913f10 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -803,6 +803,10 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
 const char *xc_domain_get_native_protocol(xc_interface *xch,
                                           uint32_t domid);
 
+int xc_domain_set_pagefault_info(xc_interface *xch,
+                                 uint32_t domid,
+                                 xen_domctl_set_pagefault_info_t *info);
+
 /**
  * This function returns information about the execution context of a
  * particular vcpu of a domain.
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index dfb0c95..c0e3d73 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3114,6 +3114,66 @@ out:
         nvmx_idtv_handling();
 }
 
+static bool_t vmx_check_pf_injection(void)
+{
+    struct vcpu *curr = current;
+    struct domain *d = curr->domain;
+    struct segment_register seg;
+    unsigned long ev;
+    uint32_t pending_event = 0;
+
+    if ( likely(d->arch.hvm_domain.fault_info.virtual_address == 0 )
+         || !is_hvm_domain(d) )
+        return 0;
+
+    hvm_get_segment_register(curr, x86_seg_ss, &seg);
+
+    if ( seg.attr.fields.dpl != 3 ) /* Guest is not in user mode */
+        return 0;
+
+    vmx_vmcs_enter(curr);
+
+    if ( curr->arch.hvm_vcpu.guest_cr[3]
+         != d->arch.hvm_domain.fault_info.address_space )
+    {
+        vmx_vmcs_exit(curr);
+        return 0;
+    }
+
+    __vmread(VM_ENTRY_INTR_INFO, &ev);
+
+    if ( (ev & INTR_INFO_VALID_MASK) &&
+         hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
+        pending_event = ev;
+
+    if ( pending_event )
+    {
+        vmx_vmcs_exit(curr);
+        return 0;
+    }
+
+    vmx_vmcs_exit(curr);
+    return 1;
+}
+
+static void vmx_inject_pf(void)
+{
+    struct vcpu *curr = current;
+    struct domain *d = curr->domain;
+    int errcode = PFEC_user_mode;
+    uint64_t virtual_address = d->arch.hvm_domain.fault_info.virtual_address;
+    uint32_t write_access = d->arch.hvm_domain.fault_info.write_access;
+
+    d->arch.hvm_domain.fault_info.address_space = 0;
+    d->arch.hvm_domain.fault_info.virtual_address = 0;
+    d->arch.hvm_domain.fault_info.write_access = 0;
+
+    if ( write_access )
+        errcode |= PFEC_write_access;
+
+    hvm_inject_page_fault(errcode, virtual_address);
+}
+
 void vmx_vmenter_helper(const struct cpu_user_regs *regs)
 {
     struct vcpu *curr = current;
@@ -3154,6 +3214,9 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
     if ( unlikely(need_flush) )
         vpid_sync_all();
 
+    if ( vmx_check_pf_injection() )
+        vmx_inject_pf();
+
  out:
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index c326aba..4198f44 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -967,6 +967,32 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
     }
     break;
 
+    case XEN_DOMCTL_set_pagefault_info:
+    {
+        struct domain *d;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->domain);
+        if ( d != NULL )
+        {
+            ret = -EINVAL;
+
+            if ( is_hvm_domain(d) )
+            {
+                d->arch.hvm_domain.fault_info.address_space =
+                    op->u.set_pagefault_info.address_space;
+                d->arch.hvm_domain.fault_info.virtual_address =
+                    op->u.set_pagefault_info.virtual_address;
+                d->arch.hvm_domain.fault_info.write_access =
+                    op->u.set_pagefault_info.write_access;
+                ret = 0;
+            }
+
+            rcu_unlock_domain(d);
+        }
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, d, u_domctl);
         break;
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 291a2e0..f9bd098 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -141,6 +141,13 @@ struct hvm_domain {
      */
     uint64_t sync_tsc;
 
+    /* Memory introspection page fault injection data. */
+    struct {
+        uint64_t address_space;
+        uint64_t virtual_address;
+        uint32_t write_access;
+    } fault_info;
+
     union {
         struct vmx_domain vmx;
         struct svm_domain svm;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 5b11bbf..3010197 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -936,6 +936,18 @@ typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
 #endif
 
+/* XEN_DOMCTL_set_pagefault_info requests that a page fault occur at
+ * the next VMENTRY.
+ *  */
+struct xen_domctl_set_pagefault_info {
+    uint64_t address_space;
+    uint64_t virtual_address;
+    uint32_t write_access;
+};
+typedef struct xen_domctl_set_pagefault_info xen_domctl_set_pagefault_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_pagefault_info_t);
+
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -1008,6 +1020,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_cacheflush                    71
 #define XEN_DOMCTL_get_vcpu_msrs                 72
 #define XEN_DOMCTL_set_vcpu_msrs                 73
+#define XEN_DOMCTL_set_pagefault_info            74
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -1068,6 +1081,7 @@ struct xen_domctl {
         struct xen_domctl_cacheflush        cacheflush;
         struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
+        struct xen_domctl_set_pagefault_info set_pagefault_info;
         uint8_t                             pad[128];
     } u;
 };
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH RFC V4 5/5] xen: Handle resumed instruction based on previous mem_event reply
  2014-08-04 11:30 [PATCH RFC V4 1/5] xen: Emulate with no writes Razvan Cojocaru
                   ` (2 preceding siblings ...)
  2014-08-04 11:30 ` [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
@ 2014-08-04 11:30 ` Razvan Cojocaru
  2014-08-04 14:33   ` Jan Beulich
  2014-08-04 14:09 ` [PATCH RFC V4 1/5] xen: Emulate with no writes Jan Beulich
  4 siblings, 1 reply; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 11:30 UTC (permalink / raw)
  To: xen-devel
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, ian.jackson, JBeulich, eddie.dong, jun.nakajima

In a scenario where a page fault that triggered a mem_event occured,
p2m_mem_access_check() will now be able to either 1) emulate the
current instruction, or 2) emulate it, but don't allow it to perform
any writes.

Changes since V1:
 - Removed the 'skip' code which required computing the current
   instruction length.
 - Removed the set_ad_bits() code that attempted to modify the
   'accessed' and 'dirty' bits for instructions that the emulator
   can't handle at the moment.

Changes since V2:
 - Moved the __vmread(EXIT_QUALIFICATION, &exit_qualification); code
   in vmx.c, accessible via hvm_funcs.
 - Incorporated changes by Andrew Cooper ("[PATCH 1/2] Xen/mem_event:
   Validate the response vcpu_id before acting on it."

Changes since V3:
 - Collapsed verbose lines into a single "else if()".
 - Changed an int to bool_t.
 - Fixed a minor coding style issue.
 - Now computing the first parameter to hvm_emulate_one_full()
   (replaced an "if" with a single call).
 - Added code comments about eip and gla reset (clarity issue).
 - Removed duplicate code by Andrew Cooper (introduced in V2,
   since committed).

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/domain.c          |    3 ++
 xen/arch/x86/hvm/vmx/vmx.c     |   13 ++++++
 xen/arch/x86/mm/p2m.c          |   85 ++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/domain.h   |    9 +++++
 xen/include/asm-x86/hvm/hvm.h  |    2 +
 xen/include/public/mem_event.h |   12 +++---
 6 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index e896210..af9b213 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -407,6 +407,9 @@ int vcpu_initialise(struct vcpu *v)
 
     v->arch.flags = TF_kernel_mode;
 
+    /* By default, do not emulate */
+    v->arch.mem_event.emulate_flags = 0;
+
     rc = mapcache_vcpu_init(v);
     if ( rc )
         return rc;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index c0e3d73..150fe9f 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1698,6 +1698,18 @@ static void vmx_enable_intro_msr_interception(struct domain *d)
     }
 }
 
+static bool_t vmx_exited_by_pagefault(void)
+{
+    unsigned long exit_qualification;
+
+    __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+    if ( (exit_qualification & EPT_GLA_FAULT) == 0 )
+        return 0;
+
+    return 1;
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -1757,6 +1769,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
     .hypervisor_cpuid_leaf = vmx_hypervisor_cpuid_leaf,
     .enable_intro_msr_interception = vmx_enable_intro_msr_interception,
+    .exited_by_pagefault  = vmx_exited_by_pagefault,
 };
 
 const struct hvm_function_table * __init start_vmx(void)
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 069e869..da1bc2d 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1391,6 +1391,7 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
     p2m_access_t p2ma;
     mem_event_request_t *req;
     int rc;
+    unsigned long eip = guest_cpu_user_regs()->eip;
 
     /* First, handle rx2rw conversion automatically.
      * These calls to p2m->set_entry() must succeed: we have the gfn
@@ -1443,6 +1444,36 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
             return 1;
         }
     }
+    else if ( hvm_funcs.exited_by_pagefault && !hvm_funcs.exited_by_pagefault() ) /* don't send a mem_event */
+    {
+        if ( v->arch.mem_event.emulate_flags == 0 )
+        {
+            v->arch.mem_event.emulate_flags = MEM_EVENT_FLAG_EMULATE;
+            v->arch.mem_event.gpa = gpa;
+            v->arch.mem_event.eip = eip;
+        }
+    }
+
+    /* The previous mem_event reply does not match the current state. */
+    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
+    {
+        /* Don't emulate the current instruction, send a new mem_event. */
+        v->arch.mem_event.emulate_flags = 0;
+
+        /* Make sure to mark the current state to match it again against
+         * the new mem_event about to be sent. */
+        v->arch.mem_event.gpa = gpa;
+        v->arch.mem_event.eip = eip;
+    }
+
+    if ( v->arch.mem_event.emulate_flags )
+    {
+        hvm_emulate_one_full((v->arch.mem_event.emulate_flags & MEM_EVENT_FLAG_EMULATE_NOWRITE) != 0,
+            TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+
+        v->arch.mem_event.emulate_flags = 0;
+        return 1;
+    }
 
     *req_ptr = NULL;
     req = xzalloc(mem_event_request_t);
@@ -1495,6 +1526,60 @@ void p2m_mem_access_resume(struct domain *d)
 
         v = d->vcpu[rsp.vcpu_id];
 
+        /* Mark vcpu for skipping one instruction upon rescheduling */
+        if ( rsp.flags & MEM_EVENT_FLAG_EMULATE )
+        {
+            xenmem_access_t access;
+            bool_t violation = 1;
+
+            v->arch.mem_event.emulate_flags = 0;
+
+            if ( p2m_get_mem_access(d, rsp.gfn, &access) == 0 )
+            {
+                violation = 0;
+
+                switch ( access )
+                {
+                case XENMEM_access_n:
+                case XENMEM_access_n2rwx:
+                default:
+                    violation = rsp.access_r || rsp.access_w || rsp.access_x;
+                    break;
+
+                case XENMEM_access_r:
+                    violation = rsp.access_w || rsp.access_x;
+                    break;
+
+                case XENMEM_access_w:
+                    violation = rsp.access_r || rsp.access_x;
+                    break;
+
+                case XENMEM_access_x:
+                    violation = rsp.access_r || rsp.access_w;
+                    break;
+
+                case XENMEM_access_rx:
+                case XENMEM_access_rx2rw:
+                    violation = rsp.access_w;
+                    break;
+
+                case XENMEM_access_wx:
+                    violation = rsp.access_r;
+                    break;
+
+                case XENMEM_access_rw:
+                    violation = rsp.access_x;
+                    break;
+
+                case XENMEM_access_rwx:
+                    break;
+                }
+            }
+
+            if ( violation )
+                v->arch.mem_event.emulate_flags = rsp.flags;
+        }
+
         /* Unpause domain */
         if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
             mem_event_vcpu_unpause(v);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index abf55fb..0fa4d3d 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -446,6 +446,15 @@ struct arch_vcpu
 
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
+
+    /* Should we emulate the next matching instruction on VCPU resume
+     * after a mem_event? */
+    struct {
+        uint32_t emulate_flags;
+        unsigned long gpa;
+        unsigned long eip;
+    } mem_event;
+
 } __cacheline_aligned;
 
 /* Shorthands to improve code legibility. */
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 2a87e9b..1843621 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -207,6 +207,8 @@ struct hvm_function_table {
                                   uint32_t *ecx, uint32_t *edx);
 
     void (*enable_intro_msr_interception)(struct domain *d);
+
+    bool_t (*exited_by_pagefault)(void);
 };
 
 extern struct hvm_function_table hvm_funcs;
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index 37eb0de..0ad78cd 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -31,11 +31,13 @@
 #include "io/ring.h"
 
 /* Memory event flags */
-#define MEM_EVENT_FLAG_VCPU_PAUSED  (1 << 0)
-#define MEM_EVENT_FLAG_DROP_PAGE    (1 << 1)
-#define MEM_EVENT_FLAG_EVICT_FAIL   (1 << 2)
-#define MEM_EVENT_FLAG_FOREIGN      (1 << 3)
-#define MEM_EVENT_FLAG_DUMMY        (1 << 4)
+#define MEM_EVENT_FLAG_VCPU_PAUSED     (1 << 0)
+#define MEM_EVENT_FLAG_DROP_PAGE       (1 << 1)
+#define MEM_EVENT_FLAG_EVICT_FAIL      (1 << 2)
+#define MEM_EVENT_FLAG_FOREIGN         (1 << 3)
+#define MEM_EVENT_FLAG_DUMMY           (1 << 4)
+#define MEM_EVENT_FLAG_EMULATE         (1 << 5)
+#define MEM_EVENT_FLAG_EMULATE_NOWRITE (1 << 6)
 
 /* Reasons for the memory event request */
 #define MEM_EVENT_REASON_UNKNOWN     0    /* typical reason */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-04 11:30 ` [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
@ 2014-08-04 11:51   ` Ian Campbell
  2014-08-04 14:26   ` Jan Beulich
  1 sibling, 0 replies; 29+ messages in thread
From: Ian Campbell @ 2014-08-04 11:51 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, JBeulich, stefano.stabellini, andrew.cooper3,
	ian.jackson, xen-devel, eddie.dong, jun.nakajima

On Mon, 2014-08-04 at 14:30 +0300, Razvan Cojocaru wrote:
>  
> +int xc_domain_set_pagefault_info(xc_interface *xch,
> +                                 uint32_t domid,
> +                                 xen_domctl_set_pagefault_info_t *info)
> +{
> +    DECLARE_DOMCTL;
> +
> +    if (info == NULL)
> +        return -1;
> +
> +    domctl.cmd = XEN_DOMCTL_set_pagefault_info;
> +    domctl.domain = (domid_t)domid;
> +    domctl.u.set_pagefault_info = *info;
> +
> +    return do_domctl(xch, &domctl);
> +}

Looks like a plausible wrapping of a hypercall, so assuming the hyp guys
are happy with that interface:
Acked-by: Ian Campbell <ian.campbell@citrix.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-04 11:30 [PATCH RFC V4 1/5] xen: Emulate with no writes Razvan Cojocaru
                   ` (3 preceding siblings ...)
  2014-08-04 11:30 ` [PATCH RFC V4 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
@ 2014-08-04 14:09 ` Jan Beulich
  2014-08-04 14:25   ` Razvan Cojocaru
  2014-08-05 15:16   ` Razvan Cojocaru
  4 siblings, 2 replies; 29+ messages in thread
From: Jan Beulich @ 2014-08-04 14:09 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -688,6 +688,51 @@ static int hvmemul_write(
>      return X86EMUL_OKAY;
>  }
>  
> +static int hvmemul_write_discard(
> +    enum x86_segment seg,
> +    unsigned long offset,
> +    void *p_data,
> +    unsigned int bytes,
> +    struct x86_emulate_ctxt *ctxt)
> +{
> +    /* Discarding the write. */
> +    return X86EMUL_OKAY;
> +}

While this one is okay, ...

> +static int hvmemul_rep_ins_discard(
> +    uint16_t src_port,
> +    enum x86_segment dst_seg,
> +    unsigned long dst_offset,
> +    unsigned int bytes_per_rep,
> +    unsigned long *reps,
> +    struct x86_emulate_ctxt *ctxt)
> +{
> +    return X86EMUL_OKAY;
> +}
> +
> +static int hvmemul_rep_movs_discard(
> +   enum x86_segment src_seg,
> +   unsigned long src_offset,
> +   enum x86_segment dst_seg,
> +   unsigned long dst_offset,
> +   unsigned int bytes_per_rep,
> +   unsigned long *reps,
> +   struct x86_emulate_ctxt *ctxt)
> +{
> +    return X86EMUL_OKAY;
> +}

... these don't seem to be: I don't think you can just drop the other
half of the operation (i.e. the port or MMIO read).

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 2/5] xen: Optimize introspection access to guest state
  2014-08-04 11:30 ` [PATCH RFC V4 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
@ 2014-08-04 14:16   ` Jan Beulich
  2014-08-04 14:43     ` Razvan Cojocaru
  0 siblings, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2014-08-04 14:16 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
> Changes since V3:
>  - Changed the name of VCPU variables referring to the current
>    VCPU to "curr".
>  - Renamed "mem_event_regs" to "x86_mem_event_regs" to make it
>    explicit.

That's a step in the right direction, but perhaps not enough. I realize
the whole header doesn't meet the requirements we nowadays put
on public ones, but let's at least not make it worse. I.e. in the case
at hand either out a xen_ prefix as the very first thing (making this
structure's name not match anything else in that header) or as a
compromise stay at least with the mem_event_ prefix, i.e. name it
mem_event_regs_x86.

Furthermore as typedef-ed name with out _t suffix is kind of
unusual. I'm not sure if it was in the context of this series that
someone suggested that the _t collides with the C standard. If
that's really the case, and since you don't need a handle for the
type, please just avoid the typedef (and drop the _st suffix from
the structure tag at once).

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-04 14:09 ` [PATCH RFC V4 1/5] xen: Emulate with no writes Jan Beulich
@ 2014-08-04 14:25   ` Razvan Cojocaru
  2014-08-04 14:42     ` Jan Beulich
  2014-08-05 15:16   ` Razvan Cojocaru
  1 sibling, 1 reply; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 14:25 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/04/2014 05:09 PM, Jan Beulich wrote:
>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -688,6 +688,51 @@ static int hvmemul_write(
>>      return X86EMUL_OKAY;
>>  }
>>  
>> +static int hvmemul_write_discard(
>> +    enum x86_segment seg,
>> +    unsigned long offset,
>> +    void *p_data,
>> +    unsigned int bytes,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    /* Discarding the write. */
>> +    return X86EMUL_OKAY;
>> +}
> 
> While this one is okay, ...
> 
>> +static int hvmemul_rep_ins_discard(
>> +    uint16_t src_port,
>> +    enum x86_segment dst_seg,
>> +    unsigned long dst_offset,
>> +    unsigned int bytes_per_rep,
>> +    unsigned long *reps,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    return X86EMUL_OKAY;
>> +}
>> +
>> +static int hvmemul_rep_movs_discard(
>> +   enum x86_segment src_seg,
>> +   unsigned long src_offset,
>> +   enum x86_segment dst_seg,
>> +   unsigned long dst_offset,
>> +   unsigned int bytes_per_rep,
>> +   unsigned long *reps,
>> +   struct x86_emulate_ctxt *ctxt)
>> +{
>> +    return X86EMUL_OKAY;
>> +}
> 
> ... these don't seem to be: I don't think you can just drop the other
> half of the operation (i.e. the port or MMIO read).

It's been suggested here:

http://lists.xen.org/archives/html/xen-devel/2014-07/msg03088.html

that we should use modified versions of the rep_ins, rep_movs and
cmpxchg handlers if we want to make sure absolutely no writes will
happen. Then again, perhaps the modification were supposed to be more
subtle than just doing nothing in the handler?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-04 11:30 ` [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
  2014-08-04 11:51   ` Ian Campbell
@ 2014-08-04 14:26   ` Jan Beulich
  2014-08-04 15:00     ` Razvan Cojocaru
  2014-08-04 15:11     ` Razvan Cojocaru
  1 sibling, 2 replies; 29+ messages in thread
From: Jan Beulich @ 2014-08-04 14:26 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
> +static bool_t vmx_check_pf_injection(void)
> +{
> +    struct vcpu *curr = current;
> +    struct domain *d = curr->domain;
> +    struct segment_register seg;
> +    unsigned long ev;
> +    uint32_t pending_event = 0;
> +
> +    if ( likely(d->arch.hvm_domain.fault_info.virtual_address == 0 )

Bad space before ). And my question stands: Why is VA zero
special?

> +         || !is_hvm_domain(d) )

Following the majority of other code, the || belong at the end of the
previous line. Also, even if not strictly required at present, the order
of the two checks would better be swapped.

> +        return 0;
> +
> +    hvm_get_segment_register(curr, x86_seg_ss, &seg);
> +
> +    if ( seg.attr.fields.dpl != 3 ) /* Guest is not in user mode */
> +        return 0;
> +
> +    vmx_vmcs_enter(curr);
> +
> +    if ( curr->arch.hvm_vcpu.guest_cr[3]
> +         != d->arch.hvm_domain.fault_info.address_space )
> +    {
> +        vmx_vmcs_exit(curr);
> +        return 0;
> +    }

I think the vmx_vmcs_enter() could be moved here, simplifying the error
handling above.

> +
> +    __vmread(VM_ENTRY_INTR_INFO, &ev);
> +
> +    if ( (ev & INTR_INFO_VALID_MASK) &&
> +         hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )

Are there no manifest constants for all these plain numbers?

> +        pending_event = ev;
> +
> +    if ( pending_event )
> +    {
> +        vmx_vmcs_exit(curr);
> +        return 0;
> +    }
> +
> +    vmx_vmcs_exit(curr);
> +    return 1;

These two return paths would read better when folded together.

> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -936,6 +936,18 @@ typedef struct xen_domctl_vcpu_msrs 
> xen_domctl_vcpu_msrs_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
>  #endif
>  
> +/* XEN_DOMCTL_set_pagefault_info requests that a page fault occur at
> + * the next VMENTRY.
> + *  */

Coding style.

> +struct xen_domctl_set_pagefault_info {
> +    uint64_t address_space;
> +    uint64_t virtual_address;
> +    uint32_t write_access;
> +};

Missing padding (structure size currently differs between 32- and 64-bit
callers; this only doesn't matter much because callers normally pass a
full struct xen_domctl anyway).

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 5/5] xen: Handle resumed instruction based on previous mem_event reply
  2014-08-04 11:30 ` [PATCH RFC V4 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
@ 2014-08-04 14:33   ` Jan Beulich
  2014-08-06 14:00     ` Razvan Cojocaru
  0 siblings, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2014-08-04 14:33 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
> In a scenario where a page fault that triggered a mem_event occured,
> p2m_mem_access_check() will now be able to either 1) emulate the
> current instruction, or 2) emulate it, but don't allow it to perform
> any writes.
> 
> Changes since V1:
>  - Removed the 'skip' code which required computing the current
>    instruction length.
>  - Removed the set_ad_bits() code that attempted to modify the
>    'accessed' and 'dirty' bits for instructions that the emulator
>    can't handle at the moment.
> 
> Changes since V2:
>  - Moved the __vmread(EXIT_QUALIFICATION, &exit_qualification); code
>    in vmx.c, accessible via hvm_funcs.
>  - Incorporated changes by Andrew Cooper ("[PATCH 1/2] Xen/mem_event:
>    Validate the response vcpu_id before acting on it."
> 
> Changes since V3:
>  - Collapsed verbose lines into a single "else if()".
>  - Changed an int to bool_t.
>  - Fixed a minor coding style issue.
>  - Now computing the first parameter to hvm_emulate_one_full()
>    (replaced an "if" with a single call).
>  - Added code comments about eip and gla reset (clarity issue).
>  - Removed duplicate code by Andrew Cooper (introduced in V2,
>    since committed).
> 
> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
> ---
>  xen/arch/x86/domain.c          |    3 ++
>  xen/arch/x86/hvm/vmx/vmx.c     |   13 ++++++
>  xen/arch/x86/mm/p2m.c          |   85 
> ++++++++++++++++++++++++++++++++++++++++
>  xen/include/asm-x86/domain.h   |    9 +++++
>  xen/include/asm-x86/hvm/hvm.h  |    2 +
>  xen/include/public/mem_event.h |   12 +++---
>  6 files changed, 119 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index e896210..af9b213 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -407,6 +407,9 @@ int vcpu_initialise(struct vcpu *v)
>  
>      v->arch.flags = TF_kernel_mode;
>  
> +    /* By default, do not emulate */
> +    v->arch.mem_event.emulate_flags = 0;
> +
>      rc = mapcache_vcpu_init(v);
>      if ( rc )
>          return rc;
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index c0e3d73..150fe9f 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -1698,6 +1698,18 @@ static void vmx_enable_intro_msr_interception(struct 
> domain *d)
>      }
>  }
>  
> +static bool_t vmx_exited_by_pagefault(void)
> +{
> +    unsigned long exit_qualification;
> +
> +    __vmread(EXIT_QUALIFICATION, &exit_qualification);
> +
> +    if ( (exit_qualification & EPT_GLA_FAULT) == 0 )
> +        return 0;
> +
> +    return 1;
> +}
> +
>  static struct hvm_function_table __initdata vmx_function_table = {
>      .name                 = "VMX",
>      .cpu_up_prepare       = vmx_cpu_up_prepare,
> @@ -1757,6 +1769,7 @@ static struct hvm_function_table __initdata 
> vmx_function_table = {
>      .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
>      .hypervisor_cpuid_leaf = vmx_hypervisor_cpuid_leaf,
>      .enable_intro_msr_interception = vmx_enable_intro_msr_interception,
> +    .exited_by_pagefault  = vmx_exited_by_pagefault,
>  };
>  
>  const struct hvm_function_table * __init start_vmx(void)
> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index 069e869..da1bc2d 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -1391,6 +1391,7 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t 
> gla_valid, unsigned long gla,
>      p2m_access_t p2ma;
>      mem_event_request_t *req;
>      int rc;
> +    unsigned long eip = guest_cpu_user_regs()->eip;
>  
>      /* First, handle rx2rw conversion automatically.
>       * These calls to p2m->set_entry() must succeed: we have the gfn
> @@ -1443,6 +1444,36 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t 
> gla_valid, unsigned long gla,
>              return 1;
>          }
>      }
> +    else if ( hvm_funcs.exited_by_pagefault && !hvm_funcs.exited_by_pagefault() ) /* don't send a mem_event */

DYM

    else if ( !hvm_funcs.exited_by_pagefault || !hvm_funcs.exited_by_pagefault() )

Apart from that the line is too long and ...

> +    {
> +        if ( v->arch.mem_event.emulate_flags == 0 )

... the two if()s should again be combined.

> +        {
> +            v->arch.mem_event.emulate_flags = MEM_EVENT_FLAG_EMULATE;
> +            v->arch.mem_event.gpa = gpa;
> +            v->arch.mem_event.eip = eip;
> +        }
> +    }
> +
> +    /* The previous mem_event reply does not match the current state. */
> +    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
> +    {
> +        /* Don't emulate the current instruction, send a new mem_event. */
> +        v->arch.mem_event.emulate_flags = 0;
> +
> +        /* Make sure to mark the current state to match it again against
> +         * the new mem_event about to be sent. */
> +        v->arch.mem_event.gpa = gpa;
> +        v->arch.mem_event.eip = eip;
> +    }
> +
> +    if ( v->arch.mem_event.emulate_flags )
> +    {
> +        hvm_emulate_one_full((v->arch.mem_event.emulate_flags & MEM_EVENT_FLAG_EMULATE_NOWRITE) != 0,

Again too long a line.

> @@ -1495,6 +1526,60 @@ void p2m_mem_access_resume(struct domain *d)
>  
>          v = d->vcpu[rsp.vcpu_id];
>  
> +        /* Mark vcpu for skipping one instruction upon rescheduling */
> +        if ( rsp.flags & MEM_EVENT_FLAG_EMULATE )
> +        {
> +            xenmem_access_t access;
> +            bool_t violation = 1;
> +
> +            v->arch.mem_event.emulate_flags = 0;
> +
> +            if ( p2m_get_mem_access(d, rsp.gfn, &access) == 0 )
> +            {
> +                violation = 0;

This belongs ...

> +
> +                switch ( access )
> +                {
> +                case XENMEM_access_n:
> +                case XENMEM_access_n2rwx:
> +                default:
> +                    violation = rsp.access_r || rsp.access_w || rsp.access_x;
> +                    break;
> +
> +                case XENMEM_access_r:
> +                    violation = rsp.access_w || rsp.access_x;
> +                    break;
> +
> +                case XENMEM_access_w:
> +                    violation = rsp.access_r || rsp.access_x;
> +                    break;
> +
> +                case XENMEM_access_x:
> +                    violation = rsp.access_r || rsp.access_w;
> +                    break;
> +
> +                case XENMEM_access_rx:
> +                case XENMEM_access_rx2rw:
> +                    violation = rsp.access_w;
> +                    break;
> +
> +                case XENMEM_access_wx:
> +                    violation = rsp.access_r;
> +                    break;
> +
> +                case XENMEM_access_rw:
> +                    violation = rsp.access_x;
> +                    break;
> +
> +                case XENMEM_access_rwx:

... here, as all other cases set violation anyway.

> --- a/xen/include/asm-x86/hvm/hvm.h
> +++ b/xen/include/asm-x86/hvm/hvm.h
> @@ -207,6 +207,8 @@ struct hvm_function_table {
>                                    uint32_t *ecx, uint32_t *edx);
>  
>      void (*enable_intro_msr_interception)(struct domain *d);
> +
> +    bool_t (*exited_by_pagefault)(void);

The naming needs improvement, since afaiu you're not caring about
ordinary page faults, but only nested ones.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-04 14:25   ` Razvan Cojocaru
@ 2014-08-04 14:42     ` Jan Beulich
  0 siblings, 0 replies; 29+ messages in thread
From: Jan Beulich @ 2014-08-04 14:42 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 04.08.14 at 16:25, <rcojocaru@bitdefender.com> wrote:
> On 08/04/2014 05:09 PM, Jan Beulich wrote:
>>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>>> --- a/xen/arch/x86/hvm/emulate.c
>>> +++ b/xen/arch/x86/hvm/emulate.c
>>> @@ -688,6 +688,51 @@ static int hvmemul_write(
>>>      return X86EMUL_OKAY;
>>>  }
>>>  
>>> +static int hvmemul_write_discard(
>>> +    enum x86_segment seg,
>>> +    unsigned long offset,
>>> +    void *p_data,
>>> +    unsigned int bytes,
>>> +    struct x86_emulate_ctxt *ctxt)
>>> +{
>>> +    /* Discarding the write. */
>>> +    return X86EMUL_OKAY;
>>> +}
>> 
>> While this one is okay, ...
>> 
>>> +static int hvmemul_rep_ins_discard(
>>> +    uint16_t src_port,
>>> +    enum x86_segment dst_seg,
>>> +    unsigned long dst_offset,
>>> +    unsigned int bytes_per_rep,
>>> +    unsigned long *reps,
>>> +    struct x86_emulate_ctxt *ctxt)
>>> +{
>>> +    return X86EMUL_OKAY;
>>> +}
>>> +
>>> +static int hvmemul_rep_movs_discard(
>>> +   enum x86_segment src_seg,
>>> +   unsigned long src_offset,
>>> +   enum x86_segment dst_seg,
>>> +   unsigned long dst_offset,
>>> +   unsigned int bytes_per_rep,
>>> +   unsigned long *reps,
>>> +   struct x86_emulate_ctxt *ctxt)
>>> +{
>>> +    return X86EMUL_OKAY;
>>> +}
>> 
>> ... these don't seem to be: I don't think you can just drop the other
>> half of the operation (i.e. the port or MMIO read).
> 
> It's been suggested here:
> 
> http://lists.xen.org/archives/html/xen-devel/2014-07/msg03088.html 
> 
> that we should use modified versions of the rep_ins, rep_movs and
> cmpxchg handlers if we want to make sure absolutely no writes will
> happen. Then again, perhaps the modification were supposed to be more
> subtle than just doing nothing in the handler?

That's what I'm trying to hint at.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 2/5] xen: Optimize introspection access to guest state
  2014-08-04 14:16   ` Jan Beulich
@ 2014-08-04 14:43     ` Razvan Cojocaru
  0 siblings, 0 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 14:43 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/04/2014 05:16 PM, Jan Beulich wrote:
>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>> Changes since V3:
>>  - Changed the name of VCPU variables referring to the current
>>    VCPU to "curr".
>>  - Renamed "mem_event_regs" to "x86_mem_event_regs" to make it
>>    explicit.
> 
> That's a step in the right direction, but perhaps not enough. I realize
> the whole header doesn't meet the requirements we nowadays put
> on public ones, but let's at least not make it worse. I.e. in the case
> at hand either out a xen_ prefix as the very first thing (making this
> structure's name not match anything else in that header) or as a
> compromise stay at least with the mem_event_ prefix, i.e. name it
> mem_event_regs_x86.
> 
> Furthermore as typedef-ed name with out _t suffix is kind of
> unusual. I'm not sure if it was in the context of this series that
> someone suggested that the _t collides with the C standard. If
> that's really the case, and since you don't need a handle for the
> type, please just avoid the typedef (and drop the _st suffix from
> the structure tag at once).

Indeed, it has been suggested here:

http://lists.xenproject.org/archives/html/xen-devel/2014-07/msg01672.html

I'll do the renaming.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-04 14:26   ` Jan Beulich
@ 2014-08-04 15:00     ` Razvan Cojocaru
  2014-08-04 15:20       ` Jan Beulich
  2014-08-04 15:11     ` Razvan Cojocaru
  1 sibling, 1 reply; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 15:00 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/04/2014 05:26 PM, Jan Beulich wrote:
>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>> +    __vmread(VM_ENTRY_INTR_INFO, &ev);
>> +
>> +    if ( (ev & INTR_INFO_VALID_MASK) &&
>> +         hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
> 
> Are there no manifest constants for all these plain numbers?

If there are, vmx_vmcs_save() in vmx.c (line 416) doesn't use them. I've
copied that part verbatim.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-04 14:26   ` Jan Beulich
  2014-08-04 15:00     ` Razvan Cojocaru
@ 2014-08-04 15:11     ` Razvan Cojocaru
  2014-08-04 15:21       ` Jan Beulich
  1 sibling, 1 reply; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 15:11 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/04/2014 05:26 PM, Jan Beulich wrote:
>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>> +static bool_t vmx_check_pf_injection(void)
>> +{
>> +    struct vcpu *curr = current;
>> +    struct domain *d = curr->domain;
>> +    struct segment_register seg;
>> +    unsigned long ev;
>> +    uint32_t pending_event = 0;
>> +
>> +    if ( likely(d->arch.hvm_domain.fault_info.virtual_address == 0 )
> 
> Bad space before ). And my question stands: Why is VA zero
> special?

It's special because for our purposes (mostly Windows HVM guests, but I
think the same applies to Linux), that page is reserved and it's never
swapped out, so there would be no point in asking for a page fault
injection there.

If you think that a code comment is not enough here and that in the
future somebody might legitimately want to use 0 as a proper value, I'll
add a "valid" member and use that explicitly.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-04 15:00     ` Razvan Cojocaru
@ 2014-08-04 15:20       ` Jan Beulich
  2014-08-05  8:09         ` Razvan Cojocaru
  0 siblings, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2014-08-04 15:20 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 04.08.14 at 17:00, <rcojocaru@bitdefender.com> wrote:
> On 08/04/2014 05:26 PM, Jan Beulich wrote:
>>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>>> +    __vmread(VM_ENTRY_INTR_INFO, &ev);
>>> +
>>> +    if ( (ev & INTR_INFO_VALID_MASK) &&
>>> +         hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
>> 
>> Are there no manifest constants for all these plain numbers?
> 
> If there are, vmx_vmcs_save() in vmx.c (line 416) doesn't use them. I've
> copied that part verbatim.

And that's precisely the problem: As long as there's exactly one use
site, the need for manifest constants is questionable (i.e. largely
cosmetic). As soon as there are multiple places, connecting them
together is largely impossible without naming these numbers - only
that way you have a reasonable chance to find the clone of the
original should the original be found to need tweaking.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-04 15:11     ` Razvan Cojocaru
@ 2014-08-04 15:21       ` Jan Beulich
       [not found]         ` <53DFA537.70105@bitdefender.com>
  0 siblings, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2014-08-04 15:21 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 04.08.14 at 17:11, <rcojocaru@bitdefender.com> wrote:
> On 08/04/2014 05:26 PM, Jan Beulich wrote:
>>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>>> +static bool_t vmx_check_pf_injection(void)
>>> +{
>>> +    struct vcpu *curr = current;
>>> +    struct domain *d = curr->domain;
>>> +    struct segment_register seg;
>>> +    unsigned long ev;
>>> +    uint32_t pending_event = 0;
>>> +
>>> +    if ( likely(d->arch.hvm_domain.fault_info.virtual_address == 0 )
>> 
>> Bad space before ). And my question stands: Why is VA zero
>> special?
> 
> It's special because for our purposes (mostly Windows HVM guests, but I
> think the same applies to Linux), that page is reserved and it's never
> swapped out, so there would be no point in asking for a page fault
> injection there.
> 
> If you think that a code comment is not enough here and that in the
> future somebody might legitimately want to use 0 as a proper value, I'll
> add a "valid" member and use that explicitly.

Perhaps in another thread I said this earlier today already: In the
HVM code we should not be making assumptions about particular
guest behavior, except when it comes to optimization for certain
special cases.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
       [not found]         ` <53DFA537.70105@bitdefender.com>
@ 2014-08-04 15:23           ` Razvan Cojocaru
  0 siblings, 0 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-04 15:23 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/04/2014 06:22 PM, Razvan Cojocaru wrote:
> On 08/04/2014 06:21 PM, Jan Beulich wrote:
>>>>> On 04.08.14 at 17:11, <rcojocaru@bitdefender.com> wrote:
>>> On 08/04/2014 05:26 PM, Jan Beulich wrote:
>>>>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>>>>> +static bool_t vmx_check_pf_injection(void)
>>>>> +{
>>>>> +    struct vcpu *curr = current;
>>>>> +    struct domain *d = curr->domain;
>>>>> +    struct segment_register seg;
>>>>> +    unsigned long ev;
>>>>> +    uint32_t pending_event = 0;
>>>>> +
>>>>> +    if ( likely(d->arch.hvm_domain.fault_info.virtual_address == 0 )
>>>>
>>>> Bad space before ). And my question stands: Why is VA zero
>>>> special?
>>>
>>> It's special because for our purposes (mostly Windows HVM guests, but I
>>> think the same applies to Linux), that page is reserved and it's never
>>> swapped out, so there would be no point in asking for a page fault
>>> injection there.
>>>
>>> If you think that a code comment is not enough here and that in the
>>> future somebody might legitimately want to use 0 as a proper value, I'll
>>> add a "valid" member and use that explicitly.
>>
>> Perhaps in another thread I said this earlier today already: In the
>> HVM code we should not be making assumptions about particular
>> guest behavior, except when it comes to optimization for certain
>> special cases.
> 

OK, I'll address that as well.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-04 15:20       ` Jan Beulich
@ 2014-08-05  8:09         ` Razvan Cojocaru
  2014-08-05  8:39           ` Jan Beulich
  0 siblings, 1 reply; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-05  8:09 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/04/2014 06:20 PM, Jan Beulich wrote:
>>>> On 04.08.14 at 17:00, <rcojocaru@bitdefender.com> wrote:
>> On 08/04/2014 05:26 PM, Jan Beulich wrote:
>>>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>>>> +    __vmread(VM_ENTRY_INTR_INFO, &ev);
>>>> +
>>>> +    if ( (ev & INTR_INFO_VALID_MASK) &&
>>>> +         hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
>>>
>>> Are there no manifest constants for all these plain numbers?
>>
>> If there are, vmx_vmcs_save() in vmx.c (line 416) doesn't use them. I've
>> copied that part verbatim.
> 
> And that's precisely the problem: As long as there's exactly one use
> site, the need for manifest constants is questionable (i.e. largely
> cosmetic). As soon as there are multiple places, connecting them
> together is largely impossible without naming these numbers - only
> that way you have a reasonable chance to find the clone of the
> original should the original be found to need tweaking.

I'll gladly add #defines for those magic constants, but could you please
recommend names for them and a header (or at least, category of headers)
to put them in, in the interest of minimizing the number of RFC versions
for this series?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-05  8:09         ` Razvan Cojocaru
@ 2014-08-05  8:39           ` Jan Beulich
  2014-08-05  8:48             ` Razvan Cojocaru
  2014-08-05  9:59             ` Razvan Cojocaru
  0 siblings, 2 replies; 29+ messages in thread
From: Jan Beulich @ 2014-08-05  8:39 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 05.08.14 at 10:09, <rcojocaru@bitdefender.com> wrote:
> On 08/04/2014 06:20 PM, Jan Beulich wrote:
>>>>> On 04.08.14 at 17:00, <rcojocaru@bitdefender.com> wrote:
>>> On 08/04/2014 05:26 PM, Jan Beulich wrote:
>>>>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>>>>> +    __vmread(VM_ENTRY_INTR_INFO, &ev);
>>>>> +
>>>>> +    if ( (ev & INTR_INFO_VALID_MASK) &&
>>>>> +         hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
>>>>
>>>> Are there no manifest constants for all these plain numbers?
>>>
>>> If there are, vmx_vmcs_save() in vmx.c (line 416) doesn't use them. I've
>>> copied that part verbatim.
>> 
>> And that's precisely the problem: As long as there's exactly one use
>> site, the need for manifest constants is questionable (i.e. largely
>> cosmetic). As soon as there are multiple places, connecting them
>> together is largely impossible without naming these numbers - only
>> that way you have a reasonable chance to find the clone of the
>> original should the original be found to need tweaking.
> 
> I'll gladly add #defines for those magic constants, but could you please
> recommend names for them and a header (or at least, category of headers)
> to put them in, in the interest of minimizing the number of RFC versions
> for this series?

Did you even make an attempt to locate a proper place, or
to check whether such constants already exist? Simply looking
for INTR_INFO_VALID_MASK would have turned up

#define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
#define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */

i.e. all you need is there, you just need to make use of them
(and please also - perhaps in an initial cleanup patch - for the
original you cloned from). And just to avoid a needless further
intermediate round: While there is no #define for the shift count
used, you also don't need one if you make use of MASK_EXTR().

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-05  8:39           ` Jan Beulich
@ 2014-08-05  8:48             ` Razvan Cojocaru
  2014-08-05  9:59             ` Razvan Cojocaru
  1 sibling, 0 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-05  8:48 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/05/2014 11:39 AM, Jan Beulich wrote:
>>>> On 05.08.14 at 10:09, <rcojocaru@bitdefender.com> wrote:
>> On 08/04/2014 06:20 PM, Jan Beulich wrote:
>>>>>> On 04.08.14 at 17:00, <rcojocaru@bitdefender.com> wrote:
>>>> On 08/04/2014 05:26 PM, Jan Beulich wrote:
>>>>>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>>>>>> +    __vmread(VM_ENTRY_INTR_INFO, &ev);
>>>>>> +
>>>>>> +    if ( (ev & INTR_INFO_VALID_MASK) &&
>>>>>> +         hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
>>>>>
>>>>> Are there no manifest constants for all these plain numbers?
>>>>
>>>> If there are, vmx_vmcs_save() in vmx.c (line 416) doesn't use them. I've
>>>> copied that part verbatim.
>>>
>>> And that's precisely the problem: As long as there's exactly one use
>>> site, the need for manifest constants is questionable (i.e. largely
>>> cosmetic). As soon as there are multiple places, connecting them
>>> together is largely impossible without naming these numbers - only
>>> that way you have a reasonable chance to find the clone of the
>>> original should the original be found to need tweaking.
>>
>> I'll gladly add #defines for those magic constants, but could you please
>> recommend names for them and a header (or at least, category of headers)
>> to put them in, in the interest of minimizing the number of RFC versions
>> for this series?
> 
> Did you even make an attempt to locate a proper place, or
> to check whether such constants already exist? Simply looking
> for INTR_INFO_VALID_MASK would have turned up
> 
> #define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
> #define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
> 
> i.e. all you need is there, you just need to make use of them
> (and please also - perhaps in an initial cleanup patch - for the
> original you cloned from). And just to avoid a needless further
> intermediate round: While there is no #define for the shift count
> used, you also don't need one if you make use of MASK_EXTR().

I did, obviously, but you can only turn up so much when grepping for "7"
in a large, mostly undocumented codebase. Thanks for your quick answer,
I'll use those and modify the original code as well.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc
  2014-08-05  8:39           ` Jan Beulich
  2014-08-05  8:48             ` Razvan Cojocaru
@ 2014-08-05  9:59             ` Razvan Cojocaru
  1 sibling, 0 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-05  9:59 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/05/2014 11:39 AM, Jan Beulich wrote:
> #define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
> #define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
> 
> i.e. all you need is there, you just need to make use of them
> (and please also - perhaps in an initial cleanup patch - for the
> original you cloned from). And just to avoid a needless further
> intermediate round: While there is no #define for the shift count
> used, you also don't need one if you make use of MASK_EXTR().

Cleanup patch submitted:

http://lists.xen.org/archives/html/xen-devel/2014-08/msg00439.html


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-04 14:09 ` [PATCH RFC V4 1/5] xen: Emulate with no writes Jan Beulich
  2014-08-04 14:25   ` Razvan Cojocaru
@ 2014-08-05 15:16   ` Razvan Cojocaru
  2014-08-05 15:27     ` Razvan Cojocaru
  2014-08-05 15:43     ` Jan Beulich
  1 sibling, 2 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-05 15:16 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/04/2014 05:09 PM, Jan Beulich wrote:
>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>> +static int hvmemul_rep_ins_discard(
>> +    uint16_t src_port,
>> +    enum x86_segment dst_seg,
>> +    unsigned long dst_offset,
>> +    unsigned int bytes_per_rep,
>> +    unsigned long *reps,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    return X86EMUL_OKAY;
>> +}
>> +
>> +static int hvmemul_rep_movs_discard(
>> +   enum x86_segment src_seg,
>> +   unsigned long src_offset,
>> +   enum x86_segment dst_seg,
>> +   unsigned long dst_offset,
>> +   unsigned int bytes_per_rep,
>> +   unsigned long *reps,
>> +   struct x86_emulate_ctxt *ctxt)
>> +{
>> +    return X86EMUL_OKAY;
>> +}
> 
> ... these don't seem to be: I don't think you can just drop the other
> half of the operation (i.e. the port or MMIO read).

I've been looking at hvmemul_do_io() (in arch/x86/hvm/emulate.c, line
52), which is what the above functions are reduced to. At line 88 I've
come across the following code:

 /*
  * Weird-sized accesses have undefined behaviour: we discard writes
  * and read all-ones.
  */
 if ( unlikely((size > sizeof(long)) || (size & (size - 1))) )
 {
     gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size);
     ASSERT(p_data != NULL); /* cannot happen with a REP prefix */
     if ( dir == IOREQ_READ )
         memset(p_data, ~0, size);
     if ( ram_page )
         put_page(ram_page);
     return X86EMUL_UNHANDLEABLE;
 }

which does drop the last half of the function (though it does so by
returning X86EMUL_UNHANDLEABLE). Hvmemul_rep_ins() looks like this:

 static int hvmemul_rep_ins(
     uint16_t src_port,
     enum x86_segment dst_seg,
     unsigned long dst_offset,
     unsigned int bytes_per_rep,
     unsigned long *reps,
     struct x86_emulate_ctxt *ctxt)
 {
     struct hvm_emulate_ctxt *hvmemul_ctxt =
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
     unsigned long addr;
     uint32_t pfec = PFEC_page_present | PFEC_write_access;
     paddr_t gpa;
     p2m_type_t p2mt;
     int rc;

     rc = hvmemul_virtual_to_linear(
         dst_seg, dst_offset, bytes_per_rep, reps, hvm_access_write,
         hvmemul_ctxt, &addr);
     if ( rc != X86EMUL_OKAY )
         return rc;

     if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
         pfec |= PFEC_user_mode;

     rc = hvmemul_linear_to_phys(
         addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt);
     if ( rc != X86EMUL_OKAY )
         return rc;

     (void) get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT,
&p2mt);
     if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm )
         return X86EMUL_UNHANDLEABLE;

     return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
                           !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 }

So if I understand this code correctly, hvmemul_rep_ins() performs a few
checks, and then calls hvmemul_do_pio(), which ends up calling
hvmemul_do_io(), which seems to discard the write rather unceremoniously
for weird-sized accesses. This would seem to roughly correspond to just
returning X86EMUL_UNHANDLEABLE from hvmemul_rep_ins() for that special
case (with no MMIO code executed).

Did I misunderstand something?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-05 15:16   ` Razvan Cojocaru
@ 2014-08-05 15:27     ` Razvan Cojocaru
  2014-08-05 15:43     ` Jan Beulich
  1 sibling, 0 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-05 15:27 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/05/2014 06:16 PM, Razvan Cojocaru wrote:
> On 08/04/2014 05:09 PM, Jan Beulich wrote:
>>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>>> +static int hvmemul_rep_ins_discard(
>>> +    uint16_t src_port,
>>> +    enum x86_segment dst_seg,
>>> +    unsigned long dst_offset,
>>> +    unsigned int bytes_per_rep,
>>> +    unsigned long *reps,
>>> +    struct x86_emulate_ctxt *ctxt)
>>> +{
>>> +    return X86EMUL_OKAY;
>>> +}
>>> +
>>> +static int hvmemul_rep_movs_discard(
>>> +   enum x86_segment src_seg,
>>> +   unsigned long src_offset,
>>> +   enum x86_segment dst_seg,
>>> +   unsigned long dst_offset,
>>> +   unsigned int bytes_per_rep,
>>> +   unsigned long *reps,
>>> +   struct x86_emulate_ctxt *ctxt)
>>> +{
>>> +    return X86EMUL_OKAY;
>>> +}
>>
>> ... these don't seem to be: I don't think you can just drop the other
>> half of the operation (i.e. the port or MMIO read).
> 
> I've been looking at hvmemul_do_io() (in arch/x86/hvm/emulate.c, line
> 52), which is what the above functions are reduced to. At line 88 I've
> come across the following code:
> 
>  /*
>   * Weird-sized accesses have undefined behaviour: we discard writes
>   * and read all-ones.
>   */
>  if ( unlikely((size > sizeof(long)) || (size & (size - 1))) )
>  {
>      gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size);
>      ASSERT(p_data != NULL); /* cannot happen with a REP prefix */
>      if ( dir == IOREQ_READ )
>          memset(p_data, ~0, size);
>      if ( ram_page )
>          put_page(ram_page);
>      return X86EMUL_UNHANDLEABLE;
>  }
> 
> which does drop the last half of the function (though it does so by
> returning X86EMUL_UNHANDLEABLE). Hvmemul_rep_ins() looks like this:
> 
>  static int hvmemul_rep_ins(
>      uint16_t src_port,
>      enum x86_segment dst_seg,
>      unsigned long dst_offset,
>      unsigned int bytes_per_rep,
>      unsigned long *reps,
>      struct x86_emulate_ctxt *ctxt)
>  {
>      struct hvm_emulate_ctxt *hvmemul_ctxt =
>          container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>      unsigned long addr;
>      uint32_t pfec = PFEC_page_present | PFEC_write_access;
>      paddr_t gpa;
>      p2m_type_t p2mt;
>      int rc;
> 
>      rc = hvmemul_virtual_to_linear(
>          dst_seg, dst_offset, bytes_per_rep, reps, hvm_access_write,
>          hvmemul_ctxt, &addr);
>      if ( rc != X86EMUL_OKAY )
>          return rc;
> 
>      if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
>          pfec |= PFEC_user_mode;
> 
>      rc = hvmemul_linear_to_phys(
>          addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt);
>      if ( rc != X86EMUL_OKAY )
>          return rc;
> 
>      (void) get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT,
> &p2mt);
>      if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm )
>          return X86EMUL_UNHANDLEABLE;
> 
>      return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
>                            !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
>  }
> 
> So if I understand this code correctly, hvmemul_rep_ins() performs a few
> checks, and then calls hvmemul_do_pio(), which ends up calling
> hvmemul_do_io(), which seems to discard the write rather unceremoniously
> for weird-sized accesses. This would seem to roughly correspond to just
> returning X86EMUL_UNHANDLEABLE from hvmemul_rep_ins() for that special
> case (with no MMIO code executed).

To clarify, I'm aware that the special case should not happen for the
"rep" functions (hence the ASSERT()), I'm just trying to understand if
there are cases where it is allowed to drop the other half of the
operation, and if maybe in our case the handlers could just return
X86EMUL_OKAY as originally. If not, I'll continue exploring
hvmemul_do_io() for a way to do this safely.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-05 15:16   ` Razvan Cojocaru
  2014-08-05 15:27     ` Razvan Cojocaru
@ 2014-08-05 15:43     ` Jan Beulich
  2014-08-06  8:42       ` Razvan Cojocaru
  1 sibling, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2014-08-05 15:43 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 05.08.14 at 17:16, <rcojocaru@bitdefender.com> wrote:
> So if I understand this code correctly, hvmemul_rep_ins() performs a few
> checks, and then calls hvmemul_do_pio(), which ends up calling
> hvmemul_do_io(), which seems to discard the write rather unceremoniously
> for weird-sized accesses. This would seem to roughly correspond to just
> returning X86EMUL_UNHANDLEABLE from hvmemul_rep_ins() for that special
> case (with no MMIO code executed).
> 
> Did I misunderstand something?

The main issue is that as long as these functions only act as a backend
to the x86 instruction emulator, "weird size accesses" just can't happen.
I.e. the respective check is just a guard against careless future code
additions. And for such bad cases it is of course appropriate to drop
the entire effect of the instruction - this is largely like a fault happing on
it. And quite different from your case - you want the instruction to
behave normally _except_ for the memory write as I understood so far
(albeit it still escapes me how in the end correct behavior can result).

Furthermore X86EMUL_UNHANDLEABLE generally results in a fault to be
injected into the guest, i.e. such paths aren't suitable for your purposes
iiuc.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-05 15:43     ` Jan Beulich
@ 2014-08-06  8:42       ` Razvan Cojocaru
  2014-08-06  8:50         ` Jan Beulich
  0 siblings, 1 reply; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-06  8:42 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/05/2014 06:43 PM, Jan Beulich wrote:
> it. And quite different from your case - you want the instruction to
> behave normally _except_ for the memory write as I understood so far
> (albeit it still escapes me how in the end correct behavior can result).

The scenario in such cases is quite special: the decision to emulate
without writing is usually made for things such as rootkits, and
allowing a rogue application to go ahead with a write generally means
allowing it to do bad things - so in that case the alternative is
preferable.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-06  8:42       ` Razvan Cojocaru
@ 2014-08-06  8:50         ` Jan Beulich
  2014-08-28 11:53           ` Tim Deegan
  0 siblings, 1 reply; 29+ messages in thread
From: Jan Beulich @ 2014-08-06  8:50 UTC (permalink / raw)
  To: Razvan Cojocaru
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

>>> On 06.08.14 at 10:42, <rcojocaru@bitdefender.com> wrote:
> On 08/05/2014 06:43 PM, Jan Beulich wrote:
>> it. And quite different from your case - you want the instruction to
>> behave normally _except_ for the memory write as I understood so far
>> (albeit it still escapes me how in the end correct behavior can result).
> 
> The scenario in such cases is quite special: the decision to emulate
> without writing is usually made for things such as rootkits, and
> allowing a rogue application to go ahead with a write generally means
> allowing it to do bad things - so in that case the alternative is
> preferable.

In which case dropping the side effects of such instructions may
well be the right thing too, in which case the trivial implementations
you provided would be okay, but you'd have to go further and also
disallow port and MMIO reads.

Jan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 5/5] xen: Handle resumed instruction based on previous mem_event reply
  2014-08-04 14:33   ` Jan Beulich
@ 2014-08-06 14:00     ` Razvan Cojocaru
  0 siblings, 0 replies; 29+ messages in thread
From: Razvan Cojocaru @ 2014-08-06 14:00 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, stefano.stabellini, andrew.cooper3,
	eddie.dong, xen-devel, jun.nakajima, ian.jackson

On 08/04/2014 05:33 PM, Jan Beulich wrote:
>>>> On 04.08.14 at 13:30, <rcojocaru@bitdefender.com> wrote:
>> In a scenario where a page fault that triggered a mem_event occured,
>> p2m_mem_access_check() will now be able to either 1) emulate the
>> current instruction, or 2) emulate it, but don't allow it to perform
>> any writes.
>>
>> Changes since V1:
>>  - Removed the 'skip' code which required computing the current
>>    instruction length.
>>  - Removed the set_ad_bits() code that attempted to modify the
>>    'accessed' and 'dirty' bits for instructions that the emulator
>>    can't handle at the moment.
>>
>> Changes since V2:
>>  - Moved the __vmread(EXIT_QUALIFICATION, &exit_qualification); code
>>    in vmx.c, accessible via hvm_funcs.
>>  - Incorporated changes by Andrew Cooper ("[PATCH 1/2] Xen/mem_event:
>>    Validate the response vcpu_id before acting on it."
>>
>> Changes since V3:
>>  - Collapsed verbose lines into a single "else if()".
>>  - Changed an int to bool_t.
>>  - Fixed a minor coding style issue.
>>  - Now computing the first parameter to hvm_emulate_one_full()
>>    (replaced an "if" with a single call).
>>  - Added code comments about eip and gla reset (clarity issue).
>>  - Removed duplicate code by Andrew Cooper (introduced in V2,
>>    since committed).
>>
>> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
>> ---
>>  xen/arch/x86/domain.c          |    3 ++
>>  xen/arch/x86/hvm/vmx/vmx.c     |   13 ++++++
>>  xen/arch/x86/mm/p2m.c          |   85 
>> ++++++++++++++++++++++++++++++++++++++++
>>  xen/include/asm-x86/domain.h   |    9 +++++
>>  xen/include/asm-x86/hvm/hvm.h  |    2 +
>>  xen/include/public/mem_event.h |   12 +++---
>>  6 files changed, 119 insertions(+), 5 deletions(-)
>>
>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
>> index e896210..af9b213 100644
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -407,6 +407,9 @@ int vcpu_initialise(struct vcpu *v)
>>  
>>      v->arch.flags = TF_kernel_mode;
>>  
>> +    /* By default, do not emulate */
>> +    v->arch.mem_event.emulate_flags = 0;
>> +
>>      rc = mapcache_vcpu_init(v);
>>      if ( rc )
>>          return rc;
>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>> index c0e3d73..150fe9f 100644
>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -1698,6 +1698,18 @@ static void vmx_enable_intro_msr_interception(struct 
>> domain *d)
>>      }
>>  }
>>  
>> +static bool_t vmx_exited_by_pagefault(void)
>> +{
>> +    unsigned long exit_qualification;
>> +
>> +    __vmread(EXIT_QUALIFICATION, &exit_qualification);
>> +
>> +    if ( (exit_qualification & EPT_GLA_FAULT) == 0 )
>> +        return 0;
>> +
>> +    return 1;
>> +}
>> +
>>  static struct hvm_function_table __initdata vmx_function_table = {
>>      .name                 = "VMX",
>>      .cpu_up_prepare       = vmx_cpu_up_prepare,
>> @@ -1757,6 +1769,7 @@ static struct hvm_function_table __initdata 
>> vmx_function_table = {
>>      .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
>>      .hypervisor_cpuid_leaf = vmx_hypervisor_cpuid_leaf,
>>      .enable_intro_msr_interception = vmx_enable_intro_msr_interception,
>> +    .exited_by_pagefault  = vmx_exited_by_pagefault,
>>  };
>>  
>>  const struct hvm_function_table * __init start_vmx(void)
>> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
>> index 069e869..da1bc2d 100644
>> --- a/xen/arch/x86/mm/p2m.c
>> +++ b/xen/arch/x86/mm/p2m.c
>> @@ -1391,6 +1391,7 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t 
>> gla_valid, unsigned long gla,
>>      p2m_access_t p2ma;
>>      mem_event_request_t *req;
>>      int rc;
>> +    unsigned long eip = guest_cpu_user_regs()->eip;
>>  
>>      /* First, handle rx2rw conversion automatically.
>>       * These calls to p2m->set_entry() must succeed: we have the gfn
>> @@ -1443,6 +1444,36 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t 
>> gla_valid, unsigned long gla,
>>              return 1;
>>          }
>>      }
>> +    else if ( hvm_funcs.exited_by_pagefault && !hvm_funcs.exited_by_pagefault() ) /* don't send a mem_event */
> 
> DYM
> 
>     else if ( !hvm_funcs.exited_by_pagefault || !hvm_funcs.exited_by_pagefault() )

Well, no. That would mean that if hvm_funcs.exited_by_pagefault == 0
(which is the SVM case now), no mem_event will be sent (we'll just
emulate the current instruction). That would mean that in the SVM case
no mem_event will ever be sent and everything will be emulated.

With the original code, if hvm_funcs.exited_by_pagefault is not set,
i.e. in the SVM case, _all_ mem_events are being sent out (even those
that happened when exiting by nested pagefault). I'm not sure what the
status of SVM mem_event is at the moment, but it seemed the safer choice.

Sorry for the late reply, I've lost track of this question while
answering others.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC V4 1/5] xen: Emulate with no writes
  2014-08-06  8:50         ` Jan Beulich
@ 2014-08-28 11:53           ` Tim Deegan
  0 siblings, 0 replies; 29+ messages in thread
From: Tim Deegan @ 2014-08-28 11:53 UTC (permalink / raw)
  To: Jan Beulich
  Cc: kevin.tian, ian.campbell, Razvan Cojocaru, stefano.stabellini,
	andrew.cooper3, eddie.dong, xen-devel, jun.nakajima, ian.jackson

At 09:50 +0100 on 06 Aug (1407315030), Jan Beulich wrote:
> >>> On 06.08.14 at 10:42, <rcojocaru@bitdefender.com> wrote:
> > On 08/05/2014 06:43 PM, Jan Beulich wrote:
> >> it. And quite different from your case - you want the instruction to
> >> behave normally _except_ for the memory write as I understood so far
> >> (albeit it still escapes me how in the end correct behavior can result).
> > 
> > The scenario in such cases is quite special: the decision to emulate
> > without writing is usually made for things such as rootkits, and
> > allowing a rogue application to go ahead with a write generally means
> > allowing it to do bad things - so in that case the alternative is
> > preferable.
> 
> In which case dropping the side effects of such instructions may
> well be the right thing too, in which case the trivial implementations
> you provided would be okay, but you'd have to go further and also
> disallow port and MMIO reads.

+1.

A related point - you might want to emulate ther read and test parts
of cmpxchg (but OTOH the sorts of things that CMPXCHG is used for are
likely to misbehave badly with the write discarded anyway.)

Tim.

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2014-08-28 11:53 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-04 11:30 [PATCH RFC V4 1/5] xen: Emulate with no writes Razvan Cojocaru
2014-08-04 11:30 ` [PATCH RFC V4 2/5] xen: Optimize introspection access to guest state Razvan Cojocaru
2014-08-04 14:16   ` Jan Beulich
2014-08-04 14:43     ` Razvan Cojocaru
2014-08-04 11:30 ` [PATCH RFC V4 3/5] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
2014-08-04 11:30 ` [PATCH RFC V4 4/5] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
2014-08-04 11:51   ` Ian Campbell
2014-08-04 14:26   ` Jan Beulich
2014-08-04 15:00     ` Razvan Cojocaru
2014-08-04 15:20       ` Jan Beulich
2014-08-05  8:09         ` Razvan Cojocaru
2014-08-05  8:39           ` Jan Beulich
2014-08-05  8:48             ` Razvan Cojocaru
2014-08-05  9:59             ` Razvan Cojocaru
2014-08-04 15:11     ` Razvan Cojocaru
2014-08-04 15:21       ` Jan Beulich
     [not found]         ` <53DFA537.70105@bitdefender.com>
2014-08-04 15:23           ` Razvan Cojocaru
2014-08-04 11:30 ` [PATCH RFC V4 5/5] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
2014-08-04 14:33   ` Jan Beulich
2014-08-06 14:00     ` Razvan Cojocaru
2014-08-04 14:09 ` [PATCH RFC V4 1/5] xen: Emulate with no writes Jan Beulich
2014-08-04 14:25   ` Razvan Cojocaru
2014-08-04 14:42     ` Jan Beulich
2014-08-05 15:16   ` Razvan Cojocaru
2014-08-05 15:27     ` Razvan Cojocaru
2014-08-05 15:43     ` Jan Beulich
2014-08-06  8:42       ` Razvan Cojocaru
2014-08-06  8:50         ` Jan Beulich
2014-08-28 11:53           ` Tim Deegan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.