All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
@ 2014-07-02 13:33 Razvan Cojocaru
  2014-07-02 13:33 ` [PATCH RFC 2/9] xen: Optimize introspection access to guest state Razvan Cojocaru
                   ` (9 more replies)
  0 siblings, 10 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Razvan Cojocaru

Added support for emulating an instruction with no memory writes and
for retrieving the length of the next instruction. Additionally,
introduced hvm_emulate_one_full(bool_t nowrite), which acts upon all
possible return values from the hvm_emulate_one() functions (RETRY,
EXCEPTION, UNHANDLEABLE).

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/Makefile             |    2 +
 xen/arch/x86/hvm/emulate.c        |  192 +++++++
 xen/arch/x86/inat-tables.c        | 1130 +++++++++++++++++++++++++++++++++++++
 xen/arch/x86/inat.c               |   96 ++++
 xen/arch/x86/insn.c               |  576 +++++++++++++++++++
 xen/include/asm-x86/hvm/emulate.h |    5 +
 xen/include/asm-x86/inat.h        |  221 ++++++++
 xen/include/asm-x86/inat_types.h  |   29 +
 xen/include/asm-x86/insn.h        |  199 +++++++
 9 files changed, 2450 insertions(+)
 create mode 100644 xen/arch/x86/inat-tables.c
 create mode 100644 xen/arch/x86/inat.c
 create mode 100644 xen/arch/x86/insn.c
 create mode 100644 xen/include/asm-x86/inat.h
 create mode 100644 xen/include/asm-x86/inat_types.h
 create mode 100644 xen/include/asm-x86/insn.h

diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index 6c90b1b..30829a0 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -59,6 +59,8 @@ obj-y += crash.o
 obj-y += tboot.o
 obj-y += hpet.o
 obj-y += xstate.o
+obj-y += insn.o
+obj-y += inat.o
 
 obj-$(crash_debug) += gdbstub.o
 
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index eac159f..1dc8c67 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -21,6 +21,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/trace.h>
 #include <asm/hvm/support.h>
+#include <asm/insn.h>
 
 static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
 {
@@ -688,6 +689,17 @@ static int hvmemul_write(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_write_dummy(
+    enum x86_segment __attribute__((unused)) seg,
+    unsigned long __attribute__((unused)) offset,
+    void __attribute__((unused)) *p_data,
+    unsigned int __attribute__((unused)) bytes,
+    struct x86_emulate_ctxt __attribute__((unused)) *ctxt)
+{
+    /* discarding the write */
+    return X86EMUL_OKAY;
+}
+
 static int hvmemul_cmpxchg(
     enum x86_segment seg,
     unsigned long offset,
@@ -1239,6 +1251,139 @@ int hvm_emulate_one(
     return X86EMUL_OKAY;
 }
 
+int hvm_emulate_one_no_write(
+    struct hvm_emulate_ctxt *hvmemul_ctxt)
+{
+    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
+    struct vcpu *curr = current;
+    uint32_t new_intr_shadow, pfec = PFEC_page_present;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    struct x86_emulate_ops local_ops = hvm_emulate_ops;
+    unsigned long addr;
+    int rc;
+
+    if ( hvm_long_mode_enabled(curr) &&
+         hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
+    {
+        hvmemul_ctxt->ctxt.addr_size = hvmemul_ctxt->ctxt.sp_size = 64;
+    }
+    else
+    {
+        hvmemul_ctxt->ctxt.addr_size =
+            hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.db ? 32 : 16;
+        hvmemul_ctxt->ctxt.sp_size =
+            hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db ? 32 : 16;
+    }
+
+    if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    hvmemul_ctxt->insn_buf_eip = regs->eip;
+    if ( !vio->mmio_insn_bytes )
+    {
+        hvmemul_ctxt->insn_buf_bytes =
+            hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
+            (hvm_virtual_to_linear_addr(x86_seg_cs,
+                                        &hvmemul_ctxt->seg_reg[x86_seg_cs],
+                                        regs->eip,
+                                        sizeof(hvmemul_ctxt->insn_buf),
+                                        hvm_access_insn_fetch,
+                                        hvmemul_ctxt->ctxt.addr_size,
+                                        &addr) &&
+             hvm_fetch_from_guest_virt_nofault(hvmemul_ctxt->insn_buf, addr,
+                                               sizeof(hvmemul_ctxt->insn_buf),
+                                               pfec) == HVMCOPY_okay) ?
+            sizeof(hvmemul_ctxt->insn_buf) : 0;
+    }
+    else
+    {
+        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
+    }
+
+    hvmemul_ctxt->exn_pending = 0;
+    vio->mmio_retrying = vio->mmio_retry;
+    vio->mmio_retry = 0;
+
+    local_ops.write = hvmemul_write_dummy;
+    rc = x86_emulate(&hvmemul_ctxt->ctxt, &local_ops);
+
+    if ( rc == X86EMUL_OKAY && vio->mmio_retry )
+        rc = X86EMUL_RETRY;
+    if ( rc != X86EMUL_RETRY )
+    {
+        vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
+        vio->mmio_insn_bytes = 0;
+    }
+    else
+    {
+        BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
+        vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
+        memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
+    }
+
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    new_intr_shadow = hvmemul_ctxt->intr_shadow;
+
+    /* MOV-SS instruction toggles MOV-SS shadow, else we just clear it. */
+    if ( hvmemul_ctxt->ctxt.retire.flags.mov_ss )
+        new_intr_shadow ^= HVM_INTR_SHADOW_MOV_SS;
+    else
+        new_intr_shadow &= ~HVM_INTR_SHADOW_MOV_SS;
+
+    /* STI instruction toggles STI shadow, else we just clear it. */
+    if ( hvmemul_ctxt->ctxt.retire.flags.sti )
+        new_intr_shadow ^= HVM_INTR_SHADOW_STI;
+    else
+        new_intr_shadow &= ~HVM_INTR_SHADOW_STI;
+
+    if ( hvmemul_ctxt->intr_shadow != new_intr_shadow )
+    {
+        hvmemul_ctxt->intr_shadow = new_intr_shadow;
+        hvm_funcs.set_interrupt_shadow(curr, new_intr_shadow);
+    }
+
+    if ( hvmemul_ctxt->ctxt.retire.flags.hlt &&
+         !hvm_local_events_need_delivery(curr) )
+    {
+        hvm_hlt(regs->eflags);
+    }
+
+    return X86EMUL_OKAY;
+}
+
+void hvm_emulate_one_full(bool_t nowrite)
+{
+    struct hvm_emulate_ctxt ctx[1] = {};
+    int rc = X86EMUL_RETRY;
+
+    hvm_emulate_prepare(ctx, guest_cpu_user_regs());
+
+    while ( rc == X86EMUL_RETRY )
+    {
+        if ( nowrite )
+            rc = hvm_emulate_one_no_write(ctx);
+        else
+            rc = hvm_emulate_one(ctx);
+    }
+
+    switch ( rc )
+    {
+    case X86EMUL_UNHANDLEABLE:
+        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+        break;
+    case X86EMUL_EXCEPTION:
+        if ( ctx->exn_pending )
+            hvm_inject_hw_exception(ctx->exn_vector, ctx->exn_error_code);
+        /* fall through */
+    default:
+        hvm_emulate_writeback(ctx);
+        break;
+    }
+}
+
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs)
@@ -1278,6 +1423,53 @@ struct segment_register *hvmemul_get_seg_reg(
     return &hvmemul_ctxt->seg_reg[seg];
 }
 
+int hvm_get_insn_length(
+    struct hvm_emulate_ctxt *hvmemul_ctxt)
+{
+    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
+    struct vcpu *curr = current;
+    uint32_t pfec = PFEC_page_present;
+    unsigned long addr;
+    struct x86_emulate_ops local_ops = hvm_emulate_ops;
+    struct insn insn;
+
+    local_ops.write = hvmemul_write_dummy;
+
+    if ( hvm_long_mode_enabled(curr) &&
+        hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
+        hvmemul_ctxt->ctxt.addr_size = hvmemul_ctxt->ctxt.sp_size = 64;
+    else
+    {
+        hvmemul_ctxt->ctxt.addr_size =
+            hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.db ? 32 : 16;
+        hvmemul_ctxt->ctxt.sp_size =
+            hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db ? 32 : 16;
+    }
+
+    if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    hvmemul_ctxt->insn_buf_eip = regs->eip;
+    hvmemul_ctxt->insn_buf_bytes =
+        hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf)
+        ? :
+        (hvm_virtual_to_linear_addr(
+            x86_seg_cs, &hvmemul_ctxt->seg_reg[x86_seg_cs],
+            regs->eip, sizeof(hvmemul_ctxt->insn_buf),
+            hvm_access_insn_fetch, hvmemul_ctxt->ctxt.addr_size, &addr) &&
+        !hvm_fetch_from_guest_virt_nofault(
+            hvmemul_ctxt->insn_buf, addr,
+            sizeof(hvmemul_ctxt->insn_buf), pfec))
+    ? sizeof(hvmemul_ctxt->insn_buf) : 0;
+
+    hvmemul_ctxt->exn_pending = 0;
+
+    insn_init(&insn, hvmemul_ctxt->insn_buf, hvm_long_mode_enabled(curr));
+    insn_get_length(&insn);
+
+    return insn.length;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/inat-tables.c b/xen/arch/x86/inat-tables.c
new file mode 100644
index 0000000..39252c3
--- /dev/null
+++ b/xen/arch/x86/inat-tables.c
@@ -0,0 +1,1130 @@
+/* x86 opcode map generated from x86-opcode-map.txt */
+/* Do not change this code. */
+
+/* Table: one byte opcode */
+const insn_attr_t inat_primary_table[INAT_OPCODE_TABLE_SIZE] = {
+	[0x00] = INAT_MODRM,
+	[0x01] = INAT_MODRM,
+	[0x02] = INAT_MODRM,
+	[0x03] = INAT_MODRM,
+	[0x04] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x05] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0x08] = INAT_MODRM,
+	[0x09] = INAT_MODRM,
+	[0x0a] = INAT_MODRM,
+	[0x0b] = INAT_MODRM,
+	[0x0c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x0d] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0x0f] = INAT_MAKE_ESCAPE(1),
+	[0x10] = INAT_MODRM,
+	[0x11] = INAT_MODRM,
+	[0x12] = INAT_MODRM,
+	[0x13] = INAT_MODRM,
+	[0x14] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x15] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0x18] = INAT_MODRM,
+	[0x19] = INAT_MODRM,
+	[0x1a] = INAT_MODRM,
+	[0x1b] = INAT_MODRM,
+	[0x1c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x1d] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0x20] = INAT_MODRM,
+	[0x21] = INAT_MODRM,
+	[0x22] = INAT_MODRM,
+	[0x23] = INAT_MODRM,
+	[0x24] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x25] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0x26] = INAT_MAKE_PREFIX(INAT_PFX_ES),
+	[0x28] = INAT_MODRM,
+	[0x29] = INAT_MODRM,
+	[0x2a] = INAT_MODRM,
+	[0x2b] = INAT_MODRM,
+	[0x2c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x2d] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0x2e] = INAT_MAKE_PREFIX(INAT_PFX_CS),
+	[0x30] = INAT_MODRM,
+	[0x31] = INAT_MODRM,
+	[0x32] = INAT_MODRM,
+	[0x33] = INAT_MODRM,
+	[0x34] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x35] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0x36] = INAT_MAKE_PREFIX(INAT_PFX_SS),
+	[0x38] = INAT_MODRM,
+	[0x39] = INAT_MODRM,
+	[0x3a] = INAT_MODRM,
+	[0x3b] = INAT_MODRM,
+	[0x3c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x3d] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0x3e] = INAT_MAKE_PREFIX(INAT_PFX_DS),
+	[0x40] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x41] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x42] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x43] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x44] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x45] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x46] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x47] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x48] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x49] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x4a] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x4b] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x4c] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x4d] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x4e] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x4f] = INAT_MAKE_PREFIX(INAT_PFX_REX),
+	[0x50] = INAT_FORCE64,
+	[0x51] = INAT_FORCE64,
+	[0x52] = INAT_FORCE64,
+	[0x53] = INAT_FORCE64,
+	[0x54] = INAT_FORCE64,
+	[0x55] = INAT_FORCE64,
+	[0x56] = INAT_FORCE64,
+	[0x57] = INAT_FORCE64,
+	[0x58] = INAT_FORCE64,
+	[0x59] = INAT_FORCE64,
+	[0x5a] = INAT_FORCE64,
+	[0x5b] = INAT_FORCE64,
+	[0x5c] = INAT_FORCE64,
+	[0x5d] = INAT_FORCE64,
+	[0x5e] = INAT_FORCE64,
+	[0x5f] = INAT_FORCE64,
+	[0x62] = INAT_MODRM,
+	[0x63] = INAT_MODRM | INAT_MODRM,
+	[0x64] = INAT_MAKE_PREFIX(INAT_PFX_FS),
+	[0x65] = INAT_MAKE_PREFIX(INAT_PFX_GS),
+	[0x66] = INAT_MAKE_PREFIX(INAT_PFX_OPNDSZ),
+	[0x67] = INAT_MAKE_PREFIX(INAT_PFX_ADDRSZ),
+	[0x68] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x69] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_MODRM,
+	[0x6a] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
+	[0x6b] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
+	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x71] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x72] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x73] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x74] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x75] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x76] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x77] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x78] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x79] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x7a] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x7b] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x7c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x7d] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x7e] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x7f] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0x80] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(1),
+	[0x81] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_MODRM | INAT_MAKE_GROUP(1),
+	[0x82] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(1),
+	[0x83] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(1),
+	[0x84] = INAT_MODRM,
+	[0x85] = INAT_MODRM,
+	[0x86] = INAT_MODRM,
+	[0x87] = INAT_MODRM,
+	[0x88] = INAT_MODRM,
+	[0x89] = INAT_MODRM,
+	[0x8a] = INAT_MODRM,
+	[0x8b] = INAT_MODRM,
+	[0x8c] = INAT_MODRM,
+	[0x8d] = INAT_MODRM,
+	[0x8e] = INAT_MODRM,
+	[0x8f] = INAT_MAKE_GROUP(2) | INAT_MODRM | INAT_FORCE64,
+	[0x9a] = INAT_MAKE_IMM(INAT_IMM_PTR),
+	[0x9c] = INAT_FORCE64,
+	[0x9d] = INAT_FORCE64,
+	[0xa0] = INAT_MOFFSET,
+	[0xa1] = INAT_MOFFSET,
+	[0xa2] = INAT_MOFFSET,
+	[0xa3] = INAT_MOFFSET,
+	[0xa8] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xa9] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
+	[0xb0] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xb1] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xb2] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xb3] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xb4] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xb5] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xb6] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xb7] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xb8] = INAT_MAKE_IMM(INAT_IMM_VWORD),
+	[0xb9] = INAT_MAKE_IMM(INAT_IMM_VWORD),
+	[0xba] = INAT_MAKE_IMM(INAT_IMM_VWORD),
+	[0xbb] = INAT_MAKE_IMM(INAT_IMM_VWORD),
+	[0xbc] = INAT_MAKE_IMM(INAT_IMM_VWORD),
+	[0xbd] = INAT_MAKE_IMM(INAT_IMM_VWORD),
+	[0xbe] = INAT_MAKE_IMM(INAT_IMM_VWORD),
+	[0xbf] = INAT_MAKE_IMM(INAT_IMM_VWORD),
+	[0xc0] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(3),
+	[0xc1] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(3),
+	[0xc2] = INAT_MAKE_IMM(INAT_IMM_WORD) | INAT_FORCE64,
+	[0xc4] = INAT_MODRM | INAT_MAKE_PREFIX(INAT_PFX_VEX3),
+	[0xc5] = INAT_MODRM | INAT_MAKE_PREFIX(INAT_PFX_VEX2),
+	[0xc6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(4),
+	[0xc7] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_MODRM | INAT_MAKE_GROUP(4),
+	[0xc8] = INAT_MAKE_IMM(INAT_IMM_WORD) | INAT_SCNDIMM,
+	[0xc9] = INAT_FORCE64,
+	[0xca] = INAT_MAKE_IMM(INAT_IMM_WORD),
+	[0xcd] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xd0] = INAT_MODRM | INAT_MAKE_GROUP(3),
+	[0xd1] = INAT_MODRM | INAT_MAKE_GROUP(3),
+	[0xd2] = INAT_MODRM | INAT_MAKE_GROUP(3),
+	[0xd3] = INAT_MODRM | INAT_MAKE_GROUP(3),
+	[0xd4] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xd5] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xd8] = INAT_MODRM,
+	[0xd9] = INAT_MODRM,
+	[0xda] = INAT_MODRM,
+	[0xdb] = INAT_MODRM,
+	[0xdc] = INAT_MODRM,
+	[0xdd] = INAT_MODRM,
+	[0xde] = INAT_MODRM,
+	[0xdf] = INAT_MODRM,
+	[0xe0] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
+	[0xe1] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
+	[0xe2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
+	[0xe3] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
+	[0xe4] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xe5] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xe6] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xe7] = INAT_MAKE_IMM(INAT_IMM_BYTE),
+	[0xe8] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0xe9] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0xea] = INAT_MAKE_IMM(INAT_IMM_PTR),
+	[0xeb] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
+	[0xf0] = INAT_MAKE_PREFIX(INAT_PFX_LOCK),
+	[0xf2] = INAT_MAKE_PREFIX(INAT_PFX_REPNE),
+	[0xf3] = INAT_MAKE_PREFIX(INAT_PFX_REPE),
+	[0xf6] = INAT_MODRM | INAT_MAKE_GROUP(5),
+	[0xf7] = INAT_MODRM | INAT_MAKE_GROUP(6),
+	[0xfe] = INAT_MAKE_GROUP(7),
+	[0xff] = INAT_MAKE_GROUP(8),
+};
+
+/* Table: 2-byte opcode (0x0f) */
+const insn_attr_t inat_escape_table_1[INAT_OPCODE_TABLE_SIZE] = {
+	[0x00] = INAT_MAKE_GROUP(9),
+	[0x01] = INAT_MAKE_GROUP(10),
+	[0x02] = INAT_MODRM,
+	[0x03] = INAT_MODRM,
+	[0x0d] = INAT_MODRM | INAT_MAKE_GROUP(11),
+	[0x0f] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
+	[0x10] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x11] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x12] = INAT_MODRM | INAT_VEXOK | INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x13] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x14] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x15] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x16] = INAT_MODRM | INAT_VEXOK | INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x17] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x18] = INAT_MAKE_GROUP(12),
+	[0x1f] = INAT_MODRM,
+	[0x20] = INAT_MODRM,
+	[0x21] = INAT_MODRM,
+	[0x22] = INAT_MODRM,
+	[0x23] = INAT_MODRM,
+	[0x28] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x29] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x2a] = INAT_MODRM | INAT_VARIANT,
+	[0x2b] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x2c] = INAT_MODRM | INAT_VARIANT,
+	[0x2d] = INAT_MODRM | INAT_VARIANT,
+	[0x2e] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x2f] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x38] = INAT_MAKE_ESCAPE(2),
+	[0x3a] = INAT_MAKE_ESCAPE(3),
+	[0x40] = INAT_MODRM,
+	[0x41] = INAT_MODRM,
+	[0x42] = INAT_MODRM,
+	[0x43] = INAT_MODRM,
+	[0x44] = INAT_MODRM,
+	[0x45] = INAT_MODRM,
+	[0x46] = INAT_MODRM,
+	[0x47] = INAT_MODRM,
+	[0x48] = INAT_MODRM,
+	[0x49] = INAT_MODRM,
+	[0x4a] = INAT_MODRM,
+	[0x4b] = INAT_MODRM,
+	[0x4c] = INAT_MODRM,
+	[0x4d] = INAT_MODRM,
+	[0x4e] = INAT_MODRM,
+	[0x4f] = INAT_MODRM,
+	[0x50] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x51] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x52] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x53] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x54] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x55] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x56] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x57] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x58] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x59] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x5a] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x5b] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x5c] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x5d] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x5e] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x5f] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x60] = INAT_MODRM | INAT_VARIANT,
+	[0x61] = INAT_MODRM | INAT_VARIANT,
+	[0x62] = INAT_MODRM | INAT_VARIANT,
+	[0x63] = INAT_MODRM | INAT_VARIANT,
+	[0x64] = INAT_MODRM | INAT_VARIANT,
+	[0x65] = INAT_MODRM | INAT_VARIANT,
+	[0x66] = INAT_MODRM | INAT_VARIANT,
+	[0x67] = INAT_MODRM | INAT_VARIANT,
+	[0x68] = INAT_MODRM | INAT_VARIANT,
+	[0x69] = INAT_MODRM | INAT_VARIANT,
+	[0x6a] = INAT_MODRM | INAT_VARIANT,
+	[0x6b] = INAT_MODRM | INAT_VARIANT,
+	[0x6c] = INAT_VARIANT,
+	[0x6d] = INAT_VARIANT,
+	[0x6e] = INAT_MODRM | INAT_VARIANT,
+	[0x6f] = INAT_MODRM | INAT_VARIANT,
+	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0x71] = INAT_MAKE_GROUP(13),
+	[0x72] = INAT_MAKE_GROUP(14),
+	[0x73] = INAT_MAKE_GROUP(15),
+	[0x74] = INAT_MODRM | INAT_VARIANT,
+	[0x75] = INAT_MODRM | INAT_VARIANT,
+	[0x76] = INAT_MODRM | INAT_VARIANT,
+	[0x77] = INAT_VEXOK | INAT_VEXOK,
+	[0x78] = INAT_MODRM,
+	[0x79] = INAT_MODRM,
+	[0x7c] = INAT_VARIANT,
+	[0x7d] = INAT_VARIANT,
+	[0x7e] = INAT_MODRM | INAT_VARIANT,
+	[0x7f] = INAT_MODRM | INAT_VARIANT,
+	[0x80] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x81] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x82] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x83] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x84] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x85] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x86] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x87] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x88] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x89] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x8a] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x8b] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x8c] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x8d] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x8e] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x8f] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
+	[0x90] = INAT_MODRM,
+	[0x91] = INAT_MODRM,
+	[0x92] = INAT_MODRM,
+	[0x93] = INAT_MODRM,
+	[0x94] = INAT_MODRM,
+	[0x95] = INAT_MODRM,
+	[0x96] = INAT_MODRM,
+	[0x97] = INAT_MODRM,
+	[0x98] = INAT_MODRM,
+	[0x99] = INAT_MODRM,
+	[0x9a] = INAT_MODRM,
+	[0x9b] = INAT_MODRM,
+	[0x9c] = INAT_MODRM,
+	[0x9d] = INAT_MODRM,
+	[0x9e] = INAT_MODRM,
+	[0x9f] = INAT_MODRM,
+	[0xa0] = INAT_FORCE64,
+	[0xa1] = INAT_FORCE64,
+	[0xa3] = INAT_MODRM,
+	[0xa4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
+	[0xa5] = INAT_MODRM,
+	[0xa6] = INAT_MAKE_GROUP(16),
+	[0xa7] = INAT_MAKE_GROUP(17),
+	[0xa8] = INAT_FORCE64,
+	[0xa9] = INAT_FORCE64,
+	[0xab] = INAT_MODRM,
+	[0xac] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
+	[0xad] = INAT_MODRM,
+	[0xae] = INAT_MAKE_GROUP(18),
+	[0xaf] = INAT_MODRM,
+	[0xb0] = INAT_MODRM,
+	[0xb1] = INAT_MODRM,
+	[0xb2] = INAT_MODRM,
+	[0xb3] = INAT_MODRM,
+	[0xb4] = INAT_MODRM,
+	[0xb5] = INAT_MODRM,
+	[0xb6] = INAT_MODRM,
+	[0xb7] = INAT_MODRM,
+	[0xb8] = INAT_VARIANT,
+	[0xb9] = INAT_MAKE_GROUP(19),
+	[0xba] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(20),
+	[0xbb] = INAT_MODRM,
+	[0xbc] = INAT_MODRM | INAT_VARIANT,
+	[0xbd] = INAT_MODRM | INAT_VARIANT,
+	[0xbe] = INAT_MODRM,
+	[0xbf] = INAT_MODRM,
+	[0xc0] = INAT_MODRM,
+	[0xc1] = INAT_MODRM,
+	[0xc2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0xc3] = INAT_MODRM,
+	[0xc4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0xc5] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0xc6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0xc7] = INAT_MAKE_GROUP(21),
+	[0xd0] = INAT_VARIANT,
+	[0xd1] = INAT_MODRM | INAT_VARIANT,
+	[0xd2] = INAT_MODRM | INAT_VARIANT,
+	[0xd3] = INAT_MODRM | INAT_VARIANT,
+	[0xd4] = INAT_MODRM | INAT_VARIANT,
+	[0xd5] = INAT_MODRM | INAT_VARIANT,
+	[0xd6] = INAT_VARIANT,
+	[0xd7] = INAT_MODRM | INAT_VARIANT,
+	[0xd8] = INAT_MODRM | INAT_VARIANT,
+	[0xd9] = INAT_MODRM | INAT_VARIANT,
+	[0xda] = INAT_MODRM | INAT_VARIANT,
+	[0xdb] = INAT_MODRM | INAT_VARIANT,
+	[0xdc] = INAT_MODRM | INAT_VARIANT,
+	[0xdd] = INAT_MODRM | INAT_VARIANT,
+	[0xde] = INAT_MODRM | INAT_VARIANT,
+	[0xdf] = INAT_MODRM | INAT_VARIANT,
+	[0xe0] = INAT_MODRM | INAT_VARIANT,
+	[0xe1] = INAT_MODRM | INAT_VARIANT,
+	[0xe2] = INAT_MODRM | INAT_VARIANT,
+	[0xe3] = INAT_MODRM | INAT_VARIANT,
+	[0xe4] = INAT_MODRM | INAT_VARIANT,
+	[0xe5] = INAT_MODRM | INAT_VARIANT,
+	[0xe6] = INAT_VARIANT,
+	[0xe7] = INAT_MODRM | INAT_VARIANT,
+	[0xe8] = INAT_MODRM | INAT_VARIANT,
+	[0xe9] = INAT_MODRM | INAT_VARIANT,
+	[0xea] = INAT_MODRM | INAT_VARIANT,
+	[0xeb] = INAT_MODRM | INAT_VARIANT,
+	[0xec] = INAT_MODRM | INAT_VARIANT,
+	[0xed] = INAT_MODRM | INAT_VARIANT,
+	[0xee] = INAT_MODRM | INAT_VARIANT,
+	[0xef] = INAT_MODRM | INAT_VARIANT,
+	[0xf0] = INAT_VARIANT,
+	[0xf1] = INAT_MODRM | INAT_VARIANT,
+	[0xf2] = INAT_MODRM | INAT_VARIANT,
+	[0xf3] = INAT_MODRM | INAT_VARIANT,
+	[0xf4] = INAT_MODRM | INAT_VARIANT,
+	[0xf5] = INAT_MODRM | INAT_VARIANT,
+	[0xf6] = INAT_MODRM | INAT_VARIANT,
+	[0xf7] = INAT_MODRM | INAT_VARIANT,
+	[0xf8] = INAT_MODRM | INAT_VARIANT,
+	[0xf9] = INAT_MODRM | INAT_VARIANT,
+	[0xfa] = INAT_MODRM | INAT_VARIANT,
+	[0xfb] = INAT_MODRM | INAT_VARIANT,
+	[0xfc] = INAT_MODRM | INAT_VARIANT,
+	[0xfd] = INAT_MODRM | INAT_VARIANT,
+	[0xfe] = INAT_MODRM | INAT_VARIANT,
+};
+const insn_attr_t inat_escape_table_1_1[INAT_OPCODE_TABLE_SIZE] = {
+	[0x10] = INAT_MODRM | INAT_VEXOK,
+	[0x11] = INAT_MODRM | INAT_VEXOK,
+	[0x12] = INAT_MODRM | INAT_VEXOK,
+	[0x13] = INAT_MODRM | INAT_VEXOK,
+	[0x14] = INAT_MODRM | INAT_VEXOK,
+	[0x15] = INAT_MODRM | INAT_VEXOK,
+	[0x16] = INAT_MODRM | INAT_VEXOK,
+	[0x17] = INAT_MODRM | INAT_VEXOK,
+	[0x28] = INAT_MODRM | INAT_VEXOK,
+	[0x29] = INAT_MODRM | INAT_VEXOK,
+	[0x2a] = INAT_MODRM,
+	[0x2b] = INAT_MODRM | INAT_VEXOK,
+	[0x2c] = INAT_MODRM,
+	[0x2d] = INAT_MODRM,
+	[0x2e] = INAT_MODRM | INAT_VEXOK,
+	[0x2f] = INAT_MODRM | INAT_VEXOK,
+	[0x50] = INAT_MODRM | INAT_VEXOK,
+	[0x51] = INAT_MODRM | INAT_VEXOK,
+	[0x54] = INAT_MODRM | INAT_VEXOK,
+	[0x55] = INAT_MODRM | INAT_VEXOK,
+	[0x56] = INAT_MODRM | INAT_VEXOK,
+	[0x57] = INAT_MODRM | INAT_VEXOK,
+	[0x58] = INAT_MODRM | INAT_VEXOK,
+	[0x59] = INAT_MODRM | INAT_VEXOK,
+	[0x5a] = INAT_MODRM | INAT_VEXOK,
+	[0x5b] = INAT_MODRM | INAT_VEXOK,
+	[0x5c] = INAT_MODRM | INAT_VEXOK,
+	[0x5d] = INAT_MODRM | INAT_VEXOK,
+	[0x5e] = INAT_MODRM | INAT_VEXOK,
+	[0x5f] = INAT_MODRM | INAT_VEXOK,
+	[0x60] = INAT_MODRM | INAT_VEXOK,
+	[0x61] = INAT_MODRM | INAT_VEXOK,
+	[0x62] = INAT_MODRM | INAT_VEXOK,
+	[0x63] = INAT_MODRM | INAT_VEXOK,
+	[0x64] = INAT_MODRM | INAT_VEXOK,
+	[0x65] = INAT_MODRM | INAT_VEXOK,
+	[0x66] = INAT_MODRM | INAT_VEXOK,
+	[0x67] = INAT_MODRM | INAT_VEXOK,
+	[0x68] = INAT_MODRM | INAT_VEXOK,
+	[0x69] = INAT_MODRM | INAT_VEXOK,
+	[0x6a] = INAT_MODRM | INAT_VEXOK,
+	[0x6b] = INAT_MODRM | INAT_VEXOK,
+	[0x6c] = INAT_MODRM | INAT_VEXOK,
+	[0x6d] = INAT_MODRM | INAT_VEXOK,
+	[0x6e] = INAT_MODRM | INAT_VEXOK,
+	[0x6f] = INAT_MODRM | INAT_VEXOK,
+	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x74] = INAT_MODRM | INAT_VEXOK,
+	[0x75] = INAT_MODRM | INAT_VEXOK,
+	[0x76] = INAT_MODRM | INAT_VEXOK,
+	[0x7c] = INAT_MODRM | INAT_VEXOK,
+	[0x7d] = INAT_MODRM | INAT_VEXOK,
+	[0x7e] = INAT_MODRM | INAT_VEXOK,
+	[0x7f] = INAT_MODRM | INAT_VEXOK,
+	[0xbc] = INAT_MODRM,
+	[0xbd] = INAT_MODRM,
+	[0xc2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0xc4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0xc5] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0xc6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0xd0] = INAT_MODRM | INAT_VEXOK,
+	[0xd1] = INAT_MODRM | INAT_VEXOK,
+	[0xd2] = INAT_MODRM | INAT_VEXOK,
+	[0xd3] = INAT_MODRM | INAT_VEXOK,
+	[0xd4] = INAT_MODRM | INAT_VEXOK,
+	[0xd5] = INAT_MODRM | INAT_VEXOK,
+	[0xd6] = INAT_MODRM | INAT_VEXOK,
+	[0xd7] = INAT_MODRM | INAT_VEXOK,
+	[0xd8] = INAT_MODRM | INAT_VEXOK,
+	[0xd9] = INAT_MODRM | INAT_VEXOK,
+	[0xda] = INAT_MODRM | INAT_VEXOK,
+	[0xdb] = INAT_MODRM | INAT_VEXOK,
+	[0xdc] = INAT_MODRM | INAT_VEXOK,
+	[0xdd] = INAT_MODRM | INAT_VEXOK,
+	[0xde] = INAT_MODRM | INAT_VEXOK,
+	[0xdf] = INAT_MODRM | INAT_VEXOK,
+	[0xe0] = INAT_MODRM | INAT_VEXOK,
+	[0xe1] = INAT_MODRM | INAT_VEXOK,
+	[0xe2] = INAT_MODRM | INAT_VEXOK,
+	[0xe3] = INAT_MODRM | INAT_VEXOK,
+	[0xe4] = INAT_MODRM | INAT_VEXOK,
+	[0xe5] = INAT_MODRM | INAT_VEXOK,
+	[0xe6] = INAT_MODRM | INAT_VEXOK,
+	[0xe7] = INAT_MODRM | INAT_VEXOK,
+	[0xe8] = INAT_MODRM | INAT_VEXOK,
+	[0xe9] = INAT_MODRM | INAT_VEXOK,
+	[0xea] = INAT_MODRM | INAT_VEXOK,
+	[0xeb] = INAT_MODRM | INAT_VEXOK,
+	[0xec] = INAT_MODRM | INAT_VEXOK,
+	[0xed] = INAT_MODRM | INAT_VEXOK,
+	[0xee] = INAT_MODRM | INAT_VEXOK,
+	[0xef] = INAT_MODRM | INAT_VEXOK,
+	[0xf1] = INAT_MODRM | INAT_VEXOK,
+	[0xf2] = INAT_MODRM | INAT_VEXOK,
+	[0xf3] = INAT_MODRM | INAT_VEXOK,
+	[0xf4] = INAT_MODRM | INAT_VEXOK,
+	[0xf5] = INAT_MODRM | INAT_VEXOK,
+	[0xf6] = INAT_MODRM | INAT_VEXOK,
+	[0xf7] = INAT_MODRM | INAT_VEXOK,
+	[0xf8] = INAT_MODRM | INAT_VEXOK,
+	[0xf9] = INAT_MODRM | INAT_VEXOK,
+	[0xfa] = INAT_MODRM | INAT_VEXOK,
+	[0xfb] = INAT_MODRM | INAT_VEXOK,
+	[0xfc] = INAT_MODRM | INAT_VEXOK,
+	[0xfd] = INAT_MODRM | INAT_VEXOK,
+	[0xfe] = INAT_MODRM | INAT_VEXOK,
+};
+const insn_attr_t inat_escape_table_1_2[INAT_OPCODE_TABLE_SIZE] = {
+	[0x10] = INAT_MODRM | INAT_VEXOK,
+	[0x11] = INAT_MODRM | INAT_VEXOK,
+	[0x12] = INAT_MODRM | INAT_VEXOK,
+	[0x16] = INAT_MODRM | INAT_VEXOK,
+	[0x2a] = INAT_MODRM | INAT_VEXOK,
+	[0x2c] = INAT_MODRM | INAT_VEXOK,
+	[0x2d] = INAT_MODRM | INAT_VEXOK,
+	[0x51] = INAT_MODRM | INAT_VEXOK,
+	[0x52] = INAT_MODRM | INAT_VEXOK,
+	[0x53] = INAT_MODRM | INAT_VEXOK,
+	[0x58] = INAT_MODRM | INAT_VEXOK,
+	[0x59] = INAT_MODRM | INAT_VEXOK,
+	[0x5a] = INAT_MODRM | INAT_VEXOK,
+	[0x5b] = INAT_MODRM | INAT_VEXOK,
+	[0x5c] = INAT_MODRM | INAT_VEXOK,
+	[0x5d] = INAT_MODRM | INAT_VEXOK,
+	[0x5e] = INAT_MODRM | INAT_VEXOK,
+	[0x5f] = INAT_MODRM | INAT_VEXOK,
+	[0x6f] = INAT_MODRM | INAT_VEXOK,
+	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x7e] = INAT_MODRM | INAT_VEXOK,
+	[0x7f] = INAT_MODRM | INAT_VEXOK,
+	[0xb8] = INAT_MODRM,
+	[0xbc] = INAT_MODRM,
+	[0xbd] = INAT_MODRM,
+	[0xc2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0xd6] = INAT_MODRM,
+	[0xe6] = INAT_MODRM | INAT_VEXOK,
+};
+const insn_attr_t inat_escape_table_1_3[INAT_OPCODE_TABLE_SIZE] = {
+	[0x10] = INAT_MODRM | INAT_VEXOK,
+	[0x11] = INAT_MODRM | INAT_VEXOK,
+	[0x12] = INAT_MODRM | INAT_VEXOK,
+	[0x2a] = INAT_MODRM | INAT_VEXOK,
+	[0x2c] = INAT_MODRM | INAT_VEXOK,
+	[0x2d] = INAT_MODRM | INAT_VEXOK,
+	[0x51] = INAT_MODRM | INAT_VEXOK,
+	[0x58] = INAT_MODRM | INAT_VEXOK,
+	[0x59] = INAT_MODRM | INAT_VEXOK,
+	[0x5a] = INAT_MODRM | INAT_VEXOK,
+	[0x5c] = INAT_MODRM | INAT_VEXOK,
+	[0x5d] = INAT_MODRM | INAT_VEXOK,
+	[0x5e] = INAT_MODRM | INAT_VEXOK,
+	[0x5f] = INAT_MODRM | INAT_VEXOK,
+	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x7c] = INAT_MODRM | INAT_VEXOK,
+	[0x7d] = INAT_MODRM | INAT_VEXOK,
+	[0xbc] = INAT_MODRM,
+	[0xbd] = INAT_MODRM,
+	[0xc2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0xd0] = INAT_MODRM | INAT_VEXOK,
+	[0xd6] = INAT_MODRM,
+	[0xe6] = INAT_MODRM | INAT_VEXOK,
+	[0xf0] = INAT_MODRM | INAT_VEXOK,
+};
+
+/* Table: 3-byte opcode 1 (0x0f 0x38) */
+const insn_attr_t inat_escape_table_2[INAT_OPCODE_TABLE_SIZE] = {
+	[0x00] = INAT_MODRM | INAT_VARIANT,
+	[0x01] = INAT_MODRM | INAT_VARIANT,
+	[0x02] = INAT_MODRM | INAT_VARIANT,
+	[0x03] = INAT_MODRM | INAT_VARIANT,
+	[0x04] = INAT_MODRM | INAT_VARIANT,
+	[0x05] = INAT_MODRM | INAT_VARIANT,
+	[0x06] = INAT_MODRM | INAT_VARIANT,
+	[0x07] = INAT_MODRM | INAT_VARIANT,
+	[0x08] = INAT_MODRM | INAT_VARIANT,
+	[0x09] = INAT_MODRM | INAT_VARIANT,
+	[0x0a] = INAT_MODRM | INAT_VARIANT,
+	[0x0b] = INAT_MODRM | INAT_VARIANT,
+	[0x0c] = INAT_VARIANT,
+	[0x0d] = INAT_VARIANT,
+	[0x0e] = INAT_VARIANT,
+	[0x0f] = INAT_VARIANT,
+	[0x10] = INAT_VARIANT,
+	[0x13] = INAT_VARIANT,
+	[0x14] = INAT_VARIANT,
+	[0x15] = INAT_VARIANT,
+	[0x16] = INAT_VARIANT,
+	[0x17] = INAT_VARIANT,
+	[0x18] = INAT_VARIANT,
+	[0x19] = INAT_VARIANT,
+	[0x1a] = INAT_VARIANT,
+	[0x1c] = INAT_MODRM | INAT_VARIANT,
+	[0x1d] = INAT_MODRM | INAT_VARIANT,
+	[0x1e] = INAT_MODRM | INAT_VARIANT,
+	[0x20] = INAT_VARIANT,
+	[0x21] = INAT_VARIANT,
+	[0x22] = INAT_VARIANT,
+	[0x23] = INAT_VARIANT,
+	[0x24] = INAT_VARIANT,
+	[0x25] = INAT_VARIANT,
+	[0x28] = INAT_VARIANT,
+	[0x29] = INAT_VARIANT,
+	[0x2a] = INAT_VARIANT,
+	[0x2b] = INAT_VARIANT,
+	[0x2c] = INAT_VARIANT,
+	[0x2d] = INAT_VARIANT,
+	[0x2e] = INAT_VARIANT,
+	[0x2f] = INAT_VARIANT,
+	[0x30] = INAT_VARIANT,
+	[0x31] = INAT_VARIANT,
+	[0x32] = INAT_VARIANT,
+	[0x33] = INAT_VARIANT,
+	[0x34] = INAT_VARIANT,
+	[0x35] = INAT_VARIANT,
+	[0x36] = INAT_VARIANT,
+	[0x37] = INAT_VARIANT,
+	[0x38] = INAT_VARIANT,
+	[0x39] = INAT_VARIANT,
+	[0x3a] = INAT_VARIANT,
+	[0x3b] = INAT_VARIANT,
+	[0x3c] = INAT_VARIANT,
+	[0x3d] = INAT_VARIANT,
+	[0x3e] = INAT_VARIANT,
+	[0x3f] = INAT_VARIANT,
+	[0x40] = INAT_VARIANT,
+	[0x41] = INAT_VARIANT,
+	[0x45] = INAT_VARIANT,
+	[0x46] = INAT_VARIANT,
+	[0x47] = INAT_VARIANT,
+	[0x58] = INAT_VARIANT,
+	[0x59] = INAT_VARIANT,
+	[0x5a] = INAT_VARIANT,
+	[0x78] = INAT_VARIANT,
+	[0x79] = INAT_VARIANT,
+	[0x80] = INAT_VARIANT,
+	[0x81] = INAT_VARIANT,
+	[0x82] = INAT_VARIANT,
+	[0x8c] = INAT_VARIANT,
+	[0x8e] = INAT_VARIANT,
+	[0x90] = INAT_VARIANT,
+	[0x91] = INAT_VARIANT,
+	[0x92] = INAT_VARIANT,
+	[0x93] = INAT_VARIANT,
+	[0x96] = INAT_VARIANT,
+	[0x97] = INAT_VARIANT,
+	[0x98] = INAT_VARIANT,
+	[0x99] = INAT_VARIANT,
+	[0x9a] = INAT_VARIANT,
+	[0x9b] = INAT_VARIANT,
+	[0x9c] = INAT_VARIANT,
+	[0x9d] = INAT_VARIANT,
+	[0x9e] = INAT_VARIANT,
+	[0x9f] = INAT_VARIANT,
+	[0xa6] = INAT_VARIANT,
+	[0xa7] = INAT_VARIANT,
+	[0xa8] = INAT_VARIANT,
+	[0xa9] = INAT_VARIANT,
+	[0xaa] = INAT_VARIANT,
+	[0xab] = INAT_VARIANT,
+	[0xac] = INAT_VARIANT,
+	[0xad] = INAT_VARIANT,
+	[0xae] = INAT_VARIANT,
+	[0xaf] = INAT_VARIANT,
+	[0xb6] = INAT_VARIANT,
+	[0xb7] = INAT_VARIANT,
+	[0xb8] = INAT_VARIANT,
+	[0xb9] = INAT_VARIANT,
+	[0xba] = INAT_VARIANT,
+	[0xbb] = INAT_VARIANT,
+	[0xbc] = INAT_VARIANT,
+	[0xbd] = INAT_VARIANT,
+	[0xbe] = INAT_VARIANT,
+	[0xbf] = INAT_VARIANT,
+	[0xdb] = INAT_VARIANT,
+	[0xdc] = INAT_VARIANT,
+	[0xdd] = INAT_VARIANT,
+	[0xde] = INAT_VARIANT,
+	[0xdf] = INAT_VARIANT,
+	[0xf0] = INAT_MODRM | INAT_VARIANT,
+	[0xf1] = INAT_MODRM | INAT_VARIANT,
+	[0xf2] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xf3] = INAT_MAKE_GROUP(22),
+	[0xf5] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY | INAT_VARIANT,
+	[0xf6] = INAT_VARIANT,
+	[0xf7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY | INAT_VARIANT,
+};
+const insn_attr_t inat_escape_table_2_1[INAT_OPCODE_TABLE_SIZE] = {
+	[0x00] = INAT_MODRM | INAT_VEXOK,
+	[0x01] = INAT_MODRM | INAT_VEXOK,
+	[0x02] = INAT_MODRM | INAT_VEXOK,
+	[0x03] = INAT_MODRM | INAT_VEXOK,
+	[0x04] = INAT_MODRM | INAT_VEXOK,
+	[0x05] = INAT_MODRM | INAT_VEXOK,
+	[0x06] = INAT_MODRM | INAT_VEXOK,
+	[0x07] = INAT_MODRM | INAT_VEXOK,
+	[0x08] = INAT_MODRM | INAT_VEXOK,
+	[0x09] = INAT_MODRM | INAT_VEXOK,
+	[0x0a] = INAT_MODRM | INAT_VEXOK,
+	[0x0b] = INAT_MODRM | INAT_VEXOK,
+	[0x0c] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x0d] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x0e] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x0f] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x10] = INAT_MODRM,
+	[0x13] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x14] = INAT_MODRM,
+	[0x15] = INAT_MODRM,
+	[0x16] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x17] = INAT_MODRM | INAT_VEXOK,
+	[0x18] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x19] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x1a] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x1c] = INAT_MODRM | INAT_VEXOK,
+	[0x1d] = INAT_MODRM | INAT_VEXOK,
+	[0x1e] = INAT_MODRM | INAT_VEXOK,
+	[0x20] = INAT_MODRM | INAT_VEXOK,
+	[0x21] = INAT_MODRM | INAT_VEXOK,
+	[0x22] = INAT_MODRM | INAT_VEXOK,
+	[0x23] = INAT_MODRM | INAT_VEXOK,
+	[0x24] = INAT_MODRM | INAT_VEXOK,
+	[0x25] = INAT_MODRM | INAT_VEXOK,
+	[0x28] = INAT_MODRM | INAT_VEXOK,
+	[0x29] = INAT_MODRM | INAT_VEXOK,
+	[0x2a] = INAT_MODRM | INAT_VEXOK,
+	[0x2b] = INAT_MODRM | INAT_VEXOK,
+	[0x2c] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x2d] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x2e] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x2f] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x30] = INAT_MODRM | INAT_VEXOK,
+	[0x31] = INAT_MODRM | INAT_VEXOK,
+	[0x32] = INAT_MODRM | INAT_VEXOK,
+	[0x33] = INAT_MODRM | INAT_VEXOK,
+	[0x34] = INAT_MODRM | INAT_VEXOK,
+	[0x35] = INAT_MODRM | INAT_VEXOK,
+	[0x36] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x37] = INAT_MODRM | INAT_VEXOK,
+	[0x38] = INAT_MODRM | INAT_VEXOK,
+	[0x39] = INAT_MODRM | INAT_VEXOK,
+	[0x3a] = INAT_MODRM | INAT_VEXOK,
+	[0x3b] = INAT_MODRM | INAT_VEXOK,
+	[0x3c] = INAT_MODRM | INAT_VEXOK,
+	[0x3d] = INAT_MODRM | INAT_VEXOK,
+	[0x3e] = INAT_MODRM | INAT_VEXOK,
+	[0x3f] = INAT_MODRM | INAT_VEXOK,
+	[0x40] = INAT_MODRM | INAT_VEXOK,
+	[0x41] = INAT_MODRM | INAT_VEXOK,
+	[0x45] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x46] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x47] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x58] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x59] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x5a] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x78] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x79] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x80] = INAT_MODRM,
+	[0x81] = INAT_MODRM,
+	[0x82] = INAT_MODRM,
+	[0x8c] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x8e] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x90] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x91] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x92] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x93] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x96] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x97] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x98] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x99] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x9a] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x9b] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x9c] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x9d] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x9e] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x9f] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xa6] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xa7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xa8] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xa9] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xaa] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xab] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xac] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xad] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xae] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xaf] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xb6] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xb7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xb8] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xb9] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xba] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xbb] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xbc] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xbd] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xbe] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xbf] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xdb] = INAT_MODRM | INAT_VEXOK,
+	[0xdc] = INAT_MODRM | INAT_VEXOK,
+	[0xdd] = INAT_MODRM | INAT_VEXOK,
+	[0xde] = INAT_MODRM | INAT_VEXOK,
+	[0xdf] = INAT_MODRM | INAT_VEXOK,
+	[0xf0] = INAT_MODRM,
+	[0xf1] = INAT_MODRM,
+	[0xf7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+};
+const insn_attr_t inat_escape_table_2_2[INAT_OPCODE_TABLE_SIZE] = {
+	[0xf5] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xf7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+};
+const insn_attr_t inat_escape_table_2_3[INAT_OPCODE_TABLE_SIZE] = {
+	[0xf0] = INAT_MODRM,
+	[0xf1] = INAT_MODRM,
+	[0xf5] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xf6] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0xf7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+};
+
+/* Table: 3-byte opcode 2 (0x0f 0x3a) */
+const insn_attr_t inat_escape_table_3[INAT_OPCODE_TABLE_SIZE] = {
+	[0x00] = INAT_VARIANT,
+	[0x01] = INAT_VARIANT,
+	[0x02] = INAT_VARIANT,
+	[0x04] = INAT_VARIANT,
+	[0x05] = INAT_VARIANT,
+	[0x06] = INAT_VARIANT,
+	[0x08] = INAT_VARIANT,
+	[0x09] = INAT_VARIANT,
+	[0x0a] = INAT_VARIANT,
+	[0x0b] = INAT_VARIANT,
+	[0x0c] = INAT_VARIANT,
+	[0x0d] = INAT_VARIANT,
+	[0x0e] = INAT_VARIANT,
+	[0x0f] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0x14] = INAT_VARIANT,
+	[0x15] = INAT_VARIANT,
+	[0x16] = INAT_VARIANT,
+	[0x17] = INAT_VARIANT,
+	[0x18] = INAT_VARIANT,
+	[0x19] = INAT_VARIANT,
+	[0x1d] = INAT_VARIANT,
+	[0x20] = INAT_VARIANT,
+	[0x21] = INAT_VARIANT,
+	[0x22] = INAT_VARIANT,
+	[0x38] = INAT_VARIANT,
+	[0x39] = INAT_VARIANT,
+	[0x40] = INAT_VARIANT,
+	[0x41] = INAT_VARIANT,
+	[0x42] = INAT_VARIANT,
+	[0x44] = INAT_VARIANT,
+	[0x46] = INAT_VARIANT,
+	[0x4a] = INAT_VARIANT,
+	[0x4b] = INAT_VARIANT,
+	[0x4c] = INAT_VARIANT,
+	[0x60] = INAT_VARIANT,
+	[0x61] = INAT_VARIANT,
+	[0x62] = INAT_VARIANT,
+	[0x63] = INAT_VARIANT,
+	[0xdf] = INAT_VARIANT,
+	[0xf0] = INAT_VARIANT,
+};
+const insn_attr_t inat_escape_table_3_1[INAT_OPCODE_TABLE_SIZE] = {
+	[0x00] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x01] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x02] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x04] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x05] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x06] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x08] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x09] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x0a] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x0b] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x0c] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x0d] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x0e] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x0f] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x14] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x15] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x16] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x17] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x18] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x19] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x1d] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x20] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x21] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x22] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x38] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x39] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x40] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x41] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x42] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x44] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x46] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x4a] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x4b] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x4c] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x60] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x61] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x62] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x63] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0xdf] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+};
+const insn_attr_t inat_escape_table_3_3[INAT_OPCODE_TABLE_SIZE] = {
+	[0xf0] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+};
+
+/* GrpTable: Grp1 */
+
+/* GrpTable: Grp1A */
+
+/* GrpTable: Grp2 */
+
+/* GrpTable: Grp3_1 */
+const insn_attr_t inat_group_table_5[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
+	[0x2] = INAT_MODRM,
+	[0x3] = INAT_MODRM,
+	[0x4] = INAT_MODRM,
+	[0x5] = INAT_MODRM,
+	[0x6] = INAT_MODRM,
+	[0x7] = INAT_MODRM,
+};
+
+/* GrpTable: Grp3_2 */
+const insn_attr_t inat_group_table_6[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_MODRM,
+	[0x2] = INAT_MODRM,
+	[0x3] = INAT_MODRM,
+	[0x4] = INAT_MODRM,
+	[0x5] = INAT_MODRM,
+	[0x6] = INAT_MODRM,
+	[0x7] = INAT_MODRM,
+};
+
+/* GrpTable: Grp4 */
+const insn_attr_t inat_group_table_7[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_MODRM,
+	[0x1] = INAT_MODRM,
+};
+
+/* GrpTable: Grp5 */
+const insn_attr_t inat_group_table_8[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_MODRM,
+	[0x1] = INAT_MODRM,
+	[0x2] = INAT_MODRM | INAT_FORCE64,
+	[0x3] = INAT_MODRM,
+	[0x4] = INAT_MODRM | INAT_FORCE64,
+	[0x5] = INAT_MODRM,
+	[0x6] = INAT_MODRM | INAT_FORCE64,
+};
+
+/* GrpTable: Grp6 */
+const insn_attr_t inat_group_table_9[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_MODRM,
+	[0x1] = INAT_MODRM,
+	[0x2] = INAT_MODRM,
+	[0x3] = INAT_MODRM,
+	[0x4] = INAT_MODRM,
+	[0x5] = INAT_MODRM,
+};
+
+/* GrpTable: Grp7 */
+const insn_attr_t inat_group_table_10[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_MODRM,
+	[0x1] = INAT_MODRM,
+	[0x2] = INAT_MODRM,
+	[0x3] = INAT_MODRM,
+	[0x4] = INAT_MODRM,
+	[0x6] = INAT_MODRM,
+	[0x7] = INAT_MODRM,
+};
+
+/* GrpTable: Grp8 */
+
+/* GrpTable: Grp9 */
+const insn_attr_t inat_group_table_21[INAT_GROUP_TABLE_SIZE] = {
+	[0x1] = INAT_MODRM,
+	[0x6] = INAT_MODRM | INAT_MODRM | INAT_VARIANT,
+	[0x7] = INAT_MODRM | INAT_VARIANT,
+};
+const insn_attr_t inat_group_table_21_1[INAT_GROUP_TABLE_SIZE] = {
+	[0x6] = INAT_MODRM,
+};
+const insn_attr_t inat_group_table_21_2[INAT_GROUP_TABLE_SIZE] = {
+	[0x6] = INAT_MODRM,
+	[0x7] = INAT_MODRM,
+};
+
+/* GrpTable: Grp10 */
+
+/* GrpTable: Grp11 */
+
+/* GrpTable: Grp12 */
+const insn_attr_t inat_group_table_13[INAT_GROUP_TABLE_SIZE] = {
+	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0x4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+};
+const insn_attr_t inat_group_table_13_1[INAT_GROUP_TABLE_SIZE] = {
+	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+};
+
+/* GrpTable: Grp13 */
+const insn_attr_t inat_group_table_14[INAT_GROUP_TABLE_SIZE] = {
+	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0x4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+};
+const insn_attr_t inat_group_table_14_1[INAT_GROUP_TABLE_SIZE] = {
+	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+};
+
+/* GrpTable: Grp14 */
+const insn_attr_t inat_group_table_15[INAT_GROUP_TABLE_SIZE] = {
+	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0x3] = INAT_VARIANT,
+	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
+	[0x7] = INAT_VARIANT,
+};
+const insn_attr_t inat_group_table_15_1[INAT_GROUP_TABLE_SIZE] = {
+	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x3] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+	[0x7] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
+};
+
+/* GrpTable: Grp15 */
+const insn_attr_t inat_group_table_18[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_VARIANT,
+	[0x1] = INAT_VARIANT,
+	[0x2] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+	[0x3] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
+};
+const insn_attr_t inat_group_table_18_2[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_MODRM,
+	[0x1] = INAT_MODRM,
+	[0x2] = INAT_MODRM,
+	[0x3] = INAT_MODRM,
+};
+
+/* GrpTable: Grp16 */
+const insn_attr_t inat_group_table_12[INAT_GROUP_TABLE_SIZE] = {
+	[0x0] = INAT_MODRM,
+	[0x1] = INAT_MODRM,
+	[0x2] = INAT_MODRM,
+	[0x3] = INAT_MODRM,
+};
+
+/* GrpTable: Grp17 */
+const insn_attr_t inat_group_table_22[INAT_GROUP_TABLE_SIZE] = {
+	[0x1] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x2] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+	[0x3] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
+};
+
+/* GrpTable: GrpP */
+
+/* GrpTable: GrpPDLK */
+
+/* GrpTable: GrpRNG */
+
+/* Escape opcode map array */
+const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1][INAT_LSTPFX_MAX + 1] = {
+	[1][0] = inat_escape_table_1,
+	[1][1] = inat_escape_table_1_1,
+	[1][2] = inat_escape_table_1_2,
+	[1][3] = inat_escape_table_1_3,
+	[2][0] = inat_escape_table_2,
+	[2][1] = inat_escape_table_2_1,
+	[2][2] = inat_escape_table_2_2,
+	[2][3] = inat_escape_table_2_3,
+	[3][0] = inat_escape_table_3,
+	[3][1] = inat_escape_table_3_1,
+	[3][3] = inat_escape_table_3_3,
+};
+
+/* Group opcode map array */
+const insn_attr_t * const inat_group_tables[INAT_GRP_MAX + 1][INAT_LSTPFX_MAX + 1] = {
+	[5][0] = inat_group_table_5,
+	[6][0] = inat_group_table_6,
+	[7][0] = inat_group_table_7,
+	[8][0] = inat_group_table_8,
+	[9][0] = inat_group_table_9,
+	[10][0] = inat_group_table_10,
+	[12][0] = inat_group_table_12,
+	[13][0] = inat_group_table_13,
+	[13][1] = inat_group_table_13_1,
+	[14][0] = inat_group_table_14,
+	[14][1] = inat_group_table_14_1,
+	[15][0] = inat_group_table_15,
+	[15][1] = inat_group_table_15_1,
+	[18][0] = inat_group_table_18,
+	[18][2] = inat_group_table_18_2,
+	[21][0] = inat_group_table_21,
+	[21][1] = inat_group_table_21_1,
+	[21][2] = inat_group_table_21_2,
+	[22][0] = inat_group_table_22,
+};
+
+/* AVX opcode map array */
+const insn_attr_t * const inat_avx_tables[X86_VEX_M_MAX + 1][INAT_LSTPFX_MAX + 1] = {
+	[1][0] = inat_escape_table_1,
+	[1][1] = inat_escape_table_1_1,
+	[1][2] = inat_escape_table_1_2,
+	[1][3] = inat_escape_table_1_3,
+	[2][0] = inat_escape_table_2,
+	[2][1] = inat_escape_table_2_1,
+	[2][2] = inat_escape_table_2_2,
+	[2][3] = inat_escape_table_2_3,
+	[3][0] = inat_escape_table_3,
+	[3][1] = inat_escape_table_3_1,
+	[3][3] = inat_escape_table_3_3,
+};
diff --git a/xen/arch/x86/inat.c b/xen/arch/x86/inat.c
new file mode 100644
index 0000000..feeaa50
--- /dev/null
+++ b/xen/arch/x86/inat.c
@@ -0,0 +1,96 @@
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/insn.h>
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+	return inat_primary_table[opcode];
+}
+
+int inat_get_last_prefix_id(insn_byte_t last_pfx)
+{
+	insn_attr_t lpfx_attr;
+
+	lpfx_attr = inat_get_opcode_attribute(last_pfx);
+	return inat_last_prefix_id(lpfx_attr);
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id,
+				      insn_attr_t esc_attr)
+{
+	const insn_attr_t *table;
+	int n;
+
+	n = inat_escape_id(esc_attr);
+
+	table = inat_escape_tables[n][0];
+	if (!table)
+		return 0;
+	if (inat_has_variant(table[opcode]) && lpfx_id) {
+		table = inat_escape_tables[n][lpfx_id];
+		if (!table)
+			return 0;
+	}
+	return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id,
+				     insn_attr_t grp_attr)
+{
+	const insn_attr_t *table;
+	int n;
+
+	n = inat_group_id(grp_attr);
+
+	table = inat_group_tables[n][0];
+	if (!table)
+		return inat_group_common_attribute(grp_attr);
+	if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) {
+		table = inat_group_tables[n][lpfx_id];
+		if (!table)
+			return inat_group_common_attribute(grp_attr);
+	}
+	return table[X86_MODRM_REG(modrm)] |
+	       inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+				   insn_byte_t vex_p)
+{
+	const insn_attr_t *table;
+	if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+		return 0;
+	/* At first, this checks the master table */
+	table = inat_avx_tables[vex_m][0];
+	if (!table)
+		return 0;
+	if (!inat_is_group(table[opcode]) && vex_p) {
+		/* If this is not a group, get attribute directly */
+		table = inat_avx_tables[vex_m][vex_p];
+		if (!table)
+			return 0;
+	}
+	return table[opcode];
+}
diff --git a/xen/arch/x86/insn.c b/xen/arch/x86/insn.c
new file mode 100644
index 0000000..5aea2c7
--- /dev/null
+++ b/xen/arch/x86/insn.c
@@ -0,0 +1,576 @@
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004, 2009
+ */
+
+#include <xen/string.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+
+/* Verify next sizeof(t) bytes can be on the same instruction */
+#define validate_next(t, insn, n)	\
+	((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE)
+
+#define __get_next(t, insn)	\
+	({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define __peek_nbyte_next(t, insn, n)	\
+	({ t r = *(t*)((insn)->next_byte + n); r; })
+
+#define get_next(t, insn)	\
+	({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
+
+#define peek_nbyte_next(t, insn, n)	\
+	({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); })
+
+#define peek_next(t, insn)	peek_nbyte_next(t, insn, 0)
+
+/**
+ * insn_init() - initialize struct insn
+ * @insn:	&struct insn to be initialized
+ * @kaddr:	address (in kernel memory) of instruction (or copy thereof)
+ * @x86_64:	!0 for 64-bit kernel or 64-bit app
+ */
+void insn_init(struct insn *insn, const void *kaddr, int x86_64)
+{
+	memset(insn, 0, sizeof(*insn));
+	insn->kaddr = kaddr;
+	insn->next_byte = kaddr;
+	insn->x86_64 = x86_64 ? 1 : 0;
+	insn->opnd_bytes = 4;
+	if (x86_64)
+		insn->addr_bytes = 8;
+	else
+		insn->addr_bytes = 4;
+}
+
+/**
+ * insn_get_prefixes - scan x86 instruction prefix bytes
+ * @insn:	&struct insn containing instruction
+ *
+ * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
+ * to point to the (first) opcode.  No effect if @insn->prefixes.got
+ * is already set.
+ */
+void insn_get_prefixes(struct insn *insn)
+{
+	struct insn_field *prefixes = &insn->prefixes;
+	insn_attr_t attr;
+	insn_byte_t b, lb;
+	int i, nb;
+
+	if (prefixes->got)
+		return;
+
+	nb = 0;
+	lb = 0;
+	b = peek_next(insn_byte_t, insn);
+	attr = inat_get_opcode_attribute(b);
+	while (inat_is_legacy_prefix(attr)) {
+		/* Skip if same prefix */
+		for (i = 0; i < nb; i++)
+			if (prefixes->bytes[i] == b)
+				goto found;
+		if (nb == 4)
+			/* Invalid instruction */
+			break;
+		prefixes->bytes[nb++] = b;
+		if (inat_is_address_size_prefix(attr)) {
+			/* address size switches 2/4 or 4/8 */
+			if (insn->x86_64)
+				insn->addr_bytes ^= 12;
+			else
+				insn->addr_bytes ^= 6;
+		} else if (inat_is_operand_size_prefix(attr)) {
+			/* oprand size switches 2/4 */
+			insn->opnd_bytes ^= 6;
+		}
+found:
+		prefixes->nbytes++;
+		insn->next_byte++;
+		lb = b;
+		b = peek_next(insn_byte_t, insn);
+		attr = inat_get_opcode_attribute(b);
+	}
+	/* Set the last prefix */
+	if (lb && lb != insn->prefixes.bytes[3]) {
+		if (unlikely(insn->prefixes.bytes[3])) {
+			/* Swap the last prefix */
+			b = insn->prefixes.bytes[3];
+			for (i = 0; i < nb; i++)
+				if (prefixes->bytes[i] == lb)
+					prefixes->bytes[i] = b;
+		}
+		insn->prefixes.bytes[3] = lb;
+	}
+
+	/* Decode REX prefix */
+	if (insn->x86_64) {
+		b = peek_next(insn_byte_t, insn);
+		attr = inat_get_opcode_attribute(b);
+		if (inat_is_rex_prefix(attr)) {
+			insn->rex_prefix.value = b;
+			insn->rex_prefix.nbytes = 1;
+			insn->next_byte++;
+			if (X86_REX_W(b))
+				/* REX.W overrides opnd_size */
+				insn->opnd_bytes = 8;
+		}
+	}
+	insn->rex_prefix.got = 1;
+
+	/* Decode VEX prefix */
+	b = peek_next(insn_byte_t, insn);
+	attr = inat_get_opcode_attribute(b);
+	if (inat_is_vex_prefix(attr)) {
+		insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
+		if (!insn->x86_64) {
+			/*
+			 * In 32-bits mode, if the [7:6] bits (mod bits of
+			 * ModRM) on the second byte are not 11b, it is
+			 * LDS or LES.
+			 */
+			if (X86_MODRM_MOD(b2) != 3)
+				goto vex_end;
+		}
+		insn->vex_prefix.bytes[0] = b;
+		insn->vex_prefix.bytes[1] = b2;
+		if (inat_is_vex3_prefix(attr)) {
+			b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+			insn->vex_prefix.bytes[2] = b2;
+			insn->vex_prefix.nbytes = 3;
+			insn->next_byte += 3;
+			if (insn->x86_64 && X86_VEX_W(b2))
+				/* VEX.W overrides opnd_size */
+				insn->opnd_bytes = 8;
+		} else {
+			insn->vex_prefix.nbytes = 2;
+			insn->next_byte += 2;
+		}
+	}
+vex_end:
+	insn->vex_prefix.got = 1;
+
+	prefixes->got = 1;
+
+err_out:
+	return;
+}
+
+/**
+ * insn_get_opcode - collect opcode(s)
+ * @insn:	&struct insn containing instruction
+ *
+ * Populates @insn->opcode, updates @insn->next_byte to point past the
+ * opcode byte(s), and set @insn->attr (except for groups).
+ * If necessary, first collects any preceding (prefix) bytes.
+ * Sets @insn->opcode.value = opcode1.  No effect if @insn->opcode.got
+ * is already 1.
+ */
+void insn_get_opcode(struct insn *insn)
+{
+	struct insn_field *opcode = &insn->opcode;
+	insn_byte_t op;
+	int pfx_id;
+	if (opcode->got)
+		return;
+	if (!insn->prefixes.got)
+		insn_get_prefixes(insn);
+
+	/* Get first opcode */
+	op = get_next(insn_byte_t, insn);
+	opcode->bytes[0] = op;
+	opcode->nbytes = 1;
+
+	/* Check if there is VEX prefix or not */
+	if (insn_is_avx(insn)) {
+		insn_byte_t m, p;
+		m = insn_vex_m_bits(insn);
+		p = insn_vex_p_bits(insn);
+		insn->attr = inat_get_avx_attribute(op, m, p);
+		if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
+			insn->attr = 0;	/* This instruction is bad */
+		goto end;	/* VEX has only 1 byte for opcode */
+	}
+
+	insn->attr = inat_get_opcode_attribute(op);
+	while (inat_is_escape(insn->attr)) {
+		/* Get escaped opcode */
+		op = get_next(insn_byte_t, insn);
+		opcode->bytes[opcode->nbytes++] = op;
+		pfx_id = insn_last_prefix_id(insn);
+		insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
+	}
+	if (inat_must_vex(insn->attr))
+		insn->attr = 0;	/* This instruction is bad */
+end:
+	opcode->got = 1;
+
+err_out:
+	return;
+}
+
+/**
+ * insn_get_modrm - collect ModRM byte, if any
+ * @insn:	&struct insn containing instruction
+ *
+ * Populates @insn->modrm and updates @insn->next_byte to point past the
+ * ModRM byte, if any.  If necessary, first collects the preceding bytes
+ * (prefixes and opcode(s)).  No effect if @insn->modrm.got is already 1.
+ */
+void insn_get_modrm(struct insn *insn)
+{
+	struct insn_field *modrm = &insn->modrm;
+	insn_byte_t pfx_id, mod;
+	if (modrm->got)
+		return;
+	if (!insn->opcode.got)
+		insn_get_opcode(insn);
+
+	if (inat_has_modrm(insn->attr)) {
+		mod = get_next(insn_byte_t, insn);
+		modrm->value = mod;
+		modrm->nbytes = 1;
+		if (inat_is_group(insn->attr)) {
+			pfx_id = insn_last_prefix_id(insn);
+			insn->attr = inat_get_group_attribute(mod, pfx_id,
+							      insn->attr);
+			if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
+				insn->attr = 0;	/* This is bad */
+		}
+	}
+
+	if (insn->x86_64 && inat_is_force64(insn->attr))
+		insn->opnd_bytes = 8;
+	modrm->got = 1;
+
+err_out:
+	return;
+}
+
+
+/**
+ * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
+ * @insn:	&struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.  No effect if @insn->x86_64 is 0.
+ */
+int insn_rip_relative(struct insn *insn)
+{
+	struct insn_field *modrm = &insn->modrm;
+
+	if (!insn->x86_64)
+		return 0;
+	if (!modrm->got)
+		insn_get_modrm(insn);
+	/*
+	 * For rip-relative instructions, the mod field (top 2 bits)
+	 * is zero and the r/m field (bottom 3 bits) is 0x5.
+	 */
+	return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+}
+
+/**
+ * insn_get_sib() - Get the SIB byte of instruction
+ * @insn:	&struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.
+ */
+void insn_get_sib(struct insn *insn)
+{
+	insn_byte_t modrm;
+
+	if (insn->sib.got)
+		return;
+	if (!insn->modrm.got)
+		insn_get_modrm(insn);
+	if (insn->modrm.nbytes) {
+		modrm = (insn_byte_t)insn->modrm.value;
+		if (insn->addr_bytes != 2 &&
+		    X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
+			insn->sib.value = get_next(insn_byte_t, insn);
+			insn->sib.nbytes = 1;
+		}
+	}
+	insn->sib.got = 1;
+
+err_out:
+	return;
+}
+
+
+/**
+ * insn_get_displacement() - Get the displacement of instruction
+ * @insn:	&struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * SIB byte.
+ * Displacement value is sign-expanded.
+ */
+void insn_get_displacement(struct insn *insn)
+{
+	insn_byte_t mod, rm, base;
+
+	if (insn->displacement.got)
+		return;
+	if (!insn->sib.got)
+		insn_get_sib(insn);
+	if (insn->modrm.nbytes) {
+		/*
+		 * Interpreting the modrm byte:
+		 * mod = 00 - no displacement fields (exceptions below)
+		 * mod = 01 - 1-byte displacement field
+		 * mod = 10 - displacement field is 4 bytes, or 2 bytes if
+		 * 	address size = 2 (0x67 prefix in 32-bit mode)
+		 * mod = 11 - no memory operand
+		 *
+		 * If address size = 2...
+		 * mod = 00, r/m = 110 - displacement field is 2 bytes
+		 *
+		 * If address size != 2...
+		 * mod != 11, r/m = 100 - SIB byte exists
+		 * mod = 00, SIB base = 101 - displacement field is 4 bytes
+		 * mod = 00, r/m = 101 - rip-relative addressing, displacement
+		 * 	field is 4 bytes
+		 */
+		mod = X86_MODRM_MOD(insn->modrm.value);
+		rm = X86_MODRM_RM(insn->modrm.value);
+		base = X86_SIB_BASE(insn->sib.value);
+		if (mod == 3)
+			goto out;
+		if (mod == 1) {
+			insn->displacement.value = get_next(char, insn);
+			insn->displacement.nbytes = 1;
+		} else if (insn->addr_bytes == 2) {
+			if ((mod == 0 && rm == 6) || mod == 2) {
+				insn->displacement.value =
+					 get_next(short, insn);
+				insn->displacement.nbytes = 2;
+			}
+		} else {
+			if ((mod == 0 && rm == 5) || mod == 2 ||
+			    (mod == 0 && base == 5)) {
+				insn->displacement.value = get_next(int, insn);
+				insn->displacement.nbytes = 4;
+			}
+		}
+	}
+out:
+	insn->displacement.got = 1;
+
+err_out:
+	return;
+}
+
+/* Decode moffset16/32/64. Return 0 if failed */
+static int __get_moffset(struct insn *insn)
+{
+	switch (insn->addr_bytes) {
+	case 2:
+		insn->moffset1.value = get_next(short, insn);
+		insn->moffset1.nbytes = 2;
+		break;
+	case 4:
+		insn->moffset1.value = get_next(int, insn);
+		insn->moffset1.nbytes = 4;
+		break;
+	case 8:
+		insn->moffset1.value = get_next(int, insn);
+		insn->moffset1.nbytes = 4;
+		insn->moffset2.value = get_next(int, insn);
+		insn->moffset2.nbytes = 4;
+		break;
+	default:	/* opnd_bytes must be modified manually */
+		goto err_out;
+	}
+	insn->moffset1.got = insn->moffset2.got = 1;
+
+	return 1;
+
+err_out:
+	return 0;
+}
+
+/* Decode imm v32(Iz). Return 0 if failed */
+static int __get_immv32(struct insn *insn)
+{
+	switch (insn->opnd_bytes) {
+	case 2:
+		insn->immediate.value = get_next(short, insn);
+		insn->immediate.nbytes = 2;
+		break;
+	case 4:
+	case 8:
+		insn->immediate.value = get_next(int, insn);
+		insn->immediate.nbytes = 4;
+		break;
+	default:	/* opnd_bytes must be modified manually */
+		goto err_out;
+	}
+
+	return 1;
+
+err_out:
+	return 0;
+}
+
+/* Decode imm v64(Iv/Ov), Return 0 if failed */
+static int __get_immv(struct insn *insn)
+{
+	switch (insn->opnd_bytes) {
+	case 2:
+		insn->immediate1.value = get_next(short, insn);
+		insn->immediate1.nbytes = 2;
+		break;
+	case 4:
+		insn->immediate1.value = get_next(int, insn);
+		insn->immediate1.nbytes = 4;
+		break;
+	case 8:
+		insn->immediate1.value = get_next(int, insn);
+		insn->immediate1.nbytes = 4;
+		insn->immediate2.value = get_next(int, insn);
+		insn->immediate2.nbytes = 4;
+		break;
+	default:	/* opnd_bytes must be modified manually */
+		goto err_out;
+	}
+	insn->immediate1.got = insn->immediate2.got = 1;
+
+	return 1;
+err_out:
+	return 0;
+}
+
+/* Decode ptr16:16/32(Ap) */
+static int __get_immptr(struct insn *insn)
+{
+	switch (insn->opnd_bytes) {
+	case 2:
+		insn->immediate1.value = get_next(short, insn);
+		insn->immediate1.nbytes = 2;
+		break;
+	case 4:
+		insn->immediate1.value = get_next(int, insn);
+		insn->immediate1.nbytes = 4;
+		break;
+	case 8:
+		/* ptr16:64 is not exist (no segment) */
+		return 0;
+	default:	/* opnd_bytes must be modified manually */
+		goto err_out;
+	}
+	insn->immediate2.value = get_next(unsigned short, insn);
+	insn->immediate2.nbytes = 2;
+	insn->immediate1.got = insn->immediate2.got = 1;
+
+	return 1;
+err_out:
+	return 0;
+}
+
+/**
+ * insn_get_immediate() - Get the immediates of instruction
+ * @insn:	&struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * displacement bytes.
+ * Basically, most of immediates are sign-expanded. Unsigned-value can be
+ * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ */
+void insn_get_immediate(struct insn *insn)
+{
+	if (insn->immediate.got)
+		return;
+	if (!insn->displacement.got)
+		insn_get_displacement(insn);
+
+	if (inat_has_moffset(insn->attr)) {
+		if (!__get_moffset(insn))
+			goto err_out;
+		goto done;
+	}
+
+	if (!inat_has_immediate(insn->attr))
+		/* no immediates */
+		goto done;
+
+	switch (inat_immediate_size(insn->attr)) {
+	case INAT_IMM_BYTE:
+		insn->immediate.value = get_next(char, insn);
+		insn->immediate.nbytes = 1;
+		break;
+	case INAT_IMM_WORD:
+		insn->immediate.value = get_next(short, insn);
+		insn->immediate.nbytes = 2;
+		break;
+	case INAT_IMM_DWORD:
+		insn->immediate.value = get_next(int, insn);
+		insn->immediate.nbytes = 4;
+		break;
+	case INAT_IMM_QWORD:
+		insn->immediate1.value = get_next(int, insn);
+		insn->immediate1.nbytes = 4;
+		insn->immediate2.value = get_next(int, insn);
+		insn->immediate2.nbytes = 4;
+		break;
+	case INAT_IMM_PTR:
+		if (!__get_immptr(insn))
+			goto err_out;
+		break;
+	case INAT_IMM_VWORD32:
+		if (!__get_immv32(insn))
+			goto err_out;
+		break;
+	case INAT_IMM_VWORD:
+		if (!__get_immv(insn))
+			goto err_out;
+		break;
+	default:
+		/* Here, insn must have an immediate, but failed */
+		goto err_out;
+	}
+	if (inat_has_second_immediate(insn->attr)) {
+		insn->immediate2.value = get_next(char, insn);
+		insn->immediate2.nbytes = 1;
+	}
+done:
+	insn->immediate.got = 1;
+
+err_out:
+	return;
+}
+
+/**
+ * insn_get_length() - Get the length of instruction
+ * @insn:	&struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * immediates bytes.
+ */
+void insn_get_length(struct insn *insn)
+{
+	if (insn->length)
+		return;
+	if (!insn->immediate.got)
+		insn_get_immediate(insn);
+	insn->length = (unsigned char)((unsigned long)insn->next_byte
+				     - (unsigned long)insn->kaddr);
+}
diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h
index 00a06cc..db89184 100644
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -37,6 +37,9 @@ struct hvm_emulate_ctxt {
 
 int hvm_emulate_one(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
+int hvm_emulate_one_no_write(
+    struct hvm_emulate_ctxt *hvmemul_ctxt);
+void hvm_emulate_one_full(bool_t nowrite);
 void hvm_emulate_prepare(
     struct hvm_emulate_ctxt *hvmemul_ctxt,
     struct cpu_user_regs *regs);
@@ -45,6 +48,8 @@ void hvm_emulate_writeback(
 struct segment_register *hvmemul_get_seg_reg(
     enum x86_segment seg,
     struct hvm_emulate_ctxt *hvmemul_ctxt);
+int hvm_get_insn_length(
+    struct hvm_emulate_ctxt *hvmemul_ctxt);
 
 int hvmemul_do_pio(
     unsigned long port, unsigned long *reps, int size,
diff --git a/xen/include/asm-x86/inat.h b/xen/include/asm-x86/inat.h
new file mode 100644
index 0000000..74a2e31
--- /dev/null
+++ b/xen/include/asm-x86/inat.h
@@ -0,0 +1,221 @@
+#ifndef _ASM_X86_INAT_H
+#define _ASM_X86_INAT_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/inat_types.h>
+
+/*
+ * Internal bits. Don't use bitmasks directly, because these bits are
+ * unstable. You should use checking functions.
+ */
+
+#define INAT_OPCODE_TABLE_SIZE 256
+#define INAT_GROUP_TABLE_SIZE 8
+
+/* Legacy last prefixes */
+#define INAT_PFX_OPNDSZ	1	/* 0x66 */ /* LPFX1 */
+#define INAT_PFX_REPE	2	/* 0xF3 */ /* LPFX2 */
+#define INAT_PFX_REPNE	3	/* 0xF2 */ /* LPFX3 */
+/* Other Legacy prefixes */
+#define INAT_PFX_LOCK	4	/* 0xF0 */
+#define INAT_PFX_CS	5	/* 0x2E */
+#define INAT_PFX_DS	6	/* 0x3E */
+#define INAT_PFX_ES	7	/* 0x26 */
+#define INAT_PFX_FS	8	/* 0x64 */
+#define INAT_PFX_GS	9	/* 0x65 */
+#define INAT_PFX_SS	10	/* 0x36 */
+#define INAT_PFX_ADDRSZ	11	/* 0x67 */
+/* x86-64 REX prefix */
+#define INAT_PFX_REX	12	/* 0x4X */
+/* AVX VEX prefixes */
+#define INAT_PFX_VEX2	13	/* 2-bytes VEX prefix */
+#define INAT_PFX_VEX3	14	/* 3-bytes VEX prefix */
+
+#define INAT_LSTPFX_MAX	3
+#define INAT_LGCPFX_MAX	11
+
+/* Immediate size */
+#define INAT_IMM_BYTE		1
+#define INAT_IMM_WORD		2
+#define INAT_IMM_DWORD		3
+#define INAT_IMM_QWORD		4
+#define INAT_IMM_PTR		5
+#define INAT_IMM_VWORD32	6
+#define INAT_IMM_VWORD		7
+
+/* Legacy prefix */
+#define INAT_PFX_OFFS	0
+#define INAT_PFX_BITS	4
+#define INAT_PFX_MAX    ((1 << INAT_PFX_BITS) - 1)
+#define INAT_PFX_MASK	(INAT_PFX_MAX << INAT_PFX_OFFS)
+/* Escape opcodes */
+#define INAT_ESC_OFFS	(INAT_PFX_OFFS + INAT_PFX_BITS)
+#define INAT_ESC_BITS	2
+#define INAT_ESC_MAX	((1 << INAT_ESC_BITS) - 1)
+#define INAT_ESC_MASK	(INAT_ESC_MAX << INAT_ESC_OFFS)
+/* Group opcodes (1-16) */
+#define INAT_GRP_OFFS	(INAT_ESC_OFFS + INAT_ESC_BITS)
+#define INAT_GRP_BITS	5
+#define INAT_GRP_MAX	((1 << INAT_GRP_BITS) - 1)
+#define INAT_GRP_MASK	(INAT_GRP_MAX << INAT_GRP_OFFS)
+/* Immediates */
+#define INAT_IMM_OFFS	(INAT_GRP_OFFS + INAT_GRP_BITS)
+#define INAT_IMM_BITS	3
+#define INAT_IMM_MASK	(((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
+/* Flags */
+#define INAT_FLAG_OFFS	(INAT_IMM_OFFS + INAT_IMM_BITS)
+#define INAT_MODRM	(1 << (INAT_FLAG_OFFS))
+#define INAT_FORCE64	(1 << (INAT_FLAG_OFFS + 1))
+#define INAT_SCNDIMM	(1 << (INAT_FLAG_OFFS + 2))
+#define INAT_MOFFSET	(1 << (INAT_FLAG_OFFS + 3))
+#define INAT_VARIANT	(1 << (INAT_FLAG_OFFS + 4))
+#define INAT_VEXOK	(1 << (INAT_FLAG_OFFS + 5))
+#define INAT_VEXONLY	(1 << (INAT_FLAG_OFFS + 6))
+/* Attribute making macros for attribute tables */
+#define INAT_MAKE_PREFIX(pfx)	(pfx << INAT_PFX_OFFS)
+#define INAT_MAKE_ESCAPE(esc)	(esc << INAT_ESC_OFFS)
+#define INAT_MAKE_GROUP(grp)	((grp << INAT_GRP_OFFS) | INAT_MODRM)
+#define INAT_MAKE_IMM(imm)	(imm << INAT_IMM_OFFS)
+
+/* Attribute search APIs */
+extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern int inat_get_last_prefix_id(insn_byte_t last_pfx);
+extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
+					     int lpfx_id,
+					     insn_attr_t esc_attr);
+extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
+					    int lpfx_id,
+					    insn_attr_t esc_attr);
+extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
+					  insn_byte_t vex_m,
+					  insn_byte_t vex_pp);
+
+/* Attribute checking functions */
+static inline int inat_is_legacy_prefix(insn_attr_t attr)
+{
+	attr &= INAT_PFX_MASK;
+	return attr && attr <= INAT_LGCPFX_MAX;
+}
+
+static inline int inat_is_address_size_prefix(insn_attr_t attr)
+{
+	return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
+}
+
+static inline int inat_is_operand_size_prefix(insn_attr_t attr)
+{
+	return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
+}
+
+static inline int inat_is_rex_prefix(insn_attr_t attr)
+{
+	return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
+}
+
+static inline int inat_last_prefix_id(insn_attr_t attr)
+{
+	if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
+		return 0;
+	else
+		return attr & INAT_PFX_MASK;
+}
+
+static inline int inat_is_vex_prefix(insn_attr_t attr)
+{
+	attr &= INAT_PFX_MASK;
+	return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_vex3_prefix(insn_attr_t attr)
+{
+	return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_escape(insn_attr_t attr)
+{
+	return attr & INAT_ESC_MASK;
+}
+
+static inline int inat_escape_id(insn_attr_t attr)
+{
+	return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
+}
+
+static inline int inat_is_group(insn_attr_t attr)
+{
+	return attr & INAT_GRP_MASK;
+}
+
+static inline int inat_group_id(insn_attr_t attr)
+{
+	return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
+}
+
+static inline int inat_group_common_attribute(insn_attr_t attr)
+{
+	return attr & ~INAT_GRP_MASK;
+}
+
+static inline int inat_has_immediate(insn_attr_t attr)
+{
+	return attr & INAT_IMM_MASK;
+}
+
+static inline int inat_immediate_size(insn_attr_t attr)
+{
+	return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
+}
+
+static inline int inat_has_modrm(insn_attr_t attr)
+{
+	return attr & INAT_MODRM;
+}
+
+static inline int inat_is_force64(insn_attr_t attr)
+{
+	return attr & INAT_FORCE64;
+}
+
+static inline int inat_has_second_immediate(insn_attr_t attr)
+{
+	return attr & INAT_SCNDIMM;
+}
+
+static inline int inat_has_moffset(insn_attr_t attr)
+{
+	return attr & INAT_MOFFSET;
+}
+
+static inline int inat_has_variant(insn_attr_t attr)
+{
+	return attr & INAT_VARIANT;
+}
+
+static inline int inat_accept_vex(insn_attr_t attr)
+{
+	return attr & INAT_VEXOK;
+}
+
+static inline int inat_must_vex(insn_attr_t attr)
+{
+	return attr & INAT_VEXONLY;
+}
+#endif
diff --git a/xen/include/asm-x86/inat_types.h b/xen/include/asm-x86/inat_types.h
new file mode 100644
index 0000000..cb3c20c
--- /dev/null
+++ b/xen/include/asm-x86/inat_types.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_INAT_TYPES_H
+#define _ASM_X86_INAT_TYPES_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* Instruction attributes */
+typedef unsigned int insn_attr_t;
+typedef unsigned char insn_byte_t;
+typedef signed int insn_value_t;
+
+#endif
diff --git a/xen/include/asm-x86/insn.h b/xen/include/asm-x86/insn.h
new file mode 100644
index 0000000..48eb30a
--- /dev/null
+++ b/xen/include/asm-x86/insn.h
@@ -0,0 +1,199 @@
+#ifndef _ASM_X86_INSN_H
+#define _ASM_X86_INSN_H
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/* insn_attr_t is defined in inat.h */
+#include <asm/inat.h>
+
+struct insn_field {
+	union {
+		insn_value_t value;
+		insn_byte_t bytes[4];
+	};
+	/* !0 if we've run insn_get_xxx() for this field */
+	unsigned char got;
+	unsigned char nbytes;
+};
+
+struct insn {
+	struct insn_field prefixes;	/*
+					 * Prefixes
+					 * prefixes.bytes[3]: last prefix
+					 */
+	struct insn_field rex_prefix;	/* REX prefix */
+	struct insn_field vex_prefix;	/* VEX prefix */
+	struct insn_field opcode;	/*
+					 * opcode.bytes[0]: opcode1
+					 * opcode.bytes[1]: opcode2
+					 * opcode.bytes[2]: opcode3
+					 */
+	struct insn_field modrm;
+	struct insn_field sib;
+	struct insn_field displacement;
+	union {
+		struct insn_field immediate;
+		struct insn_field moffset1;	/* for 64bit MOV */
+		struct insn_field immediate1;	/* for 64bit imm or off16/32 */
+	};
+	union {
+		struct insn_field moffset2;	/* for 64bit MOV */
+		struct insn_field immediate2;	/* for 64bit imm or seg16 */
+	};
+
+	insn_attr_t attr;
+	unsigned char opnd_bytes;
+	unsigned char addr_bytes;
+	unsigned char length;
+	unsigned char x86_64;
+
+	const insn_byte_t *kaddr;	/* kernel address of insn to analyze */
+	const insn_byte_t *next_byte;
+};
+
+#define MAX_INSN_SIZE	16
+
+#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
+
+#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
+#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
+#define X86_SIB_BASE(sib) ((sib) & 0x07)
+
+#define X86_REX_W(rex) ((rex) & 8)
+#define X86_REX_R(rex) ((rex) & 4)
+#define X86_REX_X(rex) ((rex) & 2)
+#define X86_REX_B(rex) ((rex) & 1)
+
+/* VEX bit flags  */
+#define X86_VEX_W(vex)	((vex) & 0x80)	/* VEX3 Byte2 */
+#define X86_VEX_R(vex)	((vex) & 0x80)	/* VEX2/3 Byte1 */
+#define X86_VEX_X(vex)	((vex) & 0x40)	/* VEX3 Byte1 */
+#define X86_VEX_B(vex)	((vex) & 0x20)	/* VEX3 Byte1 */
+#define X86_VEX_L(vex)	((vex) & 0x04)	/* VEX3 Byte2, VEX2 Byte1 */
+/* VEX bit fields */
+#define X86_VEX3_M(vex)	((vex) & 0x1f)		/* VEX3 Byte1 */
+#define X86_VEX2_M	1			/* VEX2.M always 1 */
+#define X86_VEX_V(vex)	(((vex) & 0x78) >> 3)	/* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_P(vex)	((vex) & 0x03)		/* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_M_MAX	0x1f			/* VEX3.M Maximum value */
+
+extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
+extern void insn_get_prefixes(struct insn *insn);
+extern void insn_get_opcode(struct insn *insn);
+extern void insn_get_modrm(struct insn *insn);
+extern void insn_get_sib(struct insn *insn);
+extern void insn_get_displacement(struct insn *insn);
+extern void insn_get_immediate(struct insn *insn);
+extern void insn_get_length(struct insn *insn);
+
+/* Attribute will be determined after getting ModRM (for opcode groups) */
+static inline void insn_get_attribute(struct insn *insn)
+{
+	insn_get_modrm(insn);
+}
+
+/* Instruction uses RIP-relative addressing */
+extern int insn_rip_relative(struct insn *insn);
+
+/* Init insn for kernel text */
+static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
+{
+#ifdef CONFIG_X86_64
+	insn_init(insn, kaddr, 1);
+#else /* CONFIG_X86_32 */
+	insn_init(insn, kaddr, 0);
+#endif
+}
+
+static inline int insn_is_avx(struct insn *insn)
+{
+	if (!insn->prefixes.got)
+		insn_get_prefixes(insn);
+	return (insn->vex_prefix.value != 0);
+}
+
+/* Ensure this instruction is decoded completely */
+static inline int insn_complete(struct insn *insn)
+{
+	return insn->opcode.got && insn->modrm.got && insn->sib.got &&
+		insn->displacement.got && insn->immediate.got;
+}
+
+static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
+{
+	if (insn->vex_prefix.nbytes == 2)	/* 2 bytes VEX */
+		return X86_VEX2_M;
+	else
+		return X86_VEX3_M(insn->vex_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
+{
+	if (insn->vex_prefix.nbytes == 2)	/* 2 bytes VEX */
+		return X86_VEX_P(insn->vex_prefix.bytes[1]);
+	else
+		return X86_VEX_P(insn->vex_prefix.bytes[2]);
+}
+
+/* Get the last prefix id from last prefix or VEX prefix */
+static inline int insn_last_prefix_id(struct insn *insn)
+{
+	if (insn_is_avx(insn))
+		return insn_vex_p_bits(insn);	/* VEX_p is a SIMD prefix id */
+
+	if (insn->prefixes.bytes[3])
+		return inat_get_last_prefix_id(insn->prefixes.bytes[3]);
+
+	return 0;
+}
+
+/* Offset of each field from kaddr */
+static inline int insn_offset_rex_prefix(struct insn *insn)
+{
+	return insn->prefixes.nbytes;
+}
+static inline int insn_offset_vex_prefix(struct insn *insn)
+{
+	return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
+}
+static inline int insn_offset_opcode(struct insn *insn)
+{
+	return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
+}
+static inline int insn_offset_modrm(struct insn *insn)
+{
+	return insn_offset_opcode(insn) + insn->opcode.nbytes;
+}
+static inline int insn_offset_sib(struct insn *insn)
+{
+	return insn_offset_modrm(insn) + insn->modrm.nbytes;
+}
+static inline int insn_offset_displacement(struct insn *insn)
+{
+	return insn_offset_sib(insn) + insn->sib.nbytes;
+}
+static inline int insn_offset_immediate(struct insn *insn)
+{
+	return insn_offset_displacement(insn) + insn->displacement.nbytes;
+}
+
+#endif /* _ASM_X86_INSN_H */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
@ 2014-07-02 13:33 ` Razvan Cojocaru
  2014-07-02 15:31   ` Andrew Cooper
  2014-07-02 15:37   ` Jan Beulich
  2014-07-02 13:33 ` [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
                   ` (8 subsequent siblings)
  9 siblings, 2 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Razvan Cojocaru

Speed optimization for introspection purposes: a handful of registers
are sent along with each mem_event. This requires enlargement of the
mem_event_request / mem_event_response stuctures, and additional code
to fill in relevant values.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/hvm.c                 |   33 +++++++++++++++++
 xen/arch/x86/hvm/vmx/vmx.c             |    1 +
 xen/arch/x86/mm/p2m.c                  |   61 ++++++++++++++++++++++++++++++++
 xen/include/public/arch-x86/hvm/save.h |    4 +++
 xen/include/public/mem_event.h         |   36 +++++++++++++++++++
 5 files changed, 135 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 17ff011..f65a5f5 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -6016,6 +6016,38 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
     return rc;
 }
 
+static inline void hvm_mem_event_fill_regs(mem_event_request_t *req)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct vcpu *v = current;
+
+    req->regs.rax = regs->eax;
+    req->regs.rcx = regs->ecx;
+    req->regs.rdx = regs->edx;
+    req->regs.rbx = regs->ebx;
+    req->regs.rsp = regs->esp;
+    req->regs.rbp = regs->ebp;
+    req->regs.rsi = regs->esi;
+    req->regs.rdi = regs->edi;
+
+    req->regs.r8  = regs->r8;
+    req->regs.r9  = regs->r9;
+    req->regs.r10 = regs->r10;
+    req->regs.r11 = regs->r11;
+    req->regs.r12 = regs->r12;
+    req->regs.r13 = regs->r13;
+    req->regs.r14 = regs->r14;
+    req->regs.r15 = regs->r15;
+
+    req->regs.rflags = regs->eflags;
+    req->regs.rip    = regs->eip;
+
+    req->regs.msr_efer = v->arch.hvm_vcpu.guest_efer;
+    req->regs.cr0 = v->arch.hvm_vcpu.guest_cr[0];
+    req->regs.cr3 = v->arch.hvm_vcpu.guest_cr[3];
+    req->regs.cr4 = v->arch.hvm_vcpu.guest_cr[4];
+}
+
 static int hvm_memory_event_traps(long p, uint32_t reason,
                                   unsigned long value, unsigned long old, 
                                   bool_t gla_valid, unsigned long gla) 
@@ -6060,6 +6092,7 @@ static int hvm_memory_event_traps(long p, uint32_t reason,
         req.gla = old;
     }
     
+    hvm_mem_event_fill_regs(&req);
     mem_event_put_request(d, &d->mem_event->access, &req);
     
     return 1;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 2caa04a..fed21b6 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
     c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
 
     c->msr_efer = v->arch.hvm_vcpu.guest_efer;
+    c->guest_x86_mode = vmx_guest_x86_mode(v);
 
     __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
     __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 642ec28..93252d9 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1314,6 +1314,64 @@ void p2m_mem_paging_resume(struct domain *d)
     }
 }
 
+static inline void p2m_mem_event_fill_regs(mem_event_request_t *req)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct segment_register seg;
+    struct hvm_hw_cpu ctxt;
+    struct vcpu *v = current;
+
+    memset(&ctxt, 0, sizeof(struct hvm_hw_cpu));
+
+    /* Architecture-specific vmcs/vmcb bits */
+    hvm_funcs.save_cpu_ctxt(v, &ctxt);
+
+    req->regs.rax = regs->eax;
+    req->regs.rcx = regs->ecx;
+    req->regs.rdx = regs->edx;
+    req->regs.rbx = regs->ebx;
+    req->regs.rsp = regs->esp;
+    req->regs.rbp = regs->ebp;
+    req->regs.rsi = regs->esi;
+    req->regs.rdi = regs->edi;
+
+#ifdef __x86_64__
+    req->regs.r8  = regs->r8;
+    req->regs.r9  = regs->r9;
+    req->regs.r10 = regs->r10;
+    req->regs.r11 = regs->r11;
+    req->regs.r12 = regs->r12;
+    req->regs.r13 = regs->r13;
+    req->regs.r14 = regs->r14;
+    req->regs.r15 = regs->r15;
+#endif
+
+    req->regs.rflags = regs->eflags;
+    req->regs.rip    = regs->eip;
+
+    req->regs.dr7 = v->arch.debugreg[7];
+    req->regs.cr0 = ctxt.cr0;
+    req->regs.cr2 = ctxt.cr2;
+    req->regs.cr3 = ctxt.cr3;
+    req->regs.cr4 = ctxt.cr4;
+
+    req->regs.sysenter_cs = ctxt.sysenter_cs;
+    req->regs.sysenter_esp = ctxt.sysenter_esp;
+    req->regs.sysenter_eip = ctxt.sysenter_eip;
+
+    req->regs.msr_efer = ctxt.msr_efer;
+    req->regs.msr_star = ctxt.msr_star;
+    req->regs.msr_lstar = ctxt.msr_lstar;
+
+    hvm_get_segment_register(v, x86_seg_fs, &seg);
+    req->regs.fs_base = seg.base;
+
+    hvm_get_segment_register(v, x86_seg_gs, &seg);
+    req->regs.gs_base = seg.base;
+
+    req->regs.guest_x86_mode = hvm_guest_x86_mode(current);
+}
+
 bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, 
                           bool_t access_r, bool_t access_w, bool_t access_x,
                           mem_event_request_t **req_ptr)
@@ -1407,6 +1465,9 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
     if ( p2ma != p2m_access_n2rwx )
         vcpu_pause_nosync(v);
 
+    if ( req )
+        p2m_mem_event_fill_regs(req);
+
     /* VCPU may be paused, return whether we promoted automatically */
     return (p2ma == p2m_access_n2rwx);
 }
diff --git a/xen/include/public/arch-x86/hvm/save.h b/xen/include/public/arch-x86/hvm/save.h
index 16d85a3..7b659ba 100644
--- a/xen/include/public/arch-x86/hvm/save.h
+++ b/xen/include/public/arch-x86/hvm/save.h
@@ -157,6 +157,8 @@ struct hvm_hw_cpu {
     };
     /* error code for pending event */
     uint32_t error_code;
+
+    int32_t guest_x86_mode;
 };
 
 struct hvm_hw_cpu_compat {
@@ -266,6 +268,8 @@ struct hvm_hw_cpu_compat {
     };
     /* error code for pending event */
     uint32_t error_code;
+
+    int32_t guest_x86_mode;
 };
 
 static inline int _hvm_hw_fix_cpu(void *h) {
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index 3831b41..24ac67d 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -48,6 +48,41 @@
 #define MEM_EVENT_REASON_MSR         7    /* MSR was hit: gfn is MSR value, gla is MSR address;
                                              does NOT honour HVMPME_onchangeonly */
 
+typedef struct mem_event_regs_st {
+    uint64_t rax;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbx;
+    uint64_t rsp;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+    uint64_t rflags;
+    uint64_t dr7;
+    uint64_t rip;
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+    uint64_t msr_efer;
+    uint64_t msr_star;
+    uint64_t msr_lstar;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    int32_t guest_x86_mode;
+} mem_event_regs_t;
+
 typedef struct mem_event_st {
     uint32_t flags;
     uint32_t vcpu_id;
@@ -65,6 +100,7 @@ typedef struct mem_event_st {
     uint16_t available:12;
 
     uint16_t reason;
+    mem_event_regs_t regs;
 } mem_event_request_t, mem_event_response_t;
 
 DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
  2014-07-02 13:33 ` [PATCH RFC 2/9] xen: Optimize introspection access to guest state Razvan Cojocaru
@ 2014-07-02 13:33 ` Razvan Cojocaru
  2014-07-02 15:35   ` Andrew Cooper
  2014-07-02 13:33 ` [PATCH RFC 4/9] xenctrl: Make the headers C++ friendly Razvan Cojocaru
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Razvan Cojocaru

Vmx_disable_intercept_for_msr() will now refuse to disable interception of
MSRs needed by the memory introspection library.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/vmx/vmcs.c |   19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 8ffc562..eb3f030 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -700,6 +700,25 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
     if ( msr_bitmap == NULL )
         return;
 
+    /* Filter out MSR-s needed by the memory introspection engine */
+    switch ( msr )
+    {
+    case MSR_IA32_SYSENTER_EIP:
+    case MSR_IA32_SYSENTER_ESP:
+    case MSR_IA32_SYSENTER_CS:
+    case MSR_IA32_MC0_CTL:
+    case MSR_STAR:
+    case MSR_LSTAR:
+
+        printk("Warning: cannot disable the interception of MSR "
+            "0x%08x because it is needed by the memory introspection "
+            "engine\n", msr);
+        return;
+
+    default:
+        break;
+    }
+
     /*
      * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
      * have the write-low and read-high bitmap offsets the wrong way round.
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH RFC 4/9] xenctrl: Make the headers C++ friendly
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
  2014-07-02 13:33 ` [PATCH RFC 2/9] xen: Optimize introspection access to guest state Razvan Cojocaru
  2014-07-02 13:33 ` [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
@ 2014-07-02 13:33 ` Razvan Cojocaru
  2014-07-02 15:37   ` Andrew Cooper
  2014-07-02 13:33 ` [PATCH RFC 5/9] xen: Support for VMCALL mem_events Razvan Cojocaru
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Razvan Cojocaru

Moved an enum definition before the typedef that uses it.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 tools/libxc/xenctrl.h |   19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index af6f249..abd8947 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -119,6 +119,16 @@ typedef struct xc_interface_core xc_interface;
 typedef struct xc_interface_core xc_evtchn;
 typedef struct xc_interface_core xc_gnttab;
 typedef struct xc_interface_core xc_gntshr;
+
+enum xc_error_code {
+  XC_ERROR_NONE = 0,
+  XC_INTERNAL_ERROR = 1,
+  XC_INVALID_KERNEL = 2,
+  XC_INVALID_PARAM = 3,
+  XC_OUT_OF_MEMORY = 4,
+  /* new codes need to be added to xc_error_level_to_desc too */
+};
+
 typedef enum xc_error_code xc_error_code;
 
 
@@ -1766,15 +1776,6 @@ int xc_hvm_inject_trap(
  */
 
 
-enum xc_error_code {
-  XC_ERROR_NONE = 0,
-  XC_INTERNAL_ERROR = 1,
-  XC_INVALID_KERNEL = 2,
-  XC_INVALID_PARAM = 3,
-  XC_OUT_OF_MEMORY = 4,
-  /* new codes need to be added to xc_error_level_to_desc too */
-};
-
 #define XC_MAX_ERROR_MSG_LEN 1024
 typedef struct xc_error {
   enum xc_error_code code;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
                   ` (2 preceding siblings ...)
  2014-07-02 13:33 ` [PATCH RFC 4/9] xenctrl: Make the headers C++ friendly Razvan Cojocaru
@ 2014-07-02 13:33 ` Razvan Cojocaru
  2014-07-02 15:47   ` Jan Beulich
  2014-07-02 15:54   ` Andrew Cooper
  2014-07-02 13:33 ` [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
                   ` (5 subsequent siblings)
  9 siblings, 2 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Razvan Cojocaru

Added support for VMCALL events (the memory introspection library
will have the guest trigger VMCALLs, which will then be sent along
via the mem_event mechanism).

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/hvm/hvm.c          |    8 ++++++++
 xen/arch/x86/hvm/vmx/vmx.c      |   15 ++++++++++++++-
 xen/include/asm-x86/hvm/hvm.h   |    1 +
 xen/include/public/hvm/params.h |    4 +++-
 xen/include/public/mem_event.h  |    1 +
 5 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index f65a5f5..df696d1 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -6130,6 +6130,14 @@ void hvm_memory_event_msr(unsigned long msr, unsigned long value)
                            value, ~value, 1, msr);
 }
 
+void hvm_memory_event_vmcall(unsigned long rip, unsigned long eax)
+{
+    hvm_memory_event_traps(current->domain->arch.hvm_domain
+                             .params[HVM_PARAM_MEMORY_EVENT_VMCALL],
+                           MEM_EVENT_REASON_VMCALL,
+                           rip, ~rip, 1, eax);
+}
+
 int hvm_memory_event_int3(unsigned long gla) 
 {
     uint32_t pfec = PFEC_page_present;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index fed21b6..b4c12cd 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2880,8 +2880,21 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     case EXIT_REASON_VMCALL:
     {
         int rc;
+        unsigned long eax = regs->eax;
+
         HVMTRACE_1D(VMMCALL, regs->eax);
-        rc = hvm_do_hypercall(regs);
+
+        if ( regs->eax != 0x494e5452 ) /* Introcore magic */
+        {
+            rc = hvm_do_hypercall(regs);
+        }
+        else
+        {
+            hvm_memory_event_vmcall(guest_cpu_user_regs()->eip, eax);
+            update_guest_eip();
+            break;
+        }
+
         if ( rc != HVM_HCALL_preempted )
         {
             update_guest_eip(); /* Safe: VMCALL */
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 90e69f5..67e365b 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -532,6 +532,7 @@ void hvm_memory_event_cr0(unsigned long value, unsigned long old);
 void hvm_memory_event_cr3(unsigned long value, unsigned long old);
 void hvm_memory_event_cr4(unsigned long value, unsigned long old);
 void hvm_memory_event_msr(unsigned long msr, unsigned long value);
+void hvm_memory_event_vmcall(unsigned long rip, unsigned long eax);
 /* Called for current VCPU on int3: returns -1 if no listener */
 int hvm_memory_event_int3(unsigned long gla);
 
diff --git a/xen/include/public/hvm/params.h b/xen/include/public/hvm/params.h
index f830bdd..ea2eee6 100644
--- a/xen/include/public/hvm/params.h
+++ b/xen/include/public/hvm/params.h
@@ -148,6 +148,8 @@
 #define HVM_PARAM_IOREQ_SERVER_PFN 32
 #define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
 
-#define HVM_NR_PARAMS          34
+#define HVM_PARAM_MEMORY_EVENT_VMCALL 34
+
+#define HVM_NR_PARAMS          35
 
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index 24ac67d..5fa2858 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -47,6 +47,7 @@
 #define MEM_EVENT_REASON_SINGLESTEP  6    /* single step was invoked: gla/gfn are RIP */
 #define MEM_EVENT_REASON_MSR         7    /* MSR was hit: gfn is MSR value, gla is MSR address;
                                              does NOT honour HVMPME_onchangeonly */
+#define MEM_EVENT_REASON_VMCALL      8    /* VMCALL: gfn is RIP, gla is EAX */
 
 typedef struct mem_event_regs_st {
     uint64_t rax;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
                   ` (3 preceding siblings ...)
  2014-07-02 13:33 ` [PATCH RFC 5/9] xen: Support for VMCALL mem_events Razvan Cojocaru
@ 2014-07-02 13:33 ` Razvan Cojocaru
  2014-07-02 15:51   ` Jan Beulich
  2014-07-02 13:33 ` [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Razvan Cojocaru

Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
new xc_domain_set_pagefault_info() function to set per-domain page
fault injection information. This information is then used to call
hvm_inject_page_fault() at the first VMENTRY where the guest status
matches and there are no other pending traps.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 tools/libxc/xc_domain.c     |   17 +++++++++++++++++
 tools/libxc/xenctrl.h       |    4 ++++
 xen/arch/x86/hvm/vmx/vmx.c  |   39 +++++++++++++++++++++++++++++++++++++++
 xen/common/domain.c         |    5 +++++
 xen/common/domctl.c         |   21 +++++++++++++++++++++
 xen/include/public/domctl.h |   14 ++++++++++++++
 xen/include/xen/sched.h     |    7 +++++++
 7 files changed, 107 insertions(+)

diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index d5d6d12..701b65f 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -486,6 +486,23 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
     return ret;
 }
 
+int xc_domain_set_pagefault_info(xc_interface *xch,
+                                 uint32_t domid,
+                                 xen_domctl_set_pagefault_info_t *info)
+{
+    DECLARE_DOMCTL;
+
+    if (info == NULL)
+        return -1;
+
+    domctl.cmd = XEN_DOMCTL_set_pagefault_info;
+    domctl.domain = (domid_t)domid;
+    domctl.u.set_pagefault_info.address_space = info->address_space;
+    domctl.u.set_pagefault_info.virtual_address = info->virtual_address;
+    domctl.u.set_pagefault_info.write_access = info->write_access;
+    return do_domctl(xch, &domctl);
+}
+
 int xc_vcpu_getcontext(xc_interface *xch,
                        uint32_t domid,
                        uint32_t vcpu,
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index abd8947..088f02f 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -752,6 +752,10 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
 const char *xc_domain_get_native_protocol(xc_interface *xch,
                                           uint32_t domid);
 
+int xc_domain_set_pagefault_info(xc_interface *xch,
+                                 uint32_t domid,
+                                 xen_domctl_set_pagefault_info_t *info);
+
 /**
  * This function returns information about the execution context of a
  * particular vcpu of a domain.
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index b4c12cd..4a9a7c8 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -416,6 +416,7 @@ static void vmx_restore_dr(struct vcpu *v)
 static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
 {
     unsigned long ev;
+    unsigned long cs_arbytes;
 
     vmx_vmcs_enter(v);
 
@@ -430,6 +431,9 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
     __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
     __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
     __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
+    __vmread(GUEST_CS_AR_BYTES, &cs_arbytes);
+
+    c->cs_arbytes = (uint32_t)cs_arbytes;
 
     c->pending_event = 0;
     c->error_code = 0;
@@ -3111,6 +3115,39 @@ out:
         nvmx_idtv_handling();
 }
 
+static void check_pf_injection(void)
+{
+    struct vcpu *curr = current;
+    struct domain *d = curr->domain;
+    struct hvm_hw_cpu ctxt;
+    uint32_t cs_dpl;
+
+    if ( !is_hvm_domain(d) || d->fault_info.virtual_address == 0 )
+        return;
+
+    memset(&ctxt, 0, sizeof(struct hvm_hw_cpu));
+    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
+
+    cs_dpl = (ctxt.cs_arbytes >> 5) & 3;
+
+    if ( cs_dpl == 3 /* Guest is in user mode */
+         && !ctxt.pending_event
+         && ctxt.cr3 == d->fault_info.address_space )
+    {
+        /* Cache */
+        uint64_t virtual_address = d->fault_info.virtual_address;
+        uint32_t write_access = d->fault_info.write_access;
+
+        /* Reset */
+        d->fault_info.address_space = 0;
+        d->fault_info.virtual_address = 0;
+        d->fault_info.write_access = 0;
+
+        hvm_inject_page_fault((write_access << 1) | PFEC_user_mode,
+            virtual_address);
+    }
+}
+
 void vmx_vmenter_helper(const struct cpu_user_regs *regs)
 {
     struct vcpu *curr = current;
@@ -3151,6 +3188,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
     if ( unlikely(need_flush) )
         vpid_sync_all();
 
+    check_pf_injection();
+
  out:
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 
diff --git a/xen/common/domain.c b/xen/common/domain.c
index c3a576e..9402924 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -255,6 +255,11 @@ struct domain *domain_create(
 
     d->domain_id = domid;
 
+    /* Memory introspection page fault variables set-up. */
+    d->fault_info.address_space = 0;
+    d->fault_info.virtual_address = 0;
+    d->fault_info.write_access = 0;
+
     lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain");
 
     if ( (err = xsm_alloc_security_domain(d)) != 0 )
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index 000993f..461e207 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -975,6 +975,27 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
     }
     break;
 
+    case XEN_DOMCTL_set_pagefault_info:
+    {
+        struct domain *d;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->domain);
+        if ( d != NULL )
+        {
+            d->fault_info.address_space =
+                op->u.set_pagefault_info.address_space;
+            d->fault_info.virtual_address =
+                op->u.set_pagefault_info.virtual_address;
+            d->fault_info.write_access =
+                op->u.set_pagefault_info.write_access;
+
+            rcu_unlock_domain(d);
+            ret = 0;
+        }
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, d, u_domctl);
         break;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 5b11bbf..c8bf3f8 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -936,6 +936,18 @@ typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
 #endif
 
+/* XEN_DOMCTL_set_pagefault_info requests that a page fault occur at
+ * the next VMENTRY.
+ *  */
+struct xen_domctl_set_pagefault_info {
+    uint64_t address_space;
+    uint64_t virtual_address;
+    uint32_t write_access;
+};
+typedef struct xen_domctl_set_pagefault_info xen_domctl_set_pagefault_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_pagefault_info_t);
+
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -1012,6 +1024,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
 #define XEN_DOMCTL_gdbsx_domstatus             1003
+#define XEN_DOMCTL_set_pagefault_info          1004
     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
     domid_t  domain;
     union {
@@ -1068,6 +1081,7 @@ struct xen_domctl {
         struct xen_domctl_cacheflush        cacheflush;
         struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
+        struct xen_domctl_set_pagefault_info set_pagefault_info;
         uint8_t                             pad[128];
     } u;
 };
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index f920e1a..fe78a9a 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -447,6 +447,13 @@ struct domain
     nodemask_t node_affinity;
     unsigned int last_alloc_node;
     spinlock_t node_affinity_lock;
+
+    /* Memory introspection page fault injection data. */
+    struct {
+        uint64_t address_space;
+        uint64_t virtual_address;
+        uint32_t write_access;
+    } fault_info;
 };
 
 struct domain_setup_info
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
                   ` (4 preceding siblings ...)
  2014-07-02 13:33 ` [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
@ 2014-07-02 13:33 ` Razvan Cojocaru
  2014-07-02 15:56   ` Jan Beulich
  2014-07-02 13:34 ` [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults Razvan Cojocaru
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Razvan Cojocaru

In a scenario where a page fault that triggered a mem_event occured,
p2m_mem_access_check() will now be able to either 1) emulate the
current instruction, 2) skip the current instruction, or 3) emulate
it, but don't allow it to perform any writes. Since some SSE2
instructions are problematic to emulate (Firefox uses some),
support for setting the A and D (accessed and dirty) bits has been
added (please see p2m_set_ad_bits()).

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 xen/arch/x86/domain.c          |    5 ++
 xen/arch/x86/hvm/emulate.c     |   10 ++++
 xen/arch/x86/mm/p2m.c          |  123 ++++++++++++++++++++++++++++++++++++++++
 xen/common/domain.c            |    3 +
 xen/include/asm-x86/domain.h   |    9 +++
 xen/include/public/mem_event.h |   14 +++--
 xen/include/xen/sched.h        |    5 ++
 7 files changed, 164 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index e896210..5cd283b 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -407,6 +407,11 @@ int vcpu_initialise(struct vcpu *v)
 
     v->arch.flags = TF_kernel_mode;
 
+    /* By default, do not emulate */
+    v->arch.mem_event.emulate_flags = 0;
+    v->arch.mem_event.gpa = 0;
+    v->arch.mem_event.eip = 0;
+
     rc = mapcache_vcpu_init(v);
     if ( rc )
         return rc;
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 1dc8c67..5e7113d 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1372,6 +1372,16 @@ void hvm_emulate_one_full(bool_t nowrite)
     switch ( rc )
     {
     case X86EMUL_UNHANDLEABLE:
+        printk("Emulation failed @ %04x:%lx: "
+               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
+               hvmemul_get_seg_reg(x86_seg_cs, ctx)->sel,
+               ctx->insn_buf_eip,
+               ctx->insn_buf[0], ctx->insn_buf[1],
+               ctx->insn_buf[2], ctx->insn_buf[3],
+               ctx->insn_buf[4], ctx->insn_buf[5],
+               ctx->insn_buf[6], ctx->insn_buf[7],
+               ctx->insn_buf[8], ctx->insn_buf[9]);
+
         hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
         break;
     case X86EMUL_EXCEPTION:
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 93252d9..4dd3f1b 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1372,12 +1372,35 @@ static inline void p2m_mem_event_fill_regs(mem_event_request_t *req)
     req->regs.guest_x86_mode = hvm_guest_x86_mode(current);
 }
 
+static void p2m_set_ad_bits(struct vcpu *v, struct p2m_domain *p2m,
+                            paddr_t ga)
+{
+    struct hvm_hw_cpu ctxt;
+    uint32_t pfec = 0;
+    const struct paging_mode *pg_mode = v->arch.paging.mode;
+
+    hvm_funcs.save_cpu_ctxt(v, &ctxt);
+
+    if ( guest_cpu_user_regs()->eip == v->sse_pg_dirty.eip
+         && ga == v->sse_pg_dirty.gla )
+    {
+        pfec = 2;
+        pg_mode->p2m_ga_to_gfn(v, p2m, ctxt.cr3, ga, &pfec, NULL);
+    }
+    else
+        pg_mode->p2m_ga_to_gfn(v, p2m, ctxt.cr3, ga, &pfec, NULL);
+
+    v->sse_pg_dirty.eip = guest_cpu_user_regs()->eip;
+    v->sse_pg_dirty.gla = ga;
+}
+
 bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, 
                           bool_t access_r, bool_t access_w, bool_t access_x,
                           mem_event_request_t **req_ptr)
 {
     struct vcpu *v = current;
     unsigned long gfn = gpa >> PAGE_SHIFT;
+    unsigned long exit_qualification;
     struct domain *d = v->domain;    
     struct p2m_domain* p2m = p2m_get_hostp2m(d);
     mfn_t mfn;
@@ -1385,6 +1408,9 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
     p2m_access_t p2ma;
     mem_event_request_t *req;
     int rc;
+    unsigned long eip = 0;
+
+    __vmread(EXIT_QUALIFICATION, &exit_qualification);
 
     /* First, handle rx2rw conversion automatically.
      * These calls to p2m->set_entry() must succeed: we have the gfn
@@ -1437,6 +1463,48 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
             return 1;
         }
     }
+    else
+    {
+        /* There's a mem_event listener */
+        if ( (exit_qualification & EPT_GLA_FAULT) == 0 ) /* don't send a mem_event */
+        {
+            v->arch.mem_event.emulate_flags = 0;
+            v->arch.mem_event.gpa = 0;
+            v->arch.mem_event.eip = 0;
+
+            p2m_set_ad_bits(v, p2m, gla);
+            return 1;
+        }
+    }
+
+    eip = guest_cpu_user_regs()->eip;
+
+    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
+    {
+        v->arch.mem_event.emulate_flags = 0;
+        v->arch.mem_event.gpa = gpa;
+        v->arch.mem_event.eip = eip;
+    }
+
+    if ( v->arch.mem_event.emulate_flags )
+    {
+        if ( v->arch.mem_event.emulate_flags & MEM_EVENT_FLAG_SKIP_INSTR )
+        {
+            struct hvm_emulate_ctxt ctx[1] = {};
+            int length;
+
+            hvm_emulate_prepare(ctx, guest_cpu_user_regs());
+            length = hvm_get_insn_length(ctx);
+            guest_cpu_user_regs()->eip += length;
+        }
+        else if ( v->arch.mem_event.emulate_flags & MEM_EVENT_FLAG_EMULATE_NOWRITE )
+            hvm_emulate_one_full(1);
+        else
+            hvm_emulate_one_full(0);
+
+        v->arch.mem_event.emulate_flags = 0;
+        return 1;
+    }
 
     *req_ptr = NULL;
     req = xzalloc(mem_event_request_t);
@@ -1481,6 +1549,61 @@ void p2m_mem_access_resume(struct domain *d)
     {
         if ( rsp.flags & MEM_EVENT_FLAG_DUMMY )
             continue;
+
+        /* Mark vcpu for skipping one instruction upon rescheduling */
+        if ( rsp.flags & MEM_EVENT_FLAG_EMULATE )
+        {
+            xenmem_access_t access;
+            int violation = 1;
+
+            d->vcpu[rsp.vcpu_id]->arch.mem_event.emulate_flags = 0;
+
+            if ( p2m_get_mem_access(d, rsp.gfn, &access) == 0 )
+            {
+                violation = 0;
+
+                switch (access)
+                {
+                case XENMEM_access_n:
+                case XENMEM_access_n2rwx:
+                default:
+                    violation = rsp.access_r || rsp.access_w || rsp.access_x;
+                    break;
+
+                case XENMEM_access_r:
+                    violation = rsp.access_w || rsp.access_x;
+                    break;
+
+                case XENMEM_access_w:
+                    violation = rsp.access_r || rsp.access_x;
+                    break;
+
+                case XENMEM_access_x:
+                    violation = rsp.access_r || rsp.access_w;
+                    break;
+
+                case XENMEM_access_rx:
+                case XENMEM_access_rx2rw:
+                    violation = rsp.access_w;
+                    break;
+
+                case XENMEM_access_wx:
+                    violation = rsp.access_r;
+                    break;
+
+                case XENMEM_access_rw:
+                    violation = rsp.access_x;
+                    break;
+
+                case XENMEM_access_rwx:
+                    break;
+                }
+            }
+
+            if ( violation )
+                d->vcpu[rsp.vcpu_id]->arch.mem_event.emulate_flags = rsp.flags;
+        }
+
         /* Unpause domain */
         if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
             vcpu_unpause(d->vcpu[rsp.vcpu_id]);
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 9402924..44d2919 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -121,6 +121,9 @@ struct vcpu *alloc_vcpu(
     v->domain = d;
     v->vcpu_id = vcpu_id;
 
+    v->sse_pg_dirty.eip = 0;
+    v->sse_pg_dirty.gla = 0;
+
     spin_lock_init(&v->virq_lock);
 
     tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index abf55fb..0fa4d3d 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -446,6 +446,15 @@ struct arch_vcpu
 
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
+
+    /* Should we emulate the next matching instruction on VCPU resume
+     * after a mem_event? */
+    struct {
+        uint32_t emulate_flags;
+        unsigned long gpa;
+        unsigned long eip;
+    } mem_event;
+
 } __cacheline_aligned;
 
 /* Shorthands to improve code legibility. */
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
index 5fa2858..8880a74 100644
--- a/xen/include/public/mem_event.h
+++ b/xen/include/public/mem_event.h
@@ -31,11 +31,15 @@
 #include "io/ring.h"
 
 /* Memory event flags */
-#define MEM_EVENT_FLAG_VCPU_PAUSED  (1 << 0)
-#define MEM_EVENT_FLAG_DROP_PAGE    (1 << 1)
-#define MEM_EVENT_FLAG_EVICT_FAIL   (1 << 2)
-#define MEM_EVENT_FLAG_FOREIGN      (1 << 3)
-#define MEM_EVENT_FLAG_DUMMY        (1 << 4)
+#define MEM_EVENT_FLAG_VCPU_PAUSED     (1 << 0)
+#define MEM_EVENT_FLAG_DROP_PAGE       (1 << 1)
+#define MEM_EVENT_FLAG_EVICT_FAIL      (1 << 2)
+#define MEM_EVENT_FLAG_FOREIGN         (1 << 3)
+#define MEM_EVENT_FLAG_DUMMY           (1 << 4)
+#define MEM_EVENT_FLAG_EMULATE         (1 << 5)
+#define MEM_EVENT_FLAG_EMULATE_NOWRITE (1 << 6)
+#define MEM_EVENT_FLAG_SKIP_INSTR      (1 << 7)
+
 
 /* Reasons for the memory event request */
 #define MEM_EVENT_REASON_UNKNOWN     0    /* typical reason */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index fe78a9a..567a124 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -245,6 +245,11 @@ struct vcpu
     struct evtchn_fifo_vcpu *evtchn_fifo;
 
     struct arch_vcpu arch;
+
+    struct {
+           unsigned long eip;
+           unsigned long gla;
+    } sse_pg_dirty;
 };
 
 /* Per-domain lock can be recursively acquired in fault handlers. */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
                   ` (5 preceding siblings ...)
  2014-07-02 13:33 ` [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
@ 2014-07-02 13:34 ` Razvan Cojocaru
  2014-07-02 16:04   ` Andrew Cooper
  2014-07-02 13:34 ` [PATCH RFC 9/9] mm: mark pages that have their permissions controlled by a domain Razvan Cojocaru
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:34 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrei Lutas, tim, Razvan Cojocaru

The Xen emulator is incapable of handling some instructions, which
leads to the injection of an Invalid Opcode exception (#UD) inside
the guest once an unsupported instruction is encountered.
A new mechanism has been added which is able to generically re-execute
instructions, by temporarily granting permissions inside the EPT and
re-executing the instruction with all other vcpus paused and with the
monitor trap flag set. The mechanism is re-entrant, meaning that is
capable of handling different violations caused by the same instruction.
Usually, a security appliance will decide when and what instructions
must be re-executed this way (instructions that lie in non-executable
pages and instructions that cause the setting of Accessed and/or Dirty
flags inside page tables are two examples).

Signed-off-by: Andrei Lutas <vlutas@bitdefender.com>
---
 xen/arch/x86/hvm/vmx/vmx.c |   51 ++++++++++++
 xen/arch/x86/mm/p2m.c      |  188 +++++++++++++++++++++++++++++++++++++++++++-
 xen/common/domain.c        |    6 ++
 xen/include/xen/sched.h    |   17 ++++
 4 files changed, 260 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 4a9a7c8..4976215 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2568,12 +2568,60 @@ void vmx_handle_EOI_induced_exit(struct vlapic *vlapic, int vector)
     vlapic_handle_EOI_induced_exit(vlapic, vector);
 }
 
+static int vmx_stop_reexecute_instruction(struct vcpu *v)
+{
+    int ret = 0, i;
+    struct vcpu *a;
+
+    if ( 0 == v->rexec_level )
+        return 0;
+
+    /* Step 1: Restore original EPT access rights for each GPA. */
+    for ( i = v->rexec_level - 1; i >= 0; i-- )
+    {
+        if ( 0 != p2m_set_mem_access(v->domain, v->rexec_context[i].gpa >> PAGE_SHIFT,
+                                     1, 0, 0xff, v->rexec_context[i].old_access) )
+        {
+            ret = -1;
+            return ret;
+        }
+
+        v->rexec_context[i].gpa = 0;
+    }
+
+    spin_lock(&v->domain->rexec_lock);
+
+    /* Step 2: Reset the nesting level to zero. */
+    v->rexec_level = 0;
+
+    /* Step 3: Resume all other VCPUs. */
+    for_each_vcpu ( v->domain, a )
+    {
+        if ( a == v )
+            continue;
+
+        /* Unpause the VCPU. */
+        vcpu_unpause(a);
+    }
+
+    /* Step 4: Remove the MONITOR trap flag.
+     * - this is already done when handling the exit. */
+
+    /* Step 5: We're done! */
+
+    spin_unlock(&v->domain->rexec_lock);
+
+    return ret;
+}
+
 void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
     unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
     unsigned int vector = 0;
     struct vcpu *v = current;
 
+    v->in_host = 1;
+
     __vmread(GUEST_RIP,    &regs->rip);
     __vmread(GUEST_RSP,    &regs->rsp);
     __vmread(GUEST_RFLAGS, &regs->rflags);
@@ -3074,6 +3122,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     case EXIT_REASON_MONITOR_TRAP_FLAG:
         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
+        vmx_stop_reexecute_instruction(v);
         if ( v->arch.hvm_vcpu.single_step ) {
           hvm_memory_event_single_step(regs->eip);
           if ( v->domain->debugger_attached )
@@ -3191,6 +3240,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
     check_pf_injection();
 
  out:
+    curr->in_host = 0;
+
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 
     __vmwrite(GUEST_RIP,    regs->rip);
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 4dd3f1b..ff67b09 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -34,6 +34,7 @@
 #include <public/mem_event.h>
 #include <asm/mem_sharing.h>
 #include <xen/event.h>
+#include <xen/hypercall.h>
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
 #include <xsm/xsm.h>
@@ -1394,6 +1395,181 @@ static void p2m_set_ad_bits(struct vcpu *v, struct p2m_domain *p2m,
     v->sse_pg_dirty.gla = ga;
 }
 
+static int vmx_start_reexecute_instruction(struct vcpu *v,
+                                           unsigned long gpa,
+                                           xenmem_access_t required_access)
+{
+    /* NOTE: Some required_accesses may be invalid. For example, one
+     * cannot grant only write access on a given page; read/write
+     * access must be granted instead. These inconsistencies are NOT
+     * checked here. The caller must ensure that "required_access" is
+     * an allowed combination. */
+
+    int ret = 0, i, found = 0, r = 0, w = 0, x = 0, level = 0, leave = 0;
+    xenmem_access_t old_access, new_access;
+    struct vcpu *a;
+
+    spin_lock(&v->domain->rexec_lock);
+
+    level = v->rexec_level;
+
+    /* Step 1: Make sure someone else didn't get to start an
+     * instruction re-execution */
+    for_each_vcpu ( v->domain, a )
+    {
+        /* We're interested in pausing all the VCPUs except self/v. */
+        if ( a == v )
+            continue;
+
+        /* Check if "a" started an instruction re-execution. If so,
+         * return success, as we'll re-execute our instruction later. */
+        if ( 0 != a->rexec_level )
+        {
+            /* We should be paused. */
+            ret = 0;
+            leave = 1;
+            goto release_and_exit;
+        }
+    }
+
+    /* Step 2: Make sure we're not exceeding the max re-execution depth. */
+    if ( level >= REEXECUTION_MAX_DEPTH )
+    {
+        ret = -1;
+        leave = 1;
+        goto release_and_exit;
+    }
+
+    /* Step 2: Pause all the VCPUs, except self. Note that we have to do
+     * this only if we're at nesting level 0; if we're at a higher level
+     * of nested re-exec, the vcpus are already paused. */
+    if ( 0 == level )
+    {
+        for_each_vcpu ( v->domain, a )
+        {
+            /* We're interested in pausing all the VCPUs except self/v. */
+            if ( a == v )
+                continue;
+
+            /* Pause, NO SYNC! We're gonna do our own syncing. */
+            vcpu_pause_nosync(a);
+        }
+
+        /* Step 3: Wait for all the paused VCPUs to actually leave the VMX
+         * non-root realm and enter VMX root. */
+        for_each_vcpu ( v->domain, a )
+        {
+            /* We're interested in pausing all the VCPUs except self/v. */
+            if ( a == v )
+                continue;
+
+            /* Pause, synced. */
+            while ( !a->in_host )
+                cpu_relax();
+        }
+    }
+
+    /* Update the rexecution nexting level. */
+    v->rexec_level++;
+
+release_and_exit:
+    spin_unlock(&v->domain->rexec_lock);
+
+    /* If we've got errors so far, return. */
+    if ( leave )
+        return ret;
+
+    /* Step 4: Save the current gpa & old access rights. Also, check if this
+     * is a "double-fault" on the exact same GPA, in which case, we will
+     * promote the rights of this particular GPA, and try again. */
+    for ( i = 0; i < level; i++ )
+    {
+        if (v->rexec_context[i].gpa == gpa)
+        {
+            /* This GPA is already in the queue. */
+
+            found = 1;
+
+            switch (v->rexec_context[i].cur_access) {
+                case XENMEM_access_r: r = 1; break;
+                case XENMEM_access_w: w = 1; break;
+                case XENMEM_access_x: x = 1; break;
+                case XENMEM_access_rx: r = x = 1; break;
+                case XENMEM_access_wx: w = x = 1;  break;
+                case XENMEM_access_rw: r = w = 1; break;
+                case XENMEM_access_rwx: r = w = x = 1; break;
+                default: break; // We don't care about any other case.
+            }
+        }
+    }
+
+    /* Get the current EPT access rights. They will be restored when we're done.
+     * Note that the restoration is done in reverse-order, in order to ensure
+     * that the original access rights are restore correctly. Otherwise, we may
+     * restore whatever access rights were modified by another re-execution
+     * request, and that would be bad. */
+    if ( 0 != p2m_get_mem_access(v->domain, gpa >> PAGE_SHIFT, &old_access) )
+        return -1;
+
+    v->rexec_context[level].gpa = gpa;
+    v->rexec_context[level].old_access = old_access;
+
+    /* Step 5: Make the GPA with the required access, so we can re-execute
+     * the instruction. */
+    switch ( required_access )
+    {
+        case XENMEM_access_r: r = 1; break;
+        case XENMEM_access_w: w = 1; break;
+        case XENMEM_access_x: x = 1; break;
+        case XENMEM_access_rx: r = x = 1; break;
+        case XENMEM_access_wx: w = x = 1;  break;
+        case XENMEM_access_rw: r = w = 1; break;
+        case XENMEM_access_rwx: r = w = x = 1; break;
+        default: break; // We don't care about any other case.
+    }
+
+    /* Now transform our RWX values in a XENMEM_access_* constant. */
+    if ( 0 == r && 0 == w && 0 == x )
+        new_access = XENMEM_access_n;
+    else if ( 0 == r && 0 == w && 1 == x )
+        new_access = XENMEM_access_x;
+    else if ( 0 == r && 1 == w && 0 == x )
+        new_access = XENMEM_access_w;
+    else if ( 0 == r && 1 == w && 1 == x )
+        new_access = XENMEM_access_wx;
+    else if ( 1 == r && 0 == w && 0 == x )
+        new_access = XENMEM_access_r;
+    else if ( 1 == r && 0 == w && 1 == x )
+        new_access = XENMEM_access_rx;
+    else if ( 1 == r && 1 == w && 0 == x )
+        new_access = XENMEM_access_rw;
+    else if ( 1 == r && 1 == w && 1 == x )
+        new_access = XENMEM_access_rwx;
+    else
+        new_access = required_access; /* Should never get here. */
+
+    /* And save the current access rights. */
+    v->rexec_context[level].cur_access = new_access;
+
+    /* Apply the changes inside the EPT. */
+    if ( 0 != p2m_set_mem_access(v->domain, gpa >> PAGE_SHIFT,
+                                 1, 0, 0xff, new_access) )
+        return -1;
+
+    /* Step 6: Reconfigure the VMCS, so it suits our needs. We want a
+     * VM-exit to be generated after the instruction has been
+     * successfully re-executed. */
+    if ( 0 == level )
+    {
+        v->arch.hvm_vmx.exec_control |= CPU_BASED_MONITOR_TRAP_FLAG;
+        vmx_update_cpu_exec_control(v);
+    }
+
+    /* Step 8: We should be done! */
+
+    return ret;
+}
+
 bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, 
                           bool_t access_r, bool_t access_w, bool_t access_x,
                           mem_event_request_t **req_ptr)
@@ -1472,7 +1648,10 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
             v->arch.mem_event.gpa = 0;
             v->arch.mem_event.eip = 0;
 
-            p2m_set_ad_bits(v, p2m, gla);
+            if ( 0 == gpa )
+                p2m_set_ad_bits(v, p2m, gla);
+            else
+                vmx_start_reexecute_instruction(v, gpa, XENMEM_access_rw);
             return 1;
         }
     }
@@ -1500,7 +1679,12 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
         else if ( v->arch.mem_event.emulate_flags & MEM_EVENT_FLAG_EMULATE_NOWRITE )
             hvm_emulate_one_full(1);
         else
-            hvm_emulate_one_full(0);
+        {
+            if ( access_x )
+                vmx_start_reexecute_instruction(v, gpa, XENMEM_access_x);
+            else
+                hvm_emulate_one_full(0);
+        }
 
         v->arch.mem_event.emulate_flags = 0;
         return 1;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 44d2919..175c898 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -124,6 +124,10 @@ struct vcpu *alloc_vcpu(
     v->sse_pg_dirty.eip = 0;
     v->sse_pg_dirty.gla = 0;
 
+    v->rexec_level = 0;
+    memset(v->rexec_context, 0, sizeof(v->rexec_context));
+    v->in_host = 0;
+
     spin_lock_init(&v->virq_lock);
 
     tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
@@ -263,6 +267,8 @@ struct domain *domain_create(
     d->fault_info.virtual_address = 0;
     d->fault_info.write_access = 0;
 
+    spin_lock_init(&d->rexec_lock);
+
     lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain");
 
     if ( (err = xsm_alloc_security_domain(d)) != 0 )
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 567a124..07ee19f 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -24,6 +24,7 @@
 #include <public/sysctl.h>
 #include <public/vcpu.h>
 #include <public/mem_event.h>
+#include <public/memory.h>
 #include <public/event_channel.h>
 
 #ifdef CONFIG_COMPAT
@@ -250,6 +251,20 @@ struct vcpu
            unsigned long eip;
            unsigned long gla;
     } sse_pg_dirty;
+
+#define REEXECUTION_MAX_DEPTH 8
+
+    struct rexec_context_t {
+        unsigned long gpa;
+        xenmem_access_t old_access;
+        xenmem_access_t cur_access;
+    } rexec_context[REEXECUTION_MAX_DEPTH];
+
+    int rexec_level;
+
+    /* Will be true when the vcpu is in VMX root,
+     * false when it is not. */
+    bool_t in_host;
 };
 
 /* Per-domain lock can be recursively acquired in fault handlers. */
@@ -459,6 +474,8 @@ struct domain
         uint64_t virtual_address;
         uint32_t write_access;
     } fault_info;
+
+    spinlock_t rexec_lock;
 };
 
 struct domain_setup_info
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH RFC 9/9] mm: mark pages that have their permissions controlled by a domain
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
                   ` (6 preceding siblings ...)
  2014-07-02 13:34 ` [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults Razvan Cojocaru
@ 2014-07-02 13:34 ` Razvan Cojocaru
  2014-07-03 10:19   ` Jan Beulich
  2014-07-02 15:20 ` [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Andrew Cooper
  2014-07-02 15:21 ` Jan Beulich
  9 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 13:34 UTC (permalink / raw)
  To: xen-devel; +Cc: Mihai Dontu, tim, Razvan Cojocaru

This goes together with the mem-event API changes and marks certain
pages as being controlled from outside the HV (a user domain in our
case). This prevents Xen from resetting the permissions in certain
cases, enforcing the previously expressed intention of receiving a
memory event everytime the owning domain triggers a fault.

This enhancement makes use of an unused bit in the EPT-PTE entry (vmx)
and adjusts the definitions of get_entry() and set_entry() to carry a
variable controlling this bit. It would probably have been better to add
a new access type to p2m_access_t (which I tried), but in testing it
caused subtle failures in the application using the mem-event API (the
domains themselves seemed to work just fine though).

Signed-off-by: Mihai Dontu <mdontu@bitdefender.com>
---
 xen/arch/x86/hvm/hvm.c            |    5 +--
 xen/arch/x86/hvm/svm/svm.c        |    4 +--
 xen/arch/x86/hvm/vmx/vmx.c        |    2 +-
 xen/arch/x86/mm/hap/nested_hap.c  |    2 +-
 xen/arch/x86/mm/mem_access.c      |    2 +-
 xen/arch/x86/mm/mem_sharing.c     |    4 +--
 xen/arch/x86/mm/p2m-ept.c         |   13 +++++---
 xen/arch/x86/mm/p2m-pod.c         |   12 +++----
 xen/arch/x86/mm/p2m-pt.c          |    7 +++--
 xen/arch/x86/mm/p2m.c             |   63 +++++++++++++++++++------------------
 xen/include/asm-x86/hvm/vmx/vmx.h |    5 ++-
 xen/include/asm-x86/p2m.h         |   21 +++++++------
 12 files changed, 77 insertions(+), 63 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index df696d1..952aa06 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2628,6 +2628,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa,
     int rc, fall_through = 0, paged = 0;
     int sharing_enomem = 0;
     mem_event_request_t *req_ptr = NULL;
+    bool_t mem_ev = 0;
 
     /* On Nested Virtualization, walk the guest page table.
      * If this succeeds, all is fine.
@@ -2683,7 +2684,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa,
 
     p2m = p2m_get_hostp2m(v->domain);
     mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 
-                              P2M_ALLOC | (access_w ? P2M_UNSHARE : 0), NULL);
+                              P2M_ALLOC | (access_w ? P2M_UNSHARE : 0), NULL, &mem_ev);
 
     /* Check access permissions first, then handle faults */
     if ( mfn_x(mfn) != INVALID_MFN )
@@ -2775,7 +2776,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa,
          * a large page, we do not change other pages type within that large
          * page.
          */
-        if ( access_w )
+        if ( access_w && !mem_ev )
         {
             paging_mark_dirty(v->domain, mfn_x(mfn));
             p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index 76616ac..55b2000 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1420,7 +1420,7 @@ static void svm_do_nested_pgfault(struct vcpu *v,
         p2m = p2m_get_p2m(v);
         _d.gpa = gpa;
         _d.qualification = 0;
-        mfn = __get_gfn_type_access(p2m, gfn, &_d.p2mt, &p2ma, 0, NULL, 0);
+        mfn = __get_gfn_type_access(p2m, gfn, &_d.p2mt, &p2ma, 0, NULL, 0, NULL);
         _d.mfn = mfn_x(mfn);
         
         __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
@@ -1441,7 +1441,7 @@ static void svm_do_nested_pgfault(struct vcpu *v,
     if ( p2m == NULL )
         p2m = p2m_get_p2m(v);
     /* Everything else is an error. */
-    mfn = __get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL, 0);
+    mfn = __get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL, 0, NULL);
     gdprintk(XENLOG_ERR,
          "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
          gpa, mfn_x(mfn), p2mt);
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 4976215..3ed5ab3 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2580,7 +2580,7 @@ static int vmx_stop_reexecute_instruction(struct vcpu *v)
     for ( i = v->rexec_level - 1; i >= 0; i-- )
     {
         if ( 0 != p2m_set_mem_access(v->domain, v->rexec_context[i].gpa >> PAGE_SHIFT,
-                                     1, 0, 0xff, v->rexec_context[i].old_access) )
+                                     1, 0, 0xff, v->rexec_context[i].old_access, 0) )
         {
             ret = -1;
             return ret;
diff --git a/xen/arch/x86/mm/hap/nested_hap.c b/xen/arch/x86/mm/hap/nested_hap.c
index 137a87c..9b1b973 100644
--- a/xen/arch/x86/mm/hap/nested_hap.c
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -169,7 +169,7 @@ nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa,
 
     /* walk L0 P2M table */
     mfn = get_gfn_type_access(p2m, L1_gpa >> PAGE_SHIFT, p2mt, p2ma,
-                              0, page_order);
+                              0, page_order, NULL);
 
     rc = NESTEDHVM_PAGEFAULT_DIRECT_MMIO;
     if ( *p2mt == p2m_mmio_direct )
diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
index e8465a5..4f4eaed 100644
--- a/xen/arch/x86/mm/mem_access.c
+++ b/xen/arch/x86/mm/mem_access.c
@@ -74,7 +74,7 @@ int mem_access_memop(unsigned long cmd,
             break;
 
         rc = p2m_set_mem_access(d, mao.pfn, mao.nr, start_iter,
-                                MEMOP_CMD_MASK, mao.access);
+                                MEMOP_CMD_MASK, mao.access, 1);
         if ( rc > 0 )
         {
             ASSERT(!(rc & MEMOP_CMD_MASK));
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 7293f31..a0757ec 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1274,7 +1274,7 @@ int relinquish_shared_pages(struct domain *d)
 
         if ( atomic_read(&d->shr_pages) == 0 )
             break;
-        mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL);
+        mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
         if ( mfn_valid(mfn) && (t == p2m_ram_shared) )
         {
             /* Does not fail with ENOMEM given the DESTROY flag */
@@ -1284,7 +1284,7 @@ int relinquish_shared_pages(struct domain *d)
              * unshare.  Must succeed: we just read the old entry and
              * we hold the p2m lock. */
             set_rc = p2m->set_entry(p2m, gfn, _mfn(0), PAGE_ORDER_4K,
-                                    p2m_invalid, p2m_access_rwx);
+                                    p2m_invalid, p2m_access_rwx, 0);
             ASSERT(set_rc == 0);
             count += 0x10;
         }
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 15c6e83..03c90e7 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -244,7 +244,7 @@ static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
         epte->mfn += i * trunk;
         epte->snp = (iommu_enabled && iommu_snoop);
         ASSERT(!epte->rsvd1);
-        ASSERT(!epte->avail3);
+        /* ASSERT(!epte->avail3); */
 
         ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
 
@@ -608,7 +608,7 @@ bool_t ept_handle_misconfig(uint64_t gpa)
  */
 static int
 ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
-              unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma)
+              unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma, bool_t mem_ev)
 {
     ept_entry_t *table, *ept_entry = NULL;
     unsigned long gfn_remainder = gfn;
@@ -743,6 +743,7 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         new_entry.sa_p2mt = p2mt;
         new_entry.access = p2ma;
         new_entry.snp = (iommu_enabled && iommu_snoop);
+        new_entry.mem_ev = mem_ev;
 
         /* the caller should take care of the previous page */
         new_entry.mfn = mfn_x(mfn);
@@ -799,8 +800,8 @@ out:
 
 /* Read ept p2m entries */
 static mfn_t ept_get_entry(struct p2m_domain *p2m,
-                           unsigned long gfn, p2m_type_t *t, p2m_access_t* a,
-                           p2m_query_t q, unsigned int *page_order)
+                           unsigned long gfn, p2m_type_t *t, p2m_access_t *a,
+                           p2m_query_t q, unsigned int *page_order, bool_t *mem_ev)
 {
     ept_entry_t *table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
     unsigned long gfn_remainder = gfn;
@@ -814,6 +815,8 @@ static mfn_t ept_get_entry(struct p2m_domain *p2m,
 
     *t = p2m_mmio_dm;
     *a = p2m_access_n;
+    if ( mem_ev )
+        *mem_ev = 0;
 
     /* This pfn is higher than the highest the p2m map currently holds */
     if ( gfn > p2m->max_mapped_pfn )
@@ -879,6 +882,8 @@ static mfn_t ept_get_entry(struct p2m_domain *p2m,
         else
             *t = ept_entry->sa_p2mt;
         *a = ept_entry->access;
+        if ( mem_ev )
+            *mem_ev = ept_entry->mem_ev;
 
         mfn = _mfn(ept_entry->mfn);
         if ( i )
diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
index bd4c7c8..289a377 100644
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -536,7 +536,7 @@ recount:
         p2m_access_t a;
         p2m_type_t t;
 
-        (void)p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL);
+        (void)p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL, NULL);
 
         if ( t == p2m_populate_on_demand )
             pod++;
@@ -587,7 +587,7 @@ recount:
         p2m_type_t t;
         p2m_access_t a;
 
-        mfn = p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL);
+        mfn = p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL, NULL);
         if ( t == p2m_populate_on_demand )
         {
             p2m_set_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid,
@@ -676,7 +676,7 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
     for ( i=0; i<SUPERPAGE_PAGES; i++ )
     {
         p2m_access_t a; 
-        mfn = p2m->get_entry(p2m, gfn + i, &type, &a, 0, NULL);
+        mfn = p2m->get_entry(p2m, gfn + i, &type, &a, 0, NULL, NULL);
 
         if ( i == 0 )
         {
@@ -808,7 +808,7 @@ p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count)
     for ( i=0; i<count; i++ )
     {
         p2m_access_t a;
-        mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a, 0, NULL);
+        mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a, 0, NULL, NULL);
         /* If this is ram, and not a pagetable or from the xen heap, and probably not mapped
            elsewhere, map it; otherwise, skip. */
         if ( p2m_is_ram(types[i])
@@ -947,7 +947,7 @@ p2m_pod_emergency_sweep(struct p2m_domain *p2m)
     for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )
     {
         p2m_access_t a;
-        (void)p2m->get_entry(p2m, i, &t, &a, 0, NULL);
+        (void)p2m->get_entry(p2m, i, &t, &a, 0, NULL, NULL);
         if ( p2m_is_ram(t) )
         {
             gfns[j] = i;
@@ -1135,7 +1135,7 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
     for ( i = 0; i < (1UL << order); i++ )
     {
         p2m_access_t a;
-        omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL);
+        omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL, NULL);
         if ( p2m_is_ram(ot) )
         {
             P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot);
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
index 085ab6f..4242b3b 100644
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -481,7 +481,8 @@ int p2m_pt_handle_deferred_changes(uint64_t gpa)
 /* Returns: 0 for success, -errno for failure */
 static int
 p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
-                 unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
+                 unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma,
+                 bool_t mem_ev)
 {
     /* XXX -- this might be able to be faster iff current->domain == d */
     void *table;
@@ -688,7 +689,7 @@ static inline p2m_type_t recalc_type(bool_t recalc, p2m_type_t t,
 static mfn_t
 p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
                  p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
-                 unsigned int *page_order)
+                 unsigned int *page_order, bool_t* mem_ev)
 {
     mfn_t mfn;
     paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
@@ -707,6 +708,8 @@ p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
     *t = p2m_mmio_dm;
     /* Not implemented except with EPT */
     *a = p2m_access_rwx; 
+    if ( mem_ev )
+        *mem_ev = 0;
 
     if ( gfn > p2m->max_mapped_pfn )
         /* This pfn is higher than the highest the p2m map currently holds */
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index ff67b09..41fd120 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -240,7 +240,7 @@ void p2m_memory_type_changed(struct domain *d)
 
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
                     p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
-                    unsigned int *page_order, bool_t locked)
+                    unsigned int *page_order, bool_t locked, bool_t *mem_ev)
 {
     mfn_t mfn;
 
@@ -260,7 +260,7 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
         /* Grab the lock here, don't release until put_gfn */
         gfn_lock(p2m, gfn, 0);
 
-    mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order);
+    mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, mem_ev);
 
     if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
     {
@@ -269,7 +269,7 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
          * sleeping. */
         if ( mem_sharing_unshare_page(p2m->domain, gfn, 0) < 0 )
             (void)mem_sharing_notify_enomem(p2m->domain, gfn, 0);
-        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order);
+        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, mem_ev);
     }
 
     if (unlikely((p2m_is_broken(*t))))
@@ -312,7 +312,7 @@ struct page_info *get_page_from_gfn_p2m(
     {
         /* Fast path: look up and get out */
         p2m_read_lock(p2m);
-        mfn = __get_gfn_type_access(p2m, gfn, t, a, 0, NULL, 0);
+        mfn = __get_gfn_type_access(p2m, gfn, t, a, 0, NULL, 0, NULL);
         if ( p2m_is_any_ram(*t) && mfn_valid(mfn)
              && !((q & P2M_UNSHARE) && p2m_is_shared(*t)) )
         {
@@ -340,7 +340,7 @@ struct page_info *get_page_from_gfn_p2m(
     }
 
     /* Slow path: take the write lock and do fixups */
-    mfn = get_gfn_type_access(p2m, gfn, t, a, q, NULL);
+    mfn = get_gfn_type_access(p2m, gfn, t, a, q, NULL, NULL);
     if ( p2m_is_ram(*t) && mfn_valid(mfn) )
     {
         page = mfn_to_page(mfn);
@@ -373,7 +373,7 @@ int p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         else
             order = 0;
 
-        set_rc = p2m->set_entry(p2m, gfn, mfn, order, p2mt, p2ma);
+        set_rc = p2m->set_entry(p2m, gfn, mfn, order, p2mt, p2ma, 0);
         if ( set_rc )
             rc = set_rc;
 
@@ -537,7 +537,7 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn,
     {
         for ( i = 0; i < (1UL << page_order); i++ )
         {
-            mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, 0, NULL);
+            mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, 0, NULL, NULL);
             if ( !p2m_is_grant(t) && !p2m_is_shared(t) && !p2m_is_foreign(t) )
                 set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
             ASSERT( !p2m_is_valid(t) || mfn + i == mfn_x(mfn_return) );
@@ -600,7 +600,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
     /* First, remove m->p mappings for existing p->m mappings */
     for ( i = 0; i < (1UL << page_order); i++ )
     {
-        omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL);
+        omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL, NULL);
         if ( p2m_is_shared(ot) )
         {
             /* Do an unshare to cleanly take care of all corner 
@@ -624,7 +624,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
                 (void)mem_sharing_notify_enomem(p2m->domain, gfn + i, 0);
                 return rc;
             }
-            omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL);
+            omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL, NULL);
             ASSERT(!p2m_is_shared(ot));
         }
         if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
@@ -672,7 +672,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
              * address */
             P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
                       mfn + i, ogfn, gfn + i);
-            omfn = p2m->get_entry(p2m, ogfn, &ot, &a, 0, NULL);
+            omfn = p2m->get_entry(p2m, ogfn, &ot, &a, 0, NULL, NULL);
             if ( p2m_is_ram(ot) && !p2m_is_paged(ot) )
             {
                 ASSERT(mfn_valid(omfn));
@@ -739,7 +739,7 @@ int p2m_change_type_one(struct domain *d, unsigned long gfn,
 
     gfn_lock(p2m, gfn, 0);
 
-    mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, NULL, NULL);
     rc = likely(pt == ot)
          ? p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, nt,
                          p2m->default_access)
@@ -823,7 +823,7 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
         return -EIO;
 
     gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
     if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
         p2m_unlock(p2m);
@@ -872,7 +872,7 @@ int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn)
         return -EIO;
 
     gfn_lock(p2m, gfn, 0);
-    mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
 
     /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
     if ( (INVALID_MFN == mfn_x(mfn)) || (t != p2m_mmio_direct) )
@@ -904,7 +904,7 @@ int set_shared_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
         return -EIO;
 
     gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
     /* At the moment we only allow p2m change if gfn has already been made
      * sharable first */
     ASSERT(p2m_is_shared(ot));
@@ -956,7 +956,7 @@ int p2m_mem_paging_nominate(struct domain *d, unsigned long gfn)
 
     gfn_lock(p2m, gfn, 0);
 
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
 
     /* Check if mfn is valid */
     if ( !mfn_valid(mfn) )
@@ -1018,7 +1018,7 @@ int p2m_mem_paging_evict(struct domain *d, unsigned long gfn)
     gfn_lock(p2m, gfn, 0);
 
     /* Get mfn */
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
     if ( unlikely(!mfn_valid(mfn)) )
         goto out;
 
@@ -1144,7 +1144,7 @@ void p2m_mem_paging_populate(struct domain *d, unsigned long gfn)
 
     /* Fix p2m mapping */
     gfn_lock(p2m, gfn, 0);
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
     /* Allow only nominated or evicted pages to enter page-in path */
     if ( p2mt == p2m_ram_paging_out || p2mt == p2m_ram_paged )
     {
@@ -1206,7 +1206,7 @@ int p2m_mem_paging_prep(struct domain *d, unsigned long gfn, uint64_t buffer)
 
     gfn_lock(p2m, gfn, 0);
 
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
 
     ret = -ENOENT;
     /* Allow missing pages */
@@ -1297,7 +1297,7 @@ void p2m_mem_paging_resume(struct domain *d)
         if ( !(rsp.flags & MEM_EVENT_FLAG_DROP_PAGE) )
         {
             gfn_lock(p2m, rsp.gfn, 0);
-            mfn = p2m->get_entry(p2m, rsp.gfn, &p2mt, &a, 0, NULL);
+            mfn = p2m->get_entry(p2m, rsp.gfn, &p2mt, &a, 0, NULL, NULL);
             /* Allow only pages which were prepared properly, or pages which
              * were nominated but not evicted */
             if ( mfn_valid(mfn) && (p2mt == p2m_ram_paging_in) )
@@ -1553,7 +1553,7 @@ release_and_exit:
 
     /* Apply the changes inside the EPT. */
     if ( 0 != p2m_set_mem_access(v->domain, gpa >> PAGE_SHIFT,
-                                 1, 0, 0xff, new_access) )
+                                 1, 0, 0xff, new_access, 0) )
         return -1;
 
     /* Step 6: Reconfigure the VMCS, so it suits our needs. We want a
@@ -1592,11 +1592,11 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
      * These calls to p2m->set_entry() must succeed: we have the gfn
      * locked and just did a successful get_entry(). */
     gfn_lock(p2m, gfn, 0);
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
 
     if ( access_w && p2ma == p2m_access_rx2rw ) 
     {
-        rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rw);
+        rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rw, 0);
         ASSERT(rc == 0);
         gfn_unlock(p2m, gfn, 0);
         return 1;
@@ -1605,7 +1605,7 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
     {
         ASSERT(access_w || access_r || access_x);
         rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K,
-                            p2mt, p2m_access_rwx);
+                            p2mt, p2m_access_rwx, 0);
         ASSERT(rc == 0);
     }
     gfn_unlock(p2m, gfn, 0);
@@ -1625,14 +1625,14 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
         else
         {
             gfn_lock(p2m, gfn, 0);
-            mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL);
+            mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
             if ( p2ma != p2m_access_n2rwx )
             {
                 /* A listener is not required, so clear the access
                  * restrictions.  This set must succeed: we have the
                  * gfn locked and just did a successful get_entry(). */
                 rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K,
-                                    p2mt, p2m_access_rwx);
+                                    p2mt, p2m_access_rwx, 0);
                 ASSERT(rc == 0);
             }
             gfn_unlock(p2m, gfn, 0);
@@ -1797,7 +1797,8 @@ void p2m_mem_access_resume(struct domain *d)
 /* Set access type for a region of pfns.
  * If start_pfn == -1ul, sets the default access type */
 long p2m_set_mem_access(struct domain *d, unsigned long pfn, uint32_t nr,
-                        uint32_t start, uint32_t mask, xenmem_access_t access)
+                        uint32_t start, uint32_t mask, xenmem_access_t access,
+                        bool_t mem_ev)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     p2m_access_t a, _a;
@@ -1842,8 +1843,8 @@ long p2m_set_mem_access(struct domain *d, unsigned long pfn, uint32_t nr,
     p2m_lock(p2m);
     for ( pfn += start; nr > start; ++pfn )
     {
-        mfn = p2m->get_entry(p2m, pfn, &t, &_a, 0, NULL);
-        rc = p2m->set_entry(p2m, pfn, mfn, PAGE_ORDER_4K, t, a);
+        mfn = p2m->get_entry(p2m, pfn, &t, &_a, 0, NULL, NULL);
+        rc = p2m->set_entry(p2m, pfn, mfn, PAGE_ORDER_4K, t, a, mem_ev);
         if ( rc )
             break;
 
@@ -1891,12 +1892,12 @@ int p2m_get_mem_access(struct domain *d, unsigned long pfn,
     }
 
     gfn_lock(p2m, gfn, 0);
-    mfn = p2m->get_entry(p2m, pfn, &t, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, pfn, &t, &a, 0, NULL, NULL);
     gfn_unlock(p2m, gfn, 0);
 
     if ( mfn_x(mfn) == INVALID_MFN )
         return -ESRCH;
-    
+
     if ( (unsigned) a >= ARRAY_SIZE(memaccess) )
         return -ERANGE;
 
@@ -2130,7 +2131,7 @@ void audit_p2m(struct domain *d,
             continue;
         }
 
-        p2mfn = get_gfn_type_access(p2m, gfn, &type, &p2ma, 0, NULL);
+        p2mfn = get_gfn_type_access(p2m, gfn, &type, &p2ma, 0, NULL, NULL);
         if ( mfn_x(p2mfn) != mfn )
         {
             mpbad++;
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
index c8bb548..20270e8 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -46,7 +46,10 @@ typedef union {
         access      :   4,  /* bits 61:58 - p2m_access_t */
         tm          :   1,  /* bit 62 - VT-d transient-mapping hint in
                                shared EPT/VT-d usage */
-        avail3      :   1;  /* bit 63 - Software available 3 */
+        mem_ev      :   1;  /* bit 63 - Used to determine if the permissions
+                               on this page have been set via the mem-events
+                               API. If yes, then we should not ever reset it
+                               when handling a page fault */
     };
     u64 epte;
 } ept_entry_t;
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 0ddbadb..86614d3 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -239,13 +239,13 @@ struct p2m_domain {
                                        unsigned long gfn,
                                        mfn_t mfn, unsigned int page_order,
                                        p2m_type_t p2mt,
-                                       p2m_access_t p2ma);
+                                       p2m_access_t p2ma, bool_t mem_ev);
     mfn_t              (*get_entry   )(struct p2m_domain *p2m,
                                        unsigned long gfn,
                                        p2m_type_t *p2mt,
                                        p2m_access_t *p2ma,
                                        p2m_query_t q,
-                                       unsigned int *page_order);
+                                       unsigned int *page_order, bool_t *mem_ev);
     void               (*change_entry_type_global)(struct p2m_domain *p2m,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
@@ -328,7 +328,7 @@ struct p2m_domain *p2m_get_p2m(struct vcpu *v);
 
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
                     p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
-                    unsigned int *page_order, bool_t locked);
+                    unsigned int *page_order, bool_t locked, bool_t *mem_ev);
 
 /* Read a particular P2M table, mapping pages as we go.  Most callers
  * should _not_ call this directly; use the other get_gfn* functions
@@ -337,8 +337,8 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
  * If the lookup succeeds, the return value is != INVALID_MFN and 
  * *page_order is filled in with the order of the superpage (if any) that
  * the entry was found in.  */
-#define get_gfn_type_access(p, g, t, a, q, o)   \
-        __get_gfn_type_access((p), (g), (t), (a), (q), (o), 1)
+#define get_gfn_type_access(p, g, t, a, q, o, m)   \
+        __get_gfn_type_access((p), (g), (t), (a), (q), (o), 1, m)
 
 /* General conversion function from gfn to mfn */
 static inline mfn_t get_gfn_type(struct domain *d,
@@ -346,7 +346,7 @@ static inline mfn_t get_gfn_type(struct domain *d,
                                     p2m_query_t q)
 {
     p2m_access_t a;
-    return get_gfn_type_access(p2m_get_hostp2m(d), gfn, t, &a, q, NULL);
+    return get_gfn_type_access(p2m_get_hostp2m(d), gfn, t, &a, q, NULL, NULL);
 }
 
 /* Syntactic sugar: most callers will use one of these. 
@@ -382,7 +382,7 @@ static inline mfn_t get_gfn_query_unlocked(struct domain *d,
                                            p2m_type_t *t)
 {
     p2m_access_t a;
-    return __get_gfn_type_access(p2m_get_hostp2m(d), gfn, t, &a, 0, NULL, 0);
+    return __get_gfn_type_access(p2m_get_hostp2m(d), gfn, t, &a, 0, NULL, 0, NULL);
 }
 
 /* Atomically look up a GFN and take a reference count on the backing page.
@@ -464,9 +464,9 @@ do {                                                    \
 
     /* Now do the gets */
     *first_mfn  = get_gfn_type_access(p2m_get_hostp2m(rval->first_domain), 
-                                      rval->first_gfn, first_t, first_a, q, NULL);
+                                      rval->first_gfn, first_t, first_a, q, NULL, NULL);
     *second_mfn = get_gfn_type_access(p2m_get_hostp2m(rval->second_domain), 
-                                      rval->second_gfn, second_t, second_a, q, NULL);
+                                      rval->second_gfn, second_t, second_a, q, NULL, NULL);
 }
 
 static inline void put_two_gfns(struct two_gfns *arg)
@@ -606,7 +606,8 @@ void p2m_mem_access_resume(struct domain *d);
 /* Set access type for a region of pfns.
  * If start_pfn == -1ul, sets the default access type */
 long p2m_set_mem_access(struct domain *d, unsigned long start_pfn, uint32_t nr,
-                        uint32_t start, uint32_t mask, xenmem_access_t access);
+                        uint32_t start, uint32_t mask, xenmem_access_t access,
+                        bool_t mem_ev);
 
 /* Get access type for a pfn
  * If pfn == -1ul, gets the default access type */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
                   ` (7 preceding siblings ...)
  2014-07-02 13:34 ` [PATCH RFC 9/9] mm: mark pages that have their permissions controlled by a domain Razvan Cojocaru
@ 2014-07-02 15:20 ` Andrew Cooper
  2014-07-03  7:42   ` Razvan Cojocaru
  2014-07-02 15:21 ` Jan Beulich
  9 siblings, 1 reply; 64+ messages in thread
From: Andrew Cooper @ 2014-07-02 15:20 UTC (permalink / raw)
  To: Razvan Cojocaru, xen-devel; +Cc: tim

On 02/07/14 14:33, Razvan Cojocaru wrote:
> Added support for emulating an instruction with no memory writes and
> for retrieving the length of the next instruction. Additionally,
> introduced hvm_emulate_one_full(bool_t nowrite), which acts upon all
> possible return values from the hvm_emulate_one() functions (RETRY,
> EXCEPTION, UNHANDLEABLE).
>
> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>

xen/arch/x86/x86_emulate/ is the core of the emulation in Xen, and this
looks very much as if it should be part that, rather than wedged on the
side of the hypervisor.

Amongst other things, x86_emulate already contains instruction decode
tables, so it would appear that extending x86_emulate would result in
less code duplication.

~Andrew

> ---
>  xen/arch/x86/Makefile             |    2 +
>  xen/arch/x86/hvm/emulate.c        |  192 +++++++
>  xen/arch/x86/inat-tables.c        | 1130 +++++++++++++++++++++++++++++++++++++
>  xen/arch/x86/inat.c               |   96 ++++
>  xen/arch/x86/insn.c               |  576 +++++++++++++++++++
>  xen/include/asm-x86/hvm/emulate.h |    5 +
>  xen/include/asm-x86/inat.h        |  221 ++++++++
>  xen/include/asm-x86/inat_types.h  |   29 +
>  xen/include/asm-x86/insn.h        |  199 +++++++
>  9 files changed, 2450 insertions(+)
>  create mode 100644 xen/arch/x86/inat-tables.c
>  create mode 100644 xen/arch/x86/inat.c
>  create mode 100644 xen/arch/x86/insn.c
>  create mode 100644 xen/include/asm-x86/inat.h
>  create mode 100644 xen/include/asm-x86/inat_types.h
>  create mode 100644 xen/include/asm-x86/insn.h
>
> diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
> index 6c90b1b..30829a0 100644
> --- a/xen/arch/x86/Makefile
> +++ b/xen/arch/x86/Makefile
> @@ -59,6 +59,8 @@ obj-y += crash.o
>  obj-y += tboot.o
>  obj-y += hpet.o
>  obj-y += xstate.o
> +obj-y += insn.o
> +obj-y += inat.o
>  
>  obj-$(crash_debug) += gdbstub.o
>  
> diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
> index eac159f..1dc8c67 100644
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -21,6 +21,7 @@
>  #include <asm/hvm/hvm.h>
>  #include <asm/hvm/trace.h>
>  #include <asm/hvm/support.h>
> +#include <asm/insn.h>
>  
>  static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
>  {
> @@ -688,6 +689,17 @@ static int hvmemul_write(
>      return X86EMUL_OKAY;
>  }
>  
> +static int hvmemul_write_dummy(
> +    enum x86_segment __attribute__((unused)) seg,
> +    unsigned long __attribute__((unused)) offset,
> +    void __attribute__((unused)) *p_data,
> +    unsigned int __attribute__((unused)) bytes,
> +    struct x86_emulate_ctxt __attribute__((unused)) *ctxt)
> +{
> +    /* discarding the write */
> +    return X86EMUL_OKAY;
> +}
> +
>  static int hvmemul_cmpxchg(
>      enum x86_segment seg,
>      unsigned long offset,
> @@ -1239,6 +1251,139 @@ int hvm_emulate_one(
>      return X86EMUL_OKAY;
>  }
>  
> +int hvm_emulate_one_no_write(
> +    struct hvm_emulate_ctxt *hvmemul_ctxt)
> +{
> +    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
> +    struct vcpu *curr = current;
> +    uint32_t new_intr_shadow, pfec = PFEC_page_present;
> +    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
> +    struct x86_emulate_ops local_ops = hvm_emulate_ops;
> +    unsigned long addr;
> +    int rc;
> +
> +    if ( hvm_long_mode_enabled(curr) &&
> +         hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
> +    {
> +        hvmemul_ctxt->ctxt.addr_size = hvmemul_ctxt->ctxt.sp_size = 64;
> +    }
> +    else
> +    {
> +        hvmemul_ctxt->ctxt.addr_size =
> +            hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.db ? 32 : 16;
> +        hvmemul_ctxt->ctxt.sp_size =
> +            hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db ? 32 : 16;
> +    }
> +
> +    if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
> +        pfec |= PFEC_user_mode;
> +
> +    hvmemul_ctxt->insn_buf_eip = regs->eip;
> +    if ( !vio->mmio_insn_bytes )
> +    {
> +        hvmemul_ctxt->insn_buf_bytes =
> +            hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
> +            (hvm_virtual_to_linear_addr(x86_seg_cs,
> +                                        &hvmemul_ctxt->seg_reg[x86_seg_cs],
> +                                        regs->eip,
> +                                        sizeof(hvmemul_ctxt->insn_buf),
> +                                        hvm_access_insn_fetch,
> +                                        hvmemul_ctxt->ctxt.addr_size,
> +                                        &addr) &&
> +             hvm_fetch_from_guest_virt_nofault(hvmemul_ctxt->insn_buf, addr,
> +                                               sizeof(hvmemul_ctxt->insn_buf),
> +                                               pfec) == HVMCOPY_okay) ?
> +            sizeof(hvmemul_ctxt->insn_buf) : 0;
> +    }
> +    else
> +    {
> +        hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes;
> +        memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes);
> +    }
> +
> +    hvmemul_ctxt->exn_pending = 0;
> +    vio->mmio_retrying = vio->mmio_retry;
> +    vio->mmio_retry = 0;
> +
> +    local_ops.write = hvmemul_write_dummy;
> +    rc = x86_emulate(&hvmemul_ctxt->ctxt, &local_ops);
> +
> +    if ( rc == X86EMUL_OKAY && vio->mmio_retry )
> +        rc = X86EMUL_RETRY;
> +    if ( rc != X86EMUL_RETRY )
> +    {
> +        vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
> +        vio->mmio_insn_bytes = 0;
> +    }
> +    else
> +    {
> +        BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
> +        vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
> +        memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
> +    }
> +
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    new_intr_shadow = hvmemul_ctxt->intr_shadow;
> +
> +    /* MOV-SS instruction toggles MOV-SS shadow, else we just clear it. */
> +    if ( hvmemul_ctxt->ctxt.retire.flags.mov_ss )
> +        new_intr_shadow ^= HVM_INTR_SHADOW_MOV_SS;
> +    else
> +        new_intr_shadow &= ~HVM_INTR_SHADOW_MOV_SS;
> +
> +    /* STI instruction toggles STI shadow, else we just clear it. */
> +    if ( hvmemul_ctxt->ctxt.retire.flags.sti )
> +        new_intr_shadow ^= HVM_INTR_SHADOW_STI;
> +    else
> +        new_intr_shadow &= ~HVM_INTR_SHADOW_STI;
> +
> +    if ( hvmemul_ctxt->intr_shadow != new_intr_shadow )
> +    {
> +        hvmemul_ctxt->intr_shadow = new_intr_shadow;
> +        hvm_funcs.set_interrupt_shadow(curr, new_intr_shadow);
> +    }
> +
> +    if ( hvmemul_ctxt->ctxt.retire.flags.hlt &&
> +         !hvm_local_events_need_delivery(curr) )
> +    {
> +        hvm_hlt(regs->eflags);
> +    }
> +
> +    return X86EMUL_OKAY;
> +}
> +
> +void hvm_emulate_one_full(bool_t nowrite)
> +{
> +    struct hvm_emulate_ctxt ctx[1] = {};
> +    int rc = X86EMUL_RETRY;
> +
> +    hvm_emulate_prepare(ctx, guest_cpu_user_regs());
> +
> +    while ( rc == X86EMUL_RETRY )
> +    {
> +        if ( nowrite )
> +            rc = hvm_emulate_one_no_write(ctx);
> +        else
> +            rc = hvm_emulate_one(ctx);
> +    }
> +
> +    switch ( rc )
> +    {
> +    case X86EMUL_UNHANDLEABLE:
> +        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
> +        break;
> +    case X86EMUL_EXCEPTION:
> +        if ( ctx->exn_pending )
> +            hvm_inject_hw_exception(ctx->exn_vector, ctx->exn_error_code);
> +        /* fall through */
> +    default:
> +        hvm_emulate_writeback(ctx);
> +        break;
> +    }
> +}
> +
>  void hvm_emulate_prepare(
>      struct hvm_emulate_ctxt *hvmemul_ctxt,
>      struct cpu_user_regs *regs)
> @@ -1278,6 +1423,53 @@ struct segment_register *hvmemul_get_seg_reg(
>      return &hvmemul_ctxt->seg_reg[seg];
>  }
>  
> +int hvm_get_insn_length(
> +    struct hvm_emulate_ctxt *hvmemul_ctxt)
> +{
> +    struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
> +    struct vcpu *curr = current;
> +    uint32_t pfec = PFEC_page_present;
> +    unsigned long addr;
> +    struct x86_emulate_ops local_ops = hvm_emulate_ops;
> +    struct insn insn;
> +
> +    local_ops.write = hvmemul_write_dummy;
> +
> +    if ( hvm_long_mode_enabled(curr) &&
> +        hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l )
> +        hvmemul_ctxt->ctxt.addr_size = hvmemul_ctxt->ctxt.sp_size = 64;
> +    else
> +    {
> +        hvmemul_ctxt->ctxt.addr_size =
> +            hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.db ? 32 : 16;
> +        hvmemul_ctxt->ctxt.sp_size =
> +            hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db ? 32 : 16;
> +    }
> +
> +    if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
> +        pfec |= PFEC_user_mode;
> +
> +    hvmemul_ctxt->insn_buf_eip = regs->eip;
> +    hvmemul_ctxt->insn_buf_bytes =
> +        hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf)
> +        ? :
> +        (hvm_virtual_to_linear_addr(
> +            x86_seg_cs, &hvmemul_ctxt->seg_reg[x86_seg_cs],
> +            regs->eip, sizeof(hvmemul_ctxt->insn_buf),
> +            hvm_access_insn_fetch, hvmemul_ctxt->ctxt.addr_size, &addr) &&
> +        !hvm_fetch_from_guest_virt_nofault(
> +            hvmemul_ctxt->insn_buf, addr,
> +            sizeof(hvmemul_ctxt->insn_buf), pfec))
> +    ? sizeof(hvmemul_ctxt->insn_buf) : 0;
> +
> +    hvmemul_ctxt->exn_pending = 0;
> +
> +    insn_init(&insn, hvmemul_ctxt->insn_buf, hvm_long_mode_enabled(curr));
> +    insn_get_length(&insn);
> +
> +    return insn.length;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/x86/inat-tables.c b/xen/arch/x86/inat-tables.c
> new file mode 100644
> index 0000000..39252c3
> --- /dev/null
> +++ b/xen/arch/x86/inat-tables.c
> @@ -0,0 +1,1130 @@
> +/* x86 opcode map generated from x86-opcode-map.txt */
> +/* Do not change this code. */
> +
> +/* Table: one byte opcode */
> +const insn_attr_t inat_primary_table[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x00] = INAT_MODRM,
> +	[0x01] = INAT_MODRM,
> +	[0x02] = INAT_MODRM,
> +	[0x03] = INAT_MODRM,
> +	[0x04] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x05] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0x08] = INAT_MODRM,
> +	[0x09] = INAT_MODRM,
> +	[0x0a] = INAT_MODRM,
> +	[0x0b] = INAT_MODRM,
> +	[0x0c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x0d] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0x0f] = INAT_MAKE_ESCAPE(1),
> +	[0x10] = INAT_MODRM,
> +	[0x11] = INAT_MODRM,
> +	[0x12] = INAT_MODRM,
> +	[0x13] = INAT_MODRM,
> +	[0x14] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x15] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0x18] = INAT_MODRM,
> +	[0x19] = INAT_MODRM,
> +	[0x1a] = INAT_MODRM,
> +	[0x1b] = INAT_MODRM,
> +	[0x1c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x1d] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0x20] = INAT_MODRM,
> +	[0x21] = INAT_MODRM,
> +	[0x22] = INAT_MODRM,
> +	[0x23] = INAT_MODRM,
> +	[0x24] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x25] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0x26] = INAT_MAKE_PREFIX(INAT_PFX_ES),
> +	[0x28] = INAT_MODRM,
> +	[0x29] = INAT_MODRM,
> +	[0x2a] = INAT_MODRM,
> +	[0x2b] = INAT_MODRM,
> +	[0x2c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x2d] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0x2e] = INAT_MAKE_PREFIX(INAT_PFX_CS),
> +	[0x30] = INAT_MODRM,
> +	[0x31] = INAT_MODRM,
> +	[0x32] = INAT_MODRM,
> +	[0x33] = INAT_MODRM,
> +	[0x34] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x35] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0x36] = INAT_MAKE_PREFIX(INAT_PFX_SS),
> +	[0x38] = INAT_MODRM,
> +	[0x39] = INAT_MODRM,
> +	[0x3a] = INAT_MODRM,
> +	[0x3b] = INAT_MODRM,
> +	[0x3c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x3d] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0x3e] = INAT_MAKE_PREFIX(INAT_PFX_DS),
> +	[0x40] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x41] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x42] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x43] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x44] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x45] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x46] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x47] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x48] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x49] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x4a] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x4b] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x4c] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x4d] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x4e] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x4f] = INAT_MAKE_PREFIX(INAT_PFX_REX),
> +	[0x50] = INAT_FORCE64,
> +	[0x51] = INAT_FORCE64,
> +	[0x52] = INAT_FORCE64,
> +	[0x53] = INAT_FORCE64,
> +	[0x54] = INAT_FORCE64,
> +	[0x55] = INAT_FORCE64,
> +	[0x56] = INAT_FORCE64,
> +	[0x57] = INAT_FORCE64,
> +	[0x58] = INAT_FORCE64,
> +	[0x59] = INAT_FORCE64,
> +	[0x5a] = INAT_FORCE64,
> +	[0x5b] = INAT_FORCE64,
> +	[0x5c] = INAT_FORCE64,
> +	[0x5d] = INAT_FORCE64,
> +	[0x5e] = INAT_FORCE64,
> +	[0x5f] = INAT_FORCE64,
> +	[0x62] = INAT_MODRM,
> +	[0x63] = INAT_MODRM | INAT_MODRM,
> +	[0x64] = INAT_MAKE_PREFIX(INAT_PFX_FS),
> +	[0x65] = INAT_MAKE_PREFIX(INAT_PFX_GS),
> +	[0x66] = INAT_MAKE_PREFIX(INAT_PFX_OPNDSZ),
> +	[0x67] = INAT_MAKE_PREFIX(INAT_PFX_ADDRSZ),
> +	[0x68] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x69] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_MODRM,
> +	[0x6a] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
> +	[0x6b] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
> +	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x71] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x72] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x73] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x74] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x75] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x76] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x77] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x78] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x79] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x7a] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x7b] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x7c] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x7d] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x7e] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x7f] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0x80] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(1),
> +	[0x81] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_MODRM | INAT_MAKE_GROUP(1),
> +	[0x82] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(1),
> +	[0x83] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(1),
> +	[0x84] = INAT_MODRM,
> +	[0x85] = INAT_MODRM,
> +	[0x86] = INAT_MODRM,
> +	[0x87] = INAT_MODRM,
> +	[0x88] = INAT_MODRM,
> +	[0x89] = INAT_MODRM,
> +	[0x8a] = INAT_MODRM,
> +	[0x8b] = INAT_MODRM,
> +	[0x8c] = INAT_MODRM,
> +	[0x8d] = INAT_MODRM,
> +	[0x8e] = INAT_MODRM,
> +	[0x8f] = INAT_MAKE_GROUP(2) | INAT_MODRM | INAT_FORCE64,
> +	[0x9a] = INAT_MAKE_IMM(INAT_IMM_PTR),
> +	[0x9c] = INAT_FORCE64,
> +	[0x9d] = INAT_FORCE64,
> +	[0xa0] = INAT_MOFFSET,
> +	[0xa1] = INAT_MOFFSET,
> +	[0xa2] = INAT_MOFFSET,
> +	[0xa3] = INAT_MOFFSET,
> +	[0xa8] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xa9] = INAT_MAKE_IMM(INAT_IMM_VWORD32),
> +	[0xb0] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xb1] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xb2] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xb3] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xb4] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xb5] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xb6] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xb7] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xb8] = INAT_MAKE_IMM(INAT_IMM_VWORD),
> +	[0xb9] = INAT_MAKE_IMM(INAT_IMM_VWORD),
> +	[0xba] = INAT_MAKE_IMM(INAT_IMM_VWORD),
> +	[0xbb] = INAT_MAKE_IMM(INAT_IMM_VWORD),
> +	[0xbc] = INAT_MAKE_IMM(INAT_IMM_VWORD),
> +	[0xbd] = INAT_MAKE_IMM(INAT_IMM_VWORD),
> +	[0xbe] = INAT_MAKE_IMM(INAT_IMM_VWORD),
> +	[0xbf] = INAT_MAKE_IMM(INAT_IMM_VWORD),
> +	[0xc0] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(3),
> +	[0xc1] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(3),
> +	[0xc2] = INAT_MAKE_IMM(INAT_IMM_WORD) | INAT_FORCE64,
> +	[0xc4] = INAT_MODRM | INAT_MAKE_PREFIX(INAT_PFX_VEX3),
> +	[0xc5] = INAT_MODRM | INAT_MAKE_PREFIX(INAT_PFX_VEX2),
> +	[0xc6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(4),
> +	[0xc7] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_MODRM | INAT_MAKE_GROUP(4),
> +	[0xc8] = INAT_MAKE_IMM(INAT_IMM_WORD) | INAT_SCNDIMM,
> +	[0xc9] = INAT_FORCE64,
> +	[0xca] = INAT_MAKE_IMM(INAT_IMM_WORD),
> +	[0xcd] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xd0] = INAT_MODRM | INAT_MAKE_GROUP(3),
> +	[0xd1] = INAT_MODRM | INAT_MAKE_GROUP(3),
> +	[0xd2] = INAT_MODRM | INAT_MAKE_GROUP(3),
> +	[0xd3] = INAT_MODRM | INAT_MAKE_GROUP(3),
> +	[0xd4] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xd5] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xd8] = INAT_MODRM,
> +	[0xd9] = INAT_MODRM,
> +	[0xda] = INAT_MODRM,
> +	[0xdb] = INAT_MODRM,
> +	[0xdc] = INAT_MODRM,
> +	[0xdd] = INAT_MODRM,
> +	[0xde] = INAT_MODRM,
> +	[0xdf] = INAT_MODRM,
> +	[0xe0] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
> +	[0xe1] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
> +	[0xe2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
> +	[0xe3] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
> +	[0xe4] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xe5] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xe6] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xe7] = INAT_MAKE_IMM(INAT_IMM_BYTE),
> +	[0xe8] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0xe9] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0xea] = INAT_MAKE_IMM(INAT_IMM_PTR),
> +	[0xeb] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_FORCE64,
> +	[0xf0] = INAT_MAKE_PREFIX(INAT_PFX_LOCK),
> +	[0xf2] = INAT_MAKE_PREFIX(INAT_PFX_REPNE),
> +	[0xf3] = INAT_MAKE_PREFIX(INAT_PFX_REPE),
> +	[0xf6] = INAT_MODRM | INAT_MAKE_GROUP(5),
> +	[0xf7] = INAT_MODRM | INAT_MAKE_GROUP(6),
> +	[0xfe] = INAT_MAKE_GROUP(7),
> +	[0xff] = INAT_MAKE_GROUP(8),
> +};
> +
> +/* Table: 2-byte opcode (0x0f) */
> +const insn_attr_t inat_escape_table_1[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x00] = INAT_MAKE_GROUP(9),
> +	[0x01] = INAT_MAKE_GROUP(10),
> +	[0x02] = INAT_MODRM,
> +	[0x03] = INAT_MODRM,
> +	[0x0d] = INAT_MODRM | INAT_MAKE_GROUP(11),
> +	[0x0f] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
> +	[0x10] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x11] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x12] = INAT_MODRM | INAT_VEXOK | INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x13] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x14] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x15] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x16] = INAT_MODRM | INAT_VEXOK | INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x17] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x18] = INAT_MAKE_GROUP(12),
> +	[0x1f] = INAT_MODRM,
> +	[0x20] = INAT_MODRM,
> +	[0x21] = INAT_MODRM,
> +	[0x22] = INAT_MODRM,
> +	[0x23] = INAT_MODRM,
> +	[0x28] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x29] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x2a] = INAT_MODRM | INAT_VARIANT,
> +	[0x2b] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x2c] = INAT_MODRM | INAT_VARIANT,
> +	[0x2d] = INAT_MODRM | INAT_VARIANT,
> +	[0x2e] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x2f] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x38] = INAT_MAKE_ESCAPE(2),
> +	[0x3a] = INAT_MAKE_ESCAPE(3),
> +	[0x40] = INAT_MODRM,
> +	[0x41] = INAT_MODRM,
> +	[0x42] = INAT_MODRM,
> +	[0x43] = INAT_MODRM,
> +	[0x44] = INAT_MODRM,
> +	[0x45] = INAT_MODRM,
> +	[0x46] = INAT_MODRM,
> +	[0x47] = INAT_MODRM,
> +	[0x48] = INAT_MODRM,
> +	[0x49] = INAT_MODRM,
> +	[0x4a] = INAT_MODRM,
> +	[0x4b] = INAT_MODRM,
> +	[0x4c] = INAT_MODRM,
> +	[0x4d] = INAT_MODRM,
> +	[0x4e] = INAT_MODRM,
> +	[0x4f] = INAT_MODRM,
> +	[0x50] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x51] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x52] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x53] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x54] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x55] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x56] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x57] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x58] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x59] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x5a] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x5b] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x5c] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x5d] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x5e] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x5f] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x60] = INAT_MODRM | INAT_VARIANT,
> +	[0x61] = INAT_MODRM | INAT_VARIANT,
> +	[0x62] = INAT_MODRM | INAT_VARIANT,
> +	[0x63] = INAT_MODRM | INAT_VARIANT,
> +	[0x64] = INAT_MODRM | INAT_VARIANT,
> +	[0x65] = INAT_MODRM | INAT_VARIANT,
> +	[0x66] = INAT_MODRM | INAT_VARIANT,
> +	[0x67] = INAT_MODRM | INAT_VARIANT,
> +	[0x68] = INAT_MODRM | INAT_VARIANT,
> +	[0x69] = INAT_MODRM | INAT_VARIANT,
> +	[0x6a] = INAT_MODRM | INAT_VARIANT,
> +	[0x6b] = INAT_MODRM | INAT_VARIANT,
> +	[0x6c] = INAT_VARIANT,
> +	[0x6d] = INAT_VARIANT,
> +	[0x6e] = INAT_MODRM | INAT_VARIANT,
> +	[0x6f] = INAT_MODRM | INAT_VARIANT,
> +	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0x71] = INAT_MAKE_GROUP(13),
> +	[0x72] = INAT_MAKE_GROUP(14),
> +	[0x73] = INAT_MAKE_GROUP(15),
> +	[0x74] = INAT_MODRM | INAT_VARIANT,
> +	[0x75] = INAT_MODRM | INAT_VARIANT,
> +	[0x76] = INAT_MODRM | INAT_VARIANT,
> +	[0x77] = INAT_VEXOK | INAT_VEXOK,
> +	[0x78] = INAT_MODRM,
> +	[0x79] = INAT_MODRM,
> +	[0x7c] = INAT_VARIANT,
> +	[0x7d] = INAT_VARIANT,
> +	[0x7e] = INAT_MODRM | INAT_VARIANT,
> +	[0x7f] = INAT_MODRM | INAT_VARIANT,
> +	[0x80] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x81] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x82] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x83] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x84] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x85] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x86] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x87] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x88] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x89] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x8a] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x8b] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x8c] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x8d] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x8e] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x8f] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_FORCE64,
> +	[0x90] = INAT_MODRM,
> +	[0x91] = INAT_MODRM,
> +	[0x92] = INAT_MODRM,
> +	[0x93] = INAT_MODRM,
> +	[0x94] = INAT_MODRM,
> +	[0x95] = INAT_MODRM,
> +	[0x96] = INAT_MODRM,
> +	[0x97] = INAT_MODRM,
> +	[0x98] = INAT_MODRM,
> +	[0x99] = INAT_MODRM,
> +	[0x9a] = INAT_MODRM,
> +	[0x9b] = INAT_MODRM,
> +	[0x9c] = INAT_MODRM,
> +	[0x9d] = INAT_MODRM,
> +	[0x9e] = INAT_MODRM,
> +	[0x9f] = INAT_MODRM,
> +	[0xa0] = INAT_FORCE64,
> +	[0xa1] = INAT_FORCE64,
> +	[0xa3] = INAT_MODRM,
> +	[0xa4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
> +	[0xa5] = INAT_MODRM,
> +	[0xa6] = INAT_MAKE_GROUP(16),
> +	[0xa7] = INAT_MAKE_GROUP(17),
> +	[0xa8] = INAT_FORCE64,
> +	[0xa9] = INAT_FORCE64,
> +	[0xab] = INAT_MODRM,
> +	[0xac] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
> +	[0xad] = INAT_MODRM,
> +	[0xae] = INAT_MAKE_GROUP(18),
> +	[0xaf] = INAT_MODRM,
> +	[0xb0] = INAT_MODRM,
> +	[0xb1] = INAT_MODRM,
> +	[0xb2] = INAT_MODRM,
> +	[0xb3] = INAT_MODRM,
> +	[0xb4] = INAT_MODRM,
> +	[0xb5] = INAT_MODRM,
> +	[0xb6] = INAT_MODRM,
> +	[0xb7] = INAT_MODRM,
> +	[0xb8] = INAT_VARIANT,
> +	[0xb9] = INAT_MAKE_GROUP(19),
> +	[0xba] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_MAKE_GROUP(20),
> +	[0xbb] = INAT_MODRM,
> +	[0xbc] = INAT_MODRM | INAT_VARIANT,
> +	[0xbd] = INAT_MODRM | INAT_VARIANT,
> +	[0xbe] = INAT_MODRM,
> +	[0xbf] = INAT_MODRM,
> +	[0xc0] = INAT_MODRM,
> +	[0xc1] = INAT_MODRM,
> +	[0xc2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0xc3] = INAT_MODRM,
> +	[0xc4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0xc5] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0xc6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0xc7] = INAT_MAKE_GROUP(21),
> +	[0xd0] = INAT_VARIANT,
> +	[0xd1] = INAT_MODRM | INAT_VARIANT,
> +	[0xd2] = INAT_MODRM | INAT_VARIANT,
> +	[0xd3] = INAT_MODRM | INAT_VARIANT,
> +	[0xd4] = INAT_MODRM | INAT_VARIANT,
> +	[0xd5] = INAT_MODRM | INAT_VARIANT,
> +	[0xd6] = INAT_VARIANT,
> +	[0xd7] = INAT_MODRM | INAT_VARIANT,
> +	[0xd8] = INAT_MODRM | INAT_VARIANT,
> +	[0xd9] = INAT_MODRM | INAT_VARIANT,
> +	[0xda] = INAT_MODRM | INAT_VARIANT,
> +	[0xdb] = INAT_MODRM | INAT_VARIANT,
> +	[0xdc] = INAT_MODRM | INAT_VARIANT,
> +	[0xdd] = INAT_MODRM | INAT_VARIANT,
> +	[0xde] = INAT_MODRM | INAT_VARIANT,
> +	[0xdf] = INAT_MODRM | INAT_VARIANT,
> +	[0xe0] = INAT_MODRM | INAT_VARIANT,
> +	[0xe1] = INAT_MODRM | INAT_VARIANT,
> +	[0xe2] = INAT_MODRM | INAT_VARIANT,
> +	[0xe3] = INAT_MODRM | INAT_VARIANT,
> +	[0xe4] = INAT_MODRM | INAT_VARIANT,
> +	[0xe5] = INAT_MODRM | INAT_VARIANT,
> +	[0xe6] = INAT_VARIANT,
> +	[0xe7] = INAT_MODRM | INAT_VARIANT,
> +	[0xe8] = INAT_MODRM | INAT_VARIANT,
> +	[0xe9] = INAT_MODRM | INAT_VARIANT,
> +	[0xea] = INAT_MODRM | INAT_VARIANT,
> +	[0xeb] = INAT_MODRM | INAT_VARIANT,
> +	[0xec] = INAT_MODRM | INAT_VARIANT,
> +	[0xed] = INAT_MODRM | INAT_VARIANT,
> +	[0xee] = INAT_MODRM | INAT_VARIANT,
> +	[0xef] = INAT_MODRM | INAT_VARIANT,
> +	[0xf0] = INAT_VARIANT,
> +	[0xf1] = INAT_MODRM | INAT_VARIANT,
> +	[0xf2] = INAT_MODRM | INAT_VARIANT,
> +	[0xf3] = INAT_MODRM | INAT_VARIANT,
> +	[0xf4] = INAT_MODRM | INAT_VARIANT,
> +	[0xf5] = INAT_MODRM | INAT_VARIANT,
> +	[0xf6] = INAT_MODRM | INAT_VARIANT,
> +	[0xf7] = INAT_MODRM | INAT_VARIANT,
> +	[0xf8] = INAT_MODRM | INAT_VARIANT,
> +	[0xf9] = INAT_MODRM | INAT_VARIANT,
> +	[0xfa] = INAT_MODRM | INAT_VARIANT,
> +	[0xfb] = INAT_MODRM | INAT_VARIANT,
> +	[0xfc] = INAT_MODRM | INAT_VARIANT,
> +	[0xfd] = INAT_MODRM | INAT_VARIANT,
> +	[0xfe] = INAT_MODRM | INAT_VARIANT,
> +};
> +const insn_attr_t inat_escape_table_1_1[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x10] = INAT_MODRM | INAT_VEXOK,
> +	[0x11] = INAT_MODRM | INAT_VEXOK,
> +	[0x12] = INAT_MODRM | INAT_VEXOK,
> +	[0x13] = INAT_MODRM | INAT_VEXOK,
> +	[0x14] = INAT_MODRM | INAT_VEXOK,
> +	[0x15] = INAT_MODRM | INAT_VEXOK,
> +	[0x16] = INAT_MODRM | INAT_VEXOK,
> +	[0x17] = INAT_MODRM | INAT_VEXOK,
> +	[0x28] = INAT_MODRM | INAT_VEXOK,
> +	[0x29] = INAT_MODRM | INAT_VEXOK,
> +	[0x2a] = INAT_MODRM,
> +	[0x2b] = INAT_MODRM | INAT_VEXOK,
> +	[0x2c] = INAT_MODRM,
> +	[0x2d] = INAT_MODRM,
> +	[0x2e] = INAT_MODRM | INAT_VEXOK,
> +	[0x2f] = INAT_MODRM | INAT_VEXOK,
> +	[0x50] = INAT_MODRM | INAT_VEXOK,
> +	[0x51] = INAT_MODRM | INAT_VEXOK,
> +	[0x54] = INAT_MODRM | INAT_VEXOK,
> +	[0x55] = INAT_MODRM | INAT_VEXOK,
> +	[0x56] = INAT_MODRM | INAT_VEXOK,
> +	[0x57] = INAT_MODRM | INAT_VEXOK,
> +	[0x58] = INAT_MODRM | INAT_VEXOK,
> +	[0x59] = INAT_MODRM | INAT_VEXOK,
> +	[0x5a] = INAT_MODRM | INAT_VEXOK,
> +	[0x5b] = INAT_MODRM | INAT_VEXOK,
> +	[0x5c] = INAT_MODRM | INAT_VEXOK,
> +	[0x5d] = INAT_MODRM | INAT_VEXOK,
> +	[0x5e] = INAT_MODRM | INAT_VEXOK,
> +	[0x5f] = INAT_MODRM | INAT_VEXOK,
> +	[0x60] = INAT_MODRM | INAT_VEXOK,
> +	[0x61] = INAT_MODRM | INAT_VEXOK,
> +	[0x62] = INAT_MODRM | INAT_VEXOK,
> +	[0x63] = INAT_MODRM | INAT_VEXOK,
> +	[0x64] = INAT_MODRM | INAT_VEXOK,
> +	[0x65] = INAT_MODRM | INAT_VEXOK,
> +	[0x66] = INAT_MODRM | INAT_VEXOK,
> +	[0x67] = INAT_MODRM | INAT_VEXOK,
> +	[0x68] = INAT_MODRM | INAT_VEXOK,
> +	[0x69] = INAT_MODRM | INAT_VEXOK,
> +	[0x6a] = INAT_MODRM | INAT_VEXOK,
> +	[0x6b] = INAT_MODRM | INAT_VEXOK,
> +	[0x6c] = INAT_MODRM | INAT_VEXOK,
> +	[0x6d] = INAT_MODRM | INAT_VEXOK,
> +	[0x6e] = INAT_MODRM | INAT_VEXOK,
> +	[0x6f] = INAT_MODRM | INAT_VEXOK,
> +	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x74] = INAT_MODRM | INAT_VEXOK,
> +	[0x75] = INAT_MODRM | INAT_VEXOK,
> +	[0x76] = INAT_MODRM | INAT_VEXOK,
> +	[0x7c] = INAT_MODRM | INAT_VEXOK,
> +	[0x7d] = INAT_MODRM | INAT_VEXOK,
> +	[0x7e] = INAT_MODRM | INAT_VEXOK,
> +	[0x7f] = INAT_MODRM | INAT_VEXOK,
> +	[0xbc] = INAT_MODRM,
> +	[0xbd] = INAT_MODRM,
> +	[0xc2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0xc4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0xc5] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0xc6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0xd0] = INAT_MODRM | INAT_VEXOK,
> +	[0xd1] = INAT_MODRM | INAT_VEXOK,
> +	[0xd2] = INAT_MODRM | INAT_VEXOK,
> +	[0xd3] = INAT_MODRM | INAT_VEXOK,
> +	[0xd4] = INAT_MODRM | INAT_VEXOK,
> +	[0xd5] = INAT_MODRM | INAT_VEXOK,
> +	[0xd6] = INAT_MODRM | INAT_VEXOK,
> +	[0xd7] = INAT_MODRM | INAT_VEXOK,
> +	[0xd8] = INAT_MODRM | INAT_VEXOK,
> +	[0xd9] = INAT_MODRM | INAT_VEXOK,
> +	[0xda] = INAT_MODRM | INAT_VEXOK,
> +	[0xdb] = INAT_MODRM | INAT_VEXOK,
> +	[0xdc] = INAT_MODRM | INAT_VEXOK,
> +	[0xdd] = INAT_MODRM | INAT_VEXOK,
> +	[0xde] = INAT_MODRM | INAT_VEXOK,
> +	[0xdf] = INAT_MODRM | INAT_VEXOK,
> +	[0xe0] = INAT_MODRM | INAT_VEXOK,
> +	[0xe1] = INAT_MODRM | INAT_VEXOK,
> +	[0xe2] = INAT_MODRM | INAT_VEXOK,
> +	[0xe3] = INAT_MODRM | INAT_VEXOK,
> +	[0xe4] = INAT_MODRM | INAT_VEXOK,
> +	[0xe5] = INAT_MODRM | INAT_VEXOK,
> +	[0xe6] = INAT_MODRM | INAT_VEXOK,
> +	[0xe7] = INAT_MODRM | INAT_VEXOK,
> +	[0xe8] = INAT_MODRM | INAT_VEXOK,
> +	[0xe9] = INAT_MODRM | INAT_VEXOK,
> +	[0xea] = INAT_MODRM | INAT_VEXOK,
> +	[0xeb] = INAT_MODRM | INAT_VEXOK,
> +	[0xec] = INAT_MODRM | INAT_VEXOK,
> +	[0xed] = INAT_MODRM | INAT_VEXOK,
> +	[0xee] = INAT_MODRM | INAT_VEXOK,
> +	[0xef] = INAT_MODRM | INAT_VEXOK,
> +	[0xf1] = INAT_MODRM | INAT_VEXOK,
> +	[0xf2] = INAT_MODRM | INAT_VEXOK,
> +	[0xf3] = INAT_MODRM | INAT_VEXOK,
> +	[0xf4] = INAT_MODRM | INAT_VEXOK,
> +	[0xf5] = INAT_MODRM | INAT_VEXOK,
> +	[0xf6] = INAT_MODRM | INAT_VEXOK,
> +	[0xf7] = INAT_MODRM | INAT_VEXOK,
> +	[0xf8] = INAT_MODRM | INAT_VEXOK,
> +	[0xf9] = INAT_MODRM | INAT_VEXOK,
> +	[0xfa] = INAT_MODRM | INAT_VEXOK,
> +	[0xfb] = INAT_MODRM | INAT_VEXOK,
> +	[0xfc] = INAT_MODRM | INAT_VEXOK,
> +	[0xfd] = INAT_MODRM | INAT_VEXOK,
> +	[0xfe] = INAT_MODRM | INAT_VEXOK,
> +};
> +const insn_attr_t inat_escape_table_1_2[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x10] = INAT_MODRM | INAT_VEXOK,
> +	[0x11] = INAT_MODRM | INAT_VEXOK,
> +	[0x12] = INAT_MODRM | INAT_VEXOK,
> +	[0x16] = INAT_MODRM | INAT_VEXOK,
> +	[0x2a] = INAT_MODRM | INAT_VEXOK,
> +	[0x2c] = INAT_MODRM | INAT_VEXOK,
> +	[0x2d] = INAT_MODRM | INAT_VEXOK,
> +	[0x51] = INAT_MODRM | INAT_VEXOK,
> +	[0x52] = INAT_MODRM | INAT_VEXOK,
> +	[0x53] = INAT_MODRM | INAT_VEXOK,
> +	[0x58] = INAT_MODRM | INAT_VEXOK,
> +	[0x59] = INAT_MODRM | INAT_VEXOK,
> +	[0x5a] = INAT_MODRM | INAT_VEXOK,
> +	[0x5b] = INAT_MODRM | INAT_VEXOK,
> +	[0x5c] = INAT_MODRM | INAT_VEXOK,
> +	[0x5d] = INAT_MODRM | INAT_VEXOK,
> +	[0x5e] = INAT_MODRM | INAT_VEXOK,
> +	[0x5f] = INAT_MODRM | INAT_VEXOK,
> +	[0x6f] = INAT_MODRM | INAT_VEXOK,
> +	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x7e] = INAT_MODRM | INAT_VEXOK,
> +	[0x7f] = INAT_MODRM | INAT_VEXOK,
> +	[0xb8] = INAT_MODRM,
> +	[0xbc] = INAT_MODRM,
> +	[0xbd] = INAT_MODRM,
> +	[0xc2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0xd6] = INAT_MODRM,
> +	[0xe6] = INAT_MODRM | INAT_VEXOK,
> +};
> +const insn_attr_t inat_escape_table_1_3[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x10] = INAT_MODRM | INAT_VEXOK,
> +	[0x11] = INAT_MODRM | INAT_VEXOK,
> +	[0x12] = INAT_MODRM | INAT_VEXOK,
> +	[0x2a] = INAT_MODRM | INAT_VEXOK,
> +	[0x2c] = INAT_MODRM | INAT_VEXOK,
> +	[0x2d] = INAT_MODRM | INAT_VEXOK,
> +	[0x51] = INAT_MODRM | INAT_VEXOK,
> +	[0x58] = INAT_MODRM | INAT_VEXOK,
> +	[0x59] = INAT_MODRM | INAT_VEXOK,
> +	[0x5a] = INAT_MODRM | INAT_VEXOK,
> +	[0x5c] = INAT_MODRM | INAT_VEXOK,
> +	[0x5d] = INAT_MODRM | INAT_VEXOK,
> +	[0x5e] = INAT_MODRM | INAT_VEXOK,
> +	[0x5f] = INAT_MODRM | INAT_VEXOK,
> +	[0x70] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x7c] = INAT_MODRM | INAT_VEXOK,
> +	[0x7d] = INAT_MODRM | INAT_VEXOK,
> +	[0xbc] = INAT_MODRM,
> +	[0xbd] = INAT_MODRM,
> +	[0xc2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0xd0] = INAT_MODRM | INAT_VEXOK,
> +	[0xd6] = INAT_MODRM,
> +	[0xe6] = INAT_MODRM | INAT_VEXOK,
> +	[0xf0] = INAT_MODRM | INAT_VEXOK,
> +};
> +
> +/* Table: 3-byte opcode 1 (0x0f 0x38) */
> +const insn_attr_t inat_escape_table_2[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x00] = INAT_MODRM | INAT_VARIANT,
> +	[0x01] = INAT_MODRM | INAT_VARIANT,
> +	[0x02] = INAT_MODRM | INAT_VARIANT,
> +	[0x03] = INAT_MODRM | INAT_VARIANT,
> +	[0x04] = INAT_MODRM | INAT_VARIANT,
> +	[0x05] = INAT_MODRM | INAT_VARIANT,
> +	[0x06] = INAT_MODRM | INAT_VARIANT,
> +	[0x07] = INAT_MODRM | INAT_VARIANT,
> +	[0x08] = INAT_MODRM | INAT_VARIANT,
> +	[0x09] = INAT_MODRM | INAT_VARIANT,
> +	[0x0a] = INAT_MODRM | INAT_VARIANT,
> +	[0x0b] = INAT_MODRM | INAT_VARIANT,
> +	[0x0c] = INAT_VARIANT,
> +	[0x0d] = INAT_VARIANT,
> +	[0x0e] = INAT_VARIANT,
> +	[0x0f] = INAT_VARIANT,
> +	[0x10] = INAT_VARIANT,
> +	[0x13] = INAT_VARIANT,
> +	[0x14] = INAT_VARIANT,
> +	[0x15] = INAT_VARIANT,
> +	[0x16] = INAT_VARIANT,
> +	[0x17] = INAT_VARIANT,
> +	[0x18] = INAT_VARIANT,
> +	[0x19] = INAT_VARIANT,
> +	[0x1a] = INAT_VARIANT,
> +	[0x1c] = INAT_MODRM | INAT_VARIANT,
> +	[0x1d] = INAT_MODRM | INAT_VARIANT,
> +	[0x1e] = INAT_MODRM | INAT_VARIANT,
> +	[0x20] = INAT_VARIANT,
> +	[0x21] = INAT_VARIANT,
> +	[0x22] = INAT_VARIANT,
> +	[0x23] = INAT_VARIANT,
> +	[0x24] = INAT_VARIANT,
> +	[0x25] = INAT_VARIANT,
> +	[0x28] = INAT_VARIANT,
> +	[0x29] = INAT_VARIANT,
> +	[0x2a] = INAT_VARIANT,
> +	[0x2b] = INAT_VARIANT,
> +	[0x2c] = INAT_VARIANT,
> +	[0x2d] = INAT_VARIANT,
> +	[0x2e] = INAT_VARIANT,
> +	[0x2f] = INAT_VARIANT,
> +	[0x30] = INAT_VARIANT,
> +	[0x31] = INAT_VARIANT,
> +	[0x32] = INAT_VARIANT,
> +	[0x33] = INAT_VARIANT,
> +	[0x34] = INAT_VARIANT,
> +	[0x35] = INAT_VARIANT,
> +	[0x36] = INAT_VARIANT,
> +	[0x37] = INAT_VARIANT,
> +	[0x38] = INAT_VARIANT,
> +	[0x39] = INAT_VARIANT,
> +	[0x3a] = INAT_VARIANT,
> +	[0x3b] = INAT_VARIANT,
> +	[0x3c] = INAT_VARIANT,
> +	[0x3d] = INAT_VARIANT,
> +	[0x3e] = INAT_VARIANT,
> +	[0x3f] = INAT_VARIANT,
> +	[0x40] = INAT_VARIANT,
> +	[0x41] = INAT_VARIANT,
> +	[0x45] = INAT_VARIANT,
> +	[0x46] = INAT_VARIANT,
> +	[0x47] = INAT_VARIANT,
> +	[0x58] = INAT_VARIANT,
> +	[0x59] = INAT_VARIANT,
> +	[0x5a] = INAT_VARIANT,
> +	[0x78] = INAT_VARIANT,
> +	[0x79] = INAT_VARIANT,
> +	[0x80] = INAT_VARIANT,
> +	[0x81] = INAT_VARIANT,
> +	[0x82] = INAT_VARIANT,
> +	[0x8c] = INAT_VARIANT,
> +	[0x8e] = INAT_VARIANT,
> +	[0x90] = INAT_VARIANT,
> +	[0x91] = INAT_VARIANT,
> +	[0x92] = INAT_VARIANT,
> +	[0x93] = INAT_VARIANT,
> +	[0x96] = INAT_VARIANT,
> +	[0x97] = INAT_VARIANT,
> +	[0x98] = INAT_VARIANT,
> +	[0x99] = INAT_VARIANT,
> +	[0x9a] = INAT_VARIANT,
> +	[0x9b] = INAT_VARIANT,
> +	[0x9c] = INAT_VARIANT,
> +	[0x9d] = INAT_VARIANT,
> +	[0x9e] = INAT_VARIANT,
> +	[0x9f] = INAT_VARIANT,
> +	[0xa6] = INAT_VARIANT,
> +	[0xa7] = INAT_VARIANT,
> +	[0xa8] = INAT_VARIANT,
> +	[0xa9] = INAT_VARIANT,
> +	[0xaa] = INAT_VARIANT,
> +	[0xab] = INAT_VARIANT,
> +	[0xac] = INAT_VARIANT,
> +	[0xad] = INAT_VARIANT,
> +	[0xae] = INAT_VARIANT,
> +	[0xaf] = INAT_VARIANT,
> +	[0xb6] = INAT_VARIANT,
> +	[0xb7] = INAT_VARIANT,
> +	[0xb8] = INAT_VARIANT,
> +	[0xb9] = INAT_VARIANT,
> +	[0xba] = INAT_VARIANT,
> +	[0xbb] = INAT_VARIANT,
> +	[0xbc] = INAT_VARIANT,
> +	[0xbd] = INAT_VARIANT,
> +	[0xbe] = INAT_VARIANT,
> +	[0xbf] = INAT_VARIANT,
> +	[0xdb] = INAT_VARIANT,
> +	[0xdc] = INAT_VARIANT,
> +	[0xdd] = INAT_VARIANT,
> +	[0xde] = INAT_VARIANT,
> +	[0xdf] = INAT_VARIANT,
> +	[0xf0] = INAT_MODRM | INAT_VARIANT,
> +	[0xf1] = INAT_MODRM | INAT_VARIANT,
> +	[0xf2] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xf3] = INAT_MAKE_GROUP(22),
> +	[0xf5] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY | INAT_VARIANT,
> +	[0xf6] = INAT_VARIANT,
> +	[0xf7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY | INAT_VARIANT,
> +};
> +const insn_attr_t inat_escape_table_2_1[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x00] = INAT_MODRM | INAT_VEXOK,
> +	[0x01] = INAT_MODRM | INAT_VEXOK,
> +	[0x02] = INAT_MODRM | INAT_VEXOK,
> +	[0x03] = INAT_MODRM | INAT_VEXOK,
> +	[0x04] = INAT_MODRM | INAT_VEXOK,
> +	[0x05] = INAT_MODRM | INAT_VEXOK,
> +	[0x06] = INAT_MODRM | INAT_VEXOK,
> +	[0x07] = INAT_MODRM | INAT_VEXOK,
> +	[0x08] = INAT_MODRM | INAT_VEXOK,
> +	[0x09] = INAT_MODRM | INAT_VEXOK,
> +	[0x0a] = INAT_MODRM | INAT_VEXOK,
> +	[0x0b] = INAT_MODRM | INAT_VEXOK,
> +	[0x0c] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x0d] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x0e] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x0f] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x10] = INAT_MODRM,
> +	[0x13] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x14] = INAT_MODRM,
> +	[0x15] = INAT_MODRM,
> +	[0x16] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x17] = INAT_MODRM | INAT_VEXOK,
> +	[0x18] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x19] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x1a] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x1c] = INAT_MODRM | INAT_VEXOK,
> +	[0x1d] = INAT_MODRM | INAT_VEXOK,
> +	[0x1e] = INAT_MODRM | INAT_VEXOK,
> +	[0x20] = INAT_MODRM | INAT_VEXOK,
> +	[0x21] = INAT_MODRM | INAT_VEXOK,
> +	[0x22] = INAT_MODRM | INAT_VEXOK,
> +	[0x23] = INAT_MODRM | INAT_VEXOK,
> +	[0x24] = INAT_MODRM | INAT_VEXOK,
> +	[0x25] = INAT_MODRM | INAT_VEXOK,
> +	[0x28] = INAT_MODRM | INAT_VEXOK,
> +	[0x29] = INAT_MODRM | INAT_VEXOK,
> +	[0x2a] = INAT_MODRM | INAT_VEXOK,
> +	[0x2b] = INAT_MODRM | INAT_VEXOK,
> +	[0x2c] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x2d] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x2e] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x2f] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x30] = INAT_MODRM | INAT_VEXOK,
> +	[0x31] = INAT_MODRM | INAT_VEXOK,
> +	[0x32] = INAT_MODRM | INAT_VEXOK,
> +	[0x33] = INAT_MODRM | INAT_VEXOK,
> +	[0x34] = INAT_MODRM | INAT_VEXOK,
> +	[0x35] = INAT_MODRM | INAT_VEXOK,
> +	[0x36] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x37] = INAT_MODRM | INAT_VEXOK,
> +	[0x38] = INAT_MODRM | INAT_VEXOK,
> +	[0x39] = INAT_MODRM | INAT_VEXOK,
> +	[0x3a] = INAT_MODRM | INAT_VEXOK,
> +	[0x3b] = INAT_MODRM | INAT_VEXOK,
> +	[0x3c] = INAT_MODRM | INAT_VEXOK,
> +	[0x3d] = INAT_MODRM | INAT_VEXOK,
> +	[0x3e] = INAT_MODRM | INAT_VEXOK,
> +	[0x3f] = INAT_MODRM | INAT_VEXOK,
> +	[0x40] = INAT_MODRM | INAT_VEXOK,
> +	[0x41] = INAT_MODRM | INAT_VEXOK,
> +	[0x45] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x46] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x47] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x58] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x59] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x5a] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x78] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x79] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x80] = INAT_MODRM,
> +	[0x81] = INAT_MODRM,
> +	[0x82] = INAT_MODRM,
> +	[0x8c] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x8e] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x90] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x91] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x92] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x93] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x96] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x97] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x98] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x99] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x9a] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x9b] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x9c] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x9d] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x9e] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x9f] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xa6] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xa7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xa8] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xa9] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xaa] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xab] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xac] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xad] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xae] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xaf] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xb6] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xb7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xb8] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xb9] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xba] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xbb] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xbc] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xbd] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xbe] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xbf] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xdb] = INAT_MODRM | INAT_VEXOK,
> +	[0xdc] = INAT_MODRM | INAT_VEXOK,
> +	[0xdd] = INAT_MODRM | INAT_VEXOK,
> +	[0xde] = INAT_MODRM | INAT_VEXOK,
> +	[0xdf] = INAT_MODRM | INAT_VEXOK,
> +	[0xf0] = INAT_MODRM,
> +	[0xf1] = INAT_MODRM,
> +	[0xf7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +};
> +const insn_attr_t inat_escape_table_2_2[INAT_OPCODE_TABLE_SIZE] = {
> +	[0xf5] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xf7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +};
> +const insn_attr_t inat_escape_table_2_3[INAT_OPCODE_TABLE_SIZE] = {
> +	[0xf0] = INAT_MODRM,
> +	[0xf1] = INAT_MODRM,
> +	[0xf5] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xf6] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0xf7] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +};
> +
> +/* Table: 3-byte opcode 2 (0x0f 0x3a) */
> +const insn_attr_t inat_escape_table_3[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x00] = INAT_VARIANT,
> +	[0x01] = INAT_VARIANT,
> +	[0x02] = INAT_VARIANT,
> +	[0x04] = INAT_VARIANT,
> +	[0x05] = INAT_VARIANT,
> +	[0x06] = INAT_VARIANT,
> +	[0x08] = INAT_VARIANT,
> +	[0x09] = INAT_VARIANT,
> +	[0x0a] = INAT_VARIANT,
> +	[0x0b] = INAT_VARIANT,
> +	[0x0c] = INAT_VARIANT,
> +	[0x0d] = INAT_VARIANT,
> +	[0x0e] = INAT_VARIANT,
> +	[0x0f] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0x14] = INAT_VARIANT,
> +	[0x15] = INAT_VARIANT,
> +	[0x16] = INAT_VARIANT,
> +	[0x17] = INAT_VARIANT,
> +	[0x18] = INAT_VARIANT,
> +	[0x19] = INAT_VARIANT,
> +	[0x1d] = INAT_VARIANT,
> +	[0x20] = INAT_VARIANT,
> +	[0x21] = INAT_VARIANT,
> +	[0x22] = INAT_VARIANT,
> +	[0x38] = INAT_VARIANT,
> +	[0x39] = INAT_VARIANT,
> +	[0x40] = INAT_VARIANT,
> +	[0x41] = INAT_VARIANT,
> +	[0x42] = INAT_VARIANT,
> +	[0x44] = INAT_VARIANT,
> +	[0x46] = INAT_VARIANT,
> +	[0x4a] = INAT_VARIANT,
> +	[0x4b] = INAT_VARIANT,
> +	[0x4c] = INAT_VARIANT,
> +	[0x60] = INAT_VARIANT,
> +	[0x61] = INAT_VARIANT,
> +	[0x62] = INAT_VARIANT,
> +	[0x63] = INAT_VARIANT,
> +	[0xdf] = INAT_VARIANT,
> +	[0xf0] = INAT_VARIANT,
> +};
> +const insn_attr_t inat_escape_table_3_1[INAT_OPCODE_TABLE_SIZE] = {
> +	[0x00] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x01] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x02] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x04] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x05] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x06] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x08] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x09] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x0a] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x0b] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x0c] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x0d] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x0e] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x0f] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x14] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x15] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x16] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x17] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x18] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x19] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x1d] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x20] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x21] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x22] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x38] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x39] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x40] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x41] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x42] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x44] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x46] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x4a] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x4b] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x4c] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x60] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x61] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x62] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x63] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0xdf] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +};
> +const insn_attr_t inat_escape_table_3_3[INAT_OPCODE_TABLE_SIZE] = {
> +	[0xf0] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +};
> +
> +/* GrpTable: Grp1 */
> +
> +/* GrpTable: Grp1A */
> +
> +/* GrpTable: Grp2 */
> +
> +/* GrpTable: Grp3_1 */
> +const insn_attr_t inat_group_table_5[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM,
> +	[0x2] = INAT_MODRM,
> +	[0x3] = INAT_MODRM,
> +	[0x4] = INAT_MODRM,
> +	[0x5] = INAT_MODRM,
> +	[0x6] = INAT_MODRM,
> +	[0x7] = INAT_MODRM,
> +};
> +
> +/* GrpTable: Grp3_2 */
> +const insn_attr_t inat_group_table_6[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_MAKE_IMM(INAT_IMM_VWORD32) | INAT_MODRM,
> +	[0x2] = INAT_MODRM,
> +	[0x3] = INAT_MODRM,
> +	[0x4] = INAT_MODRM,
> +	[0x5] = INAT_MODRM,
> +	[0x6] = INAT_MODRM,
> +	[0x7] = INAT_MODRM,
> +};
> +
> +/* GrpTable: Grp4 */
> +const insn_attr_t inat_group_table_7[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_MODRM,
> +	[0x1] = INAT_MODRM,
> +};
> +
> +/* GrpTable: Grp5 */
> +const insn_attr_t inat_group_table_8[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_MODRM,
> +	[0x1] = INAT_MODRM,
> +	[0x2] = INAT_MODRM | INAT_FORCE64,
> +	[0x3] = INAT_MODRM,
> +	[0x4] = INAT_MODRM | INAT_FORCE64,
> +	[0x5] = INAT_MODRM,
> +	[0x6] = INAT_MODRM | INAT_FORCE64,
> +};
> +
> +/* GrpTable: Grp6 */
> +const insn_attr_t inat_group_table_9[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_MODRM,
> +	[0x1] = INAT_MODRM,
> +	[0x2] = INAT_MODRM,
> +	[0x3] = INAT_MODRM,
> +	[0x4] = INAT_MODRM,
> +	[0x5] = INAT_MODRM,
> +};
> +
> +/* GrpTable: Grp7 */
> +const insn_attr_t inat_group_table_10[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_MODRM,
> +	[0x1] = INAT_MODRM,
> +	[0x2] = INAT_MODRM,
> +	[0x3] = INAT_MODRM,
> +	[0x4] = INAT_MODRM,
> +	[0x6] = INAT_MODRM,
> +	[0x7] = INAT_MODRM,
> +};
> +
> +/* GrpTable: Grp8 */
> +
> +/* GrpTable: Grp9 */
> +const insn_attr_t inat_group_table_21[INAT_GROUP_TABLE_SIZE] = {
> +	[0x1] = INAT_MODRM,
> +	[0x6] = INAT_MODRM | INAT_MODRM | INAT_VARIANT,
> +	[0x7] = INAT_MODRM | INAT_VARIANT,
> +};
> +const insn_attr_t inat_group_table_21_1[INAT_GROUP_TABLE_SIZE] = {
> +	[0x6] = INAT_MODRM,
> +};
> +const insn_attr_t inat_group_table_21_2[INAT_GROUP_TABLE_SIZE] = {
> +	[0x6] = INAT_MODRM,
> +	[0x7] = INAT_MODRM,
> +};
> +
> +/* GrpTable: Grp10 */
> +
> +/* GrpTable: Grp11 */
> +
> +/* GrpTable: Grp12 */
> +const insn_attr_t inat_group_table_13[INAT_GROUP_TABLE_SIZE] = {
> +	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0x4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +};
> +const insn_attr_t inat_group_table_13_1[INAT_GROUP_TABLE_SIZE] = {
> +	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +};
> +
> +/* GrpTable: Grp13 */
> +const insn_attr_t inat_group_table_14[INAT_GROUP_TABLE_SIZE] = {
> +	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0x4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +};
> +const insn_attr_t inat_group_table_14_1[INAT_GROUP_TABLE_SIZE] = {
> +	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x4] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +};
> +
> +/* GrpTable: Grp14 */
> +const insn_attr_t inat_group_table_15[INAT_GROUP_TABLE_SIZE] = {
> +	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0x3] = INAT_VARIANT,
> +	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VARIANT,
> +	[0x7] = INAT_VARIANT,
> +};
> +const insn_attr_t inat_group_table_15_1[INAT_GROUP_TABLE_SIZE] = {
> +	[0x2] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x3] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x6] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +	[0x7] = INAT_MAKE_IMM(INAT_IMM_BYTE) | INAT_MODRM | INAT_VEXOK,
> +};
> +
> +/* GrpTable: Grp15 */
> +const insn_attr_t inat_group_table_18[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_VARIANT,
> +	[0x1] = INAT_VARIANT,
> +	[0x2] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +	[0x3] = INAT_MODRM | INAT_VEXOK | INAT_VARIANT,
> +};
> +const insn_attr_t inat_group_table_18_2[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_MODRM,
> +	[0x1] = INAT_MODRM,
> +	[0x2] = INAT_MODRM,
> +	[0x3] = INAT_MODRM,
> +};
> +
> +/* GrpTable: Grp16 */
> +const insn_attr_t inat_group_table_12[INAT_GROUP_TABLE_SIZE] = {
> +	[0x0] = INAT_MODRM,
> +	[0x1] = INAT_MODRM,
> +	[0x2] = INAT_MODRM,
> +	[0x3] = INAT_MODRM,
> +};
> +
> +/* GrpTable: Grp17 */
> +const insn_attr_t inat_group_table_22[INAT_GROUP_TABLE_SIZE] = {
> +	[0x1] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x2] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +	[0x3] = INAT_MODRM | INAT_VEXOK | INAT_VEXONLY,
> +};
> +
> +/* GrpTable: GrpP */
> +
> +/* GrpTable: GrpPDLK */
> +
> +/* GrpTable: GrpRNG */
> +
> +/* Escape opcode map array */
> +const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1][INAT_LSTPFX_MAX + 1] = {
> +	[1][0] = inat_escape_table_1,
> +	[1][1] = inat_escape_table_1_1,
> +	[1][2] = inat_escape_table_1_2,
> +	[1][3] = inat_escape_table_1_3,
> +	[2][0] = inat_escape_table_2,
> +	[2][1] = inat_escape_table_2_1,
> +	[2][2] = inat_escape_table_2_2,
> +	[2][3] = inat_escape_table_2_3,
> +	[3][0] = inat_escape_table_3,
> +	[3][1] = inat_escape_table_3_1,
> +	[3][3] = inat_escape_table_3_3,
> +};
> +
> +/* Group opcode map array */
> +const insn_attr_t * const inat_group_tables[INAT_GRP_MAX + 1][INAT_LSTPFX_MAX + 1] = {
> +	[5][0] = inat_group_table_5,
> +	[6][0] = inat_group_table_6,
> +	[7][0] = inat_group_table_7,
> +	[8][0] = inat_group_table_8,
> +	[9][0] = inat_group_table_9,
> +	[10][0] = inat_group_table_10,
> +	[12][0] = inat_group_table_12,
> +	[13][0] = inat_group_table_13,
> +	[13][1] = inat_group_table_13_1,
> +	[14][0] = inat_group_table_14,
> +	[14][1] = inat_group_table_14_1,
> +	[15][0] = inat_group_table_15,
> +	[15][1] = inat_group_table_15_1,
> +	[18][0] = inat_group_table_18,
> +	[18][2] = inat_group_table_18_2,
> +	[21][0] = inat_group_table_21,
> +	[21][1] = inat_group_table_21_1,
> +	[21][2] = inat_group_table_21_2,
> +	[22][0] = inat_group_table_22,
> +};
> +
> +/* AVX opcode map array */
> +const insn_attr_t * const inat_avx_tables[X86_VEX_M_MAX + 1][INAT_LSTPFX_MAX + 1] = {
> +	[1][0] = inat_escape_table_1,
> +	[1][1] = inat_escape_table_1_1,
> +	[1][2] = inat_escape_table_1_2,
> +	[1][3] = inat_escape_table_1_3,
> +	[2][0] = inat_escape_table_2,
> +	[2][1] = inat_escape_table_2_1,
> +	[2][2] = inat_escape_table_2_2,
> +	[2][3] = inat_escape_table_2_3,
> +	[3][0] = inat_escape_table_3,
> +	[3][1] = inat_escape_table_3_1,
> +	[3][3] = inat_escape_table_3_3,
> +};
> diff --git a/xen/arch/x86/inat.c b/xen/arch/x86/inat.c
> new file mode 100644
> index 0000000..feeaa50
> --- /dev/null
> +++ b/xen/arch/x86/inat.c
> @@ -0,0 +1,96 @@
> +/*
> + * x86 instruction attribute tables
> + *
> + * Written by Masami Hiramatsu <mhiramat@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + */
> +#include <asm/insn.h>
> +
> +/* Attribute tables are generated from opcode map */
> +#include "inat-tables.c"
> +
> +/* Attribute search APIs */
> +insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
> +{
> +	return inat_primary_table[opcode];
> +}
> +
> +int inat_get_last_prefix_id(insn_byte_t last_pfx)
> +{
> +	insn_attr_t lpfx_attr;
> +
> +	lpfx_attr = inat_get_opcode_attribute(last_pfx);
> +	return inat_last_prefix_id(lpfx_attr);
> +}
> +
> +insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id,
> +				      insn_attr_t esc_attr)
> +{
> +	const insn_attr_t *table;
> +	int n;
> +
> +	n = inat_escape_id(esc_attr);
> +
> +	table = inat_escape_tables[n][0];
> +	if (!table)
> +		return 0;
> +	if (inat_has_variant(table[opcode]) && lpfx_id) {
> +		table = inat_escape_tables[n][lpfx_id];
> +		if (!table)
> +			return 0;
> +	}
> +	return table[opcode];
> +}
> +
> +insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id,
> +				     insn_attr_t grp_attr)
> +{
> +	const insn_attr_t *table;
> +	int n;
> +
> +	n = inat_group_id(grp_attr);
> +
> +	table = inat_group_tables[n][0];
> +	if (!table)
> +		return inat_group_common_attribute(grp_attr);
> +	if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) {
> +		table = inat_group_tables[n][lpfx_id];
> +		if (!table)
> +			return inat_group_common_attribute(grp_attr);
> +	}
> +	return table[X86_MODRM_REG(modrm)] |
> +	       inat_group_common_attribute(grp_attr);
> +}
> +
> +insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
> +				   insn_byte_t vex_p)
> +{
> +	const insn_attr_t *table;
> +	if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
> +		return 0;
> +	/* At first, this checks the master table */
> +	table = inat_avx_tables[vex_m][0];
> +	if (!table)
> +		return 0;
> +	if (!inat_is_group(table[opcode]) && vex_p) {
> +		/* If this is not a group, get attribute directly */
> +		table = inat_avx_tables[vex_m][vex_p];
> +		if (!table)
> +			return 0;
> +	}
> +	return table[opcode];
> +}
> diff --git a/xen/arch/x86/insn.c b/xen/arch/x86/insn.c
> new file mode 100644
> index 0000000..5aea2c7
> --- /dev/null
> +++ b/xen/arch/x86/insn.c
> @@ -0,0 +1,576 @@
> +/*
> + * x86 instruction analysis
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) IBM Corporation, 2002, 2004, 2009
> + */
> +
> +#include <xen/string.h>
> +#include <asm/inat.h>
> +#include <asm/insn.h>
> +
> +/* Verify next sizeof(t) bytes can be on the same instruction */
> +#define validate_next(t, insn, n)	\
> +	((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE)
> +
> +#define __get_next(t, insn)	\
> +	({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
> +
> +#define __peek_nbyte_next(t, insn, n)	\
> +	({ t r = *(t*)((insn)->next_byte + n); r; })
> +
> +#define get_next(t, insn)	\
> +	({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
> +
> +#define peek_nbyte_next(t, insn, n)	\
> +	({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); })
> +
> +#define peek_next(t, insn)	peek_nbyte_next(t, insn, 0)
> +
> +/**
> + * insn_init() - initialize struct insn
> + * @insn:	&struct insn to be initialized
> + * @kaddr:	address (in kernel memory) of instruction (or copy thereof)
> + * @x86_64:	!0 for 64-bit kernel or 64-bit app
> + */
> +void insn_init(struct insn *insn, const void *kaddr, int x86_64)
> +{
> +	memset(insn, 0, sizeof(*insn));
> +	insn->kaddr = kaddr;
> +	insn->next_byte = kaddr;
> +	insn->x86_64 = x86_64 ? 1 : 0;
> +	insn->opnd_bytes = 4;
> +	if (x86_64)
> +		insn->addr_bytes = 8;
> +	else
> +		insn->addr_bytes = 4;
> +}
> +
> +/**
> + * insn_get_prefixes - scan x86 instruction prefix bytes
> + * @insn:	&struct insn containing instruction
> + *
> + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
> + * to point to the (first) opcode.  No effect if @insn->prefixes.got
> + * is already set.
> + */
> +void insn_get_prefixes(struct insn *insn)
> +{
> +	struct insn_field *prefixes = &insn->prefixes;
> +	insn_attr_t attr;
> +	insn_byte_t b, lb;
> +	int i, nb;
> +
> +	if (prefixes->got)
> +		return;
> +
> +	nb = 0;
> +	lb = 0;
> +	b = peek_next(insn_byte_t, insn);
> +	attr = inat_get_opcode_attribute(b);
> +	while (inat_is_legacy_prefix(attr)) {
> +		/* Skip if same prefix */
> +		for (i = 0; i < nb; i++)
> +			if (prefixes->bytes[i] == b)
> +				goto found;
> +		if (nb == 4)
> +			/* Invalid instruction */
> +			break;
> +		prefixes->bytes[nb++] = b;
> +		if (inat_is_address_size_prefix(attr)) {
> +			/* address size switches 2/4 or 4/8 */
> +			if (insn->x86_64)
> +				insn->addr_bytes ^= 12;
> +			else
> +				insn->addr_bytes ^= 6;
> +		} else if (inat_is_operand_size_prefix(attr)) {
> +			/* oprand size switches 2/4 */
> +			insn->opnd_bytes ^= 6;
> +		}
> +found:
> +		prefixes->nbytes++;
> +		insn->next_byte++;
> +		lb = b;
> +		b = peek_next(insn_byte_t, insn);
> +		attr = inat_get_opcode_attribute(b);
> +	}
> +	/* Set the last prefix */
> +	if (lb && lb != insn->prefixes.bytes[3]) {
> +		if (unlikely(insn->prefixes.bytes[3])) {
> +			/* Swap the last prefix */
> +			b = insn->prefixes.bytes[3];
> +			for (i = 0; i < nb; i++)
> +				if (prefixes->bytes[i] == lb)
> +					prefixes->bytes[i] = b;
> +		}
> +		insn->prefixes.bytes[3] = lb;
> +	}
> +
> +	/* Decode REX prefix */
> +	if (insn->x86_64) {
> +		b = peek_next(insn_byte_t, insn);
> +		attr = inat_get_opcode_attribute(b);
> +		if (inat_is_rex_prefix(attr)) {
> +			insn->rex_prefix.value = b;
> +			insn->rex_prefix.nbytes = 1;
> +			insn->next_byte++;
> +			if (X86_REX_W(b))
> +				/* REX.W overrides opnd_size */
> +				insn->opnd_bytes = 8;
> +		}
> +	}
> +	insn->rex_prefix.got = 1;
> +
> +	/* Decode VEX prefix */
> +	b = peek_next(insn_byte_t, insn);
> +	attr = inat_get_opcode_attribute(b);
> +	if (inat_is_vex_prefix(attr)) {
> +		insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
> +		if (!insn->x86_64) {
> +			/*
> +			 * In 32-bits mode, if the [7:6] bits (mod bits of
> +			 * ModRM) on the second byte are not 11b, it is
> +			 * LDS or LES.
> +			 */
> +			if (X86_MODRM_MOD(b2) != 3)
> +				goto vex_end;
> +		}
> +		insn->vex_prefix.bytes[0] = b;
> +		insn->vex_prefix.bytes[1] = b2;
> +		if (inat_is_vex3_prefix(attr)) {
> +			b2 = peek_nbyte_next(insn_byte_t, insn, 2);
> +			insn->vex_prefix.bytes[2] = b2;
> +			insn->vex_prefix.nbytes = 3;
> +			insn->next_byte += 3;
> +			if (insn->x86_64 && X86_VEX_W(b2))
> +				/* VEX.W overrides opnd_size */
> +				insn->opnd_bytes = 8;
> +		} else {
> +			insn->vex_prefix.nbytes = 2;
> +			insn->next_byte += 2;
> +		}
> +	}
> +vex_end:
> +	insn->vex_prefix.got = 1;
> +
> +	prefixes->got = 1;
> +
> +err_out:
> +	return;
> +}
> +
> +/**
> + * insn_get_opcode - collect opcode(s)
> + * @insn:	&struct insn containing instruction
> + *
> + * Populates @insn->opcode, updates @insn->next_byte to point past the
> + * opcode byte(s), and set @insn->attr (except for groups).
> + * If necessary, first collects any preceding (prefix) bytes.
> + * Sets @insn->opcode.value = opcode1.  No effect if @insn->opcode.got
> + * is already 1.
> + */
> +void insn_get_opcode(struct insn *insn)
> +{
> +	struct insn_field *opcode = &insn->opcode;
> +	insn_byte_t op;
> +	int pfx_id;
> +	if (opcode->got)
> +		return;
> +	if (!insn->prefixes.got)
> +		insn_get_prefixes(insn);
> +
> +	/* Get first opcode */
> +	op = get_next(insn_byte_t, insn);
> +	opcode->bytes[0] = op;
> +	opcode->nbytes = 1;
> +
> +	/* Check if there is VEX prefix or not */
> +	if (insn_is_avx(insn)) {
> +		insn_byte_t m, p;
> +		m = insn_vex_m_bits(insn);
> +		p = insn_vex_p_bits(insn);
> +		insn->attr = inat_get_avx_attribute(op, m, p);
> +		if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
> +			insn->attr = 0;	/* This instruction is bad */
> +		goto end;	/* VEX has only 1 byte for opcode */
> +	}
> +
> +	insn->attr = inat_get_opcode_attribute(op);
> +	while (inat_is_escape(insn->attr)) {
> +		/* Get escaped opcode */
> +		op = get_next(insn_byte_t, insn);
> +		opcode->bytes[opcode->nbytes++] = op;
> +		pfx_id = insn_last_prefix_id(insn);
> +		insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
> +	}
> +	if (inat_must_vex(insn->attr))
> +		insn->attr = 0;	/* This instruction is bad */
> +end:
> +	opcode->got = 1;
> +
> +err_out:
> +	return;
> +}
> +
> +/**
> + * insn_get_modrm - collect ModRM byte, if any
> + * @insn:	&struct insn containing instruction
> + *
> + * Populates @insn->modrm and updates @insn->next_byte to point past the
> + * ModRM byte, if any.  If necessary, first collects the preceding bytes
> + * (prefixes and opcode(s)).  No effect if @insn->modrm.got is already 1.
> + */
> +void insn_get_modrm(struct insn *insn)
> +{
> +	struct insn_field *modrm = &insn->modrm;
> +	insn_byte_t pfx_id, mod;
> +	if (modrm->got)
> +		return;
> +	if (!insn->opcode.got)
> +		insn_get_opcode(insn);
> +
> +	if (inat_has_modrm(insn->attr)) {
> +		mod = get_next(insn_byte_t, insn);
> +		modrm->value = mod;
> +		modrm->nbytes = 1;
> +		if (inat_is_group(insn->attr)) {
> +			pfx_id = insn_last_prefix_id(insn);
> +			insn->attr = inat_get_group_attribute(mod, pfx_id,
> +							      insn->attr);
> +			if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
> +				insn->attr = 0;	/* This is bad */
> +		}
> +	}
> +
> +	if (insn->x86_64 && inat_is_force64(insn->attr))
> +		insn->opnd_bytes = 8;
> +	modrm->got = 1;
> +
> +err_out:
> +	return;
> +}
> +
> +
> +/**
> + * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
> + * @insn:	&struct insn containing instruction
> + *
> + * If necessary, first collects the instruction up to and including the
> + * ModRM byte.  No effect if @insn->x86_64 is 0.
> + */
> +int insn_rip_relative(struct insn *insn)
> +{
> +	struct insn_field *modrm = &insn->modrm;
> +
> +	if (!insn->x86_64)
> +		return 0;
> +	if (!modrm->got)
> +		insn_get_modrm(insn);
> +	/*
> +	 * For rip-relative instructions, the mod field (top 2 bits)
> +	 * is zero and the r/m field (bottom 3 bits) is 0x5.
> +	 */
> +	return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
> +}
> +
> +/**
> + * insn_get_sib() - Get the SIB byte of instruction
> + * @insn:	&struct insn containing instruction
> + *
> + * If necessary, first collects the instruction up to and including the
> + * ModRM byte.
> + */
> +void insn_get_sib(struct insn *insn)
> +{
> +	insn_byte_t modrm;
> +
> +	if (insn->sib.got)
> +		return;
> +	if (!insn->modrm.got)
> +		insn_get_modrm(insn);
> +	if (insn->modrm.nbytes) {
> +		modrm = (insn_byte_t)insn->modrm.value;
> +		if (insn->addr_bytes != 2 &&
> +		    X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
> +			insn->sib.value = get_next(insn_byte_t, insn);
> +			insn->sib.nbytes = 1;
> +		}
> +	}
> +	insn->sib.got = 1;
> +
> +err_out:
> +	return;
> +}
> +
> +
> +/**
> + * insn_get_displacement() - Get the displacement of instruction
> + * @insn:	&struct insn containing instruction
> + *
> + * If necessary, first collects the instruction up to and including the
> + * SIB byte.
> + * Displacement value is sign-expanded.
> + */
> +void insn_get_displacement(struct insn *insn)
> +{
> +	insn_byte_t mod, rm, base;
> +
> +	if (insn->displacement.got)
> +		return;
> +	if (!insn->sib.got)
> +		insn_get_sib(insn);
> +	if (insn->modrm.nbytes) {
> +		/*
> +		 * Interpreting the modrm byte:
> +		 * mod = 00 - no displacement fields (exceptions below)
> +		 * mod = 01 - 1-byte displacement field
> +		 * mod = 10 - displacement field is 4 bytes, or 2 bytes if
> +		 * 	address size = 2 (0x67 prefix in 32-bit mode)
> +		 * mod = 11 - no memory operand
> +		 *
> +		 * If address size = 2...
> +		 * mod = 00, r/m = 110 - displacement field is 2 bytes
> +		 *
> +		 * If address size != 2...
> +		 * mod != 11, r/m = 100 - SIB byte exists
> +		 * mod = 00, SIB base = 101 - displacement field is 4 bytes
> +		 * mod = 00, r/m = 101 - rip-relative addressing, displacement
> +		 * 	field is 4 bytes
> +		 */
> +		mod = X86_MODRM_MOD(insn->modrm.value);
> +		rm = X86_MODRM_RM(insn->modrm.value);
> +		base = X86_SIB_BASE(insn->sib.value);
> +		if (mod == 3)
> +			goto out;
> +		if (mod == 1) {
> +			insn->displacement.value = get_next(char, insn);
> +			insn->displacement.nbytes = 1;
> +		} else if (insn->addr_bytes == 2) {
> +			if ((mod == 0 && rm == 6) || mod == 2) {
> +				insn->displacement.value =
> +					 get_next(short, insn);
> +				insn->displacement.nbytes = 2;
> +			}
> +		} else {
> +			if ((mod == 0 && rm == 5) || mod == 2 ||
> +			    (mod == 0 && base == 5)) {
> +				insn->displacement.value = get_next(int, insn);
> +				insn->displacement.nbytes = 4;
> +			}
> +		}
> +	}
> +out:
> +	insn->displacement.got = 1;
> +
> +err_out:
> +	return;
> +}
> +
> +/* Decode moffset16/32/64. Return 0 if failed */
> +static int __get_moffset(struct insn *insn)
> +{
> +	switch (insn->addr_bytes) {
> +	case 2:
> +		insn->moffset1.value = get_next(short, insn);
> +		insn->moffset1.nbytes = 2;
> +		break;
> +	case 4:
> +		insn->moffset1.value = get_next(int, insn);
> +		insn->moffset1.nbytes = 4;
> +		break;
> +	case 8:
> +		insn->moffset1.value = get_next(int, insn);
> +		insn->moffset1.nbytes = 4;
> +		insn->moffset2.value = get_next(int, insn);
> +		insn->moffset2.nbytes = 4;
> +		break;
> +	default:	/* opnd_bytes must be modified manually */
> +		goto err_out;
> +	}
> +	insn->moffset1.got = insn->moffset2.got = 1;
> +
> +	return 1;
> +
> +err_out:
> +	return 0;
> +}
> +
> +/* Decode imm v32(Iz). Return 0 if failed */
> +static int __get_immv32(struct insn *insn)
> +{
> +	switch (insn->opnd_bytes) {
> +	case 2:
> +		insn->immediate.value = get_next(short, insn);
> +		insn->immediate.nbytes = 2;
> +		break;
> +	case 4:
> +	case 8:
> +		insn->immediate.value = get_next(int, insn);
> +		insn->immediate.nbytes = 4;
> +		break;
> +	default:	/* opnd_bytes must be modified manually */
> +		goto err_out;
> +	}
> +
> +	return 1;
> +
> +err_out:
> +	return 0;
> +}
> +
> +/* Decode imm v64(Iv/Ov), Return 0 if failed */
> +static int __get_immv(struct insn *insn)
> +{
> +	switch (insn->opnd_bytes) {
> +	case 2:
> +		insn->immediate1.value = get_next(short, insn);
> +		insn->immediate1.nbytes = 2;
> +		break;
> +	case 4:
> +		insn->immediate1.value = get_next(int, insn);
> +		insn->immediate1.nbytes = 4;
> +		break;
> +	case 8:
> +		insn->immediate1.value = get_next(int, insn);
> +		insn->immediate1.nbytes = 4;
> +		insn->immediate2.value = get_next(int, insn);
> +		insn->immediate2.nbytes = 4;
> +		break;
> +	default:	/* opnd_bytes must be modified manually */
> +		goto err_out;
> +	}
> +	insn->immediate1.got = insn->immediate2.got = 1;
> +
> +	return 1;
> +err_out:
> +	return 0;
> +}
> +
> +/* Decode ptr16:16/32(Ap) */
> +static int __get_immptr(struct insn *insn)
> +{
> +	switch (insn->opnd_bytes) {
> +	case 2:
> +		insn->immediate1.value = get_next(short, insn);
> +		insn->immediate1.nbytes = 2;
> +		break;
> +	case 4:
> +		insn->immediate1.value = get_next(int, insn);
> +		insn->immediate1.nbytes = 4;
> +		break;
> +	case 8:
> +		/* ptr16:64 is not exist (no segment) */
> +		return 0;
> +	default:	/* opnd_bytes must be modified manually */
> +		goto err_out;
> +	}
> +	insn->immediate2.value = get_next(unsigned short, insn);
> +	insn->immediate2.nbytes = 2;
> +	insn->immediate1.got = insn->immediate2.got = 1;
> +
> +	return 1;
> +err_out:
> +	return 0;
> +}
> +
> +/**
> + * insn_get_immediate() - Get the immediates of instruction
> + * @insn:	&struct insn containing instruction
> + *
> + * If necessary, first collects the instruction up to and including the
> + * displacement bytes.
> + * Basically, most of immediates are sign-expanded. Unsigned-value can be
> + * get by bit masking with ((1 << (nbytes * 8)) - 1)
> + */
> +void insn_get_immediate(struct insn *insn)
> +{
> +	if (insn->immediate.got)
> +		return;
> +	if (!insn->displacement.got)
> +		insn_get_displacement(insn);
> +
> +	if (inat_has_moffset(insn->attr)) {
> +		if (!__get_moffset(insn))
> +			goto err_out;
> +		goto done;
> +	}
> +
> +	if (!inat_has_immediate(insn->attr))
> +		/* no immediates */
> +		goto done;
> +
> +	switch (inat_immediate_size(insn->attr)) {
> +	case INAT_IMM_BYTE:
> +		insn->immediate.value = get_next(char, insn);
> +		insn->immediate.nbytes = 1;
> +		break;
> +	case INAT_IMM_WORD:
> +		insn->immediate.value = get_next(short, insn);
> +		insn->immediate.nbytes = 2;
> +		break;
> +	case INAT_IMM_DWORD:
> +		insn->immediate.value = get_next(int, insn);
> +		insn->immediate.nbytes = 4;
> +		break;
> +	case INAT_IMM_QWORD:
> +		insn->immediate1.value = get_next(int, insn);
> +		insn->immediate1.nbytes = 4;
> +		insn->immediate2.value = get_next(int, insn);
> +		insn->immediate2.nbytes = 4;
> +		break;
> +	case INAT_IMM_PTR:
> +		if (!__get_immptr(insn))
> +			goto err_out;
> +		break;
> +	case INAT_IMM_VWORD32:
> +		if (!__get_immv32(insn))
> +			goto err_out;
> +		break;
> +	case INAT_IMM_VWORD:
> +		if (!__get_immv(insn))
> +			goto err_out;
> +		break;
> +	default:
> +		/* Here, insn must have an immediate, but failed */
> +		goto err_out;
> +	}
> +	if (inat_has_second_immediate(insn->attr)) {
> +		insn->immediate2.value = get_next(char, insn);
> +		insn->immediate2.nbytes = 1;
> +	}
> +done:
> +	insn->immediate.got = 1;
> +
> +err_out:
> +	return;
> +}
> +
> +/**
> + * insn_get_length() - Get the length of instruction
> + * @insn:	&struct insn containing instruction
> + *
> + * If necessary, first collects the instruction up to and including the
> + * immediates bytes.
> + */
> +void insn_get_length(struct insn *insn)
> +{
> +	if (insn->length)
> +		return;
> +	if (!insn->immediate.got)
> +		insn_get_immediate(insn);
> +	insn->length = (unsigned char)((unsigned long)insn->next_byte
> +				     - (unsigned long)insn->kaddr);
> +}
> diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h
> index 00a06cc..db89184 100644
> --- a/xen/include/asm-x86/hvm/emulate.h
> +++ b/xen/include/asm-x86/hvm/emulate.h
> @@ -37,6 +37,9 @@ struct hvm_emulate_ctxt {
>  
>  int hvm_emulate_one(
>      struct hvm_emulate_ctxt *hvmemul_ctxt);
> +int hvm_emulate_one_no_write(
> +    struct hvm_emulate_ctxt *hvmemul_ctxt);
> +void hvm_emulate_one_full(bool_t nowrite);
>  void hvm_emulate_prepare(
>      struct hvm_emulate_ctxt *hvmemul_ctxt,
>      struct cpu_user_regs *regs);
> @@ -45,6 +48,8 @@ void hvm_emulate_writeback(
>  struct segment_register *hvmemul_get_seg_reg(
>      enum x86_segment seg,
>      struct hvm_emulate_ctxt *hvmemul_ctxt);
> +int hvm_get_insn_length(
> +    struct hvm_emulate_ctxt *hvmemul_ctxt);
>  
>  int hvmemul_do_pio(
>      unsigned long port, unsigned long *reps, int size,
> diff --git a/xen/include/asm-x86/inat.h b/xen/include/asm-x86/inat.h
> new file mode 100644
> index 0000000..74a2e31
> --- /dev/null
> +++ b/xen/include/asm-x86/inat.h
> @@ -0,0 +1,221 @@
> +#ifndef _ASM_X86_INAT_H
> +#define _ASM_X86_INAT_H
> +/*
> + * x86 instruction attributes
> + *
> + * Written by Masami Hiramatsu <mhiramat@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + */
> +#include <asm/inat_types.h>
> +
> +/*
> + * Internal bits. Don't use bitmasks directly, because these bits are
> + * unstable. You should use checking functions.
> + */
> +
> +#define INAT_OPCODE_TABLE_SIZE 256
> +#define INAT_GROUP_TABLE_SIZE 8
> +
> +/* Legacy last prefixes */
> +#define INAT_PFX_OPNDSZ	1	/* 0x66 */ /* LPFX1 */
> +#define INAT_PFX_REPE	2	/* 0xF3 */ /* LPFX2 */
> +#define INAT_PFX_REPNE	3	/* 0xF2 */ /* LPFX3 */
> +/* Other Legacy prefixes */
> +#define INAT_PFX_LOCK	4	/* 0xF0 */
> +#define INAT_PFX_CS	5	/* 0x2E */
> +#define INAT_PFX_DS	6	/* 0x3E */
> +#define INAT_PFX_ES	7	/* 0x26 */
> +#define INAT_PFX_FS	8	/* 0x64 */
> +#define INAT_PFX_GS	9	/* 0x65 */
> +#define INAT_PFX_SS	10	/* 0x36 */
> +#define INAT_PFX_ADDRSZ	11	/* 0x67 */
> +/* x86-64 REX prefix */
> +#define INAT_PFX_REX	12	/* 0x4X */
> +/* AVX VEX prefixes */
> +#define INAT_PFX_VEX2	13	/* 2-bytes VEX prefix */
> +#define INAT_PFX_VEX3	14	/* 3-bytes VEX prefix */
> +
> +#define INAT_LSTPFX_MAX	3
> +#define INAT_LGCPFX_MAX	11
> +
> +/* Immediate size */
> +#define INAT_IMM_BYTE		1
> +#define INAT_IMM_WORD		2
> +#define INAT_IMM_DWORD		3
> +#define INAT_IMM_QWORD		4
> +#define INAT_IMM_PTR		5
> +#define INAT_IMM_VWORD32	6
> +#define INAT_IMM_VWORD		7
> +
> +/* Legacy prefix */
> +#define INAT_PFX_OFFS	0
> +#define INAT_PFX_BITS	4
> +#define INAT_PFX_MAX    ((1 << INAT_PFX_BITS) - 1)
> +#define INAT_PFX_MASK	(INAT_PFX_MAX << INAT_PFX_OFFS)
> +/* Escape opcodes */
> +#define INAT_ESC_OFFS	(INAT_PFX_OFFS + INAT_PFX_BITS)
> +#define INAT_ESC_BITS	2
> +#define INAT_ESC_MAX	((1 << INAT_ESC_BITS) - 1)
> +#define INAT_ESC_MASK	(INAT_ESC_MAX << INAT_ESC_OFFS)
> +/* Group opcodes (1-16) */
> +#define INAT_GRP_OFFS	(INAT_ESC_OFFS + INAT_ESC_BITS)
> +#define INAT_GRP_BITS	5
> +#define INAT_GRP_MAX	((1 << INAT_GRP_BITS) - 1)
> +#define INAT_GRP_MASK	(INAT_GRP_MAX << INAT_GRP_OFFS)
> +/* Immediates */
> +#define INAT_IMM_OFFS	(INAT_GRP_OFFS + INAT_GRP_BITS)
> +#define INAT_IMM_BITS	3
> +#define INAT_IMM_MASK	(((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
> +/* Flags */
> +#define INAT_FLAG_OFFS	(INAT_IMM_OFFS + INAT_IMM_BITS)
> +#define INAT_MODRM	(1 << (INAT_FLAG_OFFS))
> +#define INAT_FORCE64	(1 << (INAT_FLAG_OFFS + 1))
> +#define INAT_SCNDIMM	(1 << (INAT_FLAG_OFFS + 2))
> +#define INAT_MOFFSET	(1 << (INAT_FLAG_OFFS + 3))
> +#define INAT_VARIANT	(1 << (INAT_FLAG_OFFS + 4))
> +#define INAT_VEXOK	(1 << (INAT_FLAG_OFFS + 5))
> +#define INAT_VEXONLY	(1 << (INAT_FLAG_OFFS + 6))
> +/* Attribute making macros for attribute tables */
> +#define INAT_MAKE_PREFIX(pfx)	(pfx << INAT_PFX_OFFS)
> +#define INAT_MAKE_ESCAPE(esc)	(esc << INAT_ESC_OFFS)
> +#define INAT_MAKE_GROUP(grp)	((grp << INAT_GRP_OFFS) | INAT_MODRM)
> +#define INAT_MAKE_IMM(imm)	(imm << INAT_IMM_OFFS)
> +
> +/* Attribute search APIs */
> +extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
> +extern int inat_get_last_prefix_id(insn_byte_t last_pfx);
> +extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
> +					     int lpfx_id,
> +					     insn_attr_t esc_attr);
> +extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
> +					    int lpfx_id,
> +					    insn_attr_t esc_attr);
> +extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
> +					  insn_byte_t vex_m,
> +					  insn_byte_t vex_pp);
> +
> +/* Attribute checking functions */
> +static inline int inat_is_legacy_prefix(insn_attr_t attr)
> +{
> +	attr &= INAT_PFX_MASK;
> +	return attr && attr <= INAT_LGCPFX_MAX;
> +}
> +
> +static inline int inat_is_address_size_prefix(insn_attr_t attr)
> +{
> +	return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
> +}
> +
> +static inline int inat_is_operand_size_prefix(insn_attr_t attr)
> +{
> +	return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
> +}
> +
> +static inline int inat_is_rex_prefix(insn_attr_t attr)
> +{
> +	return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
> +}
> +
> +static inline int inat_last_prefix_id(insn_attr_t attr)
> +{
> +	if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
> +		return 0;
> +	else
> +		return attr & INAT_PFX_MASK;
> +}
> +
> +static inline int inat_is_vex_prefix(insn_attr_t attr)
> +{
> +	attr &= INAT_PFX_MASK;
> +	return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
> +}
> +
> +static inline int inat_is_vex3_prefix(insn_attr_t attr)
> +{
> +	return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
> +}
> +
> +static inline int inat_is_escape(insn_attr_t attr)
> +{
> +	return attr & INAT_ESC_MASK;
> +}
> +
> +static inline int inat_escape_id(insn_attr_t attr)
> +{
> +	return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
> +}
> +
> +static inline int inat_is_group(insn_attr_t attr)
> +{
> +	return attr & INAT_GRP_MASK;
> +}
> +
> +static inline int inat_group_id(insn_attr_t attr)
> +{
> +	return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
> +}
> +
> +static inline int inat_group_common_attribute(insn_attr_t attr)
> +{
> +	return attr & ~INAT_GRP_MASK;
> +}
> +
> +static inline int inat_has_immediate(insn_attr_t attr)
> +{
> +	return attr & INAT_IMM_MASK;
> +}
> +
> +static inline int inat_immediate_size(insn_attr_t attr)
> +{
> +	return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
> +}
> +
> +static inline int inat_has_modrm(insn_attr_t attr)
> +{
> +	return attr & INAT_MODRM;
> +}
> +
> +static inline int inat_is_force64(insn_attr_t attr)
> +{
> +	return attr & INAT_FORCE64;
> +}
> +
> +static inline int inat_has_second_immediate(insn_attr_t attr)
> +{
> +	return attr & INAT_SCNDIMM;
> +}
> +
> +static inline int inat_has_moffset(insn_attr_t attr)
> +{
> +	return attr & INAT_MOFFSET;
> +}
> +
> +static inline int inat_has_variant(insn_attr_t attr)
> +{
> +	return attr & INAT_VARIANT;
> +}
> +
> +static inline int inat_accept_vex(insn_attr_t attr)
> +{
> +	return attr & INAT_VEXOK;
> +}
> +
> +static inline int inat_must_vex(insn_attr_t attr)
> +{
> +	return attr & INAT_VEXONLY;
> +}
> +#endif
> diff --git a/xen/include/asm-x86/inat_types.h b/xen/include/asm-x86/inat_types.h
> new file mode 100644
> index 0000000..cb3c20c
> --- /dev/null
> +++ b/xen/include/asm-x86/inat_types.h
> @@ -0,0 +1,29 @@
> +#ifndef _ASM_X86_INAT_TYPES_H
> +#define _ASM_X86_INAT_TYPES_H
> +/*
> + * x86 instruction attributes
> + *
> + * Written by Masami Hiramatsu <mhiramat@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + */
> +
> +/* Instruction attributes */
> +typedef unsigned int insn_attr_t;
> +typedef unsigned char insn_byte_t;
> +typedef signed int insn_value_t;
> +
> +#endif
> diff --git a/xen/include/asm-x86/insn.h b/xen/include/asm-x86/insn.h
> new file mode 100644
> index 0000000..48eb30a
> --- /dev/null
> +++ b/xen/include/asm-x86/insn.h
> @@ -0,0 +1,199 @@
> +#ifndef _ASM_X86_INSN_H
> +#define _ASM_X86_INSN_H
> +/*
> + * x86 instruction analysis
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) IBM Corporation, 2009
> + */
> +
> +/* insn_attr_t is defined in inat.h */
> +#include <asm/inat.h>
> +
> +struct insn_field {
> +	union {
> +		insn_value_t value;
> +		insn_byte_t bytes[4];
> +	};
> +	/* !0 if we've run insn_get_xxx() for this field */
> +	unsigned char got;
> +	unsigned char nbytes;
> +};
> +
> +struct insn {
> +	struct insn_field prefixes;	/*
> +					 * Prefixes
> +					 * prefixes.bytes[3]: last prefix
> +					 */
> +	struct insn_field rex_prefix;	/* REX prefix */
> +	struct insn_field vex_prefix;	/* VEX prefix */
> +	struct insn_field opcode;	/*
> +					 * opcode.bytes[0]: opcode1
> +					 * opcode.bytes[1]: opcode2
> +					 * opcode.bytes[2]: opcode3
> +					 */
> +	struct insn_field modrm;
> +	struct insn_field sib;
> +	struct insn_field displacement;
> +	union {
> +		struct insn_field immediate;
> +		struct insn_field moffset1;	/* for 64bit MOV */
> +		struct insn_field immediate1;	/* for 64bit imm or off16/32 */
> +	};
> +	union {
> +		struct insn_field moffset2;	/* for 64bit MOV */
> +		struct insn_field immediate2;	/* for 64bit imm or seg16 */
> +	};
> +
> +	insn_attr_t attr;
> +	unsigned char opnd_bytes;
> +	unsigned char addr_bytes;
> +	unsigned char length;
> +	unsigned char x86_64;
> +
> +	const insn_byte_t *kaddr;	/* kernel address of insn to analyze */
> +	const insn_byte_t *next_byte;
> +};
> +
> +#define MAX_INSN_SIZE	16
> +
> +#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
> +#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
> +#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
> +
> +#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
> +#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
> +#define X86_SIB_BASE(sib) ((sib) & 0x07)
> +
> +#define X86_REX_W(rex) ((rex) & 8)
> +#define X86_REX_R(rex) ((rex) & 4)
> +#define X86_REX_X(rex) ((rex) & 2)
> +#define X86_REX_B(rex) ((rex) & 1)
> +
> +/* VEX bit flags  */
> +#define X86_VEX_W(vex)	((vex) & 0x80)	/* VEX3 Byte2 */
> +#define X86_VEX_R(vex)	((vex) & 0x80)	/* VEX2/3 Byte1 */
> +#define X86_VEX_X(vex)	((vex) & 0x40)	/* VEX3 Byte1 */
> +#define X86_VEX_B(vex)	((vex) & 0x20)	/* VEX3 Byte1 */
> +#define X86_VEX_L(vex)	((vex) & 0x04)	/* VEX3 Byte2, VEX2 Byte1 */
> +/* VEX bit fields */
> +#define X86_VEX3_M(vex)	((vex) & 0x1f)		/* VEX3 Byte1 */
> +#define X86_VEX2_M	1			/* VEX2.M always 1 */
> +#define X86_VEX_V(vex)	(((vex) & 0x78) >> 3)	/* VEX3 Byte2, VEX2 Byte1 */
> +#define X86_VEX_P(vex)	((vex) & 0x03)		/* VEX3 Byte2, VEX2 Byte1 */
> +#define X86_VEX_M_MAX	0x1f			/* VEX3.M Maximum value */
> +
> +extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
> +extern void insn_get_prefixes(struct insn *insn);
> +extern void insn_get_opcode(struct insn *insn);
> +extern void insn_get_modrm(struct insn *insn);
> +extern void insn_get_sib(struct insn *insn);
> +extern void insn_get_displacement(struct insn *insn);
> +extern void insn_get_immediate(struct insn *insn);
> +extern void insn_get_length(struct insn *insn);
> +
> +/* Attribute will be determined after getting ModRM (for opcode groups) */
> +static inline void insn_get_attribute(struct insn *insn)
> +{
> +	insn_get_modrm(insn);
> +}
> +
> +/* Instruction uses RIP-relative addressing */
> +extern int insn_rip_relative(struct insn *insn);
> +
> +/* Init insn for kernel text */
> +static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
> +{
> +#ifdef CONFIG_X86_64
> +	insn_init(insn, kaddr, 1);
> +#else /* CONFIG_X86_32 */
> +	insn_init(insn, kaddr, 0);
> +#endif
> +}
> +
> +static inline int insn_is_avx(struct insn *insn)
> +{
> +	if (!insn->prefixes.got)
> +		insn_get_prefixes(insn);
> +	return (insn->vex_prefix.value != 0);
> +}
> +
> +/* Ensure this instruction is decoded completely */
> +static inline int insn_complete(struct insn *insn)
> +{
> +	return insn->opcode.got && insn->modrm.got && insn->sib.got &&
> +		insn->displacement.got && insn->immediate.got;
> +}
> +
> +static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
> +{
> +	if (insn->vex_prefix.nbytes == 2)	/* 2 bytes VEX */
> +		return X86_VEX2_M;
> +	else
> +		return X86_VEX3_M(insn->vex_prefix.bytes[1]);
> +}
> +
> +static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
> +{
> +	if (insn->vex_prefix.nbytes == 2)	/* 2 bytes VEX */
> +		return X86_VEX_P(insn->vex_prefix.bytes[1]);
> +	else
> +		return X86_VEX_P(insn->vex_prefix.bytes[2]);
> +}
> +
> +/* Get the last prefix id from last prefix or VEX prefix */
> +static inline int insn_last_prefix_id(struct insn *insn)
> +{
> +	if (insn_is_avx(insn))
> +		return insn_vex_p_bits(insn);	/* VEX_p is a SIMD prefix id */
> +
> +	if (insn->prefixes.bytes[3])
> +		return inat_get_last_prefix_id(insn->prefixes.bytes[3]);
> +
> +	return 0;
> +}
> +
> +/* Offset of each field from kaddr */
> +static inline int insn_offset_rex_prefix(struct insn *insn)
> +{
> +	return insn->prefixes.nbytes;
> +}
> +static inline int insn_offset_vex_prefix(struct insn *insn)
> +{
> +	return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
> +}
> +static inline int insn_offset_opcode(struct insn *insn)
> +{
> +	return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
> +}
> +static inline int insn_offset_modrm(struct insn *insn)
> +{
> +	return insn_offset_opcode(insn) + insn->opcode.nbytes;
> +}
> +static inline int insn_offset_sib(struct insn *insn)
> +{
> +	return insn_offset_modrm(insn) + insn->modrm.nbytes;
> +}
> +static inline int insn_offset_displacement(struct insn *insn)
> +{
> +	return insn_offset_sib(insn) + insn->sib.nbytes;
> +}
> +static inline int insn_offset_immediate(struct insn *insn)
> +{
> +	return insn_offset_displacement(insn) + insn->displacement.nbytes;
> +}
> +
> +#endif /* _ASM_X86_INSN_H */

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
                   ` (8 preceding siblings ...)
  2014-07-02 15:20 ` [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Andrew Cooper
@ 2014-07-02 15:21 ` Jan Beulich
  2014-07-02 15:43   ` Razvan Cojocaru
  2014-07-03  7:38   ` Razvan Cojocaru
  9 siblings, 2 replies; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 15:21 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
> @@ -688,6 +689,17 @@ static int hvmemul_write(
>      return X86EMUL_OKAY;
>  }
>  
> +static int hvmemul_write_dummy(
> +    enum x86_segment __attribute__((unused)) seg,
> +    unsigned long __attribute__((unused)) offset,
> +    void __attribute__((unused)) *p_data,
> +    unsigned int __attribute__((unused)) bytes,
> +    struct x86_emulate_ctxt __attribute__((unused)) *ctxt)

We don't mark unused function arguments like this (and if we did,
we'd want you to use __maybe_unused).

> @@ -1239,6 +1251,139 @@ int hvm_emulate_one(
>      return X86EMUL_OKAY;
>  }
>  
> +int hvm_emulate_one_no_write(
> +    struct hvm_emulate_ctxt *hvmemul_ctxt)
> +{

This must be pretty redundant with hvm_emulate_one(), and hence
most if not all of the redundancy should be factored out.

> +void hvm_emulate_one_full(bool_t nowrite)
> +{
> +    struct hvm_emulate_ctxt ctx[1] = {};
> +    int rc = X86EMUL_RETRY;
> +
> +    hvm_emulate_prepare(ctx, guest_cpu_user_regs());
> +
> +    while ( rc == X86EMUL_RETRY )
> +    {
> +        if ( nowrite )
> +            rc = hvm_emulate_one_no_write(ctx);
> +        else
> +            rc = hvm_emulate_one(ctx);
> +    }
> +
> +    switch ( rc )
> +    {
> +    case X86EMUL_UNHANDLEABLE:
> +        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);

Is it certain that #UD is always the right exception here?

> @@ -1278,6 +1423,53 @@ struct segment_register *hvmemul_get_seg_reg(
>      return &hvmemul_ctxt->seg_reg[seg];
>  }
>  
> +int hvm_get_insn_length(
> +    struct hvm_emulate_ctxt *hvmemul_ctxt)
> +{

There again looks to be quite a bit of redundancy here. Please let's
avoid having n copies of (almost) the same code.

> --- /dev/null
> +++ b/xen/arch/x86/inat-tables.c

I'm not going to look at this in much detail, just a couple of general
notes:
- at least some of the information is redundant with the full x86
  emulator; as before redundancy should be avoided
- many if not all of the arrays here appear to only be used locally,
  and hence ought to be static
- some of the tables are extremely sparse (take
  inat_escape_table_3_3[] as an example); would be nice to collapse
  those
- making future changes/additions to these tables is going to be
  pretty hard with them having neither suitable names nor comments
- coding style (also elsewhere) seems to be Linux'es, yet you don't
  mention in the description that they come from Linux (and if they
  don't you'd be asked to convert them to Xen style)

> +/**
> + * insn_init() - initialize struct insn
> + * @insn:	&struct insn to be initialized
> + * @kaddr:	address (in kernel memory) of instruction (or copy thereof)
> + * @x86_64:	!0 for 64-bit kernel or 64-bit app
> + */
> +void insn_init(struct insn *insn, const void *kaddr, int x86_64)
> +{
> +	memset(insn, 0, sizeof(*insn));
> +	insn->kaddr = kaddr;
> +	insn->next_byte = kaddr;
> +	insn->x86_64 = x86_64 ? 1 : 0;

If the argument was bool_t, you wouldn't need the conditional
operator here.

> +	insn->opnd_bytes = 4;
> +	if (x86_64)
> +		insn->addr_bytes = 8;
> +	else
> +		insn->addr_bytes = 4;

Yet here the conditional operator would improve readability imo.

> +void insn_get_prefixes(struct insn *insn)
> +{
> +	struct insn_field *prefixes = &insn->prefixes;
> +	insn_attr_t attr;
> +	insn_byte_t b, lb;
> +	int i, nb;
> +
> +	if (prefixes->got)
> +		return;
> +
> +	nb = 0;
> +	lb = 0;
> +	b = peek_next(insn_byte_t, insn);
> +	attr = inat_get_opcode_attribute(b);
> +	while (inat_is_legacy_prefix(attr)) {
> +		/* Skip if same prefix */
> +		for (i = 0; i < nb; i++)
> +			if (prefixes->bytes[i] == b)
> +				goto found;

This discarding of duplicates won't always work correctly with multiple
redundant segment override prefixes.

> +		if (nb == 4)
> +			/* Invalid instruction */
> +			break;
> +		prefixes->bytes[nb++] = b;
> +		if (inat_is_address_size_prefix(attr)) {
> +			/* address size switches 2/4 or 4/8 */
> +			if (insn->x86_64)
> +				insn->addr_bytes ^= 12;
> +			else
> +				insn->addr_bytes ^= 6;
> +		} else if (inat_is_operand_size_prefix(attr)) {
> +			/* oprand size switches 2/4 */
> +			insn->opnd_bytes ^= 6;
> +		}

Neither the address size nor the operand size prefix work this way:
Redundant prefixes don't undo the address/operand size change.

> +found:
> +		prefixes->nbytes++;
> +		insn->next_byte++;
> +		lb = b;
> +		b = peek_next(insn_byte_t, insn);
> +		attr = inat_get_opcode_attribute(b);
> +	}
> +	/* Set the last prefix */
> +	if (lb && lb != insn->prefixes.bytes[3]) {
> +		if (unlikely(insn->prefixes.bytes[3])) {
> +			/* Swap the last prefix */
> +			b = insn->prefixes.bytes[3];
> +			for (i = 0; i < nb; i++)
> +				if (prefixes->bytes[i] == lb)
> +					prefixes->bytes[i] = b;
> +		}
> +		insn->prefixes.bytes[3] = lb;
> +	}
> +
> +	/* Decode REX prefix */
> +	if (insn->x86_64) {
> +		b = peek_next(insn_byte_t, insn);
> +		attr = inat_get_opcode_attribute(b);
> +		if (inat_is_rex_prefix(attr)) {
> +			insn->rex_prefix.value = b;
> +			insn->rex_prefix.nbytes = 1;
> +			insn->next_byte++;
> +			if (X86_REX_W(b))
> +				/* REX.W overrides opnd_size */
> +				insn->opnd_bytes = 8;
> +		}
> +	}
> +	insn->rex_prefix.got = 1;
> +
> +	/* Decode VEX prefix */
> +	b = peek_next(insn_byte_t, insn);
> +	attr = inat_get_opcode_attribute(b);
> +	if (inat_is_vex_prefix(attr)) {

This all looks not really correct: A VEX prefix can't follow a REX one,
and legacy prefixes following a REX prefix invalidate the REX one.

> +void insn_get_opcode(struct insn *insn)
> +{
> +	struct insn_field *opcode = &insn->opcode;
> +	insn_byte_t op;
> +	int pfx_id;
> +	if (opcode->got)
> +		return;
> +	if (!insn->prefixes.got)

insn_get_prefixes() already checks this - please settle on whether you
want to avoid the call, or bail early from the function.

> +static int __get_immptr(struct insn *insn)
> +{
> +	switch (insn->opnd_bytes) {
> +	case 2:
> +		insn->immediate1.value = get_next(short, insn);
> +		insn->immediate1.nbytes = 2;
> +		break;
> +	case 4:
> +		insn->immediate1.value = get_next(int, insn);
> +		insn->immediate1.nbytes = 4;
> +		break;
> +	case 8:
> +		/* ptr16:64 is not exist (no segment) */
> +		return 0;
> +	default:	/* opnd_bytes must be modified manually */
> +		goto err_out;

Considering that err_out: is followed by just "return 0", why does case
8 above not "goto err_out" too, or why are "invalid" and "must be
modified manually" being indicated with the same return value? (And
I consider goto-s to a label that is followed by just a single simple
statement bad practice anyway.)

> +void insn_get_length(struct insn *insn)
> +{
> +	if (insn->length)
> +		return;
> +	if (!insn->immediate.got)
> +		insn_get_immediate(insn);
> +	insn->length = (unsigned char)((unsigned long)insn->next_byte
> +				     - (unsigned long)insn->kaddr);
> +}

That doesn't seem correct in boundary cases, e.g. a 32-bit segment
wrap at an instruction boundary.

> +/* Legacy last prefixes */
> +#define INAT_PFX_OPNDSZ	1	/* 0x66 */ /* LPFX1 */
> +#define INAT_PFX_REPE	2	/* 0xF3 */ /* LPFX2 */
> +#define INAT_PFX_REPNE	3	/* 0xF2 */ /* LPFX3 */

What does "last" here mean? There's nothing in the architecture
requiring the other possibly prefixes to com first.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-02 13:33 ` [PATCH RFC 2/9] xen: Optimize introspection access to guest state Razvan Cojocaru
@ 2014-07-02 15:31   ` Andrew Cooper
  2014-07-07 14:50     ` Razvan Cojocaru
  2014-07-10  8:05     ` Razvan Cojocaru
  2014-07-02 15:37   ` Jan Beulich
  1 sibling, 2 replies; 64+ messages in thread
From: Andrew Cooper @ 2014-07-02 15:31 UTC (permalink / raw)
  To: Razvan Cojocaru, xen-devel; +Cc: tim

On 02/07/14 14:33, Razvan Cojocaru wrote:
> Speed optimization for introspection purposes: a handful of registers
> are sent along with each mem_event. This requires enlargement of the
> mem_event_request / mem_event_response stuctures, and additional code
> to fill in relevant values.
>
> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>

The public API already has struct hvm_hw_cpu in
xen/include/public/arch-x86/hvm/save.h

It might be better to reuse that rather than defining a new structure to
contain a subset of the information.

> ---
>  xen/arch/x86/hvm/hvm.c                 |   33 +++++++++++++++++
>  xen/arch/x86/hvm/vmx/vmx.c             |    1 +
>  xen/arch/x86/mm/p2m.c                  |   61 ++++++++++++++++++++++++++++++++
>  xen/include/public/arch-x86/hvm/save.h |    4 +++
>  xen/include/public/mem_event.h         |   36 +++++++++++++++++++
>  5 files changed, 135 insertions(+)
>
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 17ff011..f65a5f5 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -6016,6 +6016,38 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
>      return rc;
>  }
>  
> +static inline void hvm_mem_event_fill_regs(mem_event_request_t *req)
> +{
> +    struct cpu_user_regs *regs = guest_cpu_user_regs();
> +    struct vcpu *v = current;
> +
> +    req->regs.rax = regs->eax;
> +    req->regs.rcx = regs->ecx;
> +    req->regs.rdx = regs->edx;
> +    req->regs.rbx = regs->ebx;
> +    req->regs.rsp = regs->esp;
> +    req->regs.rbp = regs->ebp;
> +    req->regs.rsi = regs->esi;
> +    req->regs.rdi = regs->edi;
> +
> +    req->regs.r8  = regs->r8;
> +    req->regs.r9  = regs->r9;
> +    req->regs.r10 = regs->r10;
> +    req->regs.r11 = regs->r11;
> +    req->regs.r12 = regs->r12;
> +    req->regs.r13 = regs->r13;
> +    req->regs.r14 = regs->r14;
> +    req->regs.r15 = regs->r15;
> +
> +    req->regs.rflags = regs->eflags;
> +    req->regs.rip    = regs->eip;
> +
> +    req->regs.msr_efer = v->arch.hvm_vcpu.guest_efer;
> +    req->regs.cr0 = v->arch.hvm_vcpu.guest_cr[0];
> +    req->regs.cr3 = v->arch.hvm_vcpu.guest_cr[3];
> +    req->regs.cr4 = v->arch.hvm_vcpu.guest_cr[4];
> +}
> +
>  static int hvm_memory_event_traps(long p, uint32_t reason,
>                                    unsigned long value, unsigned long old, 
>                                    bool_t gla_valid, unsigned long gla) 
> @@ -6060,6 +6092,7 @@ static int hvm_memory_event_traps(long p, uint32_t reason,
>          req.gla = old;
>      }
>      
> +    hvm_mem_event_fill_regs(&req);
>      mem_event_put_request(d, &d->mem_event->access, &req);
>      
>      return 1;
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 2caa04a..fed21b6 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>      c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
>  
>      c->msr_efer = v->arch.hvm_vcpu.guest_efer;
> +    c->guest_x86_mode = vmx_guest_x86_mode(v);

guest_x86_mode is a linear function of cr0, eflags and efer.  It can be
calculated by userspace doesn't need to transmitted individually.

>  
>      __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
>      __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index 642ec28..93252d9 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -1314,6 +1314,64 @@ void p2m_mem_paging_resume(struct domain *d)
>      }
>  }
>  
> +static inline void p2m_mem_event_fill_regs(mem_event_request_t *req)
> +{
> +    struct cpu_user_regs *regs = guest_cpu_user_regs();
> +    struct segment_register seg;
> +    struct hvm_hw_cpu ctxt;
> +    struct vcpu *v = current;
> +
> +    memset(&ctxt, 0, sizeof(struct hvm_hw_cpu));
> +
> +    /* Architecture-specific vmcs/vmcb bits */
> +    hvm_funcs.save_cpu_ctxt(v, &ctxt);
> +
> +    req->regs.rax = regs->eax;
> +    req->regs.rcx = regs->ecx;
> +    req->regs.rdx = regs->edx;
> +    req->regs.rbx = regs->ebx;
> +    req->regs.rsp = regs->esp;
> +    req->regs.rbp = regs->ebp;
> +    req->regs.rsi = regs->esi;
> +    req->regs.rdi = regs->edi;
> +
> +#ifdef __x86_64__

There is no need to code for __i386__ inside xen/arch/x86

~Andrew

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-02 13:33 ` [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
@ 2014-07-02 15:35   ` Andrew Cooper
  2014-07-02 15:43     ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Andrew Cooper @ 2014-07-02 15:35 UTC (permalink / raw)
  To: Razvan Cojocaru, xen-devel; +Cc: tim

On 02/07/14 14:33, Razvan Cojocaru wrote:
> Vmx_disable_intercept_for_msr() will now refuse to disable interception of
> MSRs needed by the memory introspection library.
>
> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
> ---
>  xen/arch/x86/hvm/vmx/vmcs.c |   19 +++++++++++++++++++
>  1 file changed, 19 insertions(+)
>
> diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
> index 8ffc562..eb3f030 100644
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -700,6 +700,25 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
>      if ( msr_bitmap == NULL )
>          return;
>  
> +    /* Filter out MSR-s needed by the memory introspection engine */
> +    switch ( msr )
> +    {
> +    case MSR_IA32_SYSENTER_EIP:
> +    case MSR_IA32_SYSENTER_ESP:
> +    case MSR_IA32_SYSENTER_CS:
> +    case MSR_IA32_MC0_CTL:
> +    case MSR_STAR:
> +    case MSR_LSTAR:
> +

Given the performance implications of forcing interception of these
MSRs, it would be gated on mem_access being active for the domain.

> +        printk("Warning: cannot disable the interception of MSR "
> +            "0x%08x because it is needed by the memory introspection "
> +            "engine\n", msr);
> +        return;

gdprintk() please, and a rather shorter message.

~Andrew

> +
> +    default:
> +        break;
> +    }
> +
>      /*
>       * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
>       * have the write-low and read-high bitmap offsets the wrong way round.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 4/9] xenctrl: Make the headers C++ friendly
  2014-07-02 13:33 ` [PATCH RFC 4/9] xenctrl: Make the headers C++ friendly Razvan Cojocaru
@ 2014-07-02 15:37   ` Andrew Cooper
  0 siblings, 0 replies; 64+ messages in thread
From: Andrew Cooper @ 2014-07-02 15:37 UTC (permalink / raw)
  To: Razvan Cojocaru, xen-devel; +Cc: Ian Jackson, tim, Ian Campbell

On 02/07/14 14:33, Razvan Cojocaru wrote:
> Moved an enum definition before the typedef that uses it.
>
> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

However, you should CC the tools maintainers for tools patches.  I have
done on this email.

> ---
>  tools/libxc/xenctrl.h |   19 ++++++++++---------
>  1 file changed, 10 insertions(+), 9 deletions(-)
>
> diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
> index af6f249..abd8947 100644
> --- a/tools/libxc/xenctrl.h
> +++ b/tools/libxc/xenctrl.h
> @@ -119,6 +119,16 @@ typedef struct xc_interface_core xc_interface;
>  typedef struct xc_interface_core xc_evtchn;
>  typedef struct xc_interface_core xc_gnttab;
>  typedef struct xc_interface_core xc_gntshr;
> +
> +enum xc_error_code {
> +  XC_ERROR_NONE = 0,
> +  XC_INTERNAL_ERROR = 1,
> +  XC_INVALID_KERNEL = 2,
> +  XC_INVALID_PARAM = 3,
> +  XC_OUT_OF_MEMORY = 4,
> +  /* new codes need to be added to xc_error_level_to_desc too */
> +};
> +
>  typedef enum xc_error_code xc_error_code;
>  
>  
> @@ -1766,15 +1776,6 @@ int xc_hvm_inject_trap(
>   */
>  
>  
> -enum xc_error_code {
> -  XC_ERROR_NONE = 0,
> -  XC_INTERNAL_ERROR = 1,
> -  XC_INVALID_KERNEL = 2,
> -  XC_INVALID_PARAM = 3,
> -  XC_OUT_OF_MEMORY = 4,
> -  /* new codes need to be added to xc_error_level_to_desc too */
> -};
> -
>  #define XC_MAX_ERROR_MSG_LEN 1024
>  typedef struct xc_error {
>    enum xc_error_code code;

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-02 13:33 ` [PATCH RFC 2/9] xen: Optimize introspection access to guest state Razvan Cojocaru
  2014-07-02 15:31   ` Andrew Cooper
@ 2014-07-02 15:37   ` Jan Beulich
  2014-07-03  8:12     ` Razvan Cojocaru
  1 sibling, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 15:37 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
> Speed optimization for introspection purposes: a handful of registers
> are sent along with each mem_event. This requires enlargement of the
> mem_event_request / mem_event_response stuctures, and additional code
> to fill in relevant values.

First of all I wonder whether all of the interface changes are really
permissible compatibility-wise.

> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -6016,6 +6016,38 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
>      return rc;
>  }
>  
> +static inline void hvm_mem_event_fill_regs(mem_event_request_t *req)
> +{
> +    struct cpu_user_regs *regs = guest_cpu_user_regs();
> +    struct vcpu *v = current;
> +
> +    req->regs.rax = regs->eax;
> +    req->regs.rcx = regs->ecx;
> +    req->regs.rdx = regs->edx;
> +    req->regs.rbx = regs->ebx;
> +    req->regs.rsp = regs->esp;
> +    req->regs.rbp = regs->ebp;
> +    req->regs.rsi = regs->esi;
> +    req->regs.rdi = regs->edi;
> +
> +    req->regs.r8  = regs->r8;
> +    req->regs.r9  = regs->r9;
> +    req->regs.r10 = regs->r10;
> +    req->regs.r11 = regs->r11;
> +    req->regs.r12 = regs->r12;
> +    req->regs.r13 = regs->r13;
> +    req->regs.r14 = regs->r14;
> +    req->regs.r15 = regs->r15;
> +
> +    req->regs.rflags = regs->eflags;
> +    req->regs.rip    = regs->eip;
> +
> +    req->regs.msr_efer = v->arch.hvm_vcpu.guest_efer;
> +    req->regs.cr0 = v->arch.hvm_vcpu.guest_cr[0];
> +    req->regs.cr3 = v->arch.hvm_vcpu.guest_cr[3];
> +    req->regs.cr4 = v->arch.hvm_vcpu.guest_cr[4];
> +}

This fills far not as many fields as the p2m function further down.
Why?

> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>      c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
>  
>      c->msr_efer = v->arch.hvm_vcpu.guest_efer;
> +    c->guest_x86_mode = vmx_guest_x86_mode(v);

This seems unrelated and/or lacking an SVM counterpart.

> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -1314,6 +1314,64 @@ void p2m_mem_paging_resume(struct domain *d)
>      }
>  }
>  
> +static inline void p2m_mem_event_fill_regs(mem_event_request_t *req)
> +{
> +    struct cpu_user_regs *regs = guest_cpu_user_regs();
> +    struct segment_register seg;
> +    struct hvm_hw_cpu ctxt;
> +    struct vcpu *v = current;
> +
> +    memset(&ctxt, 0, sizeof(struct hvm_hw_cpu));
> +
> +    /* Architecture-specific vmcs/vmcb bits */
> +    hvm_funcs.save_cpu_ctxt(v, &ctxt);
> +
> +    req->regs.rax = regs->eax;
> +    req->regs.rcx = regs->ecx;
> +    req->regs.rdx = regs->edx;
> +    req->regs.rbx = regs->ebx;
> +    req->regs.rsp = regs->esp;
> +    req->regs.rbp = regs->ebp;
> +    req->regs.rsi = regs->esi;
> +    req->regs.rdi = regs->edi;
> +
> +#ifdef __x86_64__

You don't need this anymore.

> +    req->regs.r8  = regs->r8;
> +    req->regs.r9  = regs->r9;
> +    req->regs.r10 = regs->r10;
> +    req->regs.r11 = regs->r11;
> +    req->regs.r12 = regs->r12;
> +    req->regs.r13 = regs->r13;
> +    req->regs.r14 = regs->r14;
> +    req->regs.r15 = regs->r15;
> +#endif
> +
> +    req->regs.rflags = regs->eflags;
> +    req->regs.rip    = regs->eip;
> +
> +    req->regs.dr7 = v->arch.debugreg[7];
> +    req->regs.cr0 = ctxt.cr0;
> +    req->regs.cr2 = ctxt.cr2;
> +    req->regs.cr3 = ctxt.cr3;
> +    req->regs.cr4 = ctxt.cr4;
> +
> +    req->regs.sysenter_cs = ctxt.sysenter_cs;
> +    req->regs.sysenter_esp = ctxt.sysenter_esp;
> +    req->regs.sysenter_eip = ctxt.sysenter_eip;
> +
> +    req->regs.msr_efer = ctxt.msr_efer;
> +    req->regs.msr_star = ctxt.msr_star;
> +    req->regs.msr_lstar = ctxt.msr_lstar;
> +
> +    hvm_get_segment_register(v, x86_seg_fs, &seg);
> +    req->regs.fs_base = seg.base;
> +
> +    hvm_get_segment_register(v, x86_seg_gs, &seg);
> +    req->regs.gs_base = seg.base;

These two segment bases may be sufficient to describe x86-64 state,
but what about a guest in 16- or 32-bit mode?

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-02 15:21 ` Jan Beulich
@ 2014-07-02 15:43   ` Razvan Cojocaru
  2014-07-02 16:08     ` Jan Beulich
  2014-07-03  7:38   ` Razvan Cojocaru
  1 sibling, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 15:43 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/02/2014 06:21 PM, Jan Beulich wrote:
>> +    struct x86_emulate_ctxt __attribute__((unused)) *ctxt)
> 
> We don't mark unused function arguments like this (and if we did,
> we'd want you to use __maybe_unused).

OK, thanks. What's the proper way to mark them? Should I go with
__maybe_unused then?

>> +int hvm_emulate_one_no_write(
>> +    struct hvm_emulate_ctxt *hvmemul_ctxt)
>> +{
> 
> This must be pretty redundant with hvm_emulate_one(), and hence
> most if not all of the redundancy should be factored out.

Will do.

>> +void hvm_emulate_one_full(bool_t nowrite)
>> +{
>> +    struct hvm_emulate_ctxt ctx[1] = {};
>> +    int rc = X86EMUL_RETRY;
>> +
>> +    hvm_emulate_prepare(ctx, guest_cpu_user_regs());
>> +
>> +    while ( rc == X86EMUL_RETRY )
>> +    {
>> +        if ( nowrite )
>> +            rc = hvm_emulate_one_no_write(ctx);
>> +        else
>> +            rc = hvm_emulate_one(ctx);
>> +    }
>> +
>> +    switch ( rc )
>> +    {
>> +    case X86EMUL_UNHANDLEABLE:
>> +        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
> 
> Is it certain that #UD is always the right exception here?

For our purposes, yes.

>> +int hvm_get_insn_length(
>> +    struct hvm_emulate_ctxt *hvmemul_ctxt)
>> +{
> 
> There again looks to be quite a bit of redundancy here. Please let's
> avoid having n copies of (almost) the same code.

Will factor it out.

>> --- /dev/null
>> +++ b/xen/arch/x86/inat-tables.c
> 
> I'm not going to look at this in much detail, just a couple of general
> notes:
> - at least some of the information is redundant with the full x86
>   emulator; as before redundancy should be avoided
> - many if not all of the arrays here appear to only be used locally,
>   and hence ought to be static
> - some of the tables are extremely sparse (take
>   inat_escape_table_3_3[] as an example); would be nice to collapse
>   those
> - making future changes/additions to these tables is going to be
>   pretty hard with them having neither suitable names nor comments
> - coding style (also elsewhere) seems to be Linux'es, yet you don't
>   mention in the description that they come from Linux (and if they
>   don't you'd be asked to convert them to Xen style)

Yes, the files do come from Linux, hence the coding style and the rest
of the issues. We've left them as they are on purpose, to try to reflect
that. I was under the impression that at least some of them say so at
the beginning of the file.

I'll make sure to mention this explicitly in subsequent retries.

Of course, that means that I can't really explain what the original
author intended (related to the rest of your critique).


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-02 15:35   ` Andrew Cooper
@ 2014-07-02 15:43     ` Jan Beulich
  2014-07-09  8:02       ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 15:43 UTC (permalink / raw)
  To: Razvan Cojocaru, Andrew Cooper, xen-devel; +Cc: tim

>>> On 02.07.14 at 17:35, <andrew.cooper3@citrix.com> wrote:
> On 02/07/14 14:33, Razvan Cojocaru wrote:
>> @@ -700,6 +700,25 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
>>      if ( msr_bitmap == NULL )
>>          return;
>>  
>> +    /* Filter out MSR-s needed by the memory introspection engine */
>> +    switch ( msr )
>> +    {
>> +    case MSR_IA32_SYSENTER_EIP:
>> +    case MSR_IA32_SYSENTER_ESP:
>> +    case MSR_IA32_SYSENTER_CS:
>> +    case MSR_IA32_MC0_CTL:
>> +    case MSR_STAR:
>> +    case MSR_LSTAR:
>> +
> 
> Given the performance implications of forcing interception of these
> MSRs, it would be gated on mem_access being active for the domain.

Absolutely.

>> +        printk("Warning: cannot disable the interception of MSR "
>> +            "0x%08x because it is needed by the memory introspection "
>> +            "engine\n", msr);
>> +        return;
> 
> gdprintk() please, and a rather shorter message.

Not sure about gdprintk() - we neither need the file/line to be printed
here, nor am I sure that v == current (or else the dom/vcpu printed
would be wrong), but this should clearly be XENLOG_DEBUG and
abbreviated as much as possible without making it meaningless.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-02 13:33 ` [PATCH RFC 5/9] xen: Support for VMCALL mem_events Razvan Cojocaru
@ 2014-07-02 15:47   ` Jan Beulich
  2014-07-02 15:54     ` Razvan Cojocaru
  2014-07-02 15:54   ` Andrew Cooper
  1 sibling, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 15:47 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>  xen/arch/x86/hvm/hvm.c          |    8 ++++++++
>  xen/arch/x86/hvm/vmx/vmx.c      |   15 ++++++++++++++-

This is obviously again missing the SVM side.

> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -2880,8 +2880,21 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>      case EXIT_REASON_VMCALL:
>      {
>          int rc;
> +        unsigned long eax = regs->eax;
> +
>          HVMTRACE_1D(VMMCALL, regs->eax);
> -        rc = hvm_do_hypercall(regs);
> +
> +        if ( regs->eax != 0x494e5452 ) /* Introcore magic */

Urgh?!

> --- a/xen/include/public/hvm/params.h
> +++ b/xen/include/public/hvm/params.h
> @@ -148,6 +148,8 @@
>  #define HVM_PARAM_IOREQ_SERVER_PFN 32
>  #define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
>  
> -#define HVM_NR_PARAMS          34
> +#define HVM_PARAM_MEMORY_EVENT_VMCALL 34

So why does this (used only as an argument to
hvm_memory_event_traps()) need to be settable? I guess the patch
description is just too brief.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-02 13:33 ` [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
@ 2014-07-02 15:51   ` Jan Beulich
  2014-07-02 16:00     ` Andrew Cooper
  2014-07-02 16:06     ` Razvan Cojocaru
  0 siblings, 2 replies; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 15:51 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
> new xc_domain_set_pagefault_info() function to set per-domain page
> fault injection information. This information is then used to call
> hvm_inject_page_fault() at the first VMENTRY where the guest status
> matches and there are no other pending traps.

So the first question that strikes me here: What good can it do to be
able to inject arbitrary page faults, possibly at times where the guest
OS is absolutely not expecting them?

> @@ -430,6 +431,9 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>      __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
>      __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
>      __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
> +    __vmread(GUEST_CS_AR_BYTES, &cs_arbytes);
> +
> +    c->cs_arbytes = (uint32_t)cs_arbytes;

This again looks like an unrelated change without any explanation.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-02 13:33 ` [PATCH RFC 5/9] xen: Support for VMCALL mem_events Razvan Cojocaru
  2014-07-02 15:47   ` Jan Beulich
@ 2014-07-02 15:54   ` Andrew Cooper
  2014-07-02 15:59     ` Razvan Cojocaru
  1 sibling, 1 reply; 64+ messages in thread
From: Andrew Cooper @ 2014-07-02 15:54 UTC (permalink / raw)
  To: Razvan Cojocaru, xen-devel; +Cc: tim

On 02/07/14 14:33, Razvan Cojocaru wrote:
> Added support for VMCALL events (the memory introspection library
> will have the guest trigger VMCALLs, which will then be sent along
> via the mem_event mechanism).
>
> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>

Am I correct in concluding that this is an escape mechanism for
something inside the guest to trap to the toolstack userspace monitoring
the guest?

> ---
>  xen/arch/x86/hvm/hvm.c          |    8 ++++++++
>  xen/arch/x86/hvm/vmx/vmx.c      |   15 ++++++++++++++-
>  xen/include/asm-x86/hvm/hvm.h   |    1 +
>  xen/include/public/hvm/params.h |    4 +++-
>  xen/include/public/mem_event.h  |    1 +
>  5 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index f65a5f5..df696d1 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -6130,6 +6130,14 @@ void hvm_memory_event_msr(unsigned long msr, unsigned long value)
>                             value, ~value, 1, msr);
>  }
>  
> +void hvm_memory_event_vmcall(unsigned long rip, unsigned long eax)
> +{
> +    hvm_memory_event_traps(current->domain->arch.hvm_domain
> +                             .params[HVM_PARAM_MEMORY_EVENT_VMCALL],
> +                           MEM_EVENT_REASON_VMCALL,
> +                           rip, ~rip, 1, eax);
> +}
> +
>  int hvm_memory_event_int3(unsigned long gla) 
>  {
>      uint32_t pfec = PFEC_page_present;
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index fed21b6..b4c12cd 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -2880,8 +2880,21 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>      case EXIT_REASON_VMCALL:
>      {
>          int rc;
> +        unsigned long eax = regs->eax;
> +
>          HVMTRACE_1D(VMMCALL, regs->eax);
> -        rc = hvm_do_hypercall(regs);
> +
> +        if ( regs->eax != 0x494e5452 ) /* Introcore magic */

This needs to live somewhere in the public API.

> +        {
> +            rc = hvm_do_hypercall(regs);
> +        }
> +        else
> +        {
> +            hvm_memory_event_vmcall(guest_cpu_user_regs()->eip, eax);
> +            update_guest_eip();
> +            break;
> +        }
> +
>          if ( rc != HVM_HCALL_preempted )
>          {
>              update_guest_eip(); /* Safe: VMCALL */
> diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
> index 90e69f5..67e365b 100644
> --- a/xen/include/asm-x86/hvm/hvm.h
> +++ b/xen/include/asm-x86/hvm/hvm.h
> @@ -532,6 +532,7 @@ void hvm_memory_event_cr0(unsigned long value, unsigned long old);
>  void hvm_memory_event_cr3(unsigned long value, unsigned long old);
>  void hvm_memory_event_cr4(unsigned long value, unsigned long old);
>  void hvm_memory_event_msr(unsigned long msr, unsigned long value);
> +void hvm_memory_event_vmcall(unsigned long rip, unsigned long eax);
>  /* Called for current VCPU on int3: returns -1 if no listener */
>  int hvm_memory_event_int3(unsigned long gla);
>  
> diff --git a/xen/include/public/hvm/params.h b/xen/include/public/hvm/params.h
> index f830bdd..ea2eee6 100644
> --- a/xen/include/public/hvm/params.h
> +++ b/xen/include/public/hvm/params.h
> @@ -148,6 +148,8 @@
>  #define HVM_PARAM_IOREQ_SERVER_PFN 32
>  #define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
>  
> -#define HVM_NR_PARAMS          34
> +#define HVM_PARAM_MEMORY_EVENT_VMCALL 34
> +
> +#define HVM_NR_PARAMS          35

Nothing prevents the VM from writing whatever value it wishes into this
hvmparam using the setparam hypercall, which would look to stump dom0
userspace trusting the value it finds there.

The current infrastructure for hvmparams is far too lax and I have half
a patch series (which is distinctly low down my todo list in terms of
priority) which tries to tighten the restrictions.  However in the
meantime all new params should have proper restrictions applied.

Having said that, this would appear to interact badly with mem_events in
PV guests, which have recently (or are planning to?) moved away from
being HVM specific.  It would be preferable not to impede that.

~Andrew

>  
>  #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
> diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
> index 24ac67d..5fa2858 100644
> --- a/xen/include/public/mem_event.h
> +++ b/xen/include/public/mem_event.h
> @@ -47,6 +47,7 @@
>  #define MEM_EVENT_REASON_SINGLESTEP  6    /* single step was invoked: gla/gfn are RIP */
>  #define MEM_EVENT_REASON_MSR         7    /* MSR was hit: gfn is MSR value, gla is MSR address;
>                                               does NOT honour HVMPME_onchangeonly */
> +#define MEM_EVENT_REASON_VMCALL      8    /* VMCALL: gfn is RIP, gla is EAX */
>  
>  typedef struct mem_event_regs_st {
>      uint64_t rax;

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-02 15:47   ` Jan Beulich
@ 2014-07-02 15:54     ` Razvan Cojocaru
  2014-07-02 16:11       ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 15:54 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

>>  xen/arch/x86/hvm/hvm.c          |    8 ++++++++
>>  xen/arch/x86/hvm/vmx/vmx.c      |   15 ++++++++++++++-
> 
> This is obviously again missing the SVM side.

We've done our development for VMX, that correct.

>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -2880,8 +2880,21 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>>      case EXIT_REASON_VMCALL:
>>      {
>>          int rc;
>> +        unsigned long eax = regs->eax;
>> +
>>          HVMTRACE_1D(VMMCALL, regs->eax);
>> -        rc = hvm_do_hypercall(regs);
>> +
>> +        if ( regs->eax != 0x494e5452 ) /* Introcore magic */
> 
> Urgh?!

The magic constant is INTR, and it's used to differentiate between
"regular" and induced VMCALLs. Our application sets EAX up like that to
tell the situations apart.

>> --- a/xen/include/public/hvm/params.h
>> +++ b/xen/include/public/hvm/params.h
>> @@ -148,6 +148,8 @@
>>  #define HVM_PARAM_IOREQ_SERVER_PFN 32
>>  #define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
>>  
>> -#define HVM_NR_PARAMS          34
>> +#define HVM_PARAM_MEMORY_EVENT_VMCALL 34
> 
> So why does this (used only as an argument to
> hvm_memory_event_traps()) need to be settable? I guess the patch
> description is just too brief.

Settable?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-02 13:33 ` [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
@ 2014-07-02 15:56   ` Jan Beulich
  2014-07-03  8:55     ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 15:56 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
> In a scenario where a page fault that triggered a mem_event occured,
> p2m_mem_access_check() will now be able to either 1) emulate the
> current instruction, 2) skip the current instruction, or 3) emulate
> it, but don't allow it to perform any writes. Since some SSE2
> instructions are problematic to emulate (Firefox uses some),
> support for setting the A and D (accessed and dirty) bits has been
> added (please see p2m_set_ad_bits()).

Sadly that reference is useless - the function doesn't have any
explanation what all this is about either.

> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1372,6 +1372,16 @@ void hvm_emulate_one_full(bool_t nowrite)
>      switch ( rc )
>      {
>      case X86EMUL_UNHANDLEABLE:
> +        printk("Emulation failed @ %04x:%lx: "
> +               "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
> +               hvmemul_get_seg_reg(x86_seg_cs, ctx)->sel,
> +               ctx->insn_buf_eip,
> +               ctx->insn_buf[0], ctx->insn_buf[1],
> +               ctx->insn_buf[2], ctx->insn_buf[3],
> +               ctx->insn_buf[4], ctx->insn_buf[5],
> +               ctx->insn_buf[6], ctx->insn_buf[7],
> +               ctx->insn_buf[8], ctx->insn_buf[9]);
> +
>          hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
>          break;

Since this is non-fatal to the guest, this must get its log level lowered.

> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -121,6 +121,9 @@ struct vcpu *alloc_vcpu(
>      v->domain = d;
>      v->vcpu_id = vcpu_id;
>  
> +    v->sse_pg_dirty.eip = 0;
> +    v->sse_pg_dirty.gla = 0;
> +

All fields start out as zero already. And this is hardly arch-independent
code.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-02 15:54   ` Andrew Cooper
@ 2014-07-02 15:59     ` Razvan Cojocaru
  0 siblings, 0 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 15:59 UTC (permalink / raw)
  To: Andrew Cooper, xen-devel; +Cc: tim

On 07/02/2014 06:54 PM, Andrew Cooper wrote:
> On 02/07/14 14:33, Razvan Cojocaru wrote:
>> Added support for VMCALL events (the memory introspection library
>> will have the guest trigger VMCALLs, which will then be sent along
>> via the mem_event mechanism).
>>
>> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
> 
> Am I correct in concluding that this is an escape mechanism for
> something inside the guest to trap to the toolstack userspace monitoring
> the guest?

Yes.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-02 15:51   ` Jan Beulich
@ 2014-07-02 16:00     ` Andrew Cooper
  2014-07-02 16:58       ` Mihai Donțu
  2014-07-02 16:06     ` Razvan Cojocaru
  1 sibling, 1 reply; 64+ messages in thread
From: Andrew Cooper @ 2014-07-02 16:00 UTC (permalink / raw)
  To: Jan Beulich, Razvan Cojocaru; +Cc: tim, xen-devel

On 02/07/14 16:51, Jan Beulich wrote:
>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
>> new xc_domain_set_pagefault_info() function to set per-domain page
>> fault injection information. This information is then used to call
>> hvm_inject_page_fault() at the first VMENTRY where the guest status
>> matches and there are no other pending traps.
> So the first question that strikes me here: What good can it do to be
> able to inject arbitrary page faults, possibly at times where the guest
> OS is absolutely not expecting them?

I would further this by suggesting that the only plausible case where
you could inject a pagefault is as a reply to a mem_event for which the
guest is already paused.

In which case it it would be better implemented as part of the mem_event
protocol than a plain hypercall in its own regard.

~Andrew

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults
  2014-07-02 13:34 ` [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults Razvan Cojocaru
@ 2014-07-02 16:04   ` Andrew Cooper
  0 siblings, 0 replies; 64+ messages in thread
From: Andrew Cooper @ 2014-07-02 16:04 UTC (permalink / raw)
  To: Razvan Cojocaru, xen-devel; +Cc: Andrei Lutas, tim

On 02/07/14 14:34, Razvan Cojocaru wrote:
> The Xen emulator is incapable of handling some instructions, which
> leads to the injection of an Invalid Opcode exception (#UD) inside
> the guest once an unsupported instruction is encountered.

This is (as far as I am aware) simply because Xen's current emulation
only expects to emulate instructions which trapped for MMIO or fault
reasons.

There is no conceptual problem with extending Xen's emulation code, as
we are now expecting to have to emulate instructions which would
otherwise never have hit x86_emulate.

~Andrew

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-02 15:51   ` Jan Beulich
  2014-07-02 16:00     ` Andrew Cooper
@ 2014-07-02 16:06     ` Razvan Cojocaru
  2014-07-02 16:13       ` Jan Beulich
  1 sibling, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 16:06 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/02/2014 06:51 PM, Jan Beulich wrote:
>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
>> new xc_domain_set_pagefault_info() function to set per-domain page
>> fault injection information. This information is then used to call
>> hvm_inject_page_fault() at the first VMENTRY where the guest status
>> matches and there are no other pending traps.
> 
> So the first question that strikes me here: What good can it do to be
> able to inject arbitrary page faults, possibly at times where the guest
> OS is absolutely not expecting them?

The guest, as Andrew Cooper said, is waiting for a mem_event reply.

>> @@ -430,6 +431,9 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>>      __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
>>      __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
>>      __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
>> +    __vmread(GUEST_CS_AR_BYTES, &cs_arbytes);
>> +
>> +    c->cs_arbytes = (uint32_t)cs_arbytes;
> 
> This again looks like an unrelated change without any explanation.

It's used here, to check if we're in user mode before injecting the page
fault:

 92 +static void check_pf_injection(void)
 93 +{
 94 +    struct vcpu *curr = current;
 95 +    struct domain *d = curr->domain;
 96 +    struct hvm_hw_cpu ctxt;
 97 +    uint32_t cs_dpl;
 98 +
 99 +    if ( !is_hvm_domain(d) || d->fault_info.virtual_address == 0 )
100 +        return;
101 +
102 +    memset(&ctxt, 0, sizeof(struct hvm_hw_cpu));
103 +    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
104 +
105 +    cs_dpl = (ctxt.cs_arbytes >> 5) & 3;
106 +
107 +    if ( cs_dpl == 3 /* Guest is in user mode */
108 +         && !ctxt.pending_event
109 +         && ctxt.cr3 == d->fault_info.address_space )
110 +    {
111 +        /* Cache */
112 +        uint64_t virtual_address = d->fault_info.virtual_address;
113 +        uint32_t write_access = d->fault_info.write_access;
114 +
115 +        /* Reset */
116 +        d->fault_info.address_space = 0;
117 +        d->fault_info.virtual_address = 0;
118 +        d->fault_info.write_access = 0;
119 +
120 +        hvm_inject_page_fault((write_access << 1) | PFEC_user_mode,
121 +            virtual_address);
122 +    }
123 +}


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-02 15:43   ` Razvan Cojocaru
@ 2014-07-02 16:08     ` Jan Beulich
  2014-07-02 16:18       ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 16:08 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 17:43, <rcojocaru@bitdefender.com> wrote:
> On 07/02/2014 06:21 PM, Jan Beulich wrote:
>>> +    struct x86_emulate_ctxt __attribute__((unused)) *ctxt)
>> 
>> We don't mark unused function arguments like this (and if we did,
>> we'd want you to use __maybe_unused).
> 
> OK, thanks. What's the proper way to mark them? Should I go with
> __maybe_unused then?

No - don't mark them.

>>> +void hvm_emulate_one_full(bool_t nowrite)
>>> +{
>>> +    struct hvm_emulate_ctxt ctx[1] = {};
>>> +    int rc = X86EMUL_RETRY;
>>> +
>>> +    hvm_emulate_prepare(ctx, guest_cpu_user_regs());
>>> +
>>> +    while ( rc == X86EMUL_RETRY )
>>> +    {
>>> +        if ( nowrite )
>>> +            rc = hvm_emulate_one_no_write(ctx);
>>> +        else
>>> +            rc = hvm_emulate_one(ctx);
>>> +    }
>>> +
>>> +    switch ( rc )
>>> +    {
>>> +    case X86EMUL_UNHANDLEABLE:
>>> +        hvm_inject_hw_exception(TRAP_invalid_op, 
> HVM_DELIVER_NO_ERROR_CODE);
>> 
>> Is it certain that #UD is always the right exception here?
> 
> For our purposes, yes.

That's not really a good answer for code that isn't there to only suit
you.

> Of course, that means that I can't really explain what the original
> author intended (related to the rest of your critique).

Which clearly won't help acceptance of this code, assuming that's
your goal.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-02 15:54     ` Razvan Cojocaru
@ 2014-07-02 16:11       ` Jan Beulich
  2014-07-02 16:23         ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 16:11 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 17:54, <rcojocaru@bitdefender.com> wrote:
>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>> @@ -2880,8 +2880,21 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>>>      case EXIT_REASON_VMCALL:
>>>      {
>>>          int rc;
>>> +        unsigned long eax = regs->eax;
>>> +
>>>          HVMTRACE_1D(VMMCALL, regs->eax);
>>> -        rc = hvm_do_hypercall(regs);
>>> +
>>> +        if ( regs->eax != 0x494e5452 ) /* Introcore magic */
>> 
>> Urgh?!
> 
> The magic constant is INTR, and it's used to differentiate between
> "regular" and induced VMCALLs. Our application sets EAX up like that to
> tell the situations apart.

But that needs (a) a #define and (b) an explanation.

Plus - what keeps code outside of your app to invoke this?

>>> --- a/xen/include/public/hvm/params.h
>>> +++ b/xen/include/public/hvm/params.h
>>> @@ -148,6 +148,8 @@
>>>  #define HVM_PARAM_IOREQ_SERVER_PFN 32
>>>  #define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
>>>  
>>> -#define HVM_NR_PARAMS          34
>>> +#define HVM_PARAM_MEMORY_EVENT_VMCALL 34
>> 
>> So why does this (used only as an argument to
>> hvm_memory_event_traps()) need to be settable? I guess the patch
>> description is just too brief.
> 
> Settable?

You must have a reason to make this a HVM param. That reason is
what I'm asking for.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-02 16:06     ` Razvan Cojocaru
@ 2014-07-02 16:13       ` Jan Beulich
  0 siblings, 0 replies; 64+ messages in thread
From: Jan Beulich @ 2014-07-02 16:13 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 18:06, <rcojocaru@bitdefender.com> wrote:
> On 07/02/2014 06:51 PM, Jan Beulich wrote:
>>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>>> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
>>> new xc_domain_set_pagefault_info() function to set per-domain page
>>> fault injection information. This information is then used to call
>>> hvm_inject_page_fault() at the first VMENTRY where the guest status
>>> matches and there are no other pending traps.
>> 
>> So the first question that strikes me here: What good can it do to be
>> able to inject arbitrary page faults, possibly at times where the guest
>> OS is absolutely not expecting them?
> 
> The guest, as Andrew Cooper said, is waiting for a mem_event reply.
> 
>>> @@ -430,6 +431,9 @@ static void vmx_vmcs_save(struct vcpu *v, struct 
> hvm_hw_cpu *c)
>>>      __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
>>>      __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
>>>      __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
>>> +    __vmread(GUEST_CS_AR_BYTES, &cs_arbytes);
>>> +
>>> +    c->cs_arbytes = (uint32_t)cs_arbytes;
>> 
>> This again looks like an unrelated change without any explanation.
> 
> It's used here, to check if we're in user mode before injecting the page
> fault:

Okay.

>  92 +static void check_pf_injection(void)
>  93 +{
>  94 +    struct vcpu *curr = current;
>  95 +    struct domain *d = curr->domain;
>  96 +    struct hvm_hw_cpu ctxt;
>  97 +    uint32_t cs_dpl;
>  98 +
>  99 +    if ( !is_hvm_domain(d) || d->fault_info.virtual_address == 0 )
> 100 +        return;
> 101 +
> 102 +    memset(&ctxt, 0, sizeof(struct hvm_hw_cpu));
> 103 +    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
> 104 +
> 105 +    cs_dpl = (ctxt.cs_arbytes >> 5) & 3;
> 106 +
> 107 +    if ( cs_dpl == 3 /* Guest is in user mode */

Which is yet another example of trying to determine the CPL by
looking at CS.DPL - SS.DPL is the canonical value for that.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-02 16:08     ` Jan Beulich
@ 2014-07-02 16:18       ` Razvan Cojocaru
  2014-07-03  6:24         ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 16:18 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/02/2014 07:08 PM, Jan Beulich wrote:
>>>> On 02.07.14 at 17:43, <rcojocaru@bitdefender.com> wrote:
>> On 07/02/2014 06:21 PM, Jan Beulich wrote:
>>>> +    struct x86_emulate_ctxt __attribute__((unused)) *ctxt)
>>>
>>> We don't mark unused function arguments like this (and if we did,
>>> we'd want you to use __maybe_unused).
>>
>> OK, thanks. What's the proper way to mark them? Should I go with
>> __maybe_unused then?
> 
> No - don't mark them.

Noted.

>>>> +void hvm_emulate_one_full(bool_t nowrite)
>>>> +{
>>>> +    struct hvm_emulate_ctxt ctx[1] = {};
>>>> +    int rc = X86EMUL_RETRY;
>>>> +
>>>> +    hvm_emulate_prepare(ctx, guest_cpu_user_regs());
>>>> +
>>>> +    while ( rc == X86EMUL_RETRY )
>>>> +    {
>>>> +        if ( nowrite )
>>>> +            rc = hvm_emulate_one_no_write(ctx);
>>>> +        else
>>>> +            rc = hvm_emulate_one(ctx);
>>>> +    }
>>>> +
>>>> +    switch ( rc )
>>>> +    {
>>>> +    case X86EMUL_UNHANDLEABLE:
>>>> +        hvm_inject_hw_exception(TRAP_invalid_op, 
>> HVM_DELIVER_NO_ERROR_CODE);
>>>
>>> Is it certain that #UD is always the right exception here?
>>
>> For our purposes, yes.
> 
> That's not really a good answer for code that isn't there to only suit
> you.

Of course, all I was trying to say is that as far as this function has
been used (only in conjunction with our application), it has always been
the right exception, and I was hoping that somebody might shed some
light on a scenario of possibly problematic uses of it.

>> Of course, that means that I can't really explain what the original
>> author intended (related to the rest of your critique).
> 
> Which clearly won't help acceptance of this code, assuming that's
> your goal.

Obviously acceptance is the goal, but should the Linux code be cleaned
up and handed in, or is it better to try to enhance the emulator as has
been suggested above?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-02 16:11       ` Jan Beulich
@ 2014-07-02 16:23         ` Razvan Cojocaru
  2014-07-03  6:28           ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-02 16:23 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/02/2014 07:11 PM, Jan Beulich wrote:
>>>> On 02.07.14 at 17:54, <rcojocaru@bitdefender.com> wrote:
>>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>>> @@ -2880,8 +2880,21 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>>>>      case EXIT_REASON_VMCALL:
>>>>      {
>>>>          int rc;
>>>> +        unsigned long eax = regs->eax;
>>>> +
>>>>          HVMTRACE_1D(VMMCALL, regs->eax);
>>>> -        rc = hvm_do_hypercall(regs);
>>>> +
>>>> +        if ( regs->eax != 0x494e5452 ) /* Introcore magic */
>>>
>>> Urgh?!
>>
>> The magic constant is INTR, and it's used to differentiate between
>> "regular" and induced VMCALLs. Our application sets EAX up like that to
>> tell the situations apart.
> 
> But that needs (a) a #define and (b) an explanation.
> 
> Plus - what keeps code outside of your app to invoke this?

Unfortunately, nothing. It can be set outside our application.

> 
>>>> --- a/xen/include/public/hvm/params.h
>>>> +++ b/xen/include/public/hvm/params.h
>>>> @@ -148,6 +148,8 @@
>>>>  #define HVM_PARAM_IOREQ_SERVER_PFN 32
>>>>  #define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
>>>>  
>>>> -#define HVM_NR_PARAMS          34
>>>> +#define HVM_PARAM_MEMORY_EVENT_VMCALL 34
>>>
>>> So why does this (used only as an argument to
>>> hvm_memory_event_traps()) need to be settable? I guess the patch
>>> description is just too brief.
>>
>> Settable?
> 
> You must have a reason to make this a HVM param. That reason is
> what I'm asking for.

I see. I want to be able to enable / disable this type of events. I.e.:

if (flags & ENABLE_VMCALL)
    xc_set_hvm_param(xci, domain, HVM_PARAM_MEMORY_EVENT_VMCALL,
                     HVMPME_mode_sync);

from the application, via libxc.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-02 16:00     ` Andrew Cooper
@ 2014-07-02 16:58       ` Mihai Donțu
  2014-07-02 17:07         ` Andrew Cooper
  0 siblings, 1 reply; 64+ messages in thread
From: Mihai Donțu @ 2014-07-02 16:58 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: tim, Razvan Cojocaru, Jan Beulich, xen-devel

On Wed, 2 Jul 2014 17:00:08 +0100 Andrew Cooper wrote:
> On 02/07/14 16:51, Jan Beulich wrote:
> >>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
> >> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
> >> new xc_domain_set_pagefault_info() function to set per-domain page
> >> fault injection information. This information is then used to call
> >> hvm_inject_page_fault() at the first VMENTRY where the guest status
> >> matches and there are no other pending traps.
> > So the first question that strikes me here: What good can it do to
> > be able to inject arbitrary page faults, possibly at times where
> > the guest OS is absolutely not expecting them?

I have not yet had the chance to say: thank you all for your review!

There were times when we wanted to get certain information from the
guest but couldn't because it was swapped out. We now handle that
situation by injecting a #PF and then let the OS respond as it would
under a normal circumstance. After the data is brought in, it traps
again into our application and we get what we need, but yes, it
requires deep knowledge about the guest OS in order to do it without
crashing it. It's doable only if you have the means necessary to
inspect its state fully, which is why some of the submitted patches
exist.

> I would further this by suggesting that the only plausible case where
> you could inject a pagefault is as a reply to a mem_event for which
> the guest is already paused.
> 
> In which case it it would be better implemented as part of the
> mem_event protocol than a plain hypercall in its own regard.

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-02 16:58       ` Mihai Donțu
@ 2014-07-02 17:07         ` Andrew Cooper
  2014-07-03  8:23           ` Mihai Donțu
  0 siblings, 1 reply; 64+ messages in thread
From: Andrew Cooper @ 2014-07-02 17:07 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: tim, Razvan Cojocaru, Jan Beulich, xen-devel

On 02/07/14 17:58, Mihai Donțu wrote:
> On Wed, 2 Jul 2014 17:00:08 +0100 Andrew Cooper wrote:
>> On 02/07/14 16:51, Jan Beulich wrote:
>>>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>>>> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by libxc's
>>>> new xc_domain_set_pagefault_info() function to set per-domain page
>>>> fault injection information. This information is then used to call
>>>> hvm_inject_page_fault() at the first VMENTRY where the guest status
>>>> matches and there are no other pending traps.
>>> So the first question that strikes me here: What good can it do to
>>> be able to inject arbitrary page faults, possibly at times where
>>> the guest OS is absolutely not expecting them?
> I have not yet had the chance to say: thank you all for your review!

No worries - this certainly is an interesting series to consider.

>
> There were times when we wanted to get certain information from the
> guest but couldn't because it was swapped out. We now handle that
> situation by injecting a #PF and then let the OS respond as it would
> under a normal circumstance. After the data is brought in, it traps
> again into our application and we get what we need, but yes, it
> requires deep knowledge about the guest OS in order to do it without
> crashing it. It's doable only if you have the means necessary to
> inspect its state fully, which is why some of the submitted patches
> exist.

What is the threat model here?

It seems to me that the only safe place to organise this is from a
device driver in the guest.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-02 16:18       ` Razvan Cojocaru
@ 2014-07-03  6:24         ` Jan Beulich
  0 siblings, 0 replies; 64+ messages in thread
From: Jan Beulich @ 2014-07-03  6:24 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 18:18, <rcojocaru@bitdefender.com> wrote:
> On 07/02/2014 07:08 PM, Jan Beulich wrote:
>>>>> On 02.07.14 at 17:43, <rcojocaru@bitdefender.com> wrote:
>>> Of course, that means that I can't really explain what the original
>>> author intended (related to the rest of your critique).
>> 
>> Which clearly won't help acceptance of this code, assuming that's
>> your goal.
> 
> Obviously acceptance is the goal, but should the Linux code be cleaned
> up and handed in, or is it better to try to enhance the emulator as has
> been suggested above?

How you deal with issues pointed out in the code you want to
be accepted is up to you: You could fix it locally and diverge from
Linux from the beginning, but the better route perhaps would be
to have the cloned from code fixed first (or at least in parallel).
Of course all that being said leaving aside the much more generic
question of whether to integrate the new functionality with the
existing emulation code instead.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-02 16:23         ` Razvan Cojocaru
@ 2014-07-03  6:28           ` Jan Beulich
  2014-07-03  7:29             ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-03  6:28 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 02.07.14 at 18:23, <rcojocaru@bitdefender.com> wrote:
> On 07/02/2014 07:11 PM, Jan Beulich wrote:
>>>>> On 02.07.14 at 17:54, <rcojocaru@bitdefender.com> wrote:
>>>>> --- a/xen/include/public/hvm/params.h
>>>>> +++ b/xen/include/public/hvm/params.h
>>>>> @@ -148,6 +148,8 @@
>>>>>  #define HVM_PARAM_IOREQ_SERVER_PFN 32
>>>>>  #define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
>>>>>  
>>>>> -#define HVM_NR_PARAMS          34
>>>>> +#define HVM_PARAM_MEMORY_EVENT_VMCALL 34
>>>>
>>>> So why does this (used only as an argument to
>>>> hvm_memory_event_traps()) need to be settable? I guess the patch
>>>> description is just too brief.
>>>
>>> Settable?
>> 
>> You must have a reason to make this a HVM param. That reason is
>> what I'm asking for.
> 
> I see. I want to be able to enable / disable this type of events. I.e.:
> 
> if (flags & ENABLE_VMCALL)
>     xc_set_hvm_param(xci, domain, HVM_PARAM_MEMORY_EVENT_VMCALL,
>                      HVMPME_mode_sync);
> 
> from the application, via libxc.

But hvm_memory_event_vmcall() simply uses the value, whether or
not it got set. And if the receiver of the event has to anyway deal
with instances it didn't enable, then it needs to do filtering anyway,
and hence there's little point in making configurable the exact value
being passed back up.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 5/9] xen: Support for VMCALL mem_events
  2014-07-03  6:28           ` Jan Beulich
@ 2014-07-03  7:29             ` Razvan Cojocaru
  0 siblings, 0 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03  7:29 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/03/2014 09:28 AM, Jan Beulich wrote:
>>>> On 02.07.14 at 18:23, <rcojocaru@bitdefender.com> wrote:
>> On 07/02/2014 07:11 PM, Jan Beulich wrote:
>>>>>> On 02.07.14 at 17:54, <rcojocaru@bitdefender.com> wrote:
>>>>>> --- a/xen/include/public/hvm/params.h
>>>>>> +++ b/xen/include/public/hvm/params.h
>>>>>> @@ -148,6 +148,8 @@
>>>>>>  #define HVM_PARAM_IOREQ_SERVER_PFN 32
>>>>>>  #define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
>>>>>>  
>>>>>> -#define HVM_NR_PARAMS          34
>>>>>> +#define HVM_PARAM_MEMORY_EVENT_VMCALL 34
>>>>>
>>>>> So why does this (used only as an argument to
>>>>> hvm_memory_event_traps()) need to be settable? I guess the patch
>>>>> description is just too brief.
>>>>
>>>> Settable?
>>>
>>> You must have a reason to make this a HVM param. That reason is
>>> what I'm asking for.
>>
>> I see. I want to be able to enable / disable this type of events. I.e.:
>>
>> if (flags & ENABLE_VMCALL)
>>     xc_set_hvm_param(xci, domain, HVM_PARAM_MEMORY_EVENT_VMCALL,
>>                      HVMPME_mode_sync);
>>
>> from the application, via libxc.
> 
> But hvm_memory_event_vmcall() simply uses the value, whether or
> not it got set. And if the receiver of the event has to anyway deal
> with instances it didn't enable, then it needs to do filtering anyway,
> and hence there's little point in making configurable the exact value
> being passed back up.

You're right, I see your point. I'll either handle it in do_hvm_op() as
well or remove it completely.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-02 15:21 ` Jan Beulich
  2014-07-02 15:43   ` Razvan Cojocaru
@ 2014-07-03  7:38   ` Razvan Cojocaru
  2014-07-03  8:05     ` Jan Beulich
  1 sibling, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03  7:38 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/02/2014 06:21 PM, Jan Beulich wrote:
>> +void hvm_emulate_one_full(bool_t nowrite)
>> +{
>> +    struct hvm_emulate_ctxt ctx[1] = {};
>> +    int rc = X86EMUL_RETRY;
>> +
>> +    hvm_emulate_prepare(ctx, guest_cpu_user_regs());
>> +
>> +    while ( rc == X86EMUL_RETRY )
>> +    {
>> +        if ( nowrite )
>> +            rc = hvm_emulate_one_no_write(ctx);
>> +        else
>> +            rc = hvm_emulate_one(ctx);
>> +    }
>> +
>> +    switch ( rc )
>> +    {
>> +    case X86EMUL_UNHANDLEABLE:
>> +        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
> 
> Is it certain that #UD is always the right exception here?

I'll make that configurable (extra parameters to
hvm_emulate_one_full()), would that work for you?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-02 15:20 ` [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Andrew Cooper
@ 2014-07-03  7:42   ` Razvan Cojocaru
  0 siblings, 0 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03  7:42 UTC (permalink / raw)
  To: Andrew Cooper, xen-devel; +Cc: tim

On 07/02/2014 06:20 PM, Andrew Cooper wrote:
> On 02/07/14 14:33, Razvan Cojocaru wrote:
>> Added support for emulating an instruction with no memory writes and
>> for retrieving the length of the next instruction. Additionally,
>> introduced hvm_emulate_one_full(bool_t nowrite), which acts upon all
>> possible return values from the hvm_emulate_one() functions (RETRY,
>> EXCEPTION, UNHANDLEABLE).
>>
>> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
> 
> xen/arch/x86/x86_emulate/ is the core of the emulation in Xen, and this
> looks very much as if it should be part that, rather than wedged on the
> side of the hypervisor.
> 
> Amongst other things, x86_emulate already contains instruction decode
> tables, so it would appear that extending x86_emulate would result in
> less code duplication.

Actually I think I'll break this one into two patches: the emulate
without writes part, which only needs minor modifications according to
Jan's review, and the instruction length part with requires more careful
consideration.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length
  2014-07-03  7:38   ` Razvan Cojocaru
@ 2014-07-03  8:05     ` Jan Beulich
  0 siblings, 0 replies; 64+ messages in thread
From: Jan Beulich @ 2014-07-03  8:05 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 03.07.14 at 09:38, <rcojocaru@bitdefender.com> wrote:
> On 07/02/2014 06:21 PM, Jan Beulich wrote:
>>> +void hvm_emulate_one_full(bool_t nowrite)
>>> +{
>>> +    struct hvm_emulate_ctxt ctx[1] = {};
>>> +    int rc = X86EMUL_RETRY;
>>> +
>>> +    hvm_emulate_prepare(ctx, guest_cpu_user_regs());
>>> +
>>> +    while ( rc == X86EMUL_RETRY )
>>> +    {
>>> +        if ( nowrite )
>>> +            rc = hvm_emulate_one_no_write(ctx);
>>> +        else
>>> +            rc = hvm_emulate_one(ctx);
>>> +    }
>>> +
>>> +    switch ( rc )
>>> +    {
>>> +    case X86EMUL_UNHANDLEABLE:
>>> +        hvm_inject_hw_exception(TRAP_invalid_op, 
> HVM_DELIVER_NO_ERROR_CODE);
>> 
>> Is it certain that #UD is always the right exception here?
> 
> I'll make that configurable (extra parameters to
> hvm_emulate_one_full()), would that work for you?

Yes, that would seem reasonable to me.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-02 15:37   ` Jan Beulich
@ 2014-07-03  8:12     ` Razvan Cojocaru
  2014-07-03  8:54       ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03  8:12 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/02/2014 06:37 PM, Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -6016,6 +6016,38 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
>>      return rc;
>>  }
>>  
>> +static inline void hvm_mem_event_fill_regs(mem_event_request_t *req)
>> +{
>> +    struct cpu_user_regs *regs = guest_cpu_user_regs();
>> +    struct vcpu *v = current;
>> +
>> +    req->regs.rax = regs->eax;
>> +    req->regs.rcx = regs->ecx;
>> +    req->regs.rdx = regs->edx;
>> +    req->regs.rbx = regs->ebx;
>> +    req->regs.rsp = regs->esp;
>> +    req->regs.rbp = regs->ebp;
>> +    req->regs.rsi = regs->esi;
>> +    req->regs.rdi = regs->edi;
>> +
>> +    req->regs.r8  = regs->r8;
>> +    req->regs.r9  = regs->r9;
>> +    req->regs.r10 = regs->r10;
>> +    req->regs.r11 = regs->r11;
>> +    req->regs.r12 = regs->r12;
>> +    req->regs.r13 = regs->r13;
>> +    req->regs.r14 = regs->r14;
>> +    req->regs.r15 = regs->r15;
>> +
>> +    req->regs.rflags = regs->eflags;
>> +    req->regs.rip    = regs->eip;
>> +
>> +    req->regs.msr_efer = v->arch.hvm_vcpu.guest_efer;
>> +    req->regs.cr0 = v->arch.hvm_vcpu.guest_cr[0];
>> +    req->regs.cr3 = v->arch.hvm_vcpu.guest_cr[3];
>> +    req->regs.cr4 = v->arch.hvm_vcpu.guest_cr[4];
>> +}
> 
> This fills far not as many fields as the p2m function further down.
> Why?

That is because hvm_mem_event_fill_regs() is used for events such as CR3
changes or MSR access, and p2m_mem_event_fill_regs() is used for EPT
events, and our application needs full information while handling EPT
callbacks, and not as much for the other events.

Hence I've tried to avoid the unnecessary overhead in that case,
thinking that if somebody needed those values, they would be added then.

>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>>      c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
>>  
>>      c->msr_efer = v->arch.hvm_vcpu.guest_efer;
>> +    c->guest_x86_mode = vmx_guest_x86_mode(v);
> 
> This seems unrelated and/or lacking an SVM counterpart.

Yes, it does lack a SVM counterpart. Is SVM support required for acceptance?

It is, however, not unrelated. Our application required that
information, and it is cached in the mem_event (or am I missing something?).


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-02 17:07         ` Andrew Cooper
@ 2014-07-03  8:23           ` Mihai Donțu
  2014-07-03  9:32             ` Andrew Cooper
  0 siblings, 1 reply; 64+ messages in thread
From: Mihai Donțu @ 2014-07-03  8:23 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Tim Deegan, Razvan Cojocaru, Jan Beulich, xen-devel

On Wednesday 02 July 2014 18:07:20 Andrew Cooper wrote:
> On 02/07/14 17:58, Mihai Donțu wrote:
> > On Wed, 2 Jul 2014 17:00:08 +0100 Andrew Cooper wrote:
> >> On 02/07/14 16:51, Jan Beulich wrote:
> >>>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
> >>>> Added new XEN_DOMCTL_set_pagefault_info hypercall, used by
> >>>> libxc's new xc_domain_set_pagefault_info() function to set
> >>>> per-domain page fault injection information. This information is
> >>>> then used to call hvm_inject_page_fault() at the first VMENTRY
> >>>> where the guest status matches and there are no other pending
> >>>> traps.
> >>> So the first question that strikes me here: What good can it do to
> >>> be able to inject arbitrary page faults, possibly at times where
> >>> the guest OS is absolutely not expecting them?
> > I have not yet had the chance to say: thank you all for your review!
> 
> No worries - this certainly is an interesting series to consider.
> 
> >
> > There were times when we wanted to get certain information from the
> > guest but couldn't because it was swapped out. We now handle that
> > situation by injecting a #PF and then let the OS respond as it would
> > under a normal circumstance. After the data is brought in, it traps
> > again into our application and we get what we need, but yes, it
> > requires deep knowledge about the guest OS in order to do it without
> > crashing it. It's doable only if you have the means necessary to
> > inspect its state fully, which is why some of the submitted patches
> > exist.
> 
> What is the threat model here?
> 
> It seems to me that the only safe place to organise this is from a
> device driver in the guest.

This patch by itself does not address an in-guest security issue, it
merely helps implement a number of guards. For example, if we want to
audit all attempts to write into the .text area of an application by
other applications (via  process_vm_writev() or equivalent) we need to
first bring in the complete .text sections of all modules. I forgot to
mention before, but this patch can be used to bring in pages from
memory mapped files (executables / shared objects).

This can indeed be done in a much easier fashion directly from the
guest kernel, but we are envisioning a security tool that acts
completely from outside the domain and firmly believe that the amount
of work needed to do this will be worth it.

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-03  8:12     ` Razvan Cojocaru
@ 2014-07-03  8:54       ` Jan Beulich
  0 siblings, 0 replies; 64+ messages in thread
From: Jan Beulich @ 2014-07-03  8:54 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 03.07.14 at 10:12, <rcojocaru@bitdefender.com> wrote:
> On 07/02/2014 06:37 PM, Jan Beulich wrote:
>>> --- a/xen/arch/x86/hvm/hvm.c
>>> +++ b/xen/arch/x86/hvm/hvm.c
>>> @@ -6016,6 +6016,38 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
>>>      return rc;
>>>  }
>>>  
>>> +static inline void hvm_mem_event_fill_regs(mem_event_request_t *req)
>>> +{
>>> +    struct cpu_user_regs *regs = guest_cpu_user_regs();
>>> +    struct vcpu *v = current;
>>> +
>>> +    req->regs.rax = regs->eax;
>>> +    req->regs.rcx = regs->ecx;
>>> +    req->regs.rdx = regs->edx;
>>> +    req->regs.rbx = regs->ebx;
>>> +    req->regs.rsp = regs->esp;
>>> +    req->regs.rbp = regs->ebp;
>>> +    req->regs.rsi = regs->esi;
>>> +    req->regs.rdi = regs->edi;
>>> +
>>> +    req->regs.r8  = regs->r8;
>>> +    req->regs.r9  = regs->r9;
>>> +    req->regs.r10 = regs->r10;
>>> +    req->regs.r11 = regs->r11;
>>> +    req->regs.r12 = regs->r12;
>>> +    req->regs.r13 = regs->r13;
>>> +    req->regs.r14 = regs->r14;
>>> +    req->regs.r15 = regs->r15;
>>> +
>>> +    req->regs.rflags = regs->eflags;
>>> +    req->regs.rip    = regs->eip;
>>> +
>>> +    req->regs.msr_efer = v->arch.hvm_vcpu.guest_efer;
>>> +    req->regs.cr0 = v->arch.hvm_vcpu.guest_cr[0];
>>> +    req->regs.cr3 = v->arch.hvm_vcpu.guest_cr[3];
>>> +    req->regs.cr4 = v->arch.hvm_vcpu.guest_cr[4];
>>> +}
>> 
>> This fills far not as many fields as the p2m function further down.
>> Why?
> 
> That is because hvm_mem_event_fill_regs() is used for events such as CR3
> changes or MSR access, and p2m_mem_event_fill_regs() is used for EPT
> events, and our application needs full information while handling EPT
> callbacks, and not as much for the other events.
> 
> Hence I've tried to avoid the unnecessary overhead in that case,
> thinking that if somebody needed those values, they would be added then.

Fair enough, if only it was visible (or stated) that this doesn't lead
to uninitialized data making it back to the event listener.

>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>> @@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>>>      c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
>>>  
>>>      c->msr_efer = v->arch.hvm_vcpu.guest_efer;
>>> +    c->guest_x86_mode = vmx_guest_x86_mode(v);
>> 
>> This seems unrelated and/or lacking an SVM counterpart.
> 
> Yes, it does lack a SVM counterpart. Is SVM support required for acceptance?

Not necessarily, but you should state this explicitly rather than leaving
it to be discovered by the readers. And presumably when enabling any
of this, you should check you're on VMX (at least I don't recall having
seen such a check).

> It is, however, not unrelated. Our application required that
> information, and it is cached in the mem_event (or am I missing something?).

The problem is that from the patch (including its description) it's not
clear where the consumer of this is.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-02 15:56   ` Jan Beulich
@ 2014-07-03  8:55     ` Razvan Cojocaru
  2014-07-03  9:02       ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03  8:55 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/02/2014 06:56 PM, Jan Beulich wrote:
>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>> In a scenario where a page fault that triggered a mem_event occured,
>> p2m_mem_access_check() will now be able to either 1) emulate the
>> current instruction, 2) skip the current instruction, or 3) emulate
>> it, but don't allow it to perform any writes. Since some SSE2
>> instructions are problematic to emulate (Firefox uses some),
>> support for setting the A and D (accessed and dirty) bits has been
>> added (please see p2m_set_ad_bits()).
> 
> Sadly that reference is useless - the function doesn't have any
> explanation what all this is about either.

p2m_set_ad_bits() ends up calling the code in
xen/arch/x86/mm/hap/guest_walk.c, namely an "instantiation" of
hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(), which in turn calls
guest_walk_tables() (from xen/arch/x86/mm/guest_walk.c), which sets up
the A/D bits allowing the problematic instructions to run while
bypassing emulation for that specific case.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-03  8:55     ` Razvan Cojocaru
@ 2014-07-03  9:02       ` Jan Beulich
  2014-07-03  9:12         ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-03  9:02 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 03.07.14 at 10:55, <rcojocaru@bitdefender.com> wrote:
> On 07/02/2014 06:56 PM, Jan Beulich wrote:
>>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>>> In a scenario where a page fault that triggered a mem_event occured,
>>> p2m_mem_access_check() will now be able to either 1) emulate the
>>> current instruction, 2) skip the current instruction, or 3) emulate
>>> it, but don't allow it to perform any writes. Since some SSE2
>>> instructions are problematic to emulate (Firefox uses some),
>>> support for setting the A and D (accessed and dirty) bits has been
>>> added (please see p2m_set_ad_bits()).
>> 
>> Sadly that reference is useless - the function doesn't have any
>> explanation what all this is about either.
> 
> p2m_set_ad_bits() ends up calling the code in
> xen/arch/x86/mm/hap/guest_walk.c, namely an "instantiation" of
> hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(), which in turn calls
> guest_walk_tables() (from xen/arch/x86/mm/guest_walk.c), which sets up
> the A/D bits allowing the problematic instructions to run while
> bypassing emulation for that specific case.

That's the mechanical part one can indeed work out from the patch.
The interesting but unexplained thing here is which "some SSE2
instructions" you refer to, and what's so special about them (you
not also including e.g. AVX here makes me further curious, as in
most cases AVX ones are direct extensions of SSEn ones, and hence
I'd expect them to be similarly problematic).

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-03  9:02       ` Jan Beulich
@ 2014-07-03  9:12         ` Razvan Cojocaru
  2014-07-03  9:18           ` Andrew Cooper
  2014-07-03  9:22           ` Jan Beulich
  0 siblings, 2 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03  9:12 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/03/2014 12:02 PM, Jan Beulich wrote:
>>>> On 03.07.14 at 10:55, <rcojocaru@bitdefender.com> wrote:
>> On 07/02/2014 06:56 PM, Jan Beulich wrote:
>>>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>>>> In a scenario where a page fault that triggered a mem_event occured,
>>>> p2m_mem_access_check() will now be able to either 1) emulate the
>>>> current instruction, 2) skip the current instruction, or 3) emulate
>>>> it, but don't allow it to perform any writes. Since some SSE2
>>>> instructions are problematic to emulate (Firefox uses some),
>>>> support for setting the A and D (accessed and dirty) bits has been
>>>> added (please see p2m_set_ad_bits()).
>>>
>>> Sadly that reference is useless - the function doesn't have any
>>> explanation what all this is about either.
>>
>> p2m_set_ad_bits() ends up calling the code in
>> xen/arch/x86/mm/hap/guest_walk.c, namely an "instantiation" of
>> hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(), which in turn calls
>> guest_walk_tables() (from xen/arch/x86/mm/guest_walk.c), which sets up
>> the A/D bits allowing the problematic instructions to run while
>> bypassing emulation for that specific case.
> 
> That's the mechanical part one can indeed work out from the patch.
> The interesting but unexplained thing here is which "some SSE2
> instructions" you refer to, and what's so special about them (you
> not also including e.g. AVX here makes me further curious, as in
> most cases AVX ones are direct extensions of SSEn ones, and hence
> I'd expect them to be similarly problematic).

An example that kept appearing with Xen 4.3 and Firefox in our test
environment was: divsd xmm0, qword ptr [0x21c290]


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-03  9:12         ` Razvan Cojocaru
@ 2014-07-03  9:18           ` Andrew Cooper
  2014-07-03  9:22           ` Jan Beulich
  1 sibling, 0 replies; 64+ messages in thread
From: Andrew Cooper @ 2014-07-03  9:18 UTC (permalink / raw)
  To: Razvan Cojocaru, Jan Beulich; +Cc: tim, xen-devel


On 03/07/2014 10:12, Razvan Cojocaru wrote:
> On 07/03/2014 12:02 PM, Jan Beulich wrote:
>>>>> On 03.07.14 at 10:55, <rcojocaru@bitdefender.com> wrote:
>>> On 07/02/2014 06:56 PM, Jan Beulich wrote:
>>>>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>>>>> In a scenario where a page fault that triggered a mem_event occured,
>>>>> p2m_mem_access_check() will now be able to either 1) emulate the
>>>>> current instruction, 2) skip the current instruction, or 3) emulate
>>>>> it, but don't allow it to perform any writes. Since some SSE2
>>>>> instructions are problematic to emulate (Firefox uses some),
>>>>> support for setting the A and D (accessed and dirty) bits has been
>>>>> added (please see p2m_set_ad_bits()).
>>>> Sadly that reference is useless - the function doesn't have any
>>>> explanation what all this is about either.
>>> p2m_set_ad_bits() ends up calling the code in
>>> xen/arch/x86/mm/hap/guest_walk.c, namely an "instantiation" of
>>> hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(), which in turn calls
>>> guest_walk_tables() (from xen/arch/x86/mm/guest_walk.c), which sets up
>>> the A/D bits allowing the problematic instructions to run while
>>> bypassing emulation for that specific case.
>> That's the mechanical part one can indeed work out from the patch.
>> The interesting but unexplained thing here is which "some SSE2
>> instructions" you refer to, and what's so special about them (you
>> not also including e.g. AVX here makes me further curious, as in
>> most cases AVX ones are direct extensions of SSEn ones, and hence
>> I'd expect them to be similarly problematic).
> An example that kept appearing with Xen 4.3 and Firefox in our test
> environment was: divsd xmm0, qword ptr [0x21c290]
>
>
> Thanks,
> Razvan Cojocaru

That is a bug in Xen (at least from your point of view).  It should be 
fixed by either correcting, or as I suspect is more likely, actually 
adding emulation support for SSE instructions.  Xen's emulation has 
traditionally been restricted to the code instruction set and anything 
which could reasonably fault against MMIO regions. As a result, its 
knowledge of newer instruction sets is limited at best.

~Andrew


>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-03  9:12         ` Razvan Cojocaru
  2014-07-03  9:18           ` Andrew Cooper
@ 2014-07-03  9:22           ` Jan Beulich
  2014-07-03  9:34             ` Razvan Cojocaru
  1 sibling, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-03  9:22 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 03.07.14 at 11:12, <rcojocaru@bitdefender.com> wrote:
> On 07/03/2014 12:02 PM, Jan Beulich wrote:
>>>>> On 03.07.14 at 10:55, <rcojocaru@bitdefender.com> wrote:
>>> On 07/02/2014 06:56 PM, Jan Beulich wrote:
>>>>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>>>>> In a scenario where a page fault that triggered a mem_event occured,
>>>>> p2m_mem_access_check() will now be able to either 1) emulate the
>>>>> current instruction, 2) skip the current instruction, or 3) emulate
>>>>> it, but don't allow it to perform any writes. Since some SSE2
>>>>> instructions are problematic to emulate (Firefox uses some),
>>>>> support for setting the A and D (accessed and dirty) bits has been
>>>>> added (please see p2m_set_ad_bits()).
>>>>
>>>> Sadly that reference is useless - the function doesn't have any
>>>> explanation what all this is about either.
>>>
>>> p2m_set_ad_bits() ends up calling the code in
>>> xen/arch/x86/mm/hap/guest_walk.c, namely an "instantiation" of
>>> hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(), which in turn calls
>>> guest_walk_tables() (from xen/arch/x86/mm/guest_walk.c), which sets up
>>> the A/D bits allowing the problematic instructions to run while
>>> bypassing emulation for that specific case.
>> 
>> That's the mechanical part one can indeed work out from the patch.
>> The interesting but unexplained thing here is which "some SSE2
>> instructions" you refer to, and what's so special about them (you
>> not also including e.g. AVX here makes me further curious, as in
>> most cases AVX ones are direct extensions of SSEn ones, and hence
>> I'd expect them to be similarly problematic).
> 
> An example that kept appearing with Xen 4.3 and Firefox in our test
> environment was: divsd xmm0, qword ptr [0x21c290]

And what's so special about it? Just that there's a better chance of it
raising #XM? And why would divss (which is SSE, not SSE2) or vdivsd
(AVX) not have similar problems?

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-03  8:23           ` Mihai Donțu
@ 2014-07-03  9:32             ` Andrew Cooper
  2014-07-03  9:40               ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Andrew Cooper @ 2014-07-03  9:32 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Tim Deegan, Razvan Cojocaru, Jan Beulich, xen-devel


On 03/07/2014 09:23, Mihai Donțu wrote:
> On Wednesday 02 July 2014 18:07:20 Andrew Cooper wrote:
>> On 02/07/14 17:58, Mihai Donțu wrote:
>>> On Wed, 2 Jul 2014 17:00:08 +0100 Andrew Cooper wrote:
>>>> On 02/07/14 16:51, Jan Beulich wrote:
>>>>
>>> There were times when we wanted to get certain information from the
>>> guest but couldn't because it was swapped out. We now handle that
>>> situation by injecting a #PF and then let the OS respond as it would
>>> under a normal circumstance. After the data is brought in, it traps
>>> again into our application and we get what we need, but yes, it
>>> requires deep knowledge about the guest OS in order to do it without
>>> crashing it. It's doable only if you have the means necessary to
>>> inspect its state fully, which is why some of the submitted patches
>>> exist.
>> What is the threat model here?
>>
>> It seems to me that the only safe place to organise this is from a
>> device driver in the guest.
> This patch by itself does not address an in-guest security issue, it
> merely helps implement a number of guards. For example, if we want to
> audit all attempts to write into the .text area of an application by
> other applications (via  process_vm_writev() or equivalent) we need to
> first bring in the complete .text sections of all modules. I forgot to
> mention before, but this patch can be used to bring in pages from
> memory mapped files (executables / shared objects).
>
> This can indeed be done in a much easier fashion directly from the
> guest kernel, but we are envisioning a security tool that acts
> completely from outside the domain and firmly believe that the amount
> of work needed to do this will be worth it.
>

Ok.  So you are looking for a way to force arbitrary pages to be paged in?

I cant see how this could ever be safe from outside the VM.  At the very 
best you will have to wait until the correct virtual address space is in 
context (which is not as easy as relying on cr3), probably wait until 
the vcpu is executing userspace code, and even then you are still 
fighting with the guest OS's paging-out algorithm.

This certainly isn't something can be done safely as a toolstack hypercall.

While I can see what you intend to do with the feature, I still can't 
see how it would actually work given the infrastructure in this series.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-03  9:22           ` Jan Beulich
@ 2014-07-03  9:34             ` Razvan Cojocaru
  2014-07-03 10:14               ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03  9:34 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tim, xen-devel

On 07/03/2014 12:22 PM, Jan Beulich wrote:
>>>> On 03.07.14 at 11:12, <rcojocaru@bitdefender.com> wrote:
>> On 07/03/2014 12:02 PM, Jan Beulich wrote:
>>>>>> On 03.07.14 at 10:55, <rcojocaru@bitdefender.com> wrote:
>>>> On 07/02/2014 06:56 PM, Jan Beulich wrote:
>>>>>>>> On 02.07.14 at 15:33, <rcojocaru@bitdefender.com> wrote:
>>>>>> In a scenario where a page fault that triggered a mem_event occured,
>>>>>> p2m_mem_access_check() will now be able to either 1) emulate the
>>>>>> current instruction, 2) skip the current instruction, or 3) emulate
>>>>>> it, but don't allow it to perform any writes. Since some SSE2
>>>>>> instructions are problematic to emulate (Firefox uses some),
>>>>>> support for setting the A and D (accessed and dirty) bits has been
>>>>>> added (please see p2m_set_ad_bits()).
>>>>>
>>>>> Sadly that reference is useless - the function doesn't have any
>>>>> explanation what all this is about either.
>>>>
>>>> p2m_set_ad_bits() ends up calling the code in
>>>> xen/arch/x86/mm/hap/guest_walk.c, namely an "instantiation" of
>>>> hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(), which in turn calls
>>>> guest_walk_tables() (from xen/arch/x86/mm/guest_walk.c), which sets up
>>>> the A/D bits allowing the problematic instructions to run while
>>>> bypassing emulation for that specific case.
>>>
>>> That's the mechanical part one can indeed work out from the patch.
>>> The interesting but unexplained thing here is which "some SSE2
>>> instructions" you refer to, and what's so special about them (you
>>> not also including e.g. AVX here makes me further curious, as in
>>> most cases AVX ones are direct extensions of SSEn ones, and hence
>>> I'd expect them to be similarly problematic).
>>
>> An example that kept appearing with Xen 4.3 and Firefox in our test
>> environment was: divsd xmm0, qword ptr [0x21c290]
> 
> And what's so special about it? Just that there's a better chance of it
> raising #XM? And why would divss (which is SSE, not SSE2) or vdivsd
> (AVX) not have similar problems?

Nothing is particularly special about it, apart from the fact that it
caused our hypervisor to print out:

Emulation failed @ 001b:21107d: f2 0f 5e 05 90 c2 21 00 8b 4d

and Firefox crashed. Indeed, I agree with Andrew that this looks like a
bug in Xen's emulator, however rather than treating each case
individually and risk random application crashes with instructions we
missed, we decided to, for our purposes, treat this as a class of cases
that can be handled in the same way regardless of the specific instruction.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc
  2014-07-03  9:32             ` Andrew Cooper
@ 2014-07-03  9:40               ` Razvan Cojocaru
  0 siblings, 0 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03  9:40 UTC (permalink / raw)
  To: Andrew Cooper, Mihai Donțu; +Cc: Tim Deegan, Jan Beulich, xen-devel

On 07/03/2014 12:32 PM, Andrew Cooper wrote:
> 
> On 03/07/2014 09:23, Mihai Donțu wrote:
>> On Wednesday 02 July 2014 18:07:20 Andrew Cooper wrote:
>>> On 02/07/14 17:58, Mihai Donțu wrote:
>>>> On Wed, 2 Jul 2014 17:00:08 +0100 Andrew Cooper wrote:
>>>>> On 02/07/14 16:51, Jan Beulich wrote:
>>>>>
>>>> There were times when we wanted to get certain information from the
>>>> guest but couldn't because it was swapped out. We now handle that
>>>> situation by injecting a #PF and then let the OS respond as it would
>>>> under a normal circumstance. After the data is brought in, it traps
>>>> again into our application and we get what we need, but yes, it
>>>> requires deep knowledge about the guest OS in order to do it without
>>>> crashing it. It's doable only if you have the means necessary to
>>>> inspect its state fully, which is why some of the submitted patches
>>>> exist.
>>> What is the threat model here?
>>>
>>> It seems to me that the only safe place to organise this is from a
>>> device driver in the guest.
>> This patch by itself does not address an in-guest security issue, it
>> merely helps implement a number of guards. For example, if we want to
>> audit all attempts to write into the .text area of an application by
>> other applications (via  process_vm_writev() or equivalent) we need to
>> first bring in the complete .text sections of all modules. I forgot to
>> mention before, but this patch can be used to bring in pages from
>> memory mapped files (executables / shared objects).
>>
>> This can indeed be done in a much easier fashion directly from the
>> guest kernel, but we are envisioning a security tool that acts
>> completely from outside the domain and firmly believe that the amount
>> of work needed to do this will be worth it.
>>
> 
> Ok.  So you are looking for a way to force arbitrary pages to be paged in?
> 
> I cant see how this could ever be safe from outside the VM.  At the very
> best you will have to wait until the correct virtual address space is in
> context (which is not as easy as relying on cr3), probably wait until
> the vcpu is executing userspace code, and even then you are still
> fighting with the guest OS's paging-out algorithm.

We're waiting until vmx_vmenter_helper(). Then, we check both cs_dpl
(Jan suggested SS.DPL in an earlier reply) to make sure we're in
userspace code, and cr3:

 92 +static void check_pf_injection(void)
 93 +{
 94 +    struct vcpu *curr = current;
 95 +    struct domain *d = curr->domain;
 96 +    struct hvm_hw_cpu ctxt;
 97 +    uint32_t cs_dpl;
 98 +
 99 +    if ( !is_hvm_domain(d) || d->fault_info.virtual_address == 0 )
100 +        return;
101 +
102 +    memset(&ctxt, 0, sizeof(struct hvm_hw_cpu));
103 +    hvm_funcs.save_cpu_ctxt(curr, &ctxt);
104 +
105 +    cs_dpl = (ctxt.cs_arbytes >> 5) & 3;
106 +
107 +    if ( cs_dpl == 3 /* Guest is in user mode */
108 +         && !ctxt.pending_event
109 +         && ctxt.cr3 == d->fault_info.address_space )
110 +    {
111 +        /* Cache */
112 +        uint64_t virtual_address = d->fault_info.virtual_address;
113 +        uint32_t write_access = d->fault_info.write_access;
114 +
115 +        /* Reset */
116 +        d->fault_info.address_space = 0;
117 +        d->fault_info.virtual_address = 0;
118 +        d->fault_info.write_access = 0;
119 +
120 +        hvm_inject_page_fault((write_access << 1) | PFEC_user_mode,
121 +            virtual_address);
122 +    }
123 +}

All the hypercall itself does is set a few flags that are checked in
check_pf_injection().


Thanks,
Razvan Cojocaru

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply
  2014-07-03  9:34             ` Razvan Cojocaru
@ 2014-07-03 10:14               ` Jan Beulich
  0 siblings, 0 replies; 64+ messages in thread
From: Jan Beulich @ 2014-07-03 10:14 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: tim, xen-devel

>>> On 03.07.14 at 11:34, <rcojocaru@bitdefender.com> wrote:
> On 07/03/2014 12:22 PM, Jan Beulich wrote:
>>>>> On 03.07.14 at 11:12, <rcojocaru@bitdefender.com> wrote:
>>> An example that kept appearing with Xen 4.3 and Firefox in our test
>>> environment was: divsd xmm0, qword ptr [0x21c290]
>> 
>> And what's so special about it? Just that there's a better chance of it
>> raising #XM? And why would divss (which is SSE, not SSE2) or vdivsd
>> (AVX) not have similar problems?
> 
> Nothing is particularly special about it, apart from the fact that it
> caused our hypervisor to print out:
> 
> Emulation failed @ 001b:21107d: f2 0f 5e 05 90 c2 21 00 8b 4d
> 
> and Firefox crashed. Indeed, I agree with Andrew that this looks like a
> bug in Xen's emulator, however rather than treating each case
> individually and risk random application crashes with instructions we
> missed, we decided to, for our purposes, treat this as a class of cases
> that can be handled in the same way regardless of the specific instruction.

Which is even more of an argument to extend the emulator we
have rather than introducing half a new one. And of course, with
the goal you have, just won't get away with implementing just the
instructions you have seen being used - you (or someone else) just
have to go through and implement _all_ known instructions. Quite
a bit of a job, I know...

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 9/9] mm: mark pages that have their permissions controlled by a domain
  2014-07-02 13:34 ` [PATCH RFC 9/9] mm: mark pages that have their permissions controlled by a domain Razvan Cojocaru
@ 2014-07-03 10:19   ` Jan Beulich
  2014-07-03 11:27     ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-03 10:19 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: Mihai Dontu, tim, xen-devel

>>> On 02.07.14 at 15:34, <rcojocaru@bitdefender.com> wrote:
> This goes together with the mem-event API changes and marks certain
> pages as being controlled from outside the HV (a user domain in our
> case). This prevents Xen from resetting the permissions in certain
> cases, enforcing the previously expressed intention of receiving a
> memory event everytime the owning domain triggers a fault.

These "certain cases" would clearly benefit from being named in a
more precise fashion - it's not been that long ago that for some of
the cases where the access permissions got reset as a side effect
we decided to remove that side effect, and it was discussed to also
be done for at least one more case. I.e. perhaps you would not
need extensive changes like the ones here if that was done?

> This enhancement makes use of an unused bit in the EPT-PTE entry (vmx)
> and adjusts the definitions of get_entry() and set_entry() to carry a
> variable controlling this bit. It would probably have been better to add
> a new access type to p2m_access_t (which I tried), but in testing it
> caused subtle failures in the application using the mem-event API (the
> domains themselves seemed to work just fine though).

The above is even more so because this used bit is the only one left
at this point, i.e. I'd be rather careful giving this away without being
certain there's no alternative.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 9/9] mm: mark pages that have their permissions controlled by a domain
  2014-07-03 10:19   ` Jan Beulich
@ 2014-07-03 11:27     ` Razvan Cojocaru
  2014-07-03 12:15       ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-03 11:27 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Mihai Dontu, tim, xen-devel

On 07/03/2014 01:19 PM, Jan Beulich wrote:
>>>> On 02.07.14 at 15:34, <rcojocaru@bitdefender.com> wrote:
>> This goes together with the mem-event API changes and marks certain
>> pages as being controlled from outside the HV (a user domain in our
>> case). This prevents Xen from resetting the permissions in certain
>> cases, enforcing the previously expressed intention of receiving a
>> memory event everytime the owning domain triggers a fault.
> 
> These "certain cases" would clearly benefit from being named in a
> more precise fashion - it's not been that long ago that for some of
> the cases where the access permissions got reset as a side effect
> we decided to remove that side effect, and it was discussed to also
> be done for at least one more case. I.e. perhaps you would not
> need extensive changes like the ones here if that was done?

Could you please point us to these discussions? It would definitely be
very nice if the problem has been fixed in the meantime.

We're not entirely sure what the trigger for the resets was in our case.
We've so far ruled out live migration and ballooning, by disabling them
in the .conf files for the guests.

We hook into the guest fairly early, almost immediately after it starts
running, and occasionally (quite rarely, actually), we lost the
permissions set on some pages, apparently during the boot process (or
very shortly after).


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 9/9] mm: mark pages that have their permissions controlled by a domain
  2014-07-03 11:27     ` Razvan Cojocaru
@ 2014-07-03 12:15       ` Jan Beulich
  0 siblings, 0 replies; 64+ messages in thread
From: Jan Beulich @ 2014-07-03 12:15 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: Mihai Dontu, tim, xen-devel

>>> On 03.07.14 at 13:27, <rcojocaru@bitdefender.com> wrote:
> On 07/03/2014 01:19 PM, Jan Beulich wrote:
>>>>> On 02.07.14 at 15:34, <rcojocaru@bitdefender.com> wrote:
>>> This goes together with the mem-event API changes and marks certain
>>> pages as being controlled from outside the HV (a user domain in our
>>> case). This prevents Xen from resetting the permissions in certain
>>> cases, enforcing the previously expressed intention of receiving a
>>> memory event everytime the owning domain triggers a fault.
>> 
>> These "certain cases" would clearly benefit from being named in a
>> more precise fashion - it's not been that long ago that for some of
>> the cases where the access permissions got reset as a side effect
>> we decided to remove that side effect, and it was discussed to also
>> be done for at least one more case. I.e. perhaps you would not
>> need extensive changes like the ones here if that was done?
> 
> Could you please point us to these discussions? It would definitely be
> very nice if the problem has been fixed in the meantime.

http://lists.xenproject.org/archives/html/xen-devel/2014-03/msg03441.html
http://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=e5ae6eefdfbc1816b050d02998f69f0b78d5c814

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-02 15:31   ` Andrew Cooper
@ 2014-07-07 14:50     ` Razvan Cojocaru
  2014-07-10  8:05     ` Razvan Cojocaru
  1 sibling, 0 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-07 14:50 UTC (permalink / raw)
  To: Andrew Cooper, xen-devel; +Cc: tim

On 07/02/2014 06:31 PM, Andrew Cooper wrote:
> On 02/07/14 14:33, Razvan Cojocaru wrote:
>> Speed optimization for introspection purposes: a handful of registers
>> are sent along with each mem_event. This requires enlargement of the
>> mem_event_request / mem_event_response stuctures, and additional code
>> to fill in relevant values.
>>
>> Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
> 
> The public API already has struct hvm_hw_cpu in
> xen/include/public/arch-x86/hvm/save.h
> 
> It might be better to reuse that rather than defining a new structure to
> contain a subset of the information.

While reinspecting the code to accommodate the critique - the reason why
I've not used struct hvm_hw_cpu back when I initially wrote the patch is
that sizeof(struct hvm_hw_cpu) is considerably larger that the size of
our custom struct. Using hvm_hw_cpu would have filled the mem_event ring
buffer fast, with each mem_event carrying an instance of it around.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-02 15:43     ` Jan Beulich
@ 2014-07-09  8:02       ` Razvan Cojocaru
  2014-07-23  7:56         ` Jan Beulich
  0 siblings, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-09  8:02 UTC (permalink / raw)
  To: Jan Beulich, Andrew Cooper, xen-devel; +Cc: tim

On 07/02/2014 06:43 PM, Jan Beulich wrote:
>>>> On 02.07.14 at 17:35, <andrew.cooper3@citrix.com> wrote:
>> On 02/07/14 14:33, Razvan Cojocaru wrote:
>>> @@ -700,6 +700,25 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
>>>      if ( msr_bitmap == NULL )
>>>          return;
>>>  
>>> +    /* Filter out MSR-s needed by the memory introspection engine */
>>> +    switch ( msr )
>>> +    {
>>> +    case MSR_IA32_SYSENTER_EIP:
>>> +    case MSR_IA32_SYSENTER_ESP:
>>> +    case MSR_IA32_SYSENTER_CS:
>>> +    case MSR_IA32_MC0_CTL:
>>> +    case MSR_STAR:
>>> +    case MSR_LSTAR:
>>> +
>>
>> Given the performance implications of forcing interception of these
>> MSRs, it would be gated on mem_access being active for the domain.
> 
> Absolutely.

Unfortunately the call to vmx_disable_intercept_for_msr() happens _very_
early, and by the time our application gets to enable mem_access on the
domain, the interception for these MSRs has already been disabled, with
unacceptable consequences.

I've tested this with an "if (
mem_event_check_ring(&d->mem_event->access) )" test.

Also, ideally we'd like to be able to start monitoring an already
started domain, and in that case the mem_access test would be useless
even considering a workaround for the case above.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-02 15:31   ` Andrew Cooper
  2014-07-07 14:50     ` Razvan Cojocaru
@ 2014-07-10  8:05     ` Razvan Cojocaru
  2014-07-10  8:17       ` Andrew Cooper
  1 sibling, 1 reply; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-10  8:05 UTC (permalink / raw)
  To: Andrew Cooper, xen-devel; +Cc: tim

On 07/02/2014 06:31 PM, Andrew Cooper wrote:
> On 02/07/14 14:33, Razvan Cojocaru wrote:
>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>> index 2caa04a..fed21b6 100644
>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>>      c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
>>  
>>      c->msr_efer = v->arch.hvm_vcpu.guest_efer;
>> +    c->guest_x86_mode = vmx_guest_x86_mode(v);
> 
> guest_x86_mode is a linear function of cr0, eflags and efer.  It can be
> calculated by userspace doesn't need to transmitted individually.

OK, but 1) I'm not sending eflags into userspace, and 2) I thought Xen's
vmx_guest_x86_mode() function is more trustworthy than an userspace
translation of it, with not much overhead for the HV.

1) also means that I'd replace guest_x86_mode in the mem_event with efer
and not gain more space in the ring buffer.

Hope I understood your comment correctly.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-10  8:05     ` Razvan Cojocaru
@ 2014-07-10  8:17       ` Andrew Cooper
  2014-07-10  8:23         ` Razvan Cojocaru
  2014-07-10 11:57         ` Razvan Cojocaru
  0 siblings, 2 replies; 64+ messages in thread
From: Andrew Cooper @ 2014-07-10  8:17 UTC (permalink / raw)
  To: Razvan Cojocaru, xen-devel; +Cc: tim

On 10/07/2014 09:05, Razvan Cojocaru wrote:
> On 07/02/2014 06:31 PM, Andrew Cooper wrote:
>> On 02/07/14 14:33, Razvan Cojocaru wrote:
>>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>>> index 2caa04a..fed21b6 100644
>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>> @@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>>>      c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
>>>  
>>>      c->msr_efer = v->arch.hvm_vcpu.guest_efer;
>>> +    c->guest_x86_mode = vmx_guest_x86_mode(v);
>> guest_x86_mode is a linear function of cr0, eflags and efer.  It can be
>> calculated by userspace doesn't need to transmitted individually.
> OK, but 1) I'm not sending eflags into userspace,

rflags is in the structure between r15 and dr7.

>  and 2) I thought Xen's
> vmx_guest_x86_mode() function is more trustworthy

It is not a matter of trust.  It is a matter of correct or not, and it
would be easy for userspace to simply copy what vmx_guest_x86_mode()
already has.

>  than an userspace
> translation of it, with not much overhead for the HV.

Your proposed change would make the results of vmx_guest_x86_mode() part
of the Xen ABI, and therefore hard to refactor in the future if the need
were to arise.

Also, it ties any SVM extension of your work to VT-x internals.

~Andrew

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-10  8:17       ` Andrew Cooper
@ 2014-07-10  8:23         ` Razvan Cojocaru
  2014-07-10 11:57         ` Razvan Cojocaru
  1 sibling, 0 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-10  8:23 UTC (permalink / raw)
  To: Andrew Cooper, xen-devel; +Cc: tim

On 07/10/2014 11:17 AM, Andrew Cooper wrote:
> Your proposed change would make the results of vmx_guest_x86_mode() part
> of the Xen ABI, and therefore hard to refactor in the future if the need
> were to arise.
> 
> Also, it ties any SVM extension of your work to VT-x internals.

I see. Yes, that makes sense. I'll take guest_x86_mode out.


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-10  8:17       ` Andrew Cooper
  2014-07-10  8:23         ` Razvan Cojocaru
@ 2014-07-10 11:57         ` Razvan Cojocaru
  2014-07-10 12:16           ` Razvan Cojocaru
  2014-07-10 13:01           ` Andrew Cooper
  1 sibling, 2 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-10 11:57 UTC (permalink / raw)
  To: Andrew Cooper, xen-devel; +Cc: tim

On 07/10/2014 11:17 AM, Andrew Cooper wrote:
> On 10/07/2014 09:05, Razvan Cojocaru wrote:
>> On 07/02/2014 06:31 PM, Andrew Cooper wrote:
>>> On 02/07/14 14:33, Razvan Cojocaru wrote:
>>>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>>>> index 2caa04a..fed21b6 100644
>>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>>> @@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>>>>      c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
>>>>  
>>>>      c->msr_efer = v->arch.hvm_vcpu.guest_efer;
>>>> +    c->guest_x86_mode = vmx_guest_x86_mode(v);
>>> guest_x86_mode is a linear function of cr0, eflags and efer.  It can be
>>> calculated by userspace doesn't need to transmitted individually.
>> OK, but 1) I'm not sending eflags into userspace,
> 
> rflags is in the structure between r15 and dr7.
> 
>>  and 2) I thought Xen's
>> vmx_guest_x86_mode() function is more trustworthy
> 
> It is not a matter of trust.  It is a matter of correct or not, and it
> would be easy for userspace to simply copy what vmx_guest_x86_mode()
> already has.

Actually, the point I was trying to make is that I find it safer to use
vmx_guest_x86_mode() in the HV because otherwise I need to duplicate
that code in userspace (which I'm currently trying to do), and if for
some reason the implementation changes, someone needs to change it in
the userspace code as well. Having it only in one place in the HV looked
like a good idea.

As for it being a function of cr0, eflags and efer, it would appear that
it is also a function of cs_arbytes:

static int vmx_guest_x86_mode(struct vcpu *v)
{
    unsigned long cs_ar_bytes;

    if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
        return 0;
    if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
        return 1;
    __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
    if ( hvm_long_mode_enabled(v) &&
         likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
        return 8;
    return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
}

However, in hvm.c, hvm_save_cpu_ctxt():

hvm_get_segment_register(v, x86_seg_cs, &seg);
ctxt.cs_sel = seg.sel;
ctxt.cs_limit = seg.limit;
ctxt.cs_base = seg.base;
ctxt.cs_arbytes = seg.attr.bytes;

Looking further at vmx_get_segment_register() in vmx.c, we get this:

 766     case x86_seg_cs:
 767         __vmread(GUEST_CS_SELECTOR, &sel);
 768         __vmread(GUEST_CS_LIMIT,    &limit);
 769         __vmread(GUEST_CS_BASE,     &reg->base);
 770         __vmread(GUEST_CS_AR_BYTES, &attr);
 771         break;

then:

 832     reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);

This is why my userspace version of vmx_guest_x86_mode() (which uses
hwCpu.cs_arbytes from a struct hvm_hw_cpu hwCpu filled by
xc_domain_hvm_getcontext_partial()) does not work properly (it always
ends up returning 2, for both 32-bit guests - where it should return 4,
and 64-bit guests - where it should return 8).

So this solution would appear to be a bit more involved than the initial
solution. But you're, of course, right that guest_x86_mode should not be
VMX-specific.

Would it be OK if I would replace the call to vmx_guest_x86_mode() to a
call to hvm_funcs.guest_x86_mode(v) (assuming that's possible)?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-10 11:57         ` Razvan Cojocaru
@ 2014-07-10 12:16           ` Razvan Cojocaru
  2014-07-10 13:01           ` Andrew Cooper
  1 sibling, 0 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-10 12:16 UTC (permalink / raw)
  To: Andrew Cooper, xen-devel; +Cc: tim

On 07/10/2014 02:57 PM, Razvan Cojocaru wrote:
> So this solution would appear to be a bit more involved than the initial
> solution. But you're, of course, right that guest_x86_mode should not be
> VMX-specific.
> 
> Would it be OK if I would replace the call to vmx_guest_x86_mode() to a
> call to hvm_funcs.guest_x86_mode(v) (assuming that's possible)?

Actually, I probably misunderstood this, since "c->guest_x86_mode =
vmx_guest_x86_mode(v);" only happens in vmx.c, and p2m.c does
"req->regs.guest_x86_mode = hvm_guest_x86_mode(current);".

So I think extending this to SVM would simply require a change similar
to the one in vmx.c in svm.c (presumably in svm_vmcb_save(), with the
help of svm_guest_x86_mode()).

How does this tie any SVM extension of our work to VT-x internals?


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 2/9] xen: Optimize introspection access to guest state
  2014-07-10 11:57         ` Razvan Cojocaru
  2014-07-10 12:16           ` Razvan Cojocaru
@ 2014-07-10 13:01           ` Andrew Cooper
  1 sibling, 0 replies; 64+ messages in thread
From: Andrew Cooper @ 2014-07-10 13:01 UTC (permalink / raw)
  To: Razvan Cojocaru, xen-devel; +Cc: tim

On 10/07/14 12:57, Razvan Cojocaru wrote:
> On 07/10/2014 11:17 AM, Andrew Cooper wrote:
>> On 10/07/2014 09:05, Razvan Cojocaru wrote:
>>> On 07/02/2014 06:31 PM, Andrew Cooper wrote:
>>>> On 02/07/14 14:33, Razvan Cojocaru wrote:
>>>>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>>>>> index 2caa04a..fed21b6 100644
>>>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>>>> @@ -425,6 +425,7 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
>>>>>      c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
>>>>>  
>>>>>      c->msr_efer = v->arch.hvm_vcpu.guest_efer;
>>>>> +    c->guest_x86_mode = vmx_guest_x86_mode(v);
>>>> guest_x86_mode is a linear function of cr0, eflags and efer.  It can be
>>>> calculated by userspace doesn't need to transmitted individually.
>>> OK, but 1) I'm not sending eflags into userspace,
>> rflags is in the structure between r15 and dr7.
>>
>>>  and 2) I thought Xen's
>>> vmx_guest_x86_mode() function is more trustworthy
>> It is not a matter of trust.  It is a matter of correct or not, and it
>> would be easy for userspace to simply copy what vmx_guest_x86_mode()
>> already has.
> Actually, the point I was trying to make is that I find it safer to use
> vmx_guest_x86_mode() in the HV because otherwise I need to duplicate
> that code in userspace (which I'm currently trying to do), and if for
> some reason the implementation changes, someone needs to change it in
> the userspace code as well. Having it only in one place in the HV looked
> like a good idea.
>
> As for it being a function of cr0, eflags and efer, it would appear that
> it is also a function of cs_arbytes:
>
> static int vmx_guest_x86_mode(struct vcpu *v)
> {
>     unsigned long cs_ar_bytes;
>
>     if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
>         return 0;
>     if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
>         return 1;
>     __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
>     if ( hvm_long_mode_enabled(v) &&
>          likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
>         return 8;
>     return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
> }
>
> However, in hvm.c, hvm_save_cpu_ctxt():
>
> hvm_get_segment_register(v, x86_seg_cs, &seg);
> ctxt.cs_sel = seg.sel;
> ctxt.cs_limit = seg.limit;
> ctxt.cs_base = seg.base;
> ctxt.cs_arbytes = seg.attr.bytes;
>
> Looking further at vmx_get_segment_register() in vmx.c, we get this:
>
>  766     case x86_seg_cs:
>  767         __vmread(GUEST_CS_SELECTOR, &sel);
>  768         __vmread(GUEST_CS_LIMIT,    &limit);
>  769         __vmread(GUEST_CS_BASE,     &reg->base);
>  770         __vmread(GUEST_CS_AR_BYTES, &attr);
>  771         break;
>
> then:
>
>  832     reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
>
> This is why my userspace version of vmx_guest_x86_mode() (which uses
> hwCpu.cs_arbytes from a struct hvm_hw_cpu hwCpu filled by
> xc_domain_hvm_getcontext_partial()) does not work properly (it always
> ends up returning 2, for both 32-bit guests - where it should return 4,
> and 64-bit guests - where it should return 8).
>
> So this solution would appear to be a bit more involved than the initial
> solution. But you're, of course, right that guest_x86_mode should not be
> VMX-specific.
>
> Would it be OK if I would replace the call to vmx_guest_x86_mode() to a
> call to hvm_funcs.guest_x86_mode(v) (assuming that's possible)?

That would still turn a Xen internal into a part of the ABI, which
should be avoided.

set.attr.bytes is our architectural representation of segment selector
state, so you should follow the same method as hvm_hw_cpu.   This means
that you should find the LMA bit in bit 9 of the available cs_arbytes.

~Andrew

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-09  8:02       ` Razvan Cojocaru
@ 2014-07-23  7:56         ` Jan Beulich
  2014-07-23  8:03           ` Razvan Cojocaru
  0 siblings, 1 reply; 64+ messages in thread
From: Jan Beulich @ 2014-07-23  7:56 UTC (permalink / raw)
  To: Razvan Cojocaru; +Cc: Andrew Cooper, tim, xen-devel

>>> On 09.07.14 at 10:02, <rcojocaru@bitdefender.com> wrote:
> On 07/02/2014 06:43 PM, Jan Beulich wrote:
>>>>> On 02.07.14 at 17:35, <andrew.cooper3@citrix.com> wrote:
>>> On 02/07/14 14:33, Razvan Cojocaru wrote:
>>>> @@ -700,6 +700,25 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 
> msr, int type)
>>>>      if ( msr_bitmap == NULL )
>>>>          return;
>>>>  
>>>> +    /* Filter out MSR-s needed by the memory introspection engine */
>>>> +    switch ( msr )
>>>> +    {
>>>> +    case MSR_IA32_SYSENTER_EIP:
>>>> +    case MSR_IA32_SYSENTER_ESP:
>>>> +    case MSR_IA32_SYSENTER_CS:
>>>> +    case MSR_IA32_MC0_CTL:
>>>> +    case MSR_STAR:
>>>> +    case MSR_LSTAR:
>>>> +
>>>
>>> Given the performance implications of forcing interception of these
>>> MSRs, it would be gated on mem_access being active for the domain.
>> 
>> Absolutely.
> 
> Unfortunately the call to vmx_disable_intercept_for_msr() happens _very_
> early, and by the time our application gets to enable mem_access on the
> domain, the interception for these MSRs has already been disabled, with
> unacceptable consequences.
> 
> I've tested this with an "if (
> mem_event_check_ring(&d->mem_event->access) )" test.
> 
> Also, ideally we'd like to be able to start monitoring an already
> started domain, and in that case the mem_access test would be useless
> even considering a workaround for the case above.

All understood, but not penalizing non-monitored VMs has certainly
higher priority.

Jan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events
  2014-07-23  7:56         ` Jan Beulich
@ 2014-07-23  8:03           ` Razvan Cojocaru
  0 siblings, 0 replies; 64+ messages in thread
From: Razvan Cojocaru @ 2014-07-23  8:03 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, tim, xen-devel

On 07/23/2014 10:56 AM, Jan Beulich wrote:
>>>> On 09.07.14 at 10:02, <rcojocaru@bitdefender.com> wrote:
>> On 07/02/2014 06:43 PM, Jan Beulich wrote:
>>>>>> On 02.07.14 at 17:35, <andrew.cooper3@citrix.com> wrote:
>>>> On 02/07/14 14:33, Razvan Cojocaru wrote:
>>>>> @@ -700,6 +700,25 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 
>> msr, int type)
>>>>>      if ( msr_bitmap == NULL )
>>>>>          return;
>>>>>  
>>>>> +    /* Filter out MSR-s needed by the memory introspection engine */
>>>>> +    switch ( msr )
>>>>> +    {
>>>>> +    case MSR_IA32_SYSENTER_EIP:
>>>>> +    case MSR_IA32_SYSENTER_ESP:
>>>>> +    case MSR_IA32_SYSENTER_CS:
>>>>> +    case MSR_IA32_MC0_CTL:
>>>>> +    case MSR_STAR:
>>>>> +    case MSR_LSTAR:
>>>>> +
>>>>
>>>> Given the performance implications of forcing interception of these
>>>> MSRs, it would be gated on mem_access being active for the domain.
>>>
>>> Absolutely.
>>
>> Unfortunately the call to vmx_disable_intercept_for_msr() happens _very_
>> early, and by the time our application gets to enable mem_access on the
>> domain, the interception for these MSRs has already been disabled, with
>> unacceptable consequences.
>>
>> I've tested this with an "if (
>> mem_event_check_ring(&d->mem_event->access) )" test.
>>
>> Also, ideally we'd like to be able to start monitoring an already
>> started domain, and in that case the mem_access test would be useless
>> even considering a workaround for the case above.
> 
> All understood, but not penalizing non-monitored VMs has certainly
> higher priority.

Got it, I've already changed the code, but waiting on a few other things
before resubmitting the series. As far as this patch goes, the HV now
only refuses to disable interception for the interesting MSRs if
mem_access is active for the domain, and always enables interception for
them on XEN_DOMCTL_MEM_EVENT_OP_ACCESS in mem_event_domctl().


Thanks,
Razvan Cojocaru

^ permalink raw reply	[flat|nested] 64+ messages in thread

end of thread, other threads:[~2014-07-23  8:03 UTC | newest]

Thread overview: 64+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-02 13:33 [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Razvan Cojocaru
2014-07-02 13:33 ` [PATCH RFC 2/9] xen: Optimize introspection access to guest state Razvan Cojocaru
2014-07-02 15:31   ` Andrew Cooper
2014-07-07 14:50     ` Razvan Cojocaru
2014-07-10  8:05     ` Razvan Cojocaru
2014-07-10  8:17       ` Andrew Cooper
2014-07-10  8:23         ` Razvan Cojocaru
2014-07-10 11:57         ` Razvan Cojocaru
2014-07-10 12:16           ` Razvan Cojocaru
2014-07-10 13:01           ` Andrew Cooper
2014-07-02 15:37   ` Jan Beulich
2014-07-03  8:12     ` Razvan Cojocaru
2014-07-03  8:54       ` Jan Beulich
2014-07-02 13:33 ` [PATCH RFC 3/9] xen: Force-enable relevant MSR events; optimize the number of sent MSR events Razvan Cojocaru
2014-07-02 15:35   ` Andrew Cooper
2014-07-02 15:43     ` Jan Beulich
2014-07-09  8:02       ` Razvan Cojocaru
2014-07-23  7:56         ` Jan Beulich
2014-07-23  8:03           ` Razvan Cojocaru
2014-07-02 13:33 ` [PATCH RFC 4/9] xenctrl: Make the headers C++ friendly Razvan Cojocaru
2014-07-02 15:37   ` Andrew Cooper
2014-07-02 13:33 ` [PATCH RFC 5/9] xen: Support for VMCALL mem_events Razvan Cojocaru
2014-07-02 15:47   ` Jan Beulich
2014-07-02 15:54     ` Razvan Cojocaru
2014-07-02 16:11       ` Jan Beulich
2014-07-02 16:23         ` Razvan Cojocaru
2014-07-03  6:28           ` Jan Beulich
2014-07-03  7:29             ` Razvan Cojocaru
2014-07-02 15:54   ` Andrew Cooper
2014-07-02 15:59     ` Razvan Cojocaru
2014-07-02 13:33 ` [PATCH RFC 6/9] xen, libxc: Request page fault injection via libxc Razvan Cojocaru
2014-07-02 15:51   ` Jan Beulich
2014-07-02 16:00     ` Andrew Cooper
2014-07-02 16:58       ` Mihai Donțu
2014-07-02 17:07         ` Andrew Cooper
2014-07-03  8:23           ` Mihai Donțu
2014-07-03  9:32             ` Andrew Cooper
2014-07-03  9:40               ` Razvan Cojocaru
2014-07-02 16:06     ` Razvan Cojocaru
2014-07-02 16:13       ` Jan Beulich
2014-07-02 13:33 ` [PATCH RFC 7/9] xen: Handle resumed instruction based on previous mem_event reply Razvan Cojocaru
2014-07-02 15:56   ` Jan Beulich
2014-07-03  8:55     ` Razvan Cojocaru
2014-07-03  9:02       ` Jan Beulich
2014-07-03  9:12         ` Razvan Cojocaru
2014-07-03  9:18           ` Andrew Cooper
2014-07-03  9:22           ` Jan Beulich
2014-07-03  9:34             ` Razvan Cojocaru
2014-07-03 10:14               ` Jan Beulich
2014-07-02 13:34 ` [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults Razvan Cojocaru
2014-07-02 16:04   ` Andrew Cooper
2014-07-02 13:34 ` [PATCH RFC 9/9] mm: mark pages that have their permissions controlled by a domain Razvan Cojocaru
2014-07-03 10:19   ` Jan Beulich
2014-07-03 11:27     ` Razvan Cojocaru
2014-07-03 12:15       ` Jan Beulich
2014-07-02 15:20 ` [PATCH RFC 1/9] xen: Emulate with no writes; compute current instruction length Andrew Cooper
2014-07-03  7:42   ` Razvan Cojocaru
2014-07-02 15:21 ` Jan Beulich
2014-07-02 15:43   ` Razvan Cojocaru
2014-07-02 16:08     ` Jan Beulich
2014-07-02 16:18       ` Razvan Cojocaru
2014-07-03  6:24         ` Jan Beulich
2014-07-03  7:38   ` Razvan Cojocaru
2014-07-03  8:05     ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.